Package featuregenerator :: Module languagefeaturegenerator
[hide private]
[frames] | no frames]

Source Code for Module featuregenerator.languagefeaturegenerator

  1  """ 
  2   
  3  @author: Eleftherios Avramidis 
  4  """ 
  5  from copy import deepcopy 
  6  from sentence.parallelsentence import ParallelSentence 
  7  from sentence.dataset import DataSet 
  8  from dataprocessor.input.xmlreader import XmlReader 
  9  from dataprocessor.output.xmlwriter import XmlWriter 
 10  from featuregenerator import FeatureGenerator 
 11  #from abc import ABCMeta 
 12  from sys import stderr 
 13   
14 -class LanguageFeatureGenerator(FeatureGenerator):
15 """ 16 Extends the base FeatureGenerator class, by providing basic checking/functioning for language-specific feature processes. 17 This way, this class can be inhereted and extended for feature categories that can only correspond to a particular language 18 specified upon the initialization of the object 19 @ivar lang: the language abrev. code 20 @type lang: str 21 """
22 - def __init__(self, lang):
23 """ 24 In order to initialize a language-specific feature generator, the language needs to be instantiatied as a class variable 25 @param lang: the language code of the language that the feature generator is capable of 26 @type lang: string 27 """ 28 self.lang = lang
29 # __metaclass__ = ABCMeta 30 31
32 - def get_features_src(self, simplesentence, parallelsentence):
33 """ 34 Function that falls back to the general simple sentence feature generation, only if the language is supported by the feature generator 35 It receives a source simple sentence and returns a list of source features. 36 """ 37 38 attributes = {} 39 src_lang = parallelsentence.get_attribute("langsrc") #TODO: make this format independent by adding it as an attribute of the sentence objects 40 if src_lang == self.lang: 41 attributes = self.get_features_simplesentence(simplesentence, parallelsentence) 42 return attributes
43
44 - def get_features_tgt(self, simplesentence, parallelsentence):
45 """ 46 Function that falls back to the general simple sentence feature generation, only if the language is supported by the feature generator 47 It receives a target simple sentence and returns a list of target features. 48 """ 49 attributes = {} 50 src_lang = parallelsentence.get_attribute("langtgt") #TODO: make this format independent by adding it as an attribute of the sentence objects 51 if src_lang == self.lang: 52 attributes = self.get_features_simplesentence(simplesentence, parallelsentence) 53 return attributes
54
55 - def get_features_simplesentence(self, simplesentence, parallelsentence):
56 """ 57 Abstract method to be overriden by the particular subclassed feature generator. 58 It receives a simple sentence of any type and returns a list of features. 59 It should be overriden by a feature generator that doesn't differentiate between source and target features 60 """ 61 #stderr.println("Featuregenerator of type %s doesn't provide SimpleSentence features" % self.__class__.__name__) 62 63 return self.get_features_string(simplesentence.get_string())
64 65
66 - def add_features_dataset(self, dataset):
67 """ 68 Augments the provided DataSet with features of the current feature generator. 69 It fires feature generation over the included parallelsentences it is composed of. 70 It is not compatible with SAX parsing. 71 @param dataset: The DataSet whose contents will be augmented 72 @type dataset: sentence.dataset.DataSet 73 @rtype: sentence.dataset.DataSet 74 @return: The given DataSet augmented with features generated from the current featuregenerator 75 """ 76 parallelsentences = dataset.get_parallelsentences() 77 #parallelsentences = [self.add_features_parallelsentence(parallelsentence) for parallelsentence in parallelsentences] 78 self.add_features_batch(parallelsentences) 79 print ".", 80 return DataSet(parallelsentences)
81
82 - def add_features_batch(self, parallelsentences):
83 """ 84 Abstract method to be overriden by the particular subclassed feature generator. 85 It allows the generation of features over many parallelsentences. 86 It is a flexible solution when feature generation doesn't take place item to item (see SAX parsing) but a whole list of parallel sentences needs 87 to be implemented at once. In this case, feature generator may optimize better when the whole dataset is given. 88 @param parallelsentences: The parallel sentences to be be augmented 89 @type parallelsentences: list(sentence.parallelsentence.ParallelSentence) 90 @rtype: list(sentence.parallelsentence.ParallelSentence) 91 @return: The given list of ParallelSentence which are now augmented with features generated from the current featuregenerator 92 """ 93 #Default function, if not overriden 94 parallelsentences = [self.add_features_parallelsentence(parallelsentence) for parallelsentence in parallelsentences] 95 96 return parallelsentences
97 98
99 - def get_features_string(self, string):
100 raise NotImplementedError
101 102 103 #TODO: remove this, as it breaks architecture
104 - def add_features_batch_xml(self, filename_in, filename_out):
105 reader = XmlReader(filename_in) 106 parallelsentences = reader.get_parallelsentences() 107 parallelsentences = self.add_features_batch(parallelsentences) 108 reader = None 109 writer = XmlWriter(parallelsentences) 110 writer.write_to_file(filename_out)
111