Package featuregenerator :: Module featuregenerator
[hide private]
[frames] | no frames]

Source Code for Module featuregenerator.featuregenerator

  1  """ 
  2   
  3  @author: Eleftherios Avramidis 
  4  """ 
  5  from copy import deepcopy 
  6  from sentence.parallelsentence import ParallelSentence 
  7  from sentence.dataset import DataSet 
  8  from dataprocessor.input.xmlreader import XmlReader 
  9  from dataprocessor.output.xmlwriter import XmlWriter 
 10  #from abc import ABCMeta 
 11  from sys import stderr 
 12   
13 -class FeatureGenerator(object):
14 """ 15 A base feature generator class, with no particular functioning. 16 It just provides basic feature generator functions to be inherited (or overwritten) by specific feature generators. 17 If you want to code a new FeatureGenerator, it must inherit this class and override one or all of the methods 18 get_features_src, get_features_tgt, get_features_simplesentence, get_features_parallelsentence 19 """ 20 # def __init__(self): 21 # pass 22 # __metaclass__ = ABCMeta 23 # 24
25 - def add_features_parallelsentence(self, parallelsentence):
26 """ 27 Augments the provided ParallelSentence with features of the current feature generator. 28 It fires feature generation functions over the included simplesentences it is composed of. 29 @param parallelsentence: The ParalleSentence whose contents will be augmented 30 @type parallelsentence: sentence.parallelsentence.ParalleSentence 31 @rtype: sentence.parallelsentence.ParalleSentence 32 @return: The given ParalleSentence augmented with features generated from the current featuregenerator 33 """ 34 src = self.add_features_src (parallelsentence.get_source(), parallelsentence) 35 tgt = [(self.add_features_tgt (tgt_item, parallelsentence)) for tgt_item in parallelsentence.get_translations()] 36 try: 37 ref = self.add_features_tgt (parallelsentence.get_reference(), parallelsentence) 38 except: 39 ref = parallelsentence.get_reference() 40 41 #recreate the parallelsentence with the augmented contents 42 parallelsentence = ParallelSentence(src, tgt, ref, parallelsentence.get_attributes()) 43 #add the attributes of the parallelsentence 44 parallelsentence.add_attributes (self.get_features_parallelsentence(parallelsentence)) 45 return parallelsentence
46
47 - def add_features_src(self, simplesentence, parallelsentence = None):
48 """ 49 Gets a source SimpleSentence and (optionally) its corresponding Parallelsentence and returns a SimpleSentence with the generated features 50 Operates as a wrapper around the get_features_src method, which returns a dictionary with the generated features. 51 From it we receive the dictionary, we duplicate the source SimpleSentence object and we return a proper source SimpleSentence object containing the generated features. 52 @param simplesentence: The source sentence of a ParallelSentence 53 @type simplesentence: sentence.sentence.SimpleSentence 54 @param parallelsentence: The parallelsentence containing the given source sentence. Can be omitted if the subclassed feature generator doesn't require the parallelsentence in order to deliver source features 55 @type parallelsentence: sentence.parallelsentence.ParallelSentence 56 @rtype: sentence.sentence.SimpleSentence 57 @return: A source SimpleSentence object similar to the one given, but now containing the generated features. 58 """ 59 simplesentence = deepcopy(simplesentence) 60 simplesentence.add_attributes(self.get_features_src(simplesentence, parallelsentence)) 61 return simplesentence
62
63 - def add_features_tgt(self,simplesentence, parallelsentence = None):
64 """ 65 Gets a target SimpleSentence and (optionally) its corresponding Parallelsentence and returns a SimpleSentence with the generated features 66 Operates as a wrapper around the get_features_src method, which returns a dictionary with the generated features. 67 From it we receive the dictionary, we duplicate the target SimpleSentence object and we return a proper target SimpleSentence object containing the generated features. 68 @param simplesentence: The target sentence of a ParallelSentence 69 @type simplesentence: sentence.sentence.SimpleSentence 70 @param parallelsentence: The parallelsentence containing the given target sentence. Can be omitted if the subclassed feature generator doesn't require the parallelsentence in order to deliver target features 71 @type parallelsentence: sentence.parallelsentence.ParallelSentence 72 @rtype: sentence.sentence.SimpleSentence 73 @return: A target SimpleSentence object similar to the one given, but now containing the generated features. 74 """ 75 simplesentence = deepcopy(simplesentence) 76 simplesentence.add_attributes(self.get_features_tgt(simplesentence, parallelsentence)) 77 return simplesentence
78
79 - def add_features_simplesentence(self, simplesentence, parallelsentence = None):
80 """ 81 Works as a generalized method covering the functionality of both add_features_src and add_features_tgt. It gets a SimpleSentence of any origin (source/target etc.) and (optionally) its corresponding Parallelsentence and returns a SimpleSentence with the generated features 82 Operates as a wrapper around the get_features_src method, which returns a dictionary with the generated features. 83 From it we receive the dictionary, we duplicate the SimpleSentence object and we return a proper SimpleSentence object containing the generated features. 84 @param simplesentence: A simplesentence 85 @type simplesentence: sentence.sentence.SimpleSentence 86 @param parallelsentence: The parallelsentence containing the given SimpleSentence. Can be omitted if the subclassed feature generator doesn't require the parallelsentence in order to deliver simplesentence features 87 @type parallelsentence: sentence.parallelsentence.ParallelSentence 88 @rtype: sentence.sentence.SimpleSentence 89 @return: A SimpleSentence object similar to the one given, but now containing the generated features. 90 """ 91 simplesentence = deepcopy(simplesentence) 92 simplesentence.add_attributes(self.get_features_simplesentence(simplesentence, parallelsentence)) 93 return simplesentence
94
95 - def get_features_parallelsentence(self, parallelsentence):
96 """ 97 Abstract method to be overriden by the particular subclassed feature generator. 98 It receives a parallel sentence and returns a list of parallel sentence features that globally describe the parallel sentence itself. 99 Features that describe source or target sentence etc should be added in functions get_features_src and get_features_tgt declared below. 100 Implementation here provides an empty dictionary, in case subclassed feature generator doesn't provide any features. 101 """ 102 #stderr.write("Featuregenerator of type %s doesn't provide global ParallelSentence features\n" % self.__class__.__name__) 103 return {}
104
105 - def get_features_src(self, simplesentence, parallelsentence):
106 """ 107 Abstract method to be overriden by the particular subclassed feature generator. 108 It receives a source simple sentence and returns a list of source features. 109 Implementation here fallbacks to the get_features_simplesentence function, when feature generator doesn't differentiate between source and target features 110 """ 111 return self.get_features_simplesentence(simplesentence, parallelsentence)
112
113 - def get_features_tgt(self, simplesentence, parallelsentence):
114 """ 115 Abstract method to be overriden by the particular subclassed feature generator. 116 It receives a target simple sentence and returns a list of target features. 117 Implementation here fallbacks to the get_features_simplesentence function, when feature generator doesn't differentiate between source and target features 118 """ 119 return self.get_features_simplesentence(simplesentence, parallelsentence)
120
121 - def get_features_simplesentence(self, simplesentence, parallelsentence):
122 """ 123 Abstract method to be overriden by the particular subclassed feature generator. 124 It receives a simple sentence of any type and returns a list of features. 125 It should be overriden by a feature generator that doesn't differentiate between source and target features 126 """ 127 #stderr.println("Featuregenerator of type %s doesn't provide SimpleSentence features" % self.__class__.__name__) 128 return self.get_features_string(simplesentence.string)
129
130 - def get_features_string(self, string):
131 return {}
132
133 - def add_features_dataset(self, dataset):
134 """ 135 Augments the provided DataSet with features of the current feature generator. 136 It fires feature generation over the included parallelsentences it is composed of. 137 It is not compatible with SAX parsing. 138 @param dataset: The DataSet whose contents will be augmented 139 @type dataset: sentence.dataset.DataSet 140 @rtype: sentence.dataset.DataSet 141 @return: The given DataSet augmented with features generated from the current featuregenerator 142 """ 143 parallelsentences = dataset.get_parallelsentences() 144 #parallelsentences = [self.add_features_parallelsentence(parallelsentence) for parallelsentence in parallelsentences] 145 parallelsentences = self.add_features_batch(parallelsentences) 146 print ".", 147 return DataSet(parallelsentences)
148
149 - def add_features_batch(self, parallelsentences):
150 """ 151 Abstract method to be overriden by the particular subclassed feature generator. 152 It allows the generation of features over many parallelsentences. 153 It is a flexible solution when feature generation doesn't take place item to item (see SAX parsing) but a whole list of parallel sentences needs 154 to be implemented at once. In this case, feature generator may optimize better when the whole dataset is given. 155 @param parallelsentences: The parallel sentences to be be augmented 156 @type parallelsentences: list(sentence.parallelsentence.ParallelSentence) 157 @rtype: list(sentence.parallelsentence.ParallelSentence) 158 @return: The given list of ParallelSentence which are now augmented with features generated from the current featuregenerator 159 """ 160 #Default function, if not overriden 161 parallelsentences = [self.add_features_parallelsentence(parallelsentence) for parallelsentence in parallelsentences] 162 163 return parallelsentences
164
165 - def get_annotation_name(self):
166 """ 167 Provides a name describing the set of features that each particular annotator added. 168 If not overriden, generates a name out of the class name 169 @return the name of the annotation 170 @rtype string 171 """ 172 name = self.__class__.__name__ 173 if name.endswith("FeatureGenerator"): 174 name = name[0:len(name)-len("FeatureGenerator")].lower()
175 176 177 #TODO: remove this, as it breaks architecture 178 # def add_features_batch_xml(self, filename_in, filename_out): 179 # reader = XmlReader(filename_in) 180 # parallelsentences = reader.get_parallelsentences() 181 # parallelsentences = self.add_features_batch(parallelsentences) 182 # reader = None 183 # writer = XmlWriter(parallelsentences) 184 # writer.write_to_file(filename_out) 185 186 187
188 - def process_dataset(self, dataset):
189 return self.add_features_dataset(dataset)
190