1 """
2
3 @author: Eleftherios Avramidis
4 """
5 from copy import deepcopy
6 from sentence.parallelsentence import ParallelSentence
7 from sentence.dataset import DataSet
8 from dataprocessor.input.xmlreader import XmlReader
9 from dataprocessor.output.xmlwriter import XmlWriter
10 from featuregenerator import FeatureGenerator
11
12 from sys import stderr
13
15 """
16 Extends the base FeatureGenerator class, by providing basic checking/functioning for language-specific feature processes.
17 This way, this class can be inhereted and extended for feature categories that can only correspond to a particular language
18 specified upon the initialization of the object
19 @ivar lang: the language abrev. code
20 @type lang: str
21 """
23 """
24 In order to initialize a language-specific feature generator, the language needs to be instantiatied as a class variable
25 @param lang: the language code of the language that the feature generator is capable of
26 @type lang: string
27 """
28 self.lang = lang
29
30
31
43
54
56 """
57 Abstract method to be overriden by the particular subclassed feature generator.
58 It receives a simple sentence of any type and returns a list of features.
59 It should be overriden by a feature generator that doesn't differentiate between source and target features
60 """
61
62
63 return self.get_features_string(simplesentence.get_string())
64
65
67 """
68 Augments the provided DataSet with features of the current feature generator.
69 It fires feature generation over the included parallelsentences it is composed of.
70 It is not compatible with SAX parsing.
71 @param dataset: The DataSet whose contents will be augmented
72 @type dataset: sentence.dataset.DataSet
73 @rtype: sentence.dataset.DataSet
74 @return: The given DataSet augmented with features generated from the current featuregenerator
75 """
76 parallelsentences = dataset.get_parallelsentences()
77
78 self.add_features_batch(parallelsentences)
79 print ".",
80 return DataSet(parallelsentences)
81
83 """
84 Abstract method to be overriden by the particular subclassed feature generator.
85 It allows the generation of features over many parallelsentences.
86 It is a flexible solution when feature generation doesn't take place item to item (see SAX parsing) but a whole list of parallel sentences needs
87 to be implemented at once. In this case, feature generator may optimize better when the whole dataset is given.
88 @param parallelsentences: The parallel sentences to be be augmented
89 @type parallelsentences: list(sentence.parallelsentence.ParallelSentence)
90 @rtype: list(sentence.parallelsentence.ParallelSentence)
91 @return: The given list of ParallelSentence which are now augmented with features generated from the current featuregenerator
92 """
93
94 parallelsentences = [self.add_features_parallelsentence(parallelsentence) for parallelsentence in parallelsentences]
95
96 return parallelsentences
97
98
100 raise NotImplementedError
101
102
103
111