1 '''
2 Created on 22 March 2011
3
4 @author: Eleftherios Avramidis
5 '''
6
7 from featuregenerator.languagefeaturegenerator import LanguageFeatureGenerator
8 from numpy import average, std
9
10
12 '''
13 classdocs
14 '''
15 mapping = {}
16 mapping[("de","en")] = [(["NP"], ["NP"]),
17 (["VP", "VZ"], ["VP"]),
18 (["VVFIN", "VAFIN", "VMFIN", "VAINF", "VVINF" ,"VVPP" ], ["VB", "VBZ", "VBP", "VBN", "VBG" ]),
19 (["NN", "NE"], ["NN", "NNP", "NNS"]),
20 (["PP"], ["PP"]),
21 (["$."], ["."]),
22 (["$,"], [","])]
23
24 mapping[("en","fr")] = [(["S", "SQ"], ["SENT", "Sint"]),
25 (["SBAR"], ["Srel", "Ssub"]),
26 (["NP"], ["NP"]),
27 (["VP"], [ "VP", "VN", "VPinf", "VPpart" ]),
28 (["VB", "VBZ", "VBP", "VBN", "VBG" ], ["V"]),
29 (["NN", "NNP", "NNS"], ["N"] ),
30 (["PP"], ["PP"]),
31 (["ADVP"] , ["AdP"]),
32 (["PRP"], ["CL"]),
33 (["DT", "PRP$"], ["D"]),
34 (["RB"], ["ADV"]),
35 (["JJ"], ["A"]),
36 (["."], ["."]),
37 ([","], [","])]
38
39 mapping[("de","fr")] = [(["NP"], ["NP"]),
40 (["S"], ["SENT", "Srel", "Ssub"]),
41 (["ART"], ["D"]),
42 (["VP", "VZ"], ["VP", "VPinf"]),
43 (["VVFIN", "VAFIN", "VMFIN", "VAINF", "VVINF" ,"VVPP" ], ["V"]),
44 (["NN", "NE"], ["N"] ),
45 (["PP"], ["PP"]),
46 (["$."], ["."]),
47 (["$,"], [","])]
48
49 mapping[("es","en")] = [(["sn"], ["NP"]),
50 (["grup.verb"], ["VP"]),
51 (["S"], ["S"]),
52 (["v" ], ["VB", "VBZ", "VBP", "VBN", "VBG" ]),
53 (["n"], ["NN", "NNP", "NNS"]),
54 (["sp"], ["PP"]),
55 (["pu"], ["."]),
56 (["conj"], ["CC"]),
57 (["a"], ["JJ"]),
58 (["d"], ["DT", "PRP$"]),
59 ([","], [","])]
60
61
62
63
64
65 - def __init__(self, langpair=("de","en")):
77
78
79
93
95 attributes = {}
96 try:
97 src_parse = simplesentence.get_attribute("berkeley-tree")
98 except:
99 print "error reading berkeley tree"
100 return {}
101 for (src_map, tgt_map) in self.mappings:
102 src_map_count, src_map_pos = self._count_nodetags(src_parse, src_map)
103 src_label = self._canonicalize(src_map[0])
104 attributes["parse-%s" % src_label] = str(src_map_count)
105 attributes["parse-%s-pos-avg" % src_label] = str(average(src_map_pos))
106 attributes["parse-%s-pos-std" % src_label] = str(std(src_map_pos))
107 return attributes
108
109
111 attributes = {}
112 try:
113 tgt_parse = simplesentence.get_attribute("berkeley-tree")
114 except:
115 tgt_parse = ""
116 try:
117 src_parse = parallelsentence.get_source().get_attribute("berkeley-tree")
118 except:
119 src_parse = ""
120
121 if tgt_parse and src_parse:
122 for (src_map, tgt_map) in self.mappings:
123
124
125 tgt_map_count, tgt_map_pos = self._count_nodetags(tgt_parse, tgt_map)
126 tgt_label = self._canonicalize(src_map[0])
127 attributes["parse-%s" % tgt_label] = str(tgt_map_count)
128 attributes["parse-%s-pos-avg" % tgt_label] = str(average(tgt_map_pos))
129 attributes["parse-%s-pos-std" % tgt_label] = str(std(tgt_map_pos))
130
131
132
133
134 return attributes
135
136
138 string = string.replace("$." , "dot").replace("$," , "comma")
139 string = string.replace(".", "dot").replace("," , "comma")
140 return string
141