featuregenerator.parser.berkeley.parsermatches

12 ''' 13 classdocs 14 ''' 15 mapping = {} 16 mapping[("de","en")] = [(["NP"], ["NP"]), 17 (["VP", "VZ"], ["VP"]), 18 (["VVFIN", "VAFIN", "VMFIN", "VAINF", "VVINF" ,"VVPP" ], ["VB", "VBZ", "VBP", "VBN", "VBG" ]), 19 (["NN", "NE"], ["NN", "NNP", "NNS"]), 20 (["PP"], ["PP"]), 21 (["$."], ["."]), 22 (["$,"], [","])] 23 24 mapping[("en","fr")] = [(["S", "SQ"], ["SENT", "Sint"]), 25 (["SBAR"], ["Srel", "Ssub"]), 26 (["NP"], ["NP"]), 27 (["VP"], [ "VP", "VN", "VPinf", "VPpart" ]), 28 (["VB", "VBZ", "VBP", "VBN", "VBG" ], ["V"]), 29 (["NN", "NNP", "NNS"], ["N"] ), 30 (["PP"], ["PP"]), 31 (["ADVP"] , ["AdP"]), 32 (["PRP"], ["CL"]), 33 (["DT", "PRP$"], ["D"]), 34 (["RB"], ["ADV"]), 35 (["JJ"], ["A"]), 36 (["."], ["."]), 37 ([","], [","])] 38 39 mapping[("de","fr")] = [(["NP"], ["NP"]), 40 (["S"], ["SENT", "Srel", "Ssub"]), 41 (["ART"], ["D"]), 42 (["VP", "VZ"], ["VP", "VPinf"]), 43 (["VVFIN", "VAFIN", "VMFIN", "VAINF", "VVINF" ,"VVPP" ], ["V"]), 44 (["NN", "NE"], ["N"] ), 45 (["PP"], ["PP"]), 46 (["$."], ["."]), 47 (["$,"], [","])] 48 49 mapping[("es","en")] = [(["sn"], ["NP"]), 50 (["grup.verb"], ["VP"]), 51 (["S"], ["S"]), 52 (["v" ], ["VB", "VBZ", "VBP", "VBN", "VBG" ]), 53 (["n"], ["NN", "NNP", "NNS"]), 54 (["sp"], ["PP"]), 55 (["pu"], ["."]), 56 (["conj"], ["CC"]), 57 (["a"], ["JJ"]), 58 (["d"], ["DT", "PRP$"]), 59 ([","], [","])] 60 61 62 63 64

65 - def __init__(self, langpair=("de","en")):

66 ''' 67 Constructor 68 ''' 69 #reverse mappings as well 70 reversed_mapping = {} 71 for (source_language, target_language), mapping in self.mapping.iteritems(): 72 73 reversed_mapping[(target_language, source_language)] = [(target_mapping, source_mapping) for (source_mapping, target_mapping) in mapping] 74 75 self.mapping.update(reversed_mapping) 76 self.mappings = self.mapping[langpair]

77 78 79

80 - def _count_nodetags(self, treestring="", taglist=[]):

81 match_count = 0 82 match_pos = [] 83 labels = treestring.split() 84 for parse_tag in taglist: 85 parse_tag = "(%s" %parse_tag #get the bracket so that you can search in the parse string 86 match_count += labels.count(parse_tag) 87 for pos, label in enumerate(labels, start=1): 88 if parse_tag == label: 89 match_pos.append(pos) 90 if not match_pos: 91 match_pos = [0] 92 return match_count, match_pos

93

94 - def get_features_src(self, simplesentence, parallelsentence):

95 attributes = {} 96 try: 97 src_parse = simplesentence.get_attribute("berkeley-tree") 98 except: 99 print "error reading berkeley tree" 100 return {} 101 for (src_map, tgt_map) in self.mappings: 102 src_map_count, src_map_pos = self._count_nodetags(src_parse, src_map) 103 src_label = self._canonicalize(src_map[0]) 104 attributes["parse-%s" % src_label] = str(src_map_count) 105 attributes["parse-%s-pos-avg" % src_label] = str(average(src_map_pos)) 106 attributes["parse-%s-pos-std" % src_label] = str(std(src_map_pos)) 107 return attributes

108 109

110 - def get_features_tgt(self, simplesentence, parallelsentence):

111 attributes = {} 112 try: 113 tgt_parse = simplesentence.get_attribute("berkeley-tree") 114 except: 115 tgt_parse = "" 116 try: 117 src_parse = parallelsentence.get_source().get_attribute("berkeley-tree") 118 except: 119 src_parse = "" 120 121 if tgt_parse and src_parse: 122 for (src_map, tgt_map) in self.mappings: 123 #src_label = self._canonicalize(src_map[0]) 124 #src_map_count = int(parallelsentence.get_source().get_attribute("parse-%s" % src_label)) 125 tgt_map_count, tgt_map_pos = self._count_nodetags(tgt_parse, tgt_map) 126 tgt_label = self._canonicalize(src_map[0]) 127 attributes["parse-%s" % tgt_label] = str(tgt_map_count) 128 attributes["parse-%s-pos-avg" % tgt_label] = str(average(tgt_map_pos)) 129 attributes["parse-%s-pos-std" % tgt_label] = str(std(tgt_map_pos)) 130 # if tgt_map_count != 0: 131 # attributes["parse-%s_ratio" % tgt_label] = str(1.0 * src_map_count / tgt_map_count) 132 # else: 133 # attributes["parse-%s_ratio" % tgt_label] = str(float("Inf")) 134 return attributes

135 136

137 - def _canonicalize(self, string):

138 string = string.replace("$." , "dot").replace("$," , "comma") 139 string = string.replace(".", "dot").replace("," , "comma") 140 return string

Source Code for Module featuregenerator.parser.berkeley.parsermatches