support.preprocessing.jcml.wmt12decodingfeatures2jcml

17 files = [(int(re.findall("(.*)\.sgml", f)[0]), f) for f in os.listdir(path) if f.endswith("sgml")] 18 atts_vector = [] 19 for sentence_id, filename in sorted(files): 20 filename = os.path.join(path, filename) 21 print filename 22 file = open(filename, 'r') 23 24 file_content = file.read() 25 file.close() 26 27 #SCORES 28 pattern = "SCORES \(UNWEIGHTED/WEIGHTED\): d: ([\d\-\.]*) w: ([\d\-\.]*) u: ([\d\-\.]*) d: ([\d\-\.]*) ([\d\-\.]*) ([\d\-\.]*) ([\d\-\.]*) ([\d\-\.]*) ([\d\-\.]*) lm: ([\d\-\.]*) tm: ([\d\-\.]*) ([\d\-\.]*) ([\d\-\.]*) ([\d\-\.]*) ([\d\-\.]*)" 29 values = re.findall(pattern, file_content)[0] 30 [score_d, score_w, score_u, score_d1, score_d2, score_d3, score_d4, score_d5, score_d6, score_lm, score_tm1, score_tm2, score_tm3, score_tm4, score_tm5] = values 31 32 sentence_atts = dict(score_d = score_d, score_w=score_w, 33 score_d1=score_d1, score_d2=score_d2, score_d3=score_d3, score_d4=score_d4, score_d5=score_d5, score_d6=score_d6, 34 score_lm=score_lm, score_tm1=score_tm1, score_tm2=score_tm2, score_tm3=score_tm3, score_tm4=score_tm4, score_tm5=score_tm5 ) 35 36 #GRAPH 37 graph_content = file_content.split("<wgraph>")[1] 38 pattern = "([\w]*)=([\d\-\.\,\ ]*)\s*" 39 40 atts = {} 41 graph_rows = re.findall(pattern, graph_content) 42 for (graph_feature_name, graph_feature_value)in graph_rows: 43 44 if graph_feature_name == "a": 45 atts.update(_split_a_vector(graph_feature_value)) 46 elif graph_feature_name == "r": 47 atts.update(_split_r_vector(graph_feature_value)) 48 49 elif graph_feature_name in ["UTTERANCE", "VERSION", "w"]: 50 pass 51 else: 52 if graph_feature_name == "pC": 53 graph_feature_value = graph_feature_value.strip()[:-1] 54 55 try: 56 atts[graph_feature_name].append(float(graph_feature_value)) 57 except KeyError: 58 atts[graph_feature_name] = [float(graph_feature_value)] 59 60 #now calculate avg 61 62 for graph_feature_name, graph_feature_values in atts.iteritems(): 63 sentence_atts["{0}_avg".format(graph_feature_name)] = "%.3f" % round(float(numpy.average(graph_feature_values)),3) 64 sentence_atts["{0}_std".format(graph_feature_name)] = "%.3f" % round(float(numpy.std(graph_feature_values)),3) 65 sentence_atts["{0}_var".format(graph_feature_name)] = "%.3f" % round(float(numpy.var(graph_feature_values)),3) 66 67 sentence_atts = dict([("d_%s" % k, v) for k,v in sentence_atts.iteritems()]) 68 atts_vector.append(sentence_atts) 69 return atts_vector

Source Code for Module support.preprocessing.jcml.wmt12decodingfeatures2jcml