1 '''
2 Created on 29 Feb 2012
3 @author: Eleftherios Avramidis
4 '''
5 import re
6 import sys
7 import os
8 import subprocess
9 import numpy
10 from featuregenerator.languagefeaturegenerator import LanguageFeatureGenerator
11
12 from dataprocessor.sax.saxps2jcml import Parallelsentence2Jcml
13 from dataprocessor.input.jcmlreader import JcmlReader
14
15
17 files = [(int(re.findall("(.*)\.sgml", f)[0]), f) for f in os.listdir(path) if f.endswith("sgml")]
18 atts_vector = []
19 for sentence_id, filename in sorted(files):
20 filename = os.path.join(path, filename)
21 print filename
22 file = open(filename, 'r')
23
24 file_content = file.read()
25 file.close()
26
27
28 pattern = "SCORES \(UNWEIGHTED/WEIGHTED\): d: ([\d\-\.]*) w: ([\d\-\.]*) u: ([\d\-\.]*) d: ([\d\-\.]*) ([\d\-\.]*) ([\d\-\.]*) ([\d\-\.]*) ([\d\-\.]*) ([\d\-\.]*) lm: ([\d\-\.]*) tm: ([\d\-\.]*) ([\d\-\.]*) ([\d\-\.]*) ([\d\-\.]*) ([\d\-\.]*)"
29 values = re.findall(pattern, file_content)[0]
30 [score_d, score_w, score_u, score_d1, score_d2, score_d3, score_d4, score_d5, score_d6, score_lm, score_tm1, score_tm2, score_tm3, score_tm4, score_tm5] = values
31
32 sentence_atts = dict(score_d = score_d, score_w=score_w,
33 score_d1=score_d1, score_d2=score_d2, score_d3=score_d3, score_d4=score_d4, score_d5=score_d5, score_d6=score_d6,
34 score_lm=score_lm, score_tm1=score_tm1, score_tm2=score_tm2, score_tm3=score_tm3, score_tm4=score_tm4, score_tm5=score_tm5 )
35
36
37 graph_content = file_content.split("<wgraph>")[1]
38 pattern = "([\w]*)=([\d\-\.\,\ ]*)\s*"
39
40 atts = {}
41 graph_rows = re.findall(pattern, graph_content)
42 for (graph_feature_name, graph_feature_value)in graph_rows:
43
44 if graph_feature_name == "a":
45 atts.update(_split_a_vector(graph_feature_value))
46 elif graph_feature_name == "r":
47 atts.update(_split_r_vector(graph_feature_value))
48
49 elif graph_feature_name in ["UTTERANCE", "VERSION", "w"]:
50 pass
51 else:
52 if graph_feature_name == "pC":
53 graph_feature_value = graph_feature_value.strip()[:-1]
54
55 try:
56 atts[graph_feature_name].append(float(graph_feature_value))
57 except KeyError:
58 atts[graph_feature_name] = [float(graph_feature_value)]
59
60
61
62 for graph_feature_name, graph_feature_values in atts.iteritems():
63 sentence_atts["{0}_avg".format(graph_feature_name)] = "%.3f" % round(float(numpy.average(graph_feature_values)),3)
64 sentence_atts["{0}_std".format(graph_feature_name)] = "%.3f" % round(float(numpy.std(graph_feature_values)),3)
65 sentence_atts["{0}_var".format(graph_feature_name)] = "%.3f" % round(float(numpy.var(graph_feature_values)),3)
66
67 sentence_atts = dict([("d_%s" % k, v) for k,v in sentence_atts.iteritems()])
68 atts_vector.append(sentence_atts)
69 return atts_vector
70
71
73 values = graph_feature_value.split(", ")
74 atts = {}
75 i = 0
76 for value in values:
77 i+=1
78 atts["a{0}".format(i)] = float(value.strip())
79 return atts
80
81
83
84 values = re.findall("(\-?\d{1,3}\.\d{1,3})", graph_feature_value)
85 atts = {}
86 i = 0
87 for value in values:
88 i+=1
89 atts["r{0}".format(i)] = float(value.strip())
90 return atts
91
92
93 if __name__ == '__main__':
94 input_path = sys.argv[1]
95 input_jcml = sys.argv[2]
96 output_jcml = sys.argv[3]
97
98 if len(sys.argv)<4 :
99 print "This script reads the supplementary decoding attributes of WMT12-quality estimation task and \
100 wraps them into the designated jcml format. Syntax: script [dir] [input.jcml] [output.jcml]\n \
101 dir: directory where the files [1-9]*.sgml, one file for sentence are stored \
102 input.jcml: original jcml file containing the rest of the data \
103 output.jcml: a copy of the original jcml file, with attributes from the decoded process added \
104 "
105
106 att_vector = process(input_path)
107 dataset = JcmlReader(input_jcml).get_dataset()
108 dataset.add_attribute_vector(att_vector)
109 Parallelsentence2Jcml(dataset.get_parallelsentences()).write_to_file(output_jcml)
110