1 '''
2 Created on 23 Feb 2012
3
4 @author: Eleftherios Avramidis
5 '''
6 from optparse import OptionParser
7 from sentence.sentence import SimpleSentence
8 from collections import OrderedDict
9 from sentence.parallelsentence import ParallelSentence
10 from dataprocessor.sax.saxps2jcml import Parallelsentence2Jcml
11 from featuregenerator.glassbox.moses.extractor import MosesGlassboxExtractor
12
13 if __name__ == '__main__':
14
15 parser = OptionParser()
16 parser.add_option("-s", "--source", dest="source_filename",
17 help="read one source sentence per line from FILE", metavar="FILE")
18
19 parser.add_option("-t", "--translation", dest="target_filename",
20 help="read one translation output sentence per line from FILE", metavar="FILE")
21
22 parser.add_option("-m", "--system", dest="system_name",
23 help="system name")
24
25
26 parser.add_option("-r", "--reference", dest="reference_filename",
27 help="read one reference sentence per line from FILE", metavar="FILE")
28
29 parser.add_option("-l", "--score", dest="score_filename",
30 help="read one score per line from FILE", metavar="FILE")
31
32 parser.add_option("-a", "--feature-names", action="append", dest="feature_names", type="str",
33 help="a list of feature names", default=[])
34
35 parser.add_option("-q", "--feature-files", action="append", dest="feature_files", type="str", default=[],
36 help="a list of feature FILEs in respective order")
37
38 parser.add_option("-b", "--target-features-tab", dest="target_features_tab", type="str",
39 help="all target features in one file, tab-separated")
40
41 parser.add_option("-n", "--target-features-tab-names", dest="target_features_tab_names", type="str",
42 help="all target feature names in one file, tab-separated")
43
44
45 parser.add_option("-o", "--output", dest="output_filename",
46 help="write output to this jcml FILE", metavar="FILE")
47
48 parser.add_option("-f", "--langsrc", dest="langsrc",
49 help="source language code")
50
51 parser.add_option("-e", "--langtgt", dest="langtgt",
52 help="target language code")
53
54 parser.add_option("-u", "--testset", dest="testset",
55 help="set name")
56
57 parser.add_option("-g", "--moseslog", dest="moseslog",
58 help="verbose log of moses decoding")
59
60 (opt, args) = parser.parse_args()
61
62 source_file = open(opt.source_filename, 'r')
63 target_file = open(opt.target_filename, 'r')
64
65 feature_file_objects = [open(f, 'r') for f in opt.feature_files]
66 print opt.feature_files, opt.feature_names
67 try:
68 reference_file = open(opt.reference_filename, 'r')
69 except:
70 reference_file = None
71 try:
72 score_file = open(opt.score_filename)
73 except:
74 score_file = None
75
76 try:
77 target_features_tabfile = open(opt.target_features_tab)
78 except:
79 target_features_tabfile = None
80
81 try:
82 target_features_tab_names_file = open(opt.target_features_tab_names)
83 target_features_tab_names = target_features_tab_names_file.readline().strip().split("\t")
84 target_features_tab_names_file.close()
85 except:
86 target_features_tab_names = []
87
88
89 if opt.moseslog:
90 extractor = MosesGlassboxExtractor()
91 glassbox_features_dicts = extractor.create_dicts_of_sentences_attributes(opt.moseslog)
92
93 parallelsentences = []
94 i = 0
95
96 for source_line in source_file:
97 i+=1
98 atts = OrderedDict()
99 source_line = source_line.strip()
100 target_line = target_file.readline().strip()
101
102
103 if reference_file:
104 reference_line = reference_file.readline().strip()
105 reference_sentence = SimpleSentence(reference_line)
106 else:
107 reference_sentence = None
108
109
110 if score_file:
111 score = score_file.readline().strip()
112 atts["score"] = score
113
114 atts["system"] = opt.system_name
115
116
117 if opt.moseslog:
118 atts.update(glassbox_features_dicts[i-1])
119
120
121 if target_features_tabfile:
122 feature_values = target_features_tabfile.readline().strip().split("\t")
123 for i, feature_value in enumerate(feature_values):
124 try:
125 feature_name = target_features_tab_names[i-1]
126 except:
127 feature_name = i
128 atts["qb_{}".format(feature_name)] = feature_value
129
130 source_sentence = SimpleSentence(source_line)
131 target_sentences = [SimpleSentence(target_line, atts)]
132
133 additional_atts = {}
134 for feature_name, file_object in zip(opt.feature_names, feature_file_objects):
135 value = file_object.readline().strip()
136 additional_atts[feature_name] = value
137
138
139 ps_atts = {"langsrc" : opt.langsrc ,
140 "langtgt" : opt.langtgt ,
141 "testset" : opt.testset ,
142 "id" : str(i)}
143
144 ps_atts.update(additional_atts)
145
146 ps = ParallelSentence(source_sentence, target_sentences, reference_sentence, ps_atts)
147 parallelsentences.append(ps)
148
149 for file_object in feature_file_objects:
150 file_object.close()
151
152 Parallelsentence2Jcml(parallelsentences).write_to_file(opt.output_filename)
153