1 '''
2
3 @author: Eleftherios Avramidis
4 '''
5 import os
6 import re
7 import codecs
8 from sentence.sentence import SimpleSentence
9 from sentence.parallelsentence import ParallelSentence
10 import logging
11
13 '''
14 classdocs
15 '''
16
17
19 '''
20 Constructor
21 '''
22
24 source_dir = "%s/plain/sources/" % base_dir
25 system_outputs_dir = "%s/plain/system-outputs/" % base_dir
26 reference_dir = "%s/plain/references/" % base_dir
27 testsets = os.listdir(system_outputs_dir)
28
29 parallelsentences = []
30
31 for testset in testsets:
32 source_filename ="%s/%s-src.%s" % (source_dir, testset, langpair.split("-")[0])
33 reference_filename = "%s/%s-ref.%s" % (reference_dir, testset, langpair.split("-")[1])
34 try:
35 source_xml_file = open(source_filename, 'r')
36 except:
37 logging.warn("Source file '{}' could not be opened".format(source_xml_file))
38
39
40 if extract_references:
41 try:
42 reference_file = open(reference_filename, 'r')
43 except:
44 logging.warn("Reference file '{}' could not be opened".format(reference_file))
45
46
47 submissions = []
48 testset_dir = "%s/%s" % (system_outputs_dir, testset)
49 langpairs = os.listdir(testset_dir)
50 if not langpair in langpairs:
51 print "didn't find language pair %s" % langpair
52 continue
53 langpair_dir = "%s/%s" % (testset_dir, langpair)
54 submission_filenames = os.listdir(langpair_dir)
55 for filename in submission_filenames:
56 match = re.search("\.([^.]*)$", filename)
57 system_name = match.group(1)
58 full_filename = "%s/%s" % (langpair_dir, filename)
59 submission_file = open(full_filename, 'r')
60 submissions.append((submission_file, system_name))
61
62 k = 0
63 for sourceline in source_xml_file:
64 translations = []
65
66 for i in range(len(submissions)):
67 translation_text = submissions[i][0].readline()
68 system_name = submissions[i][1]
69 attributes = { 'system' : system_name }
70 translation = SimpleSentence(translation_text, attributes)
71 translations.append(translation)
72
73 source = SimpleSentence(sourceline, {})
74 attributes = {"id" : str(k+1),
75 "langsrc" : langpair.split("-")[0],
76 "langtgt" : langpair.split("-")[1],
77 "testset" : testset
78 }
79
80 if extract_references:
81 referenceline = reference_file.readline();
82 reference = SimpleSentence(referenceline, {})
83 else:
84 reference = None
85
86 parallelsentence = ParallelSentence(source, translations, reference, attributes)
87 parallelsentences.append(parallelsentence)
88 k += 1
89
90 return parallelsentences
91
92
93 if __name__ == '__main__':
94 import sys
95 from dataprocessor.sax.saxps2jcml import Parallelsentence2Jcml
96
97 langpairs = ["en-de", "de-en", "en-fr", "fr-en", "en-es", "es-en", "en-cs", "cs-en", "en-ru", "ru-en"]
98 base_dir = sys.argv[1]
99 output_dir = sys.argv[2]
100 file_prefix = sys.argv[3]
101
102 extract_references = "--ref" in sys.argv
103
104 for langpair in langpairs:
105 pss = WmtReader().read_parallelsentences(base_dir, langpair, extract_references)
106 filename = "{}.{}.jcml".format(file_prefix, langpair)
107 filename = os.path.join(output_dir, filename)
108 Parallelsentence2Jcml(pss).write_to_file(filename)
109