1 '''
2
3 @author: Eleftherios Avramidis
4 '''
5 import re
6 from sentence.sentence import SimpleSentence
7 from sentence.parallelsentence import ParallelSentence
8 from genericreader import GenericReader
9
11 '''
12 Reads and combines strings from one-sentence-per-line data
13 '''
14
15
16 - def __init__(self, source_filename, submission_filenames, langpair, testset, pattern_name =""):
17 '''
18 @param source_filename: Name of file containing source sentences, one sentence per line
19 @type source_filename: str
20 @param submission_filenames: List of files containing MT system output corresponding with
21 the source file, one sentence per line. The filename of each file will be used for extracting
22 the 'system' attribute for its imported sentences (see \L{pattern_name} below)
23 @type submission_filenames: str
24 @param langpair: A string containing the language codes of the the language pair, source-target e.g.: de-en or en-fr
25 @type langpair: str
26 @param testset: The name of the data set, e.g: testset2011
27 @type testset: str
28 @param pattern_name: A regular expression which contains a bracketed pattern for extracting
29 the system name out of the filename. If empty, the entire filename will be used as a system name
30 '''
31 self.source_filename = source_filename
32 self.submission_filenames = submission_filenames
33 self.langpair = langpair
34 self.testset = testset
35 self.pattern_name = pattern_name
36
37
39 parallelsentences = []
40 source_xml_file = open(self.source_filename, 'r')
41
42 submissions = []
43
44 for filename in self.submission_filenames:
45 if self.pattern_name == "":
46 system_name = filename
47 else:
48 system_name = re.findall(self.pattern_name, filename)[0]
49 submission_file = open(filename, 'r')
50 submissions.append((submission_file, system_name))
51
52 k = 0
53 for sourceline in source_xml_file:
54 translations = []
55 for i in range(len(submissions)):
56 translation_text = submissions[i][0].readline()
57 system_name = submissions[i][1]
58 attributes = { 'system' : system_name }
59 translation = SimpleSentence(translation_text, attributes)
60 translations.append(translation)
61
62 source = SimpleSentence(sourceline, {})
63 attributes = {"id" : str(k+1),
64 "langsrc" : self.langpair.split("-")[0],
65 "langtgt" : self.langpair.split("-")[1],
66 "testset" : self.testset
67 }
68 parallelsentence = ParallelSentence(source, translations, None, attributes)
69 parallelsentences.append(parallelsentence)
70 k += 1
71
72 return parallelsentences
73
74
76 '''
77 Reads and combines strings and attributes from one-sentence-per-line data
78 '''
79
82
83
85 parallelsentences = []
86 source_xml_file = open(self.source_filename, 'r')
87
88 submissions = []
89
90 for filename in self.submission_filenames:
91 if self.pattern_name == "":
92 system_name = filename
93 else:
94 system_name = re.findall(self.pattern_name, filename)[0]
95 submission_file = open(filename, 'r')
96 submissions.append((submission_file, system_name))
97
98 k = 0
99 for sourceline in source_xml_file:
100 translations = []
101 for i in range(len(submissions)):
102 translation_text = submissions[i][0].readline()
103 system_name = submissions[i][1]
104 attributes = { 'system' : system_name }
105 translation = SimpleSentence(translation_text, attributes)
106 translations.append(translation)
107
108 source = SimpleSentence(sourceline, {})
109 attributes = {"id" : str(k+1),
110 "langsrc" : self.langpair.split("-")[0],
111 "langtgt" : self.langpair.split("-")[1],
112 "testset" : self.testset
113 }
114 parallelsentence = ParallelSentence(source, translations, None, attributes)
115 parallelsentences.append(parallelsentence)
116 k += 1
117
118 return parallelsentences
119