1 '''
2 Created on 23 Feb 2012
3
4 @author: Eleftherios Avramidis
5 '''
6 from itertools import combinations
7 import sys
8 import os
9 import shutil
10 from dataset import DataSet
11 from coupledparallelsentence import CoupledParallelSentence
12 from dataprocessor.input.orangereader import OrangeData
13 from dataprocessor.sax.saxps2jcml import IncrementalJcml
14 from dataprocessor.input.jcmlreader import JcmlReader
15
16
18 '''
19 A coupled data set contains all possible couples of parallel sentences of a simple dataset
20 @ivar parallelsentences: a list of the coupled parallel sentences
21 @type parallelsentences: [L{CoupledParallelSentence}, ...]
22 '''
23
24
26 '''
27 @var existing_item: allows the construction of a coupled dataset from an existing simple dataset or already coupled parallel sentences
28 @type existing_item: L{DataSet} or [L{CoupledParallelSentence}, ...]
29 '''
30 self.parallelsentences = []
31 self.attribute_names_found = False
32 self.attribute_names = []
33
34 if "construct" in kwargs:
35 dataset = kwargs["construct"]
36 parallelsentences = dataset.get_parallelsentences()
37 ps_combinations = combinations(parallelsentences, 2)
38 self.parallelsentences = [CoupledParallelSentence(ps1, ps2) for ps1, ps2 in ps_combinations]
39 elif "readfile" in kwargs:
40 dataset = JcmlReader(kwargs["readfile"]).get_dataset()
41 already_coupled_parallelsentences = dataset.get_parallelsentences()
42 self.parallelsentences = [CoupledParallelSentence(ps) for ps in already_coupled_parallelsentences]
43 elif "wrap" in kwargs:
44 dataset = kwargs["wrap"]
45 already_coupled_parallelsentences = dataset.get_parallelsentences()
46 self.parallelsentences = [CoupledParallelSentence(ps) for ps in already_coupled_parallelsentences]
47
48
50 '''
51 Reconstructs the original data set, with only one sentence per entry.
52 @return: Simple dataset that contains the simplified parallel sentences
53 @rtype: L{DataSet}
54 '''
55 single_parallelsentences = {}
56 for coupled_parallelsentence in self.parallelsentences:
57 ps1, ps2 = coupled_parallelsentence.get_couple()
58 single_parallelsentences[ps1.get_tuple_id()] = ps1
59 single_parallelsentences[ps2.get_tuple_id()] = ps2
60
61 sorted_keys = sorted(single_parallelsentences)
62 sorted_ps = [single_parallelsentences[key] for key in sorted_keys]
63 return DataSet(sorted_ps)
64
65
67 '''
68 Reconstructs the original data set, with only one sentence per entry.
69 @return: Simple dataset that contains the simplified parallel sentences
70 @rtype: L{DataSet}
71 '''
72 single_parallelsentences = {}
73 single_parallelsentences_rank = {}
74 for coupled_parallelsentence in self.parallelsentences:
75 ps1, ps2 = coupled_parallelsentence.get_couple()
76
77 if coupled_parallelsentence.get_attribute(attribute1) == "n":
78 return []
79 prob_neg = -1.00 * float(coupled_parallelsentence.get_attribute(attribute1))
80 prob_pos = -1.00 * float(coupled_parallelsentence.get_attribute(attribute2))
81
82
83 single_parallelsentences[ps1.get_tuple_id()] = ps1
84 try:
85 single_parallelsentences_rank[ps1.get_tuple_id()] += prob_neg
86 except:
87 single_parallelsentences_rank[ps1.get_tuple_id()] = prob_neg
88
89 single_parallelsentences[ps2.get_tuple_id()] = ps2
90 try:
91 single_parallelsentences_rank[ps2.get_tuple_id()] += prob_pos
92 except:
93 single_parallelsentences_rank[ps2.get_tuple_id()] = prob_pos
94
95 j = 0
96 prev_rank = None
97 prev_j = None
98 normalized_rank = {}
99 for key, rank in sorted(single_parallelsentences_rank.iteritems(), key=lambda (k,v): (v,k)):
100 j+=1
101 if rank == prev_rank:
102 normalized_rank[key] = prev_j
103 else:
104 normalized_rank[key] = j
105 prev_j = j
106 prev_rank = rank
107
108
109 sorted_keys = sorted(single_parallelsentences)
110 sorted_ps = []
111 for key in sorted_keys:
112 ps = single_parallelsentences[key]
113 ps.add_attributes({critical_attribute: str(normalized_rank[key])})
114 sorted_ps.append(ps)
115 return DataSet(sorted_ps)
116
117
118
119
121 '''
122 Reconstructs the original data set, with only one sentence per entry.
123 @return: Simple dataset that contains the simplified parallel sentences
124 @rtype: L{DataSet}
125 '''
126 single_parallelsentences = {}
127 single_parallelsentences_rank = {}
128 for coupled_parallelsentence in self.parallelsentences:
129 ps1, ps2 = coupled_parallelsentence.get_couple()
130 rank = int(coupled_parallelsentence.get_attribute(critical_attribute))
131
132
133 single_parallelsentences[ps1.get_tuple_id()] = ps1
134 try:
135 single_parallelsentences_rank[ps1.get_tuple_id()] += rank
136 except:
137 single_parallelsentences_rank[ps1.get_tuple_id()] = rank
138
139 single_parallelsentences[ps2.get_tuple_id()] = ps2
140 try:
141 single_parallelsentences_rank[ps2.get_tuple_id()] -= rank
142 except:
143 single_parallelsentences_rank[ps2.get_tuple_id()] = -1 * rank
144
145 j = 0
146 prev_rank = None
147 prev_j = None
148 normalized_rank = {}
149 for key, rank in sorted(single_parallelsentences_rank.iteritems(), key=lambda (k,v): (v,k)):
150 j+=1
151 if rank == prev_rank:
152 normalized_rank[key] = prev_j
153 else:
154 normalized_rank[key] = j
155 prev_j = j
156 prev_rank = rank
157
158
159 sorted_keys = sorted(single_parallelsentences)
160 sorted_ps = []
161 for key in sorted_keys:
162 ps = single_parallelsentences[key]
163 ps.add_attributes({critical_attribute: str(normalized_rank[key])})
164 sorted_ps.append(ps)
165 return DataSet(sorted_ps)
166
169
170
171
172
175 '''
176 @var existing_item: allows the construction of a coupled dataset from an existing simple dataset or already coupled parallel sentences
177 @type existing_item: L{DataSet} or [L{CoupledParallelSentence}, ...]
178 '''
179 self.parallelsentences = []
180 self.attribute_names_found = False
181 self.attribute_names = []
182
183 if isinstance(existing_item, DataSet):
184 dataset = existing_item
185 parallelsentences = dataset.get_parallelsentences()
186 self.parallelsentences = combinations(parallelsentences, 2)
187
188 - def write(self, filename, filter_att=False, diff=0, compact=False):
202
203
205 """
206 A wrapper for the orange Example Table that can be initialized upon a CoupledDataSet
207 @todo: maybe change that to a function of the previous class and break down to the parallel sentences
208 """
209
211
212
213 line_1 = ""
214 line_2 = ""
215 line_3 = ""
216 print "Getting attributes"
217
218 if desired_attributes == []:
219 desired_attributes = attribute_names
220
221
222
223
224
225
226 print "Constructing file"
227
228 for attribute_name in attribute_names :
229
230 attribute_name = str(attribute_name)
231 line_1 += attribute_name +"\t"
232
233
234
235 if attribute_name == class_name:
236 line_2 += "discrete\t"
237 elif attribute_name in desired_attributes and attribute_name not in meta_attributes:
238 line_2 += "continuous\t"
239 else:
240 line_2 += "string\t"
241
242
243
244 if attribute_name == class_name:
245 line_3 = line_3 + "c"
246 elif attribute_name not in desired_attributes or attribute_name in meta_attributes:
247
248 line_3 = line_3 + "m"
249 line_3 = line_3 + "\t"
250
251
252
253 line_2 += "string\t"
254 line_3 += "m\t"
255 line_1 += "src-1\t"
256
257 line_2 += "string\t"
258 line_3 += "m\t"
259 line_1 += "src-2\t"
260
261 i=0
262 for tgt in dataset.get_parallelsentences()[0].get_translations():
263 i+=1
264 line_2 += "string\t"
265 line_3 += "m\t"
266 line_1 += "tgt-" + str(i) + "\t"
267
268 line_2 += "string\t"
269 line_3 += "m\t"
270 line_1 += "ref\t"
271
272
273 line_1 = line_1 + "\n"
274 line_2 = line_2 + "\n"
275 line_3 = line_3 + "\n"
276 output = line_1 + line_2 + line_3
277 return output
278
279
324