1 '''
2 Created on 14 Jul 2012
3
4 @author: Eleftherios Avramidis
5 '''
6
7 from dataprocessor.input.jcmlreader import JcmlReader
8 from dataprocessor.sax.saxps2jcml import Parallelsentence2Jcml
9 from sentence.pairwisedataset import AnalyticPairwiseDataset, CompactPairwiseDataset, FilteredPairwiseDataset
10 import os
11 import unittest
12 from numpy.ma.testutils import assert_equal
13 from sentence.rankhandler import RankHandler
14 from sentence.dataset import DataSet
15 from dataprocessor.output.xmlwriter import XmlWriter
16
17
18
19
21 '''
22 classdocs
23 '''
25 self.filename = "pairwiseparallelsentence_test.jcml"
26 self.mydataset = JcmlReader(self.filename).get_dataset()
27
28
29
31 """
32 Loads a dataset, converts that to pairwise once and reconstructs it. Then it loads that again and reconstructs it once more
33 This was helpful to detect a problem of wrong
34 """
35
36
37
38 analytic_testset = AnalyticPairwiseDataset(self.mydataset)
39 filtered_dataset = FilteredPairwiseDataset(analytic_testset, 1.00)
40 filtered_dataset.remove_ties()
41
42 output_filename = "filtered_1.jcml"
43 Parallelsentence2Jcml(filtered_dataset.get_parallelsentences(), shuffle_translations=False, sort_attribute="system").write_to_file(output_filename)
44
45 reconstructed_dataset = filtered_dataset.get_multiclass_set()
46
47
48 output_filename = "reconstructed_1.jcml"
49 Parallelsentence2Jcml(reconstructed_dataset.get_parallelsentences(), shuffle_translations=False, sort_attribute="system").write_to_file(output_filename)
50
51
52 simple_testset = JcmlReader(output_filename).get_dataset()
53 analytic_testset_2 = AnalyticPairwiseDataset(simple_testset)
54 compact_testset_2 = CompactPairwiseDataset(analytic_testset_2)
55
56 output_filename = "filtered_2.jcml"
57 Parallelsentence2Jcml(compact_testset_2.get_parallelsentences(), shuffle_translations=False, sort_attribute="system").write_to_file(output_filename)
58
59
60
61 reconstructed_dataset_2 = compact_testset_2.get_multiclass_set()
62 output_filename = "reconstructed_2.jcml"
63 Parallelsentence2Jcml(reconstructed_dataset_2.get_parallelsentences(), shuffle_translations=False, sort_attribute="system").write_to_file(output_filename)
64
65 reconstructed_1 = reconstructed_dataset.get_parallelsentences()
66 reconstructed_2 = reconstructed_dataset_2.get_parallelsentences()
67 self.assertEqual(len(reconstructed_1), len(reconstructed_2), "The number of sentences when reconstructing the same set twice has changed")
68
69
70
71
72
73
74
75 for p1, p2 in zip(reconstructed_1, reconstructed_2):
76 systemrank1 = set([(tgt.get_attribute("system"), tgt.get_attribute("rank")) for tgt in p1.get_translations()])
77 systemrank2 = set([(tgt.get_attribute("system"), tgt.get_attribute("rank")) for tgt in p2.get_translations()])
78 self.assertEqual(systemrank1, systemrank2)
79
80
82 """
83 Loads a dataset, converts that to pairwise once and reconstructs it. Then it loads that again and reconstructs it once more
84 This was helpful to detect a problem of wrong
85 """
86
87
88
89 analytic_testset = AnalyticPairwiseDataset(self.mydataset)
90 filtered_dataset = FilteredPairwiseDataset(analytic_testset, 1.00)
91 filtered_dataset.remove_ties()
92
93 output_filename = "filtered_1.jcml"
94 Parallelsentence2Jcml(filtered_dataset.get_parallelsentences(), shuffle_translations=False, sort_attribute="system").write_to_file(output_filename)
95
96 reconstructed_dataset = filtered_dataset.get_multiclass_set()
97
98
99 output_filename = "reconstructed_1.jcml"
100 Parallelsentence2Jcml(reconstructed_dataset.get_parallelsentences(), shuffle_translations=False, sort_attribute="system").write_to_file(output_filename)
101
102
103 simple_testset = JcmlReader("filtered_1.jcml").get_dataset()
104 analytic_testset_2 = AnalyticPairwiseDataset(simple_testset)
105 compact_testset_2 = CompactPairwiseDataset(analytic_testset_2)
106
107 output_filename = "filtered_2.jcml"
108 Parallelsentence2Jcml(compact_testset_2.get_parallelsentences(), shuffle_translations=False, sort_attribute="system").write_to_file(output_filename)
109
110
111
112 reconstructed_dataset_2 = compact_testset_2.get_multiclass_set()
113 output_filename = "reconstructed_2.jcml"
114 Parallelsentence2Jcml(reconstructed_dataset_2.get_parallelsentences(), shuffle_translations=False, sort_attribute="system").write_to_file(output_filename)
115
116 reconstructed_1 = reconstructed_dataset.get_parallelsentences()
117 reconstructed_2 = reconstructed_dataset_2.get_parallelsentences()
118 self.assertEqual(len(reconstructed_1), len(reconstructed_2), "The number of sentences when reconstructing the same set twice has changed")
119
120
121
122
123
124
125
126 for p1, p2 in zip(reconstructed_1, reconstructed_2):
127 systemrank1 = set([(tgt.get_attribute("system"), tgt.get_attribute("rank")) for tgt in p1.get_translations()])
128 systemrank2 = set([(tgt.get_attribute("system"), tgt.get_attribute("rank")) for tgt in p2.get_translations()])
129 self.assertEqual(systemrank1, systemrank2)
130
131
132
133
134
135
136
137
138