Trees | Indices | Help |
|
---|
|
1 ''' 2 @author: Eleftherios Avramidis 3 ''' 4 from dataprocessor.input.jcmlreader import JcmlReader 5 from dataprocessor.sax.saxps2jcml import Parallelsentence2Jcml 6 from sentence.pairwisedataset import AnalyticPairwiseDataset, CompactPairwiseDataset, FilteredPairwiseDataset 7 import os 8 import unittest 9 from numpy.ma.testutils import assert_equal 10 from sentence.rankhandler import RankHandler 11 from sentence.dataset import DataSet 12 from dataprocessor.output.xmlwriter import XmlWriter 1315 """ 16 Test that the number of sentences stays is the same before and after the conversion 17 """ 18127 128 # equal_sentences(reconstructed_dataset_2, reconstructed_dataset) 129 130 #first see if number of sentences at the compact sets are equal 131 132 133 134 # self.assertEqual(pps_original, pps_new) 13520 self.filename = "pairwiseparallelsentence_test.jcml" 21 self.mydataset = JcmlReader(self.filename).get_dataset()22 23 2426 """ 27 check if reader reads all the parallelsentences 28 """ 29 ps_len_before = len(self.mydataset.get_parallelsentences()) 30 f = open(self.filename, 'r') 31 f_text = f.read() 32 ps_len_after = f_text.count('<judgedsentence') 33 print ps_len_before, ps_len_after 34 self.assertEqual(ps_len_before, ps_len_after) 35 f.close()36 3739 """ 40 Check whether the same dataset structure gets affected by the conversions 41 """ 42 ps_len_before = len(self.mydataset.get_parallelsentences()) 43 self.my_pairwise_dataset = AnalyticPairwiseDataset(self.mydataset) 44 ps_len_after = len(self.mydataset.get_parallelsentences()) 45 print ps_len_before, ps_len_after 46 self.assertEqual(ps_len_before, ps_len_after)47 4850 """ 51 Check whether the pairwise breakdown of the multiclass sentences has the right size 52 """ 53 54 #how many were there before? 55 ps_len_before = len(self.mydataset.get_parallelsentences()) 56 self.my_pairwise_dataset = AnalyticPairwiseDataset(self.mydataset) 57 #how many were there afterwards? 58 pss = self.my_pairwise_dataset.get_parallelsentences() 59 ps_len_after = len(pss) 60 61 #Parallelsentence2Jcml(pss).write_to_file("%s.pairwise" % self.filename) 62 63 #how many should there be? 64 translation_count_vector = self.mydataset.get_translations_count_vector() 65 print translation_count_vector 66 pairwise_translation_count_vector = [n*(n-1) for n in translation_count_vector] 67 print pairwise_translation_count_vector 68 pairwise_translations_altogether = sum(pairwise_translation_count_vector) 69 70 print "They are", ps_len_after 71 print "They should be", pairwise_translations_altogether 72 self.assertEqual(ps_len_after, pairwise_translations_altogether)73 7476 pps_new = sorted(AnalyticPairwiseDataset(self.mydataset).get_parallelsentences()) 77 pps_old = sorted(RankHandler().get_pairwise_from_multiclass_set(self.mydataset.get_parallelsentences(), True, True, False)) 78 filename1 = "%s.pairnew" % self.filename 79 filename2 = "%s.pairold" % self.filename 80 Parallelsentence2Jcml(pps_new).write_to_file(filename1) 81 Parallelsentence2Jcml(pps_old).write_to_file(filename2) 82 self.assertEqual(len(pps_new), len(pps_old)) 83 self.assertEqual(os.path.getsize(filename1), os.path.getsize(filename2))84 85 # self.assertEqual(pps_new, pps_old) 86 8789 pd_new = AnalyticPairwiseDataset(self.mydataset) 90 pd_new.remove_ties() 91 pps_new = pd_new.get_parallelsentences() 92 pps_old = RankHandler().get_pairwise_from_multiclass_set(self.mydataset.get_parallelsentences(), False, True) 93 self.assertEqual(len(pps_new), len(pps_old))94 9597 pps_original = self.mydataset.get_parallelsentences() 98 pps_new = AnalyticPairwiseDataset(self.mydataset).get_parallelsentences() 99 pps_rebuilt_old = RankHandler().get_multiclass_from_pairwise_set(pps_new, True) 100 self.assertEqual(len(pps_original), len(pps_rebuilt_old))101 102 103105 106 def equal_sentences(dataset1, dataset2): 107 self.assertEqual(len(dataset1.get_parallelsentences()), len(dataset2.get_parallelsentences()))108 109 110 111 #first perform typical cleanup of the test set 112 analytic_testset = AnalyticPairwiseDataset(self.mydataset) #this 113 filtered_dataset = FilteredPairwiseDataset(analytic_testset, 1.00) 114 filtered_dataset.remove_ties() 115 reconstructed_dataset = filtered_dataset.get_multiclass_set() 116 reconstructed_dataset.remove_ties() 117 output_filename = "filtered.jcml" 118 Parallelsentence2Jcml(reconstructed_dataset.get_parallelsentences(), shuffle_translations=False).write_to_file(output_filename) 119 120 #retrieve clean test set from the file and repeat the handling 121 simple_testset = JcmlReader(output_filename).get_dataset() 122 analytic_testset_2 = AnalyticPairwiseDataset(simple_testset) #this 123 compact_testset_2 = FilteredPairwiseDataset(analytic_testset_2) 124 reconstructed_dataset_2 = compact_testset_2.get_multiclass_set() 125 output_filename = "refiltered.jcml" 126 Parallelsentence2Jcml(reconstructed_dataset_2.get_parallelsentences(), shuffle_translations=False).write_to_file(output_filename)137 new_analytic = AnalyticPairwiseDataset(self.mydataset) 138 new_merged = CompactPairwiseDataset(new_analytic) 139 new_merged_sentences = new_merged.get_parallelsentences() 140 141 parallelsentences = self.mydataset.get_parallelsentences() 142 old_unmerged_sentences = RankHandler().get_pairwise_from_multiclass_set(parallelsentences, True, True, False) 143 old_merged_sentences = RankHandler().merge_overlapping_pairwise_set(old_unmerged_sentences) 144 145 filename1 = "%s.mergednew" % self.filename 146 filename2 = "%s.mergedold" % self.filename 147 Parallelsentence2Jcml(new_merged_sentences).write_to_file(filename1) 148 Parallelsentence2Jcml(old_merged_sentences).write_to_file(filename2) 149 150 self.assertEqual(len(new_merged_sentences), len(old_merged_sentences), "The two ways of merging differ")151 #self.assertEqual(os.path.getsize(filename1), os.path.getsize(filename2)) 152 153 154 # def test_filter_sentence_28(self): 155 # new_analytic = AnalyticPairwiseDataset(self.mydataset) 156 # sentence_id = "28" 157 # analytic_parallelsentences = new_analytic.get_parallelsentences() 158 # analytic_parallelsentences = [ps for ps in analytic_parallelsentences if ps.get_compact_id() == sentence_id] 159 # 160 # sentence_ids = set([ps.get_compact_id() for ps in analytic_parallelsentences]) 161 # rank_vector = [tuple(sorted(ps.get_system_names())) for ps in analytic_parallelsentences] 162 # rank_pairs = set(rank_vector) 163 # 164 # new_filtered = FilteredPairwiseDataset(new_analytic, 1.00) 165 # 166 # print "Should have", unique, "and have" , len(new_filtered_parallelsentences) 167 # self.assertEqual(len(new_filtered_parallelsentences), rank_pairs) 168 # 169171 sentence_28 = DataSet([ps for ps in self.mydataset.get_parallelsentences() if ps.get_compact_id() == "28"]) 172 analytic_dataset = AnalyticPairwiseDataset(sentence_28) 173 analytic_parallelsentences = analytic_dataset.get_parallelsentences() 174 175 for ps in analytic_parallelsentences: 176 rank_items = [(tuple(sorted(ps.get_system_names())), ps.get_rank()) ] 177 for rank_item in sorted(rank_items): 178 print rank_item 179 180 print 181 rank_vector = [tuple(sorted(ps.get_system_names())) for ps in analytic_parallelsentences] 182 unique = sorted(set(rank_vector)) 183 184 #manual check 185 self.assertEqual(len(rank_vector), 80) 186 self.assertEqual(len(unique), 59) 187 188 new_compact = CompactPairwiseDataset(analytic_dataset) 189 new_compact_sentences = new_compact.get_parallelsentences() 190 self.assertEqual(len(new_compact_sentences), 59) 191 new_filtered_sentences = FilteredPairwiseDataset(analytic_dataset, 1.00).get_parallelsentences() 192 self.assertEqual(len(new_filtered_sentences), 54) 193 new_filtered_sentences = FilteredPairwiseDataset(analytic_dataset, 0.60).get_parallelsentences() 194 self.assertEqual(len(new_filtered_sentences), 55)195197 new_analytic = AnalyticPairwiseDataset(self.mydataset) 198 analytic_parallelsentences = new_analytic.get_parallelsentences() 199 sentence_ids = set([ps.get_compact_id() for ps in analytic_parallelsentences]) 200 print 201 unique = 0 202 for sentence_id in sentence_ids: 203 #get a list of the system name pairs, order irrelevant 204 rank_vector = [tuple(sorted(ps.get_system_names())) for ps in analytic_parallelsentences if ps.get_compact_id() == sentence_id] 205 206 rank_pairs = set(rank_vector) 207 208 print "rank vector for sentence %s has %d comparisons "% (sentence_id, len(rank_vector)) 209 print "rank vector for sentence %s has %d unique comparisons "% (sentence_id, len(rank_pairs)) 210 unique += len(rank_pairs) 211 212 unique = 2*unique 213 new_filtered = CompactPairwiseDataset(new_analytic) 214 new_filtered_parallelsentences = new_filtered.get_parallelsentences() 215 print "Should have", unique, "and have" , len(new_filtered_parallelsentences) 216 self.assertEqual(len(new_filtered_parallelsentences), unique) 217 # for rank_tupple in rank_vector: 218 # print rank_tupple 219 filename1 = "%s.filterednew" % self.filename 220 Parallelsentence2Jcml(new_filtered_parallelsentences).write_to_file(filename1) 221 222 new_filtered = FilteredPairwiseDataset(new_analytic, 0.00) 223 new_filtered_parallelsentences = new_filtered.get_parallelsentences() 224 print "Should have", unique, "and have" , len(new_filtered_parallelsentences) 225 self.assertEqual(len(new_filtered_parallelsentences), unique) 226 # for rank_tupple in rank_vector: 227 # print rank_tupple 228 filename1 = "%s.filterednew" % self.filename 229 Parallelsentence2Jcml(new_filtered_parallelsentences).write_to_file(filename1)230 231 #class TestPairwiseParallelSentenceConversion(unittest.TestCase): 232 # 233 # 234 ## def setUp(self): 235 ### path = os.path.abspath(__file__) 236 ## mydataset = JcmlReader("pairwiseparallelsentence_test.jcml").get_dataset() 237 ## my_pairwise_dataset = AnalyticPairwiseDataset(mydataset) 238 ## my_compact_pairwise_dataset = CompactPairwiseDataset(my_pairwise_dataset) 239 ## 240 # 241 # def tearDown(self): 242 # pass 243 # 244 # 245 # def runTest(self): 246 # pass 247 # 248 # def testName(self): 249 # pass 250 251 252 if __name__ == "__main__": 253 #import sys;sys.argv = ['', 'Test.testName'] 254 unittest.main() 255
Trees | Indices | Help |
|
---|
Generated by Epydoc 3.0.1 on Fri Jul 18 11:46:16 2014 | http://epydoc.sourceforge.net |