Package test :: Module pairwise_test
[hide private]
[frames] | no frames]

Source Code for Module test.pairwise_test

  1  ''' 
  2  Created on 14 Jul 2012 
  3   
  4  @author: Eleftherios Avramidis 
  5  ''' 
  6   
  7  from dataprocessor.input.jcmlreader import JcmlReader 
  8  from dataprocessor.sax.saxps2jcml import Parallelsentence2Jcml 
  9  from sentence.pairwisedataset import AnalyticPairwiseDataset, CompactPairwiseDataset, FilteredPairwiseDataset 
 10  import os 
 11  import unittest 
 12  from numpy.ma.testutils import assert_equal 
 13  from sentence.rankhandler import RankHandler 
 14  from sentence.dataset import DataSet 
 15  from dataprocessor.output.xmlwriter import XmlWriter 
 16   
 17   
 18   
 19   
20 -class PairwiseTesting(unittest.TestCase):
21 ''' 22 classdocs 23 '''
24 - def setUp(self):
25 self.filename = "pairwiseparallelsentence_test.jcml" 26 self.mydataset = JcmlReader(self.filename).get_dataset()
27 28 29
31 """ 32 Loads a dataset, converts that to pairwise once and reconstructs it. Then it loads that again and reconstructs it once more 33 This was helpful to detect a problem of wrong 34 """ 35 36 37 #first perform typical cleanup of the test set 38 analytic_testset = AnalyticPairwiseDataset(self.mydataset) #this 39 filtered_dataset = FilteredPairwiseDataset(analytic_testset, 1.00) 40 filtered_dataset.remove_ties() 41 42 output_filename = "filtered_1.jcml" 43 Parallelsentence2Jcml(filtered_dataset.get_parallelsentences(), shuffle_translations=False, sort_attribute="system").write_to_file(output_filename) 44 45 reconstructed_dataset = filtered_dataset.get_multiclass_set() 46 # reconstructed_dataset.remove_ties() 47 48 output_filename = "reconstructed_1.jcml" 49 Parallelsentence2Jcml(reconstructed_dataset.get_parallelsentences(), shuffle_translations=False, sort_attribute="system").write_to_file(output_filename) 50 51 #retrieve clean test set from the file and repeat the handling 52 simple_testset = JcmlReader(output_filename).get_dataset() 53 analytic_testset_2 = AnalyticPairwiseDataset(simple_testset) #this 54 compact_testset_2 = CompactPairwiseDataset(analytic_testset_2) 55 56 output_filename = "filtered_2.jcml" 57 Parallelsentence2Jcml(compact_testset_2.get_parallelsentences(), shuffle_translations=False, sort_attribute="system").write_to_file(output_filename) 58 59 # self.assertEqual(len(filtered_dataset.get_parallelsentences()), len(compact_testset_2.get_parallelsentences())) 60 61 reconstructed_dataset_2 = compact_testset_2.get_multiclass_set() 62 output_filename = "reconstructed_2.jcml" 63 Parallelsentence2Jcml(reconstructed_dataset_2.get_parallelsentences(), shuffle_translations=False, sort_attribute="system").write_to_file(output_filename) 64 65 reconstructed_1 = reconstructed_dataset.get_parallelsentences() 66 reconstructed_2 = reconstructed_dataset_2.get_parallelsentences() 67 self.assertEqual(len(reconstructed_1), len(reconstructed_2), "The number of sentences when reconstructing the same set twice has changed") 68 69 # for sentence_id in compact_testset_2.get_pairwise_parallelsentence_sets().iterkeys(): 70 # pset1 = compact_testset_2.get_pairwise_parallelsentence_set(sentence_id) 71 # print pset1.get_system_names() 72 # pset2 = compact_testset_2.get_pairwise_parallelsentence_set(sentence_id) 73 # print pset2.get_system_names() 74 75 for p1, p2 in zip(reconstructed_1, reconstructed_2): 76 systemrank1 = set([(tgt.get_attribute("system"), tgt.get_attribute("rank")) for tgt in p1.get_translations()]) 77 systemrank2 = set([(tgt.get_attribute("system"), tgt.get_attribute("rank")) for tgt in p2.get_translations()]) 78 self.assertEqual(systemrank1, systemrank2)
79 80
82 """ 83 Loads a dataset, converts that to pairwise once and reconstructs it. Then it loads that again and reconstructs it once more 84 This was helpful to detect a problem of wrong 85 """ 86 87 88 #first perform typical cleanup of the test set 89 analytic_testset = AnalyticPairwiseDataset(self.mydataset) #this 90 filtered_dataset = FilteredPairwiseDataset(analytic_testset, 1.00) 91 filtered_dataset.remove_ties() 92 93 output_filename = "filtered_1.jcml" 94 Parallelsentence2Jcml(filtered_dataset.get_parallelsentences(), shuffle_translations=False, sort_attribute="system").write_to_file(output_filename) 95 96 reconstructed_dataset = filtered_dataset.get_multiclass_set() 97 # reconstructed_dataset.remove_ties() 98 99 output_filename = "reconstructed_1.jcml" 100 Parallelsentence2Jcml(reconstructed_dataset.get_parallelsentences(), shuffle_translations=False, sort_attribute="system").write_to_file(output_filename) 101 102 #retrieve pairwise test set from the file and repeat the handling 103 simple_testset = JcmlReader("filtered_1.jcml").get_dataset() 104 analytic_testset_2 = AnalyticPairwiseDataset(simple_testset) #this 105 compact_testset_2 = CompactPairwiseDataset(analytic_testset_2) 106 107 output_filename = "filtered_2.jcml" 108 Parallelsentence2Jcml(compact_testset_2.get_parallelsentences(), shuffle_translations=False, sort_attribute="system").write_to_file(output_filename) 109 110 # self.assertEqual(len(filtered_dataset.get_parallelsentences()), len(compact_testset_2.get_parallelsentences())) 111 112 reconstructed_dataset_2 = compact_testset_2.get_multiclass_set() 113 output_filename = "reconstructed_2.jcml" 114 Parallelsentence2Jcml(reconstructed_dataset_2.get_parallelsentences(), shuffle_translations=False, sort_attribute="system").write_to_file(output_filename) 115 116 reconstructed_1 = reconstructed_dataset.get_parallelsentences() 117 reconstructed_2 = reconstructed_dataset_2.get_parallelsentences() 118 self.assertEqual(len(reconstructed_1), len(reconstructed_2), "The number of sentences when reconstructing the same set twice has changed") 119 120 # for sentence_id in compact_testset_2.get_pairwise_parallelsentence_sets().iterkeys(): 121 # pset1 = compact_testset_2.get_pairwise_parallelsentence_set(sentence_id) 122 # print pset1.get_system_names() 123 # pset2 = compact_testset_2.get_pairwise_parallelsentence_set(sentence_id) 124 # print pset2.get_system_names() 125 126 for p1, p2 in zip(reconstructed_1, reconstructed_2): 127 systemrank1 = set([(tgt.get_attribute("system"), tgt.get_attribute("rank")) for tgt in p1.get_translations()]) 128 systemrank2 = set([(tgt.get_attribute("system"), tgt.get_attribute("rank")) for tgt in p2.get_translations()]) 129 self.assertEqual(systemrank1, systemrank2)
130 131 132 # rank_vector_1 133 134 135 # equal_sentences(reconstructed_dataset_2, reconstructed_dataset) 136 137 #first see if number of sentences at the compact sets are equal 138