1 '''
2 Remove from a parallel document sentences that are found in a test set
3
4 Created on 9 Nov 2012
5
6 @author: Eleftherios Avramidis
7 '''
8
9 import sys
10
11 '''
12 parameters:
13 source-language side of parallel set
14 target-language side of parallel set
15 source-language side of test set
16 preferred name of source language side of parallel set after test sentences removed
17 preferred name of target language side of parallel set after test sentences removed
18
19 '''
20
21
22
23 if __name__ == '__main__':
24
25 file1_src = open(sys.argv[1], 'r')
26 file1_tgt = open(sys.argv[2], 'r')
27 file2 = open(sys.argv[3], 'r')
28
29
30 try:
31 filtering = (sys.argv[4] == '--filter')
32 filteredfile_src = open(sys.argv[5], 'w')
33 filteredfile_tgt = open(sys.argv[6], 'w')
34 except:
35 filtering = False
36
37 len_file2 = len(file2.readlines())
38 file2.close()
39 file2 = open(sys.argv[3], 'r')
40
41
42 matched = []
43 threshold = 0.8
44 min_length = 1
45 highmatched = []
46 k = -1
47
48 highmatchedlines = []
49 nonmatchedlines = []
50 approvedlines = []
51
52
53 for line1 in file1_src:
54 line1_tgt = file1_tgt.readline()
55 i=0
56 k+=1
57 file2.seek(0)
58
59
60 line1_clean = line1.lower().strip()
61 set1 = set(line1_clean.split())
62 list1 = line1_clean.split()
63 if min_length and len(list1) < min_length:
64 print k, "sentence in src file too small: ", len(list1)
65 continue
66 if line1_clean in approvedlines:
67 print k, "line already there in src"
68 continue
69
70
71 line1_tgt_clean = line1_tgt.lower().strip()
72 set1_tgt = set(line1_tgt_clean.split())
73 list1_tgt = line1_tgt_clean.split()
74 if min_length and len(list1_tgt) < min_length:
75 print k, "sentence in tgt too small: ", len(list1_tgt)
76 continue
77
78 approvedline = True
79
80 line2 = file2.readline()
81
82
83 while approvedline and line2:
84
85 line2_clean = line2.lower().strip()
86
87 set2 = set(line2_clean.split())
88 intersection = set2.intersection(set1)
89 matched.append(len(intersection))
90 try:
91 overlap = (1.00*len(intersection))/len(set2)
92 except:
93 overlap = 0
94
95 if overlap > threshold:
96
97
98
99 print k, i, "overlap: ", 1.00*len(intersection)/len(set2)
100
101 highmatched.append((k, i, 1.00*len(intersection)/len(set2), line1, line2 ))
102 highmatchedlines.append(k)
103 approvedline = False
104
105 i+=1
106 line2 = file2.readline()
107
108 if approvedline:
109 filteredfile_src.write("{}".format(line1))
110 approvedlines.append(approvedline)
111 filteredfile_tgt.write("{}".format(line1_tgt))
112
113
114
115
116 file1_src.close()
117 file1_tgt.close()
118 file2.close()
119
120
121
122 filteredfile_src.close()
123 filteredfile_tgt.close()
124
125
126
127 for h in highmatched:
128 h = [str(j) for j in h]
129 print "\t".join(h)
130
131 targetcount = set()
132 for k,i,p,l1,l2 in highmatched:
133 targetcount.add(i)
134
135 print (100.00*len(targetcount))/(1.00*len_file2) , "% of the test-set sentences were found in the training set"
136
137 print 1.00*sum(matched)/len(matched)
138