Package support :: Package preprocessing :: Module check_overlap
Source Code for Module support.preprocessing.check_overlap

  1  '''
 
  2  Remove from a parallel document sentences that are found in a test set
 
  3  
 
  4  Created on 9 Nov 2012
 
  5  
 
  6  @author: Eleftherios Avramidis
 
  7  ''' 
  8  
 
  9  import sys 
 10  
 
 11  '''
 
 12  parameters:
 
 13  source-language side of parallel set
 
 14  target-language side of parallel set
 
 15  source-language side of test set
 
 16  preferred name of source language side of parallel set after test sentences removed
 
 17  preferred name of target language side of parallel set after test sentences removed
 
 18  
 
 19  ''' 
 20  
 
 21  
 
 22  
 
 23  if __name__ == '__main__': 
 24      #open files    
 
 25      file1_src = open(sys.argv[1], 'r') 
 26      file1_tgt = open(sys.argv[2], 'r') 
 27      file2 = open(sys.argv[3], 'r') 
 28      
 
 29      #these parameters should be always given
 
 30      try: 
 31         filtering = (sys.argv[4] == '--filter') 
 32         filteredfile_src = open(sys.argv[5], 'w') 
 33         filteredfile_tgt = open(sys.argv[6], 'w') 
 34      except: 
 35         filtering = False  
 36  
 
 37      len_file2 = len(file2.readlines()) 
 38      file2.close() 
 39      file2 = open(sys.argv[3], 'r') 
 40  
 
 41      #basic settings
 
 42      matched = [] 
 43      threshold = 0.8 
 44      min_length = 1 
 45      highmatched = [] 
 46      k = -1 
 47  #    print  "Length of file" ,len(file1.readlines())
 
 48      highmatchedlines = [] 
 49      nonmatchedlines = [] 
 50      approvedlines = [] 
 51  
 
 52      #browse the sentences of the big corpus one by one
 
 53      for line1 in file1_src: 
 54          line1_tgt = file1_tgt.readline() 
 55          i=0 
 56          k+=1 
 57          file2.seek(0) 
 58          
 
 59          # process src sentence
 
 60          line1_clean = line1.lower().strip() 
 61          set1 = set(line1_clean.split()) 
 62          list1 = line1_clean.split() 
 63          if min_length and len(list1) < min_length: 
 64              print k, "sentence in src file too small: ", len(list1) 
 65              continue 
 66          if line1_clean in approvedlines: 
 67              print k, "line already there in src" 
 68              continue 
 69          
 
 70          # process tgt sentence
 
 71          line1_tgt_clean = line1_tgt.lower().strip() 
 72          set1_tgt = set(line1_tgt_clean.split()) 
 73          list1_tgt = line1_tgt_clean.split() 
 74          if min_length and len(list1_tgt) < min_length: 
 75              print k, "sentence in tgt too small: ", len(list1_tgt) 
 76              continue 
 77              
 
 78          approvedline = True 
 79          
 
 80          line2 = file2.readline() 
 81          
 
 82          #if line is not too small or dupped, compare it one by one with the sentences of the second set
 
 83          while approvedline and line2: 
 84          
 
 85              line2_clean = line2.lower().strip() 
 86  #            print k, i  
 
 87              set2 = set(line2_clean.split()) 
 88              intersection = set2.intersection(set1) 
 89              matched.append(len(intersection)) 
 90              try: 
 91                  overlap = (1.00*len(intersection))/len(set2) 
 92              except: 
 93                  overlap = 0 
 94              
 
 95              if overlap > threshold: 
 96              #    highmatched.append((k, i, 1.00*len(intersection)/len(set2), line1, line2 ))
 
 97              #if line1_clean == line2_clean:
 
 98               
 
 99                  print k, i, "overlap: ", 1.00*len(intersection)/len(set2) 
100                  
 
101                  highmatched.append((k, i, 1.00*len(intersection)/len(set2), line1, line2 )) 
102                  highmatchedlines.append(k) 
103                  approvedline = False 
104                  
 
105              i+=1 
106              line2 = file2.readline() 
107  
 
108          if approvedline:  
109              filteredfile_src.write("{}".format(line1)) 
110              approvedlines.append(approvedline) 
111              filteredfile_tgt.write("{}".format(line1_tgt)) 
112                  
 
113          
 
114    
 
115  
 
116      file1_src.close() 
117      file1_tgt.close() 
118      file2.close() 
119  
 
120              
 
121      
 
122      filteredfile_src.close() 
123      filteredfile_tgt.close() 
124  
 
125  
 
126   
 
127      for h in highmatched: 
128          h = [str(j) for j in h] 
129          print "\t".join(h) 
130  
 
131      targetcount = set() 
132      for k,i,p,l1,l2 in highmatched: 
133          targetcount.add(i) 
134          #targetcount[i] = True #targetcount.setdefault(i, 0) + 1
 
135      print (100.00*len(targetcount))/(1.00*len_file2) , "% of the test-set sentences were found in the training set" 
136      
 
137      print 1.00*sum(matched)/len(matched) 
138