Package support :: Package preprocessing :: Module oolinkage
[hide private]
[frames] | no frames]

Source Code for Module support.preprocessing.oolinkage

 1  ''' 
 2  Created on Nov 16, 2011 
 3   
 4  @author: jogin 
 5  ''' 
 6   
 7  import sys 
 8  sys.path.append("/home/Lukas Poustka/taraxuscripts") 
 9   
10  from featuregenerator.levenshtein.levenshtein import levenshtein_tok 
11   
12   
13 -class OOLinkage():
14 """ 15 This script compares sentences from Open Office 2010 with sentences from Open Office 2011 (OpenOffice3) according to Levenshtein distance. 16 """
17 - def __init__(self, oldOOSnts, newOOSnts, newOOLinks='', oldOOLinks=''):
18 """ 19 @param: oldOOSnts: old Open Office file with aligned sentences 20 @type: string 21 @param: newOOSnts: new Open Office file with aligned sentences 22 @type: string 23 @param: newOOLinks: new Open Office file with source links of aligned sentences 24 @type: string 25 @param: oldOOLinks: filename for saving links of sentence and source 26 @type: string 27 """ 28 f = open(oldOOSnts) 29 oldSntsStr = f.read() 30 f.close() 31 32 f = open(newOOSnts) 33 newSntsStr = f.read() 34 f.close() 35 36 oldSnts = oldSntsStr.split('\n') 37 newSnts = newSntsStr.split('\n') 38 39 # Levenshtein distance 4 or less: 40 # 9, 14, 15, 20 41 b = 0 42 f = open('log.txt','w') 43 for oldSnt in oldSnts: 44 a = 0 45 b += 1 46 if b < 42: continue 47 print b 48 for newSnt in newSnts: 49 if levenshtein_tok(oldSnt, newSnt) > 4: continue 50 else: 51 52 a += 1 53 print str(b), '-', str(a), ':' 54 print 'Ref:\n', oldSnt 55 print 'Candidate:\n', newSnt 56 print 'Levenshtein:', str(levenshtein_tok(oldSnt, newSnt)), '\n' 57 f.write(str(b)+' - '+str(a)+':') 58 f.write('Ref:\n'+oldSnt) 59 f.write('Candidate:\n'+newSnt) 60 f.write('Levenshtein: '+str(levenshtein_tok(oldSnt, newSnt))+'\n') 61 if b > 60: break 62 f.close()
63 64 65 # '/share/taraxu/data/KDE4/aligned/cs-en_src.txt', '/share/taraxu/data/KDE4/aligned/cs-en_tgt.txt', 100, '/share/taraxu/data/KDE4/aligned/ 66 OOLinkage('/media/DATA/Arbeit/DFKI/111102_OpenOffice/oolinkage/esde_src.c40.detok.txt', '/media/DATA/Arbeit/DFKI/111102_OpenOffice/oolinkage/de-es_tgt_detok.txt') 67 68 #'/home/Lukas Poustka/oo/selected/openoffice.de-en.sel104.detok.de', '/home/Lukas Poustka/oo/aligned/esde_src_detok.txt', '/share/taraxu/data/OpenOffice3/aligned/de-en_GB_src_links.txt', '/home/Lukas Poustka/oo/selected/openoffice.de-en.sel104.detok.de_links.txt' 69 70 #'/home/Lukas Poustka/oo/selected/openoffice.de-en.sel104.detok.de', '/share/taraxu/data/OpenOffice3/aligned/de-en_GB_src_detok.txt', '/share/taraxu/data/OpenOffice3/aligned/de-en_GB_src_links.txt', '/home/Lukas Poustka/oo/selected/openoffice.de-en.sel104.detok.de_links.txt' 71 72 #'/home/Lukas Poustka/oo/selected/esde_tgt.c40.detok.txt', '/share/taraxu/data/OpenOffice3/aligned/de-es_src_detok.txt', '/share/taraxu/data/OpenOffice3/aligned/de-es_src_links.txt', '/home/Lukas Poustka/oo/selected/esde_tgt.c40.detok_links.txt' 73 74 #'/home/Lukas Poustka/oo/selected/esde_src.c40.detok.txt', '/share/taraxu/data/OpenOffice3/aligned/de-es_tgt_detok.txt', '/share/taraxu/data/OpenOffice3/aligned/de-es_tgt_links.txt', '/home/Lukas Poustka/oo/selected/esde_src.c40.detok_links.txt' 75