Package featuregenerator :: Module preprocessor
[hide private]
[frames] | no frames]

Source Code for Module featuregenerator.preprocessor

  1  ''' 
  2  Created on 24 Mar 2012 
  3   
  4  @author: Eleftherios Avramidis 
  5  ''' 
  6   
  7  from featuregenerator import FeatureGenerator 
  8  import subprocess 
  9  import util 
 10  import codecs 
 11  import os 
 12   
 13  import Queue 
 14  import threading 
 15  from sentence.dataset import DataSet 
 16   
17 -class Preprocessor(FeatureGenerator):
18 """ 19 """
20 - def __init__(self, lang):
21 self.lang = lang
22
23 - def add_features_src(self, simplesentence, parallelsentence = None):
24 src_lang = parallelsentence.get_attribute("langsrc") #TODO: make this format independent by adding it as an attribute of the sentence objects 25 if src_lang == self.lang: 26 simplesentence.string = self.process_string(simplesentence.string) 27 return simplesentence
28
29 - def add_features_tgt(self, simplesentence, parallelsentence = None):
30 tgt_lang = parallelsentence.get_attribute("langtgt") 31 if tgt_lang == self.lang: 32 simplesentence.string = self.process_string(simplesentence.string) 33 return simplesentence
34 35
36 - def process_string(self, string):
37 raise NotImplementedError
38 39
40 -class CommandlinePreprocessor(Preprocessor):
41 42
43 - def _enqueue_output(self, stdout, queue):
44 out = 0 45 for line in iter(stdout.readline, ''): 46 print "thread received response: ", line 47 queue.put(line)
48 # break 49 50 51 52 53 54 55 56
57 - def __init__(self, path, lang, params = {}, command_template = ""):
58 self.lang = lang 59 params["lang"] = lang 60 params["path"] = path 61 self.command = command_template.format(**params) 62 command_items = self.command.split(' ') 63 self.output = [] 64 self.running = True 65 66 self.process = subprocess.Popen(command_items, 67 shell=False, 68 bufsize=1, 69 stdin=subprocess.PIPE, 70 stdout=subprocess.PIPE, 71 )
72 73 74 # self.q = Queue.Queue() 75 # t = threading.Thread(target = self._enqueue_output, args = (self.process.stdout, self.q)) 76 # 77 # t.daemon = True 78 # t.start() 79 80 81 82 83 84 #self.process.stdin = codecs.getwriter('utf-8')(self.process.stdin) 85 #self.process.stdout = codecs.getreader('utf-8')(self.process.stdout) 86
87 - def process_string(self, string):
88 #string = string.decode('utf-8') 89 90 #string = string.encode('utf-8') 91 self.process.stdin.write('{0}{1}\n'.format(string, ' '*10240)) 92 self.process.stdin.flush() 93 self.process.stdout.flush() 94 95 output = self.process.stdout.readline().strip() 96 97 #some preprocessors occasionally return an empty string. In that case read once more 98 if output == "" and len(string) > 1: 99 output = self.process.stdout.readline().strip() 100 101 return output
102
103 - def close(self):
104 self.running = False 105 try: 106 self.process.stdin.close() 107 self.process.terminate() 108 except: 109 pass
110
111 - def __del__(self):
112 self.close()
113 114
115 - def _get_temporary_file(self, strings):
116 import tempfile 117 118 f, filename = tempfile.mkstemp(text=True) 119 os.close(f) 120 print filename 121 f = open(filename, 'w') 122 for string in strings: 123 f.write(string) 124 f.write('\n') 125 f.close() 126 return filename
127
128 - def _get_tool_output(self, strings):
129 tmpfilename = self._get_temporary_file(strings) 130 tmpfile = open(tmpfilename, 'r') 131 commanditems = self.command.split(' ') 132 output = subprocess.check_output(commanditems, stdin=tmpfile).split('\n') 133 tmpfile.close() 134 #os.remove(tmpfile) 135 return output
136 137 # def add_features_batch(self, parallelsentences): 138 # dataset = DataSet(parallelsentences) 139 # 140 # if dataset.get_parallelsentences()[0].get_attribute("langsrc") == self.lang: 141 # sourcestrings = dataset.get_singlesource_strings() 142 # processed_sourcestrings = self._get_tool_output(sourcestrings) 143 # dataset.modify_singlesource_strings(processed_sourcestrings) 144 # 145 # 146 # if dataset.get_parallelsentences()[0].get_attribute("langtgt") == self.lang: 147 # targetstringlists = dataset.get_target_strings() 148 # processed_targetstringslist = [self._get_tool_output(targetstrings) for targetstrings in targetstringlists] 149 # dataset.modify_target_strings(processed_targetstringslist) 150 # 151 # return dataset.get_parallelsentences() 152 # 153
154 -class Normalizer(CommandlinePreprocessor):
155 - def __init__(self, lang):
156 path = util.__path__[0] 157 path = os.path.join(path, "normalize-punctuation.perl") 158 command_template = "perl {path} -b -l {lang}" 159 super(Normalizer, self).__init__(path, lang, {}, command_template)
160
161 -class Tokenizer(CommandlinePreprocessor):
162 - def __init__(self, lang):
163 path = util.__path__[0] 164 path = os.path.join(path, "tokenizer.perl") 165 command_template = "perl {path} -b -l {lang}" 166 super(Tokenizer, self).__init__(path, lang, {}, command_template)
167
168 -class Detokenizer(CommandlinePreprocessor):
169 - def __init__(self, lang):
170 path = util.__path__[0] 171 path = os.path.join(path, "detokenizer.perl") 172 command_template = "perl {path} -l {lang}" 173 super(Detokenizer, self).__init__(path, lang, {}, command_template)
174 175
176 -class Truecaser(CommandlinePreprocessor):
177 - def __init__(self, lang, model):
178 path = util.__path__[0] 179 path = os.path.join(path, "truecase.perl") 180 command_template = "perl {path} -model {model}" 181 super(Truecaser, self).__init__(path, lang, {"model": model}, command_template)
182 183 184 185 if __name__ == '__main__': 186 from dataprocessor.input.jcmlreader import JcmlReader 187 from dataprocessor.sax.saxps2jcml import Parallelsentence2Jcml 188 #path = "/home/Eleftherios Avramidis/taraxu_tools/scripts/tokenizer/tokenizer.perl" 189 #command_template = "{path} -b -l {lang}" 190 # path = "/home/Eleftherios Avramidis/taraxu_tools/scripts/tokenizer/normalize-punctuation.perl" 191 # command_template = "perl {path} -l {lang} -b" 192 tokenizer = Tokenizer("en") 193 parallelsentences = JcmlReader("/home/Eleftherios Avramidis/taraxu_data/jcml-latest/clean/wmt2011.newstest.en-de.rank-clean.jcml").get_parallelsentences() 194 tokenized = tokenizer.add_features_batch(parallelsentences) 195 #tokenizer.close() 196 Parallelsentence2Jcml(tokenized).write_to_file("/home/Eleftherios Avramidis/taraxu_data/jcml-latest/tok/wmt2011.newstest.en-de.rank-clean.jcml") 197