Trees | Indices | Help |
|
---|
|
1 ''' 2 Created on 24 Mar 2012 3 4 @author: Eleftherios Avramidis 5 ''' 6 7 from featuregenerator import FeatureGenerator 8 import subprocess 9 import util 10 import codecs 11 import os 12 13 import Queue 14 import threading 15 from sentence.dataset import DataSet 1618 """ 19 """ 2238 3924 src_lang = parallelsentence.get_attribute("langsrc") #TODO: make this format independent by adding it as an attribute of the sentence objects 25 if src_lang == self.lang: 26 simplesentence.string = self.process_string(simplesentence.string) 27 return simplesentence2830 tgt_lang = parallelsentence.get_attribute("langtgt") 31 if tgt_lang == self.lang: 32 simplesentence.string = self.process_string(simplesentence.string) 33 return simplesentence34 3541 42136 137 # def add_features_batch(self, parallelsentences): 138 # dataset = DataSet(parallelsentences) 139 # 140 # if dataset.get_parallelsentences()[0].get_attribute("langsrc") == self.lang: 141 # sourcestrings = dataset.get_singlesource_strings() 142 # processed_sourcestrings = self._get_tool_output(sourcestrings) 143 # dataset.modify_singlesource_strings(processed_sourcestrings) 144 # 145 # 146 # if dataset.get_parallelsentences()[0].get_attribute("langtgt") == self.lang: 147 # targetstringlists = dataset.get_target_strings() 148 # processed_targetstringslist = [self._get_tool_output(targetstrings) for targetstrings in targetstringlists] 149 # dataset.modify_target_strings(processed_targetstringslist) 150 # 151 # return dataset.get_parallelsentences() 152 # 153 160 167 174 175 182 183 184 185 if __name__ == '__main__': 186 from dataprocessor.input.jcmlreader import JcmlReader 187 from dataprocessor.sax.saxps2jcml import Parallelsentence2Jcml 188 #path = "/home/Eleftherios Avramidis/taraxu_tools/scripts/tokenizer/tokenizer.perl" 189 #command_template = "{path} -b -l {lang}" 190 # path = "/home/Eleftherios Avramidis/taraxu_tools/scripts/tokenizer/normalize-punctuation.perl" 191 # command_template = "perl {path} -l {lang} -b" 192 tokenizer = Tokenizer("en") 193 parallelsentences = JcmlReader("/home/Eleftherios Avramidis/taraxu_data/jcml-latest/clean/wmt2011.newstest.en-de.rank-clean.jcml").get_parallelsentences() 194 tokenized = tokenizer.add_features_batch(parallelsentences) 195 #tokenizer.close() 196 Parallelsentence2Jcml(tokenized).write_to_file("/home/Eleftherios Avramidis/taraxu_data/jcml-latest/tok/wmt2011.newstest.en-de.rank-clean.jcml") 19744 out = 0 45 for line in iter(stdout.readline, ''): 46 print "thread received response: ", line 47 queue.put(line)48 # break 49 50 51 52 53 54 55 5658 self.lang = lang 59 params["lang"] = lang 60 params["path"] = path 61 self.command = command_template.format(**params) 62 command_items = self.command.split(' ') 63 self.output = [] 64 self.running = True 65 66 self.process = subprocess.Popen(command_items, 67 shell=False, 68 bufsize=1, 69 stdin=subprocess.PIPE, 70 stdout=subprocess.PIPE, 71 )72 73 74 # self.q = Queue.Queue() 75 # t = threading.Thread(target = self._enqueue_output, args = (self.process.stdout, self.q)) 76 # 77 # t.daemon = True 78 # t.start() 79 80 81 82 83 84 #self.process.stdin = codecs.getwriter('utf-8')(self.process.stdin) 85 #self.process.stdout = codecs.getreader('utf-8')(self.process.stdout) 8688 #string = string.decode('utf-8') 89 90 #string = string.encode('utf-8') 91 self.process.stdin.write('{0}{1}\n'.format(string, ' '*10240)) 92 self.process.stdin.flush() 93 self.process.stdout.flush() 94 95 output = self.process.stdout.readline().strip() 96 97 #some preprocessors occasionally return an empty string. In that case read once more 98 if output == "" and len(string) > 1: 99 output = self.process.stdout.readline().strip() 100 101 return output102104 self.running = False 105 try: 106 self.process.stdin.close() 107 self.process.terminate() 108 except: 109 pass110112 self.close()113 114116 import tempfile 117 118 f, filename = tempfile.mkstemp(text=True) 119 os.close(f) 120 print filename 121 f = open(filename, 'w') 122 for string in strings: 123 f.write(string) 124 f.write('\n') 125 f.close() 126 return filename127
Trees | Indices | Help |
|
---|
Generated by Epydoc 3.0.1 on Fri Jul 18 11:46:17 2014 | http://epydoc.sourceforge.net |