Package featuregenerator :: Package languagechecker :: Module languagetool
[hide private]
[frames] | no frames]

Source Code for Module featuregenerator.languagechecker.languagetool

  1  ''' 
  2  Created on 24 Mar 2012 
  3   
  4  @author: Eleftherios Avramidis 
  5  ''' 
  6   
  7  import subprocess 
  8  import sys 
  9  import re 
 10  import codecs 
 11  import time 
 12  from threading  import Thread 
 13  import os 
 14   
 15  from featuregenerator.languagefeaturegenerator import LanguageFeatureGenerator 
 16   
17 -class LanguageToolFeatureGenerator(LanguageFeatureGenerator):
18 ''' 19 classdocs 20 ''' 21
22 - def print_output(self, out):
23 while self.running: 24 self.output.append(out.readline())
25
26 - def __init__(self, path, lang, params = {}, command_template= 'java -jar {path} -v -l {lang} -b --api',):
27 ''' 28 Constructor 29 ''' 30 self.lang = lang 31 params["lang"] = lang 32 params["path"] = path 33 command = command_template.format(**params) 34 self.command = command 35 command_items = command.split(' ') 36 self.output = [] 37 self.running = True 38 39 # self.process = subprocess.Popen(command_items, 40 # shell=False, 41 # bufsize=0, 42 # stdin=subprocess.PIPE, 43 # stdout=subprocess.PIPE, 44 # stderr=subprocess.PIPE 45 # ) 46 ## self.process.stdout.readline() 47 ## self.process.stdout.readline() 48 # self.process.stdin = codecs.getwriter('utf-8')(self.process.stdin) 49 # self.process.stdout = codecs.getreader('utf-8')(self.process.stdout) 50 # #Thread(target=self.print_output, args=(self.process.stderr,)).start() 51 # Thread(target=self.print_output, args=(self.process.stdout,)).start() 52 self.i = 0
53 54
55 - def _get_temporary_file(self, strings):
56 import tempfile 57 58 filename = tempfile.mktemp() 59 file = open(filename, 'w') 60 for string in strings: 61 file.write(string) 62 file.write('\n') 63 file.close() 64 return filename
65
66 - def _get_tool_output(self, strings):
67 tmpfilename = self._get_temporary_file(strings) 68 tmpfile = open(tmpfilename, 'r') 69 commanditems = self.command.split(' ') 70 output = subprocess.check_output(commanditems, stdin=tmpfile, stderr=subprocess.STDOUT) 71 tmpfile.close() 72 os.remove(tmpfilename) 73 return output
74
75 - def get_features_string(self, string):
76 return self.postprocess_output(self._get_tool_output([string]))
77
78 - def get_features_string_pipe(self, string):
79 print >>self.process.stdin, string + "\n" 80 #print string 81 self.process.stdin.flush() 82 self.i += 1 83 output=[] 84 85 self.process.stderr.readline() 86 self.process.stderr.readline() 87 time.sleep(0.3) 88 output.extend(self.output) 89 self.output = [] 90 91 #print self.i 92 #print "\n".join(output) 93 return self.postprocess_output("\n".join(output))
94 95 96
97 - def close(self):
98 self.running = False
99 # self.process.stdin.close() 100 # self.process.terminate() 101 #
102 - def __del__(self):
103 self.close()
104 105
106 - def postprocess_output(self, output):
107 108 #pattern that matches one error appearance 109 pattern = 'ruleId="([^"]*)".*errorlength="([^"]*)"' 110 #get a list of the error appearances 111 errors = re.findall(pattern, output) 112 113 #construct a vector of dictionaries with counts 114 atts = {} 115 counts = {} 116 117 for error_id, error_length in errors: 118 error_label = "lgt_{0}".format(error_id).lower() 119 error_count_label = "lgt_{0}_chars".format(error_id).lower() 120 try: 121 atts[error_label] += 1 122 counts[error_count_label] += int(error_length) 123 except KeyError: 124 atts[error_label] = 1 125 counts[error_count_label] = int(error_length) 126 #print counts[error_count_label] 127 128 atts = dict([(k, str(v)) for (k,v) in atts.iteritems()]) 129 atts["lt_errors"] = str(len(errors)) 130 atts["lt_errors_chars"] = str(sum(counts.values())) 131 counts = dict([(k, str(v)) for (k,v) in counts.iteritems()]) 132 atts.update(counts) 133 #print atts 134 return atts
135 136 137
138 -class LanguageCheckerCmd(LanguageFeatureGenerator):
139
140 - def __init__(self, path, lang, params = {}, command_template= 'java -jar {path} -v -l {lang} -b --api',):
141 self.lang = lang
142
143 - def add_features_batch(self, parallelsentences):
144 process = subprocess.Popen(['commandline', 'test2.py'], shell=False, stdin=subprocess.PIPE)
145 # process.communicate(parallelsentence.get_) 146 147
148 - def offline_process(self, filename_input, filename_output, existing_jcml):
149 dataset = JcmlReader(existing_jcml).get_dataset() 150 size = dataset.get_size() 151 file_input = open(filename_input, 'r') 152 file_content = file_input.read() 153 att_vector = self._get_att_vector(file_content, size) 154 dataset.add_attribute_vector(att_vector) 155 156 Parallelsentence2Jcml(dataset.get_parallelsentences()).write_to_file(filename_output)
157 158
159 - def _get_att_vector(self, file_content, size):
160 161 162 pattern = "\d*.\) Line (\d*), column \d*, Rule ID: (.*)\n" 163 164 feature_entries = re.findall(pattern, file_content) 165 feature_entries = [(int(key), value.replace("[", "_").replace("]", "_")) for (key, value) in feature_entries] 166 errors_per_sentence = {} 167 possible_error_ids = set() 168 #first make one list of error ids per sentence 169 for sentence_id , error_id in feature_entries: 170 possible_error_ids.add(error_id) 171 try: 172 errors_per_sentence[sentence_id-1].append(error_id) 173 except KeyError: 174 errors_per_sentence[sentence_id-1] = [error_id] 175 176 #construct a vector of dictionaries with counts 177 vector_atts = [] 178 for i in range(0, size+1): 179 atts = {} 180 for error_id in possible_error_ids: 181 error_label = "lgt_{0}".format(error_id).lower() 182 atts[error_label] = 0 183 try: 184 for error_id in errors_per_sentence[i]: 185 error_label = "lgt_{0}".format(error_id).lower() 186 atts[error_label] += 1 187 except KeyError: 188 pass 189 vector_atts.append(atts) 190 191 return vector_atts
192 193 194 if __name__ == '__main__': 195 path = "/home/Eleftherios Avramidis/taraxu_tools/LanguageTool-1.6/LanguageTool.jar" 196 cmdfg = LanguageToolFeatureGenerator(path, 'en') 197 from dataprocessor.input.jcmlreader import JcmlReader 198 from dataprocessor.sax.saxps2jcml import Parallelsentence2Jcml 199 parallelsentences = JcmlReader("/home/Eleftherios Avramidis/taraxu_data/selection-mechanism/ml4hmt/app/autoranking/4/wmt00-test-devpart.orig.jcml").get_parallelsentences() 200 annotated = cmdfg.add_features_batch(parallelsentences) 201 cmdfg.close() 202 Parallelsentence2Jcml(annotated).write_to_file("/home/Eleftherios Avramidis/taraxu_data/selection-mechanism/ml4hmt/app/autoranking/4/training-sample.lt.jcml") 203