Trees | Indices | Help |
|
---|
|
1 ''' 2 Created on 24 Mar 2012 3 4 @author: Eleftherios Avramidis 5 ''' 6 7 import subprocess 8 import sys 9 import re 10 import codecs 11 import time 12 from threading import Thread 13 import os 14 15 from featuregenerator.languagefeaturegenerator import LanguageFeatureGenerator 1618 ''' 19 classdocs 20 ''' 21 25135 136 13726 - def __init__(self, path, lang, params = {}, command_template= 'java -jar {path} -v -l {lang} -b --api',):27 ''' 28 Constructor 29 ''' 30 self.lang = lang 31 params["lang"] = lang 32 params["path"] = path 33 command = command_template.format(**params) 34 self.command = command 35 command_items = command.split(' ') 36 self.output = [] 37 self.running = True 38 39 # self.process = subprocess.Popen(command_items, 40 # shell=False, 41 # bufsize=0, 42 # stdin=subprocess.PIPE, 43 # stdout=subprocess.PIPE, 44 # stderr=subprocess.PIPE 45 # ) 46 ## self.process.stdout.readline() 47 ## self.process.stdout.readline() 48 # self.process.stdin = codecs.getwriter('utf-8')(self.process.stdin) 49 # self.process.stdout = codecs.getreader('utf-8')(self.process.stdout) 50 # #Thread(target=self.print_output, args=(self.process.stderr,)).start() 51 # Thread(target=self.print_output, args=(self.process.stdout,)).start() 52 self.i = 053 5456 import tempfile 57 58 filename = tempfile.mktemp() 59 file = open(filename, 'w') 60 for string in strings: 61 file.write(string) 62 file.write('\n') 63 file.close() 64 return filename6567 tmpfilename = self._get_temporary_file(strings) 68 tmpfile = open(tmpfilename, 'r') 69 commanditems = self.command.split(' ') 70 output = subprocess.check_output(commanditems, stdin=tmpfile, stderr=subprocess.STDOUT) 71 tmpfile.close() 72 os.remove(tmpfilename) 73 return output74 7779 print >>self.process.stdin, string + "\n" 80 #print string 81 self.process.stdin.flush() 82 self.i += 1 83 output=[] 84 85 self.process.stderr.readline() 86 self.process.stderr.readline() 87 time.sleep(0.3) 88 output.extend(self.output) 89 self.output = [] 90 91 #print self.i 92 #print "\n".join(output) 93 return self.postprocess_output("\n".join(output))94 95 96 99 # self.process.stdin.close() 100 # self.process.terminate() 101 #103 self.close()104 105107 108 #pattern that matches one error appearance 109 pattern = 'ruleId="([^"]*)".*errorlength="([^"]*)"' 110 #get a list of the error appearances 111 errors = re.findall(pattern, output) 112 113 #construct a vector of dictionaries with counts 114 atts = {} 115 counts = {} 116 117 for error_id, error_length in errors: 118 error_label = "lgt_{0}".format(error_id).lower() 119 error_count_label = "lgt_{0}_chars".format(error_id).lower() 120 try: 121 atts[error_label] += 1 122 counts[error_count_label] += int(error_length) 123 except KeyError: 124 atts[error_label] = 1 125 counts[error_count_label] = int(error_length) 126 #print counts[error_count_label] 127 128 atts = dict([(k, str(v)) for (k,v) in atts.iteritems()]) 129 atts["lt_errors"] = str(len(errors)) 130 atts["lt_errors_chars"] = str(sum(counts.values())) 131 counts = dict([(k, str(v)) for (k,v) in counts.iteritems()]) 132 atts.update(counts) 133 #print atts 134 return atts139192 193 194 if __name__ == '__main__': 195 path = "/home/Eleftherios Avramidis/taraxu_tools/LanguageTool-1.6/LanguageTool.jar" 196 cmdfg = LanguageToolFeatureGenerator(path, 'en') 197 from dataprocessor.input.jcmlreader import JcmlReader 198 from dataprocessor.sax.saxps2jcml import Parallelsentence2Jcml 199 parallelsentences = JcmlReader("/home/Eleftherios Avramidis/taraxu_data/selection-mechanism/ml4hmt/app/autoranking/4/wmt00-test-devpart.orig.jcml").get_parallelsentences() 200 annotated = cmdfg.add_features_batch(parallelsentences) 201 cmdfg.close() 202 Parallelsentence2Jcml(annotated).write_to_file("/home/Eleftherios Avramidis/taraxu_data/selection-mechanism/ml4hmt/app/autoranking/4/training-sample.lt.jcml") 203140 - def __init__(self, path, lang, params = {}, command_template= 'java -jar {path} -v -l {lang} -b --api',):142144 process = subprocess.Popen(['commandline', 'test2.py'], shell=False, stdin=subprocess.PIPE)145 # process.communicate(parallelsentence.get_) 146 147149 dataset = JcmlReader(existing_jcml).get_dataset() 150 size = dataset.get_size() 151 file_input = open(filename_input, 'r') 152 file_content = file_input.read() 153 att_vector = self._get_att_vector(file_content, size) 154 dataset.add_attribute_vector(att_vector) 155 156 Parallelsentence2Jcml(dataset.get_parallelsentences()).write_to_file(filename_output)157 158160 161 162 pattern = "\d*.\) Line (\d*), column \d*, Rule ID: (.*)\n" 163 164 feature_entries = re.findall(pattern, file_content) 165 feature_entries = [(int(key), value.replace("[", "_").replace("]", "_")) for (key, value) in feature_entries] 166 errors_per_sentence = {} 167 possible_error_ids = set() 168 #first make one list of error ids per sentence 169 for sentence_id , error_id in feature_entries: 170 possible_error_ids.add(error_id) 171 try: 172 errors_per_sentence[sentence_id-1].append(error_id) 173 except KeyError: 174 errors_per_sentence[sentence_id-1] = [error_id] 175 176 #construct a vector of dictionaries with counts 177 vector_atts = [] 178 for i in range(0, size+1): 179 atts = {} 180 for error_id in possible_error_ids: 181 error_label = "lgt_{0}".format(error_id).lower() 182 atts[error_label] = 0 183 try: 184 for error_id in errors_per_sentence[i]: 185 error_label = "lgt_{0}".format(error_id).lower() 186 atts[error_label] += 1 187 except KeyError: 188 pass 189 vector_atts.append(atts) 190 191 return vector_atts
Trees | Indices | Help |
|
---|
Generated by Epydoc 3.0.1 on Fri Jul 18 11:46:15 2014 | http://epydoc.sourceforge.net |