Package app :: Package autoranking :: Module annotate_batch
[hide private]
[frames] | no frames]

Source Code for Module app.autoranking.annotate_batch

  1  ''' 
  2  Created on 17 Jan 2012 
  3  Modified 22 Mar 2012 for autoranking app 
  4  @author: Eleftherios Avramidis 
  5  ''' 
  6   
  7  import shutil 
  8  import os 
  9  import re 
 10  import sys 
 11   
 12  #pipeline essentials 
 13  from ruffus import * 
 14  #from multiprocessing import Process, Manager  
 15  from ruffus.task import pipeline_printout_graph, pipeline_printout 
 16   
 17  #internal code classes 
 18  import bootstrap  
 19  cfg = bootstrap.get_cfg() 
 20  from dataprocessor.input.jcmlreader import JcmlReader 
 21  from dataprocessor.sax.saxps2jcml import Parallelsentence2Jcml  
 22  from dataprocessor.sax import saxjcml 
 23  from featuregenerator.parser.berkeley.parsermatches import ParserMatches 
 24  from featuregenerator.parser.berkeley.cfgrules import CfgRulesExtractor 
 25  from featuregenerator.lengthfeaturegenerator import LengthFeatureGenerator 
 26  from featuregenerator.ratio_generator import RatioGenerator 
 27  from featuregenerator.ibm1featuregenerator import Ibm1FeatureGenerator 
 28  from featuregenerator.levenshtein.levenshtein_generator import LevenshteinGenerator 
 29  from featuregenerator.bleu.bleugenerator import CrossBleuGenerator, BleuGenerator 
 30  from featuregenerator.meteor.meteor import CrossMeteorGenerator, MeteorGenerator 
 31  from featuregenerator.attribute_rank import AttributeRankGenerator 
 32  from dataprocessor.input.xmlreader import XmlReader 
 33  from featuregenerator.languagechecker.languagetool_socket import LanguageToolSocketFeatureGenerator 
 34  from featuregenerator.preprocessor import Normalizer 
 35  from featuregenerator.preprocessor import Tokenizer 
 36   
 37   
 38   
 39   
 40   
 41   
 42   
 43  gateway = cfg.java_init() 
 44   
 45  cores = int(cfg.get("general", "cores")) 
 46  parallel_feature_functions = [] 
 47  sys.stderr.write("running with {} cores\n".format(cores))  
 48   
 49  path = cfg.get_path() 
 50  os.chdir(path) 
 51  source_language =  cfg.get("general", "source_language") 
 52  target_language =  cfg.get("general", "target_language") 
 53  training_sets = cfg.get("training", "filenames").split(",") 
 54  testing_set = cfg.get("testing", "filename") 
 55  all_sets = training_sets 
 56  #all_sets.append(testing_set) 
 57   
 58  print all_sets 
59 60 61 -def get_basename(filename):
62 basename = re.findall("(.*)\.jcml", os.path.basename(filename))[0] 63 print basename 64 return basename
65 66 params = [] 67 for external_file in all_sets: 68 basename = get_basename(external_file) 69 print "Found basename" 70 basename = basename.replace(".", "-") 71 output_file = "{0}.{1}".format(basename, "orig.jcml") 72 params.append([external_file, output_file])
73 74 75 @files(params) 76 -def data_fetch(external_file, output_file):
77 """ 78 Fetch training file and place it comfortably in the working directory 79 Files are expected to contain the set name, followed by the ending .jcml 80 """ 81 # for external_file in external_files: 82 print "Moving here external file ", external_file 83 # basename = get_basename(external_file) 84 # print "Found basename" 85 # basename = basename.replace(".", "-") 86 87 # output_file = "{0}.{1}".format(basename, "orig.jcml") 88 print "output", output_file 89 shutil.copy(external_file, output_file)
90 91 try: 92 annotated_filenames = cfg.get("training", "annotated_filenames").split(",") 93 except: 94 annotated_filenames = []
95 96 97 98 #@split(data_fetch,"*.ext.f.jcml", annotated_filenames) 99 #def add_externally_annotated_sets(input_file, output_files, external_files): 100 ## input_basename = get_basename(input_file) 101 # for external_file in external_files: 102 # external_basename = get_basename(external_file) 103 ## if input_basename == external_basename: 104 # shutil.copy(external_file, "%s.ext.f.jcml" % external_basename) 105 # 106 #if (cfg.exists_parser(target_language)): 107 # parallel_feature_functions.append(add_externally_annotated_sets) 108 109 @transform(data_fetch, suffix("orig.jcml"), "tok.jcml") 110 -def preprocess_data(input_file, output_file):
111 112 normalizer_src = Normalizer(source_language) 113 normalizer_tgt = Normalizer(target_language) 114 tokenizer_src = Tokenizer(source_language) 115 tokenizer_tgt = Tokenizer(target_language) 116 fgs = [normalizer_src, normalizer_tgt, tokenizer_src, tokenizer_tgt] 117 118 # parallelsentences = JcmlReader(input_file).get_parallelsentences() 119 # for fg in fgs: 120 # parallelsentences = fg.add_features_batch(parallelsentences) 121 # Parallelsentence2Jcml(parallelsentences).write_to_file(output_file) 122 saxjcml.run_features_generator(input_file, output_file, fgs, True)
123
124 125 126 127 @jobs_limit(1, "checker") 128 @active_if(cfg.exists_checker(source_language)) 129 @transform(data_fetch, suffix(".orig.jcml"), ".iq.%s.f.jcml" % source_language, source_language) 130 -def features_checker_source(input_file, output_file, source_language):
131 # features_checker(input_file, output_file, language_checker_source) 132 cfg.get_checker(source_language).add_features_batch_xml(input_file, output_file)
133 # saxjcml.run_features_generator(input_file, output_file, [cfg.get_checker(source_language)]) 134 #ATTENTION: for some reason, the checker has to be initialized via suds in the same thread as it is being run 135 if cfg.exists_checker(source_language): 136 parallel_feature_functions.append(features_checker_source)
137 138 139 #language_checker_target = cfg.get_checker(target_language) 140 141 142 @jobs_limit(1, "checker") 143 @active_if(cfg.exists_checker(target_language)) 144 @transform(data_fetch, suffix(".orig.jcml"), ".iq.%s.f.jcml" % target_language, target_language) 145 -def features_checker_target(input_file, output_file, target_language):
146 # features_checker(input_file, output_file, language_checker_target) 147 cfg.get_checker(target_language).add_features_batch_xml(input_file, output_file)
148 # saxjcml.run_features_generator(input_file, output_file, [cfg.get_checker(target_language)]) 149 150 if cfg.exists_checker(target_language): 151 parallel_feature_functions.append(features_checker_target)
152 153 154 #def features_checker(input_file, output_file, language_checker): 155 # saxjcml.run_features_generator(input_file, output_file, [language_checker]) 156 157 158 @jobs_limit(1, "ltool") #Dunno why, but only one language tool at a time 159 @active_if(cfg.has_section("languagetool")) 160 @transform(data_fetch, suffix(".orig.jcml"), ".lt.%s.f.jcml" % source_language, source_language) 161 -def features_langtool_source(input_file, output_file, language):
162 features_langtool(input_file, output_file, language)
163
164 @jobs_limit(1, "ltool") 165 @active_if(cfg.has_section("languagetool")) 166 @transform(data_fetch, suffix(".orig.jcml"), ".lt.%s.f.jcml" % target_language, target_language) 167 -def features_langtool_target(input_file, output_file, language):
168 features_langtool(input_file, output_file, language)
169 if cfg.has_section("languagetool"): 170 parallel_feature_functions.append(features_langtool_target) 171 parallel_feature_functions.append(features_langtool_source)
172 173 -def features_langtool(input_file, output_file, language):
174 fg = LanguageToolSocketFeatureGenerator(language, cfg.gateway) 175 saxjcml.run_features_generator(input_file, output_file, [fg])
176
177 178 179 180 @split(preprocess_data, "*.part.jcml", cores) 181 -def original_data_split(input_files, output_files, parts):
182 """ 183 Split the datasets to parts, in order to perform heavy tasks 184 """ 185 for input_file in input_files: 186 print "splitting file", input_file 187 re_split = "([^.]*)\.tok\.(jcml)" 188 XmlReader(input_file).split_and_write(parts, re_split)
189
190 191 @active_if(cfg.exists_parser(source_language)) 192 @transform(original_data_split, suffix("part.jcml"), "part.parsed.%s.f.jcml" % source_language, source_language, cfg.get_parser_name(source_language)) 193 -def features_berkeley_source(input_file, output_file, source_language, parser_name):
194 features_berkeley(input_file, output_file, source_language)
195
196 @active_if(cfg.exists_parser(target_language)) 197 @transform(original_data_split, suffix("part.jcml"), "part.parsed.%s.f.jcml" % target_language, target_language, cfg.get_parser_name(target_language)) 198 -def features_berkeley_target(input_file, output_file, target_language, parser_name):
199 features_berkeley(input_file, output_file, target_language)
200
201 202 -def features_berkeley(input_file, output_file, language):
203 """ 204 Parsing 205 """ 206 parser = cfg.get_parser(language) #this is bypassing the architecture, but avoids wasting memory for the loaded parser 207 saxjcml.run_features_generator(input_file, output_file, [parser])
208
209 # parser = BerkeleyXMLRPCFeatureGenerator(parser_url, language, parser_tokenize) 210 # saxjcml.run_features_generator(input_file, output_file, [parser]) 211 212 @active_if(cfg.exists_parser(source_language)) 213 #@merge(features_berkeley_source, "parsed.%s.f.jcml" % source_language) 214 @collate(features_berkeley_source, regex(r"([^.]+)\.\s?(\d+)\.part.parsed.([^.]+).f.jcml"), r"\1.parsed.\3.f.jcml") 215 -def merge_parse_parts_source(inputs, output):
216 merge_parts(inputs, output)
217 if (cfg.exists_parser(source_language)): 218 parallel_feature_functions.append(merge_parse_parts_source)
219 220 @active_if(cfg.exists_parser(target_language)) 221 #@merge(features_berkeley_target, "parsed.%s.f.jcml" % target_language) 222 @collate(features_berkeley_target, regex(r"([^.]+)\.\s?(\d+)\.part.parsed.([^.]+).f.jcml"), r"\1.parsed.\3.f.jcml") 223 -def merge_parse_parts_target(inputs, output):
224 merge_parts(inputs, output)
225 if (cfg.exists_parser(target_language)): 226 parallel_feature_functions.append(merge_parse_parts_target)
227 228 229 -def merge_parts(inputs, output):
230 print inputs 231 parallelsentences = [] 232 for inp in sorted(inputs): 233 parallelsentences.extend(JcmlReader(inp).get_parallelsentences()) 234 Parallelsentence2Jcml(parallelsentences).write_to_file(output)
235
236 237 238 @transform(preprocess_data, suffix(".tok.jcml"), ".tc.%s.jcml" % source_language, source_language, cfg.get_truecaser_model(source_language)) 239 -def truecase_source(input_file, output_file, language, model):
240 truecase(input_file, output_file, language, model)
241
242 @transform(preprocess_data, suffix(".tok.jcml"), ".tc.%s.jcml" % target_language, target_language, cfg.get_truecaser_model(target_language)) 243 -def truecase_target(input_file, output_file, language, model):
244 truecase(input_file, output_file, language, model)
245
246 -def truecase(input_file, output_file, language, model):
247 from featuregenerator.preprocessor import Truecaser 248 truecaser = Truecaser(language, model) 249 saxjcml.run_features_generator(input_file, output_file, [truecaser])
250
251 252 253 254 @transform(truecase_target, suffix(".tc.%s.jcml" % target_language), ".bleu.%s.f.jcml" % target_language) 255 -def cross_bleu(input_file, output_file):
256 saxjcml.run_features_generator(input_file, output_file, [CrossBleuGenerator()])
257 parallel_feature_functions.append(cross_bleu)
258 259 260 @active_if(cfg.has_section("meteor")) 261 @transform(truecase_target, suffix(".tc.%s.jcml" % target_language), ".meteor.%s.f.jcml" % target_language, target_language, cfg.get_classpath()[0], cfg.get_classpath()[1]) 262 -def cross_meteor(input_file, output_file, target_language, classpath, dir_path):
263 saxjcml.run_features_generator(input_file, output_file, [CrossMeteorGenerator(target_language, classpath, dir_path)])
264 265 if cfg.has_section("meteor"): 266 parallel_feature_functions.append(cross_meteor)
267 268 269 270 271 272 # parallelsentences = JcmlReader(input_file).get_parallelsentences() 273 # parallelsentences = truecaser.add_features_batch(parallelsentences) 274 # Parallelsentence2Jcml(parallelsentences).write_to_file(output_file) 275 276 277 @active_if(cfg.exists_lm(source_language)) 278 @transform(truecase_source, suffix(".tc.%s.jcml" % source_language), ".lm.%s.f.jcml" % source_language, source_language, cfg.get_lm_name(source_language)) 279 -def features_lm_source(input_file, output_file, language, lm_name):
280 features_lm(input_file, output_file, language, lm_name)
281 #saxjcml.run_features_generator(input_file, output_file, [srilm_ngram]) 282 if (cfg.exists_lm(source_language)): 283 parallel_feature_functions.append(features_lm_source)
284 285 @active_if(cfg.exists_lm(target_language)) 286 @transform(truecase_target, suffix(".tc.%s.jcml" % target_language), ".lm.%s.f.jcml" % target_language, target_language, cfg.get_lm_name(target_language)) 287 -def features_lm_target(input_file, output_file, language, lm_name):
288 features_lm(input_file, output_file, language, lm_name)
289 if (cfg.exists_lm(target_language)): 290 parallel_feature_functions.append(features_lm_target)
291 292 -def features_lm(input_file, output_file, language, lm_name):
293 features_lm_batch(input_file, output_file, language, lm_name)
294
295 -def features_lm_batch(input_file, output_file, language, lm_name):
296 srilmgenerator = cfg.get_lm(language) 297 processed_parallelsentences = srilmgenerator.add_features_batch(JcmlReader(input_file).get_parallelsentences()) 298 Parallelsentence2Jcml(processed_parallelsentences).write_to_file(output_file)
299
300 #unimplemented 301 -def features_lm_single(input_file, output_file, language, lm_url, lm_tokenize, lm_lowercase):
302 pass
303
304 305 #language_checker_source = cfg.get_checker(source_language) 306 307 @transform(preprocess_data, suffix(".tok.jcml"), ".l.f.jcml") 308 -def features_length(input_file, output_file):
309 saxjcml.run_features_generator(input_file, output_file, [LengthFeatureGenerator()])
310 parallel_feature_functions.append(features_length) 311 312 313 #@active_if(False) 314 #def features_ibm(input_file, output_file, ibm1lexicon): 315 # ibmfeaturegenerator = Ibm1FeatureGenerator(ibm1lexicon) 316 # saxjcml.run_features_generator(input_file, output_file, [ibmfeaturegenerator]) 317 318 """ 319 Quest 320 """
321 @transform(truecase_source, suffix(".tc.%s.jcml" % source_language), ".tc.%s-%s.jcml" % (source_language, target_language), target_language, cfg.get_truecaser_model(target_language)) 322 -def truecase_target_append(input_file, output_file, language, model):
323 truecase(input_file, output_file, language, model)
324
325 @active_if(cfg.has_section('quest')) 326 @transform(truecase_target_append, suffix(".tc.%s-%s.jcml" % (source_language, target_language)), ".quest.f.jcml", source_language, target_language, cfg.get('quest', 'commandline')) 327 -def features_quest(input_file, output_file, source_language, target_language, commandline):
328 import subprocess, os, shutil 329 input_file = os.path.abspath(input_file) 330 output_file = os.path.abspath(output_file) 331 output_file_tmp = "{}.tmp".format(output_file) 332 previous_path = os.path.abspath(os.curdir) 333 os.chdir(cfg.get('quest', 'path')) 334 subprocess.check_call(commandline.format(sourcelang=source_language, targetlang=target_language, inputfile=input_file, outputfile=output_file_tmp).split()) 335 os.chdir(previous_path) 336 shutil.move(output_file_tmp, output_file)
337 338 if cfg.has_section('quest'): 339 parallel_feature_functions.append(features_quest)
340 341 @active_if(cfg.getboolean("annotation", "reference_features")) 342 @transform(data_fetch, suffix(".orig.jcml"), ".ref.f.jcml", cfg.get("annotation", "moreisbetter").split(","), cfg.get("annotation", "lessisbetter").split(","), cfg.get_classpath()[0], cfg.get_classpath()[1]) 343 -def reference_features(input_file, output_file, moreisbetter_atts, lessisbetter_atts, classpath, dir_path):
344 analyzers = [LevenshteinGenerator(), 345 BleuGenerator()] 346 saxjcml.run_features_generator(input_file, output_file, analyzers) 347 348 if cfg.has_section("meteor"): 349 analyzers.append(MeteorGenerator(target_language, classpath, dir_path))
350 351 if cfg.getboolean("annotation", "reference_features"): 352 parallel_feature_functions.append(reference_features)
353 354 # analyzers.append(RatioGenerator()) 355 356 # for attribute in moreisbetter_atts: 357 # analyzers.append(AttributeRankGenerator(attribute, None, True)) 358 # for attribute in lessisbetter_atts: 359 # analyzers.append(AttributeRankGenerator(attribute)) 360 # 361 362 #active_parallel_feature_functions = [function for function in parallel_feature_functions if function.is_active] 363 364 #first part of the regular expression is the basename of the dataset 365 @collate(parallel_feature_functions, regex(r"([^.]+)\.(.+)\.f.jcml"), r"\1.all.f.jcml") 366 -def features_gather(singledataset_annotations, gathered_singledataset_annotations):
367 368 print "gathering features from tasks ", parallel_feature_functions 369 370 tobermerged = singledataset_annotations 371 original_file = tobermerged[0] 372 original_dataset = JcmlReader(original_file).get_dataset() 373 for appended_file in tobermerged[1:]: 374 appended_dataset = JcmlReader(appended_file).get_dataset() 375 original_dataset.merge_dataset_symmetrical(appended_dataset, {}, "id") 376 Parallelsentence2Jcml(original_dataset.get_parallelsentences()).write_to_file(gathered_singledataset_annotations)
377
378 379 @transform(features_gather, suffix(".all.f.jcml"), ".all.analyzed.f.jcml", cfg.get("general", "source_language"), cfg.get("general", "target_language")) 380 -def analyze_external_features(input_file, output_file, source_language, target_language):
381 langpair = (source_language, target_language) 382 analyzers = [ 383 ParserMatches(langpair), 384 CfgRulesExtractor(), 385 RatioGenerator()] 386 saxjcml.run_features_generator(input_file, output_file, analyzers)
387
388 389 390 391 392 -def create_ranks():
393 pass
394 395 396 397 if __name__ == '__main__': 398 399 400 pipeline_printout_graph("flowchart.pdf", "pdf", [analyze_external_features]) 401 402 pipeline_run([analyze_external_features], multiprocess = cores, verbose = 5) 403 #pipeline_run([original_data_split], multiprocess = 2) 404 405 print "Done!" 406 cfg.java_terminate() 407