Package app :: Package autoranking :: Module annotate_batch
[hide private]
[frames] | no frames]

Module annotate_batch

source code

Created on 17 Jan 2012 Modified 22 Mar 2012 for autoranking app


Author: Eleftherios Avramidis

Functions [hide private]
 
get_basename(filename) source code
 
data_fetch(external_file, output_file)
Fetch training file and place it comfortably in the working directory Files are expected to contain the set name, followed by the ending .jcml
source code
 
preprocess_data(input_file, output_file) source code
 
features_checker_source(input_file, output_file, source_language) source code
 
features_checker_target(input_file, output_file, target_language) source code
 
features_langtool_source(input_file, output_file, language) source code
 
features_langtool_target(input_file, output_file, language) source code
 
features_langtool(input_file, output_file, language) source code
 
original_data_split(input_files, output_files, parts)
Split the datasets to parts, in order to perform heavy tasks
source code
 
features_berkeley_source(input_file, output_file, source_language, parser_name) source code
 
features_berkeley_target(input_file, output_file, target_language, parser_name) source code
 
features_berkeley(input_file, output_file, language)
Parsing
source code
 
merge_parse_parts_source(inputs, output) source code
 
merge_parse_parts_target(inputs, output) source code
 
merge_parts(inputs, output) source code
 
truecase_source(input_file, output_file, language, model) source code
 
truecase_target(input_file, output_file, language, model) source code
 
truecase(input_file, output_file, language, model) source code
 
cross_bleu(input_file, output_file) source code
 
cross_meteor(input_file, output_file, target_language, classpath, dir_path) source code
 
features_lm_source(input_file, output_file, language, lm_name) source code
 
features_lm_target(input_file, output_file, language, lm_name) source code
 
features_lm(input_file, output_file, language, lm_name) source code
 
features_lm_batch(input_file, output_file, language, lm_name) source code
 
features_lm_single(input_file, output_file, language, lm_url, lm_tokenize, lm_lowercase) source code
 
features_length(input_file, output_file) source code
 
truecase_target_append(input_file, output_file, language, model) source code
 
features_quest(input_file, output_file, source_language, target_language, commandline) source code
 
reference_features(input_file, output_file, moreisbetter_atts, lessisbetter_atts, classpath, dir_path) source code
 
features_gather(singledataset_annotations, gathered_singledataset_annotations) source code
 
analyze_external_features(input_file, output_file, source_language, target_language) source code
 
create_ranks() source code
Variables [hide private]
  cfg = bootstrap.get_cfg()
  gateway = cfg.java_init()
  cores = int(cfg.get("general", "cores"))
  parallel_feature_functions = []
  path = cfg.get_path()
  source_language = cfg.get("general", "source_language")
  target_language = cfg.get("general", "target_language")
  training_sets = cfg.get("training", "filenames").split(",")
  testing_set = cfg.get("testing", "filename")
  all_sets = cfg.get("training", "filenames").split(",")
  params = []
  annotated_filenames = []
Function Details [hide private]

data_fetch(external_file, output_file)

source code 

Fetch training file and place it comfortably in the working directory Files are expected to contain the set name, followed by the ending .jcml

Decorators:
  • @files(params)

preprocess_data(input_file, output_file)

source code 
Decorators:
  • @transform(data_fetch, suffix("orig.jcml"), "tok.jcml")

features_checker_source(input_file, output_file, source_language)

source code 
Decorators:
  • @jobs_limit(1, "checker")
  • @active_if(cfg.exists_checker(source_language))
  • @transform(data_fetch, suffix(".orig.jcml"), ".iq.%s.f.jcml" % source_language, source_language)

features_checker_target(input_file, output_file, target_language)

source code 
Decorators:
  • @jobs_limit(1, "checker")
  • @active_if(cfg.exists_checker(target_language))
  • @transform(data_fetch, suffix(".orig.jcml"), ".iq.%s.f.jcml" % target_language, target_language)

features_langtool_source(input_file, output_file, language)

source code 
Decorators:
  • @jobs_limit(1, "ltool")
  • @active_if(cfg.has_section("languagetool"))
  • @transform(data_fetch, suffix(".orig.jcml"), ".lt.%s.f.jcml" % source_language, source_language)

features_langtool_target(input_file, output_file, language)

source code 
Decorators:
  • @jobs_limit(1, "ltool")
  • @active_if(cfg.has_section("languagetool"))
  • @transform(data_fetch, suffix(".orig.jcml"), ".lt.%s.f.jcml" % target_language, target_language)

original_data_split(input_files, output_files, parts)

source code 

Split the datasets to parts, in order to perform heavy tasks

Decorators:
  • @split(preprocess_data, "*.part.jcml", cores)

features_berkeley_source(input_file, output_file, source_language, parser_name)

source code 
Decorators:
  • @active_if(cfg.exists_parser(source_language))
  • @transform(original_data_split, suffix("part.jcml"), "part.parsed.%s.f.jcml" % source_language, source_language, cfg.get_parser_name(source_language))

features_berkeley_target(input_file, output_file, target_language, parser_name)

source code 
Decorators:
  • @active_if(cfg.exists_parser(target_language))
  • @transform(original_data_split, suffix("part.jcml"), "part.parsed.%s.f.jcml" % target_language, target_language, cfg.get_parser_name(target_language))

merge_parse_parts_source(inputs, output)

source code 
Decorators:
  • @active_if(cfg.exists_parser(source_language))
  • @collate(features_berkeley_source, regex(r"([^.]+)\.\s?(\d+)\.part.parsed.([^.]+).f.jcml"), r"\1.parsed.\3.f.jcml")

merge_parse_parts_target(inputs, output)

source code 
Decorators:
  • @active_if(cfg.exists_parser(target_language))
  • @collate(features_berkeley_target, regex(r"([^.]+)\.\s?(\d+)\.part.parsed.([^.]+).f.jcml"), r"\1.parsed.\3.f.jcml")

truecase_source(input_file, output_file, language, model)

source code 
Decorators:
  • @transform(preprocess_data, suffix(".tok.jcml"), ".tc.%s.jcml" % source_language, source_language, cfg.get_truecaser_model(source_language))

truecase_target(input_file, output_file, language, model)

source code 
Decorators:
  • @transform(preprocess_data, suffix(".tok.jcml"), ".tc.%s.jcml" % target_language, target_language, cfg.get_truecaser_model(target_language))

cross_bleu(input_file, output_file)

source code 
Decorators:
  • @transform(truecase_target, suffix(".tc.%s.jcml" % target_language), ".bleu.%s.f.jcml" % target_language)

cross_meteor(input_file, output_file, target_language, classpath, dir_path)

source code 
Decorators:
  • @active_if(cfg.has_section("meteor"))
  • @transform(truecase_target, suffix(".tc.%s.jcml" % target_language), ".meteor.%s.f.jcml" % target_language, target_language, cfg.get_classpath() [0], cfg.get_classpath() [1])

features_lm_source(input_file, output_file, language, lm_name)

source code 
Decorators:
  • @active_if(cfg.exists_lm(source_language))
  • @transform(truecase_source, suffix(".tc.%s.jcml" % source_language), ".lm.%s.f.jcml" % source_language, source_language, cfg.get_lm_name(source_language))

features_lm_target(input_file, output_file, language, lm_name)

source code 
Decorators:
  • @active_if(cfg.exists_lm(target_language))
  • @transform(truecase_target, suffix(".tc.%s.jcml" % target_language), ".lm.%s.f.jcml" % target_language, target_language, cfg.get_lm_name(target_language))

features_length(input_file, output_file)

source code 
Decorators:
  • @transform(preprocess_data, suffix(".tok.jcml"), ".l.f.jcml")

truecase_target_append(input_file, output_file, language, model)

source code 
Decorators:
  • @transform(truecase_source, suffix(".tc.%s.jcml" % source_language), ".tc.%s-%s.jcml" %(source_language, target_language), target_language, cfg.get_truecaser_model(target_language))

features_quest(input_file, output_file, source_language, target_language, commandline)

source code 
Decorators:
  • @active_if(cfg.has_section('quest'))
  • @transform(truecase_target_append, suffix(".tc.%s-%s.jcml" %(source_language, target_language)), ".quest.f.jcml", source_language, target_language, cfg.get('quest', 'commandline'))

reference_features(input_file, output_file, moreisbetter_atts, lessisbetter_atts, classpath, dir_path)

source code 
Decorators:
  • @active_if(cfg.getboolean("annotation", "reference_features"))
  • @transform(data_fetch, suffix(".orig.jcml"), ".ref.f.jcml", cfg.get("annotation", "moreisbetter").split(","), cfg.get("annotation", "lessisbetter").split(","), cfg.get_classpath() [0], cfg.get_classpath() [1])

features_gather(singledataset_annotations, gathered_singledataset_annotations)

source code 
Decorators:
  • @collate(parallel_feature_functions, regex(r"([^.]+)\.(.+)\.f.jcml"), r"\1.all.f.jcml")

analyze_external_features(input_file, output_file, source_language, target_language)

source code 
Decorators:
  • @transform(features_gather, suffix(".all.f.jcml"), ".all.analyzed.f.jcml", cfg.get("general", "source_language"), cfg.get("general", "target_language"))