Package app :: Package autoranking :: Module annotate_batch

[frames] | no frames]

Module annotate_batch

source code

Created on 17 Jan 2012 Modified 22 Mar 2012 for autoranking app

Author: Eleftherios Avramidis

Functions

[hide private]

get_basename(filename)

source code

data_fetch(external_file, output_file)
Fetch training file and place it comfortably in the working directory Files are expected to contain the set name, followed by the ending .jcml

source code

preprocess_data(input_file, output_file)

source code

features_checker_source(input_file, output_file, source_language)

source code

features_checker_target(input_file, output_file, target_language)

source code

features_langtool_source(input_file, output_file, language)

source code

features_langtool_target(input_file, output_file, language)

source code

features_langtool(input_file, output_file, language)

source code

original_data_split(input_files, output_files, parts)
Split the datasets to parts, in order to perform heavy tasks

source code

features_berkeley_source(input_file, output_file, source_language, parser_name)

source code

features_berkeley_target(input_file, output_file, target_language, parser_name)

source code

features_berkeley(input_file, output_file, language)
Parsing

source code

merge_parse_parts_source(inputs, output)

source code

merge_parse_parts_target(inputs, output)

source code

merge_parts(inputs, output)

source code

truecase_source(input_file, output_file, language, model)

source code

truecase_target(input_file, output_file, language, model)

source code

truecase(input_file, output_file, language, model)

source code

cross_bleu(input_file, output_file)

source code

cross_meteor(input_file, output_file, target_language, classpath, dir_path)

source code

features_lm_source(input_file, output_file, language, lm_name)

source code

features_lm_target(input_file, output_file, language, lm_name)

source code

features_lm(input_file, output_file, language, lm_name)

source code

features_lm_batch(input_file, output_file, language, lm_name)

source code

features_lm_single(input_file, output_file, language, lm_url, lm_tokenize, lm_lowercase)

source code

features_length(input_file, output_file)

source code

truecase_target_append(input_file, output_file, language, model)

source code

features_quest(input_file, output_file, source_language, target_language, commandline)

source code

reference_features(input_file, output_file, moreisbetter_atts, lessisbetter_atts, classpath, dir_path)

source code

features_gather(singledataset_annotations, gathered_singledataset_annotations)

source code

analyze_external_features(input_file, output_file, source_language, target_language)

source code

create_ranks()

source code

Variables

[hide private]

cfg = bootstrap.get_cfg()

gateway = cfg.java_init()

cores = int(cfg.get("general", "cores"))

parallel_feature_functions = []

path = cfg.get_path()

source_language = cfg.get("general", "source_language")

target_language = cfg.get("general", "target_language")

training_sets = cfg.get("training", "filenames").split(",")

testing_set = cfg.get("testing", "filename")

all_sets = cfg.get("training", "filenames").split(",")

params = []

annotated_filenames = []

Function Details

[hide private]

data_fetch(external_file, output_file)

source code

Fetch training file and place it comfortably in the working directory Files are expected to contain the set name, followed by the ending .jcml

Decorators:

@files(params)

preprocess_data(input_file, output_file)

source code

Decorators:

@transform(data_fetch, suffix("orig.jcml"), "tok.jcml")

features_checker_source(input_file, output_file, source_language)

source code

Decorators:

@jobs_limit(1, "checker")
@active_if(cfg.exists_checker(source_language))
@transform(data_fetch, suffix(".orig.jcml"), ".iq.%s.f.jcml" % source_language, source_language)

features_checker_target(input_file, output_file, target_language)

source code

Decorators:

@jobs_limit(1, "checker")
@active_if(cfg.exists_checker(target_language))
@transform(data_fetch, suffix(".orig.jcml"), ".iq.%s.f.jcml" % target_language, target_language)

features_langtool_source(input_file, output_file, language)

source code

Decorators:

@jobs_limit(1, "ltool")
@active_if(cfg.has_section("languagetool"))
@transform(data_fetch, suffix(".orig.jcml"), ".lt.%s.f.jcml" % source_language, source_language)

features_langtool_target(input_file, output_file, language)

source code

Decorators:

@jobs_limit(1, "ltool")
@active_if(cfg.has_section("languagetool"))
@transform(data_fetch, suffix(".orig.jcml"), ".lt.%s.f.jcml" % target_language, target_language)

original_data_split(input_files, output_files, parts)

source code

Split the datasets to parts, in order to perform heavy tasks

Decorators:

@split(preprocess_data, "*.part.jcml", cores)

features_berkeley_source(input_file, output_file, source_language, parser_name)

source code

Decorators:

@active_if(cfg.exists_parser(source_language))
@transform(original_data_split, suffix("part.jcml"), "part.parsed.%s.f.jcml" % source_language, source_language, cfg.get_parser_name(source_language))

features_berkeley_target(input_file, output_file, target_language, parser_name)

source code

Decorators:

@active_if(cfg.exists_parser(target_language))
@transform(original_data_split, suffix("part.jcml"), "part.parsed.%s.f.jcml" % target_language, target_language, cfg.get_parser_name(target_language))

merge_parse_parts_source(inputs, output)

source code

Decorators:

@active_if(cfg.exists_parser(source_language))
@collate(features_berkeley_source, regex(r"([^.]+)\.\s?(\d+)\.part.parsed.([^.]+).f.jcml"), r"\1.parsed.\3.f.jcml")

merge_parse_parts_target(inputs, output)

source code

Decorators:

@active_if(cfg.exists_parser(target_language))
@collate(features_berkeley_target, regex(r"([^.]+)\.\s?(\d+)\.part.parsed.([^.]+).f.jcml"), r"\1.parsed.\3.f.jcml")

truecase_source(input_file, output_file, language, model)

source code

Decorators:

@transform(preprocess_data, suffix(".tok.jcml"), ".tc.%s.jcml" % source_language, source_language, cfg.get_truecaser_model(source_language))

truecase_target(input_file, output_file, language, model)

source code

Decorators:

@transform(preprocess_data, suffix(".tok.jcml"), ".tc.%s.jcml" % target_language, target_language, cfg.get_truecaser_model(target_language))

cross_bleu(input_file, output_file)

source code

Decorators:

@transform(truecase_target, suffix(".tc.%s.jcml" % target_language), ".bleu.%s.f.jcml" % target_language)

cross_meteor(input_file, output_file, target_language, classpath, dir_path)

source code

Decorators:

@active_if(cfg.has_section("meteor"))
@transform(truecase_target, suffix(".tc.%s.jcml" % target_language), ".meteor.%s.f.jcml" % target_language, target_language, cfg.get_classpath() [0], cfg.get_classpath() [1])

features_lm_source(input_file, output_file, language, lm_name)

source code

Decorators:

@active_if(cfg.exists_lm(source_language))
@transform(truecase_source, suffix(".tc.%s.jcml" % source_language), ".lm.%s.f.jcml" % source_language, source_language, cfg.get_lm_name(source_language))

features_lm_target(input_file, output_file, language, lm_name)

source code

Decorators:

@active_if(cfg.exists_lm(target_language))
@transform(truecase_target, suffix(".tc.%s.jcml" % target_language), ".lm.%s.f.jcml" % target_language, target_language, cfg.get_lm_name(target_language))

features_length(input_file, output_file)

source code

Decorators:

@transform(preprocess_data, suffix(".tok.jcml"), ".l.f.jcml")

truecase_target_append(input_file, output_file, language, model)

source code

Decorators:

@transform(truecase_source, suffix(".tc.%s.jcml" % source_language), ".tc.%s-%s.jcml" %(source_language, target_language), target_language, cfg.get_truecaser_model(target_language))

features_quest(input_file, output_file, source_language, target_language, commandline)

source code

Decorators:

@active_if(cfg.has_section('quest'))
@transform(truecase_target_append, suffix(".tc.%s-%s.jcml" %(source_language, target_language)), ".quest.f.jcml", source_language, target_language, cfg.get('quest', 'commandline'))

reference_features(input_file, output_file, moreisbetter_atts, lessisbetter_atts, classpath, dir_path)

source code

Decorators:

@active_if(cfg.getboolean("annotation", "reference_features"))
@transform(data_fetch, suffix(".orig.jcml"), ".ref.f.jcml", cfg.get("annotation", "moreisbetter").split(","), cfg.get("annotation", "lessisbetter").split(","), cfg.get_classpath() [0], cfg.get_classpath() [1])

features_gather(singledataset_annotations, gathered_singledataset_annotations)

source code

Decorators:

@collate(parallel_feature_functions, regex(r"([^.]+)\.(.+)\.f.jcml"), r"\1.all.f.jcml")

analyze_external_features(input_file, output_file, source_language, target_language)

source code

Decorators:

@transform(features_gather, suffix(".all.f.jcml"), ".all.analyzed.f.jcml", cfg.get("general", "source_language"), cfg.get("general", "target_language"))