Package app :: Package autoranking :: Module application
[hide private]
[frames] | no frames]

Source Code for Module app.autoranking.application

  1  # -*- coding: utf-8 -*- 
  2   
  3  ''' 
  4  This script provides 
  5   (a) the class that wraps the functionality of the ranking mechanism 
  6   (b) a command-line interactive interface for testing installation 
  7   
  8  Created on 2 Aug 2013 
  9   
 10  @author: Eleftherios Avramidis 
 11  ''' 
 12   
 13  import time 
 14  import sys 
 15   
 16  from featuregenerator.parser.berkeley.berkeleyclient import BerkeleySocketFeatureGenerator 
 17  from sentence.sentence import SimpleSentence 
 18   
 19  from ml.lib.orange import OrangeRuntimeRanker  
 20  from sentence.parallelsentence import ParallelSentence 
 21   
 22  from bootstrap import ExperimentConfigParser 
 23  from featuregenerator.parser.berkeley.parsermatches import ParserMatches 
 24  from featuregenerator.lengthfeaturegenerator import LengthFeatureGenerator 
 25  from featuregenerator.meteor.meteor import CrossMeteorGenerator 
 26  from featuregenerator.preprocessor import Normalizer 
 27  from featuregenerator.preprocessor import Tokenizer 
 28  from featuregenerator.preprocessor import Truecaser 
 29   
 30  from py4j.java_gateway import GatewayClient, JavaGateway  
 31   
 32   
 33   
34 -class Autoranking:
35 """ 36 A class that demonstrates the use of simple ranking pipeline. It provides 37 the function 'parse' that receives source and translation strings and 38 returns a ranked list 39 @ivar featuregenerators: List of initialized feature generator objects in the order that will be used 40 @type featuregenerators: [featuregenerator.featuregenerator.FeatureGenerator, ...] 41 @ivar ranker: Machine Learning class that handles ranking of items 42 @type ranker: ml.lib.orange 43 @ivar source_language: Language code for source language 44 @type source_language: str 45 @ivar target_language: Language code for target language 46 @type target_language: str 47 """
48 - def __init__(self, configfilenames, classifiername):
49 """ 50 Initialize the class. 51 @param configfilenames: a list of annotation configuration files that contain 52 the settings for all feature generators etc. 53 @type configfilenames: list(str) 54 @param classifiername: the filename of a picked classifier object 55 @type classifiername: str 56 """ 57 cfg = ExperimentConfigParser() 58 for config_filename in configfilenames: 59 cfg.read(config_filename) 60 61 self.gateway = cfg.java_init() 62 63 self.featuregenerators = self.initialize_featuregenerators(cfg) 64 self.ranker = OrangeRuntimeRanker(classifiername) 65 self.source_language = cfg.get("general", "source_language") 66 self.target_language = cfg.get("general", "target_language")
67 68
69 - def rank(self, source, translations):
70 """ 71 Rank translations according to estimated quality 72 @param source: The source sentence whose translations are raned 73 @type source: str 74 @param translations: The translations to be ranked 75 @type translations: list(str) 76 """ 77 sourcesentence = SimpleSentence(source) 78 79 translationsentences = [SimpleSentence(t, {"system":"{}".format(i+1)}) for i,t in enumerate(translations)] 80 atts = {"langsrc":self.source_language, "langtgt":self.target_language} 81 parallelsentence = ParallelSentence(sourcesentence, translationsentences, None, atts) 82 83 #annotate the parallelsentence 84 annotated_parallelsentence = self._annotate(parallelsentence) 85 print "line annotated" 86 ranking, description = self.ranker.rank_sentence(annotated_parallelsentence) 87 88 #put things in the original order given by the user 89 #because the ranker scrambles the order 90 ranking.sort(key=lambda x: x[1].get_attribute("system")) 91 92 #return only ranks without system ids 93 description += "\n Final ranking: {}".format([(r[0], r[1].get_string()) for r in ranking]) 94 ranking = [r[0] for r in ranking] 95 return ranking, description
96 97
98 - def _annotate(self, parallelsentence):
99 100 #before parallelizing take care of diverse dependencies on preprocessing 101 for featuregenerator in self.featuregenerators: 102 sys.stderr.write("Running {} \n".format(str(featuregenerator))) 103 parallelsentence = featuregenerator.add_features_parallelsentence(parallelsentence) 104 time.sleep(1) 105 print "got sentence" 106 return parallelsentence
107 108
109 - def _get_parser(self, cfg, language):
110 for parser_name in [section for section in cfg.sections() if section.startswith("parser:")]: 111 if cfg.get(parser_name, "language") == language: 112 grammarfile = cfg.get(parser_name, "grammarfile") 113 sys.stderr.write("initializing socket parser with grammar file {}\n".format(grammarfile)) 114 return BerkeleySocketFeatureGenerator(language, grammarfile, self.gateway)
115
116 - def _get_java_gateway(self, cfg):
117 java_classpath, dir_path = cfg.get_classpath() 118 119 if java_classpath: 120 121 #self.jvm = JVM(java_classpath) 122 socket_no = self.jvm.socket_no 123 self.gatewayclient = GatewayClient('localhost', socket_no) 124 self.gateway = JavaGateway(self.gatewayclient, auto_convert=True, auto_field=True) 125 sys.stderr.write("Initialized global Java gateway with pid {} in socket {}\n".format(self.jvm.pid, socket_no)) 126 return self.gateway
127 128
129 - def initialize_featuregenerators(self, cfg):
130 """ 131 Initialize the featuregenerators that handle superficial analysis of given translations 132 @param cfg: the loaded configuration object 133 """ 134 source_language = cfg.get("general", "source_language") 135 target_language = cfg.get("general", "target_language") 136 137 src_parser = cfg.get_parser(source_language) 138 tgt_parser = cfg.get_parser(target_language) 139 140 langpair = (source_language, target_language) 141 142 featuregenerators = [ 143 Normalizer(source_language), 144 Normalizer(target_language), 145 Tokenizer(source_language), 146 Tokenizer(target_language), 147 148 src_parser, 149 tgt_parser, 150 151 ParserMatches(langpair), 152 153 #truecase only for the language model 154 Truecaser(source_language, cfg.get_truecaser_model(source_language)), 155 Truecaser(target_language, cfg.get_truecaser_model(target_language)), 156 157 cfg.get_lm(source_language), 158 cfg.get_lm(target_language), 159 160 CrossMeteorGenerator(target_language, cfg.get_classpath()[0], cfg.get_classpath()[1]), 161 LengthFeatureGenerator() 162 ] 163 164 return featuregenerators
165 166 167 if __name__ == "__main__": 168 try: 169 classifier_filename = sys.argv[1] # "/share/taraxu/selection-mechanism/wmt13/sentenceranking/autoranking_wmt13_newfeatures1_de_en/class_nameranklangpairde-eninclude_references0.0ties0.0trainset_modeannotatedattattset_24classifierLogReg/classifier.clsf" 170 configfilenames = sys.argv[2:] 171 except: 172 sys.exit("Syntax: python application.py <classifier_filename> <pipeline.config.1> [<pipeline.config.2> ...]") 173 174 #[ 175 #'/home/Eleftherios Avramidis/workspace/qualitative/src/app/autoranking/config/pipeline.cfg', 176 #'/home/Eleftherios Avramidis/workspace/qualitative/src/app/autoranking/config/pipeline.wmt13metric.blade6.de.de-en.cfg' 177 #] 178 179 180 autoranker = Autoranking(configfilenames, classifier_filename) 181 182 while 1==1: 183 source = raw_input("Source sentence (or 'exit') > ") 184 if source == "exit": 185 sys.exit("Exit requested") 186 doexit = False 187 i = 0 188 translations = [] 189 while 1==1: 190 i+=1 191 translation = raw_input("Translation (or empty to continue) > ") 192 if translation!="": 193 translations.append(translation) 194 else: 195 break 196 197 result, description = autoranker.rank(source, translations) 198 print description 199 print "The right order of the given sentences is ", result 200