Package app :: Package autoranking :: Module bootstrap
[hide private]
[frames] | no frames]

Source Code for Module app.autoranking.bootstrap

  1  """bootstrap.py 
  2      bootstrapping module to initialize necessary objects for annotation 
  3  """ 
  4   
  5   
  6  import StringIO 
  7  from ConfigParser import ConfigParser, NoOptionError 
  8  from featuregenerator.parser.berkeley.berkeleyclient import BerkeleySocketFeatureGenerator, BerkeleyXMLRPCFeatureGenerator 
  9  from featuregenerator.iq.acrolinxclient import IQFeatureGenerator 
 10  from featuregenerator.lm.srilm.srilm_ngram import SRILMngramGenerator  
 11  import os 
 12  import re 
 13  import sys 
 14  import time 
 15  import random 
 16  import argparse 
 17  import fnmatch 
 18  import socket 
 19  from util.jvm import JVM 
 20  from py4j.java_gateway import GatewayClient, JavaGateway 
 21   
 22  # --- config and options--- 
 23  CONFIG_FILENAME = os.path.abspath(os.path.join(os.path.dirname(__name__), 'config/pipeline.cfg')) 
 24  print 'config', CONFIG_FILENAME  
 25  CONFIG_TEMPLATE = """ 
 26  """ 
 27   
 28   
29 -class ExperimentConfigParser(ConfigParser):
30 """ 31 An extension of the ConfigParser that initializes necessary object for annotation pipelines 32 according to the settings specified by the configuration files 33 """ 34 checker = 0 35
36 - def java_init(self):
37 38 #collect java classpath entries from all sections 39 java_classpath, dir_path = self.get_classpath() 40 41 if java_classpath: 42 43 self.jvm = JVM(java_classpath) 44 socket_no = self.jvm.socket_no 45 #socket_no = 25336 46 self.gatewayclient = GatewayClient('localhost', socket_no) 47 self.gateway = JavaGateway(self.gatewayclient, auto_convert=True, auto_field=True) 48 sys.stderr.write("Initialized global Java gateway with pid {} in socket {}\n".format(self.jvm.pid, socket_no)) 49 return self.gateway
50 # wait so that server starts 51 # time.sleep(2) 52
53 - def get_classpath(self):
54 java_classpath = set() 55 for section in self.sections(): 56 try: 57 java_classpath.add(self.get(section,"java_classpath")) 58 except NoOptionError: 59 pass 60 if len(java_classpath) > 0: 61 path = os.path.abspath(__file__) 62 dir_path = os.path.dirname(path) #@todo: change location of the JavaServer to sth more universal 63 java_classpath.add(dir_path) 64 return list(java_classpath), dir_path 65 return [], None
66
67 - def get_gatewayclient(self):
68 try: 69 return self.socket 70 except: 71 None
72 73
74 - def java_terminate(self):
75 try: 76 self.jvm.terminate() 77 except: 78 pass
79 80
81 - def getlearner(self):
82 classifier_name = self.get("training", "classifier") + "Learner" 83 return eval(classifier_name)
84
85 - def get_classifier_params(self):
86 self.classifier_params = eval(self.get("training", "params_%s" % self.get("training", "classifier")))
87 88 # def get_classifier(self, name = None): 89 # if not name: 90 # name = self.get("training", "classifier") 91 # package = classifier 92 # prefix = package.__name__ + '.' 93 # for importer, modname, ispkg in pkgutil.iter_modules(package.__path__, prefix): 94 # module = __import__(modname, fromlist="dummy") 95 # try: 96 # return getattr(module, name) 97 # except: 98 # pass 99 # return getattr(Orange, name) 100
101 - def exists_parser(self, language):
102 for parser_name in [section for section in self.sections() if section.startswith("parser:")]: 103 if self.get(parser_name, "language") == language: 104 return True 105 return False
106
107 - def get_parser(self, language):
108 #this is reading the configuration, maybe move elsewher 109 for parser_name in [section for section in self.sections() if section.startswith("parser:")]: 110 if self.get(parser_name, "language") == language: 111 tokenize = self.getboolean(parser_name, "tokenize") 112 if self.get(parser_name, "type") == "xmlrpc": 113 url = self.get(parser_name, "url") 114 return BerkeleyXMLRPCFeatureGenerator(url, language, tokenize) 115 elif self.get(parser_name, "type") == "socket": 116 grammarfile = self.get(parser_name, "grammarfile") 117 sys.stderr.write("initializing socket parser with grammar file {}\n".format(grammarfile)) 118 119 # return BerkeleySocketFeatureGenerator(language, grammarfile, self.get_classpath()) 120 return BerkeleySocketFeatureGenerator(language, grammarfile, self.gateway) 121 return False
122 123
124 - def get_parser_name(self, language):
125 for parser_name in [section for section in self.sections() if section.startswith("parser:")]: 126 if self.get(parser_name, "language") == language: 127 return parser_name 128 return None
129 130
131 - def exists_checker(self, language):
132 for checker_name in [section for section in self.sections() if section.startswith("checker:")]: 133 if self.get(checker_name, "language") == language: 134 return True 135 return False
136 137
138 - def _get_checker_settings(self, checker_name):
139 settings = {} 140 for option in self.options(checker_name): 141 if option.startswith("setting_"): 142 setting_name = re.findall("setting_(.*)", option)[0] 143 setting_value = self.get(checker_name, option) 144 settings[setting_name] = setting_value 145 return settings
146 147
148 - def get_checker(self, language):
149 #@todo: see how to generalize this. also pass parameters read by the pipeline, currently hardcoded 150 for checker_name in [section for section in self.sections() if section.startswith("checker:")]: 151 print "looking on checker ", checker_name , language 152 if self.get(checker_name, "language") == language: 153 wtime = random.randint(1, 15) 154 time.sleep(wtime) 155 #TODO: if KenLM gets wrapped up, add a type: setting 156 157 settings = self._get_checker_settings(checker_name) 158 159 #user_id = "{}{}".format(self.get(checker_name, "user_id"), ExperimentConfigParser.checker) 160 # user_id = self.get(checker_name, "user_id") 161 #user_id = os.path.basename(tempfile.mktemp()) 162 user_id = socket.gethostname() 163 164 feature_generator = IQFeatureGenerator(language, 165 settings, 166 user_id, 167 self.get(checker_name, "host"), 168 self.get(checker_name, "wsdl_path"), 169 self.get(checker_name, "protocol"), 170 "%s.dat" % user_id 171 ) 172 print "returning feature generator with user_id", user_id 173 return feature_generator 174 print "Failure with checker for", language 175 return None
176 177
178 - def get_source_language(self):
179 return self.get("general", "source_language")
180
181 - def get_target_language(self):
182 return self.get("general", "target_language")
183 184 185
186 - def exists_lm(self, language):
187 for lm_name in [section for section in self.sections() if section.startswith("lm:")]: 188 if self.get(lm_name, "language") == language: 189 return True 190 return False
191 192 193
194 - def get_lm(self, language):
195 #TODO: probably establish sth like ExternalProcessor object and wrap all these params there 196 for lm_name in [section for section in self.sections() if section.startswith("lm:")]: 197 if self.get(lm_name, "language") == language: 198 #TODO: if KenLM gets wrapped up, add a type: setting 199 lm_url = self.get(lm_name, "url") 200 lm_tokenize = self.getboolean(lm_name, "tokenize") 201 lm_lowercase = self.getboolean(lm_name, "lowercase") 202 srilm_generator = SRILMngramGenerator(lm_url, language, lm_lowercase, lm_tokenize) 203 return srilm_generator 204 return None
205 206
207 - def get_lm_name(self, language):
208 for lm_name in [section for section in self.sections() if section.startswith("lm:")]: 209 if self.get(lm_name, "language") == language: 210 return lm_name 211 return ""
212
213 - def get_truecaser_model(self, language):
214 for tc_name in [section for section in self.sections() if section.startswith("tc:")]: 215 if self.get(tc_name, "language") == language: 216 return self.get(tc_name, "model") 217 return ""
218
219 - def get_path(self):
220 return self.path
221 222 223 224
225 - def prepare_dir(self, continue_step = None):
226 227 path = self.get("general", "path") 228 229 #first check whether the path of the "pool" exists or create it 230 try: 231 existing_files = os.listdir(path) 232 except: 233 os.makedirs(path) 234 existing_files = [] 235 236 if continue_step: 237 current_step_id = continue_step 238 path = os.path.join(path, str(current_step_id)) 239 else: 240 current_step_id = self._get_new_step_id(existing_files) 241 path = os.path.join(path, str(current_step_id)) 242 os.mkdir(path) 243 244 os.chdir(path) 245 #copy all configuration settings to the new directory 246 new_configfile = open("app.cfg",'w') 247 self.write(new_configfile) 248 new_configfile.close() 249 self.path = path 250 sys.stderr.write("Working in path {}\n".format(path)) 251 sys.stderr.write("System process pid: {}\n".format(os.getpid())) 252 return path
253
254 - def _get_new_step_id(self, existing_files):
255 #subdirectories should only have as name the integer id of the app 256 filename_ids = [] 257 for filename in existing_files: #@todo add check if is directory or do better listing 258 try: 259 filename_ids.append(int(filename)) 260 except: 261 pass 262 current_step_id = 1 263 264 #add one to the get the id of this app 265 if filename_ids: 266 highestnum = max(filename_ids) 267 current_step_id = highestnum + 1 268 sys.stderr.write("Running app as step {0}\n".format(current_step_id)) 269 return current_step_id
270
271 - def __del__(self):
272 self.java_terminate()
273 274 275 #try: 276 # configfilename = os.sys.argv[1] 277 #except IndexError: 278 # configfilename = CONFIG_FILENAME 279
280 -def get_cfg_files(config_filenames):
281 cfg = ExperimentConfigParser() 282 for config_filename in config_filenames: 283 cfg.read(config_filename) 284 return cfg
285
286 -def get_cfg():
287 288 # global configuration 289 cfg = ExperimentConfigParser() 290 cfg.readfp(StringIO.StringIO(CONFIG_TEMPLATE)) # set up defaults 291 #cfg.read(CONFIG_FILENAME) # add user-specified settings 292 #cfg.read(configfilename) # add user-specified settings 293 294 parser = argparse.ArgumentParser(description='') 295 parser.add_argument('--config', nargs='*', default=['cfg/pipeline.cfg'], help="Configuration files") 296 parser.add_argument('--sourcelang', '-s', help="Source language code") 297 parser.add_argument('--targetlang', '-t', help="Target language code") 298 parser.add_argument('--selectpath', help="""If source and target language are set, 299 then use all files in the indicated directory 300 that have these language codes in their filename""") 301 parser.add_argument('--cont', help="""If you want to resume an existing app, 302 specify its folder name heres. This must be 303 an existing dir name""") 304 parser.add_argument('--cores', help='How many cores should be parallelized') 305 306 args = parser.parse_args() 307 308 for config_filename in args.config: 309 cfg.read(config_filename) 310 311 continue_experiment = args.cont 312 if args.sourcelang and args.targetlang and args.selectpath: 313 #source-target lang code separated with hyphen 314 filepattern = "*{}-{}*".format(args.sourcelang, args.targetlang) 315 available_files = os.listdir(args.selectpath) 316 print available_files 317 chosen_files = fnmatch.filter(available_files, filepattern) 318 print chosen_files 319 #prepend path 320 chosen_files = [os.path.join(args.selectpath, f) for f in chosen_files] 321 cfg.set("general", "source_language", args.sourcelang) 322 cfg.set("general", "target_language", args.targetlang) 323 cfg.set("training", "filenames", ",".join(chosen_files)) 324 325 if args.cores: 326 cfg.set("general", "cores", args.cores) 327 328 path = cfg.prepare_dir(continue_experiment) 329 330 #os.chdir(path) 331 return cfg
332