app.autoranking.bootstrap

30 """ 31 An extension of the ConfigParser that initializes necessary object for annotation pipelines 32 according to the settings specified by the configuration files 33 """ 34 checker = 0 35

36 - def java_init(self):

37 38 #collect java classpath entries from all sections 39 java_classpath, dir_path = self.get_classpath() 40 41 if java_classpath: 42 43 self.jvm = JVM(java_classpath) 44 socket_no = self.jvm.socket_no 45 #socket_no = 25336 46 self.gatewayclient = GatewayClient('localhost', socket_no) 47 self.gateway = JavaGateway(self.gatewayclient, auto_convert=True, auto_field=True) 48 sys.stderr.write("Initialized global Java gateway with pid {} in socket {}\n".format(self.jvm.pid, socket_no)) 49 return self.gateway

50 # wait so that server starts 51 # time.sleep(2) 52

53 - def get_classpath(self):

54 java_classpath = set() 55 for section in self.sections(): 56 try: 57 java_classpath.add(self.get(section,"java_classpath")) 58 except NoOptionError: 59 pass 60 if len(java_classpath) > 0: 61 path = os.path.abspath(__file__) 62 dir_path = os.path.dirname(path) #@todo: change location of the JavaServer to sth more universal 63 java_classpath.add(dir_path) 64 return list(java_classpath), dir_path 65 return [], None

66

67 - def get_gatewayclient(self):

68 try: 69 return self.socket 70 except: 71 None

72 73

74 - def java_terminate(self):

75 try: 76 self.jvm.terminate() 77 except: 78 pass

79 80

81 - def getlearner(self):

82 classifier_name = self.get("training", "classifier") + "Learner" 83 return eval(classifier_name)

84

85 - def get_classifier_params(self):

86 self.classifier_params = eval(self.get("training", "params_%s" % self.get("training", "classifier")))

87 88 # def get_classifier(self, name = None): 89 # if not name: 90 # name = self.get("training", "classifier") 91 # package = classifier 92 # prefix = package.__name__ + '.' 93 # for importer, modname, ispkg in pkgutil.iter_modules(package.__path__, prefix): 94 # module = __import__(modname, fromlist="dummy") 95 # try: 96 # return getattr(module, name) 97 # except: 98 # pass 99 # return getattr(Orange, name) 100

101 - def exists_parser(self, language):

102 for parser_name in [section for section in self.sections() if section.startswith("parser:")]: 103 if self.get(parser_name, "language") == language: 104 return True 105 return False

106

107 - def get_parser(self, language):

108 #this is reading the configuration, maybe move elsewher 109 for parser_name in [section for section in self.sections() if section.startswith("parser:")]: 110 if self.get(parser_name, "language") == language: 111 tokenize = self.getboolean(parser_name, "tokenize") 112 if self.get(parser_name, "type") == "xmlrpc": 113 url = self.get(parser_name, "url") 114 return BerkeleyXMLRPCFeatureGenerator(url, language, tokenize) 115 elif self.get(parser_name, "type") == "socket": 116 grammarfile = self.get(parser_name, "grammarfile") 117 sys.stderr.write("initializing socket parser with grammar file {}\n".format(grammarfile)) 118 119 # return BerkeleySocketFeatureGenerator(language, grammarfile, self.get_classpath()) 120 return BerkeleySocketFeatureGenerator(language, grammarfile, self.gateway) 121 return False

122 123

124 - def get_parser_name(self, language):

125 for parser_name in [section for section in self.sections() if section.startswith("parser:")]: 126 if self.get(parser_name, "language") == language: 127 return parser_name 128 return None

129 130

131 - def exists_checker(self, language):

132 for checker_name in [section for section in self.sections() if section.startswith("checker:")]: 133 if self.get(checker_name, "language") == language: 134 return True 135 return False

136 137

138 - def _get_checker_settings(self, checker_name):

139 settings = {} 140 for option in self.options(checker_name): 141 if option.startswith("setting_"): 142 setting_name = re.findall("setting_(.*)", option)[0] 143 setting_value = self.get(checker_name, option) 144 settings[setting_name] = setting_value 145 return settings

146 147

148 - def get_checker(self, language):

149 #@todo: see how to generalize this. also pass parameters read by the pipeline, currently hardcoded 150 for checker_name in [section for section in self.sections() if section.startswith("checker:")]: 151 print "looking on checker ", checker_name , language 152 if self.get(checker_name, "language") == language: 153 wtime = random.randint(1, 15) 154 time.sleep(wtime) 155 #TODO: if KenLM gets wrapped up, add a type: setting 156 157 settings = self._get_checker_settings(checker_name) 158 159 #user_id = "{}{}".format(self.get(checker_name, "user_id"), ExperimentConfigParser.checker) 160 # user_id = self.get(checker_name, "user_id") 161 #user_id = os.path.basename(tempfile.mktemp()) 162 user_id = socket.gethostname() 163 164 feature_generator = IQFeatureGenerator(language, 165 settings, 166 user_id, 167 self.get(checker_name, "host"), 168 self.get(checker_name, "wsdl_path"), 169 self.get(checker_name, "protocol"), 170 "%s.dat" % user_id 171 ) 172 print "returning feature generator with user_id", user_id 173 return feature_generator 174 print "Failure with checker for", language 175 return None

176 177

178 - def get_source_language(self):

179 return self.get("general", "source_language")

180

181 - def get_target_language(self):

182 return self.get("general", "target_language")

183 184 185

186 - def exists_lm(self, language):

187 for lm_name in [section for section in self.sections() if section.startswith("lm:")]: 188 if self.get(lm_name, "language") == language: 189 return True 190 return False

191 192 193

194 - def get_lm(self, language):

195 #TODO: probably establish sth like ExternalProcessor object and wrap all these params there 196 for lm_name in [section for section in self.sections() if section.startswith("lm:")]: 197 if self.get(lm_name, "language") == language: 198 #TODO: if KenLM gets wrapped up, add a type: setting 199 lm_url = self.get(lm_name, "url") 200 lm_tokenize = self.getboolean(lm_name, "tokenize") 201 lm_lowercase = self.getboolean(lm_name, "lowercase") 202 srilm_generator = SRILMngramGenerator(lm_url, language, lm_lowercase, lm_tokenize) 203 return srilm_generator 204 return None

205 206

207 - def get_lm_name(self, language):

208 for lm_name in [section for section in self.sections() if section.startswith("lm:")]: 209 if self.get(lm_name, "language") == language: 210 return lm_name 211 return ""

212

213 - def get_truecaser_model(self, language):

214 for tc_name in [section for section in self.sections() if section.startswith("tc:")]: 215 if self.get(tc_name, "language") == language: 216 return self.get(tc_name, "model") 217 return ""

218

219 - def get_path(self):

220 return self.path

221 222 223 224

225 - def prepare_dir(self, continue_step = None):

226 227 path = self.get("general", "path") 228 229 #first check whether the path of the "pool" exists or create it 230 try: 231 existing_files = os.listdir(path) 232 except: 233 os.makedirs(path) 234 existing_files = [] 235 236 if continue_step: 237 current_step_id = continue_step 238 path = os.path.join(path, str(current_step_id)) 239 else: 240 current_step_id = self._get_new_step_id(existing_files) 241 path = os.path.join(path, str(current_step_id)) 242 os.mkdir(path) 243 244 os.chdir(path) 245 #copy all configuration settings to the new directory 246 new_configfile = open("app.cfg",'w') 247 self.write(new_configfile) 248 new_configfile.close() 249 self.path = path 250 sys.stderr.write("Working in path {}\n".format(path)) 251 sys.stderr.write("System process pid: {}\n".format(os.getpid())) 252 return path

253

254 - def _get_new_step_id(self, existing_files):

255 #subdirectories should only have as name the integer id of the app 256 filename_ids = [] 257 for filename in existing_files: #@todo add check if is directory or do better listing 258 try: 259 filename_ids.append(int(filename)) 260 except: 261 pass 262 current_step_id = 1 263 264 #add one to the get the id of this app 265 if filename_ids: 266 highestnum = max(filename_ids) 267 current_step_id = highestnum + 1 268 sys.stderr.write("Running app as step {0}\n".format(current_step_id)) 269 return current_step_id

270

271 - def __del__(self):

272 self.java_terminate()

287 288 # global configuration 289 cfg = ExperimentConfigParser() 290 cfg.readfp(StringIO.StringIO(CONFIG_TEMPLATE)) # set up defaults 291 #cfg.read(CONFIG_FILENAME) # add user-specified settings 292 #cfg.read(configfilename) # add user-specified settings 293 294 parser = argparse.ArgumentParser(description='') 295 parser.add_argument('--config', nargs='*', default=['cfg/pipeline.cfg'], help="Configuration files") 296 parser.add_argument('--sourcelang', '-s', help="Source language code") 297 parser.add_argument('--targetlang', '-t', help="Target language code") 298 parser.add_argument('--selectpath', help="""If source and target language are set, 299 then use all files in the indicated directory 300 that have these language codes in their filename""") 301 parser.add_argument('--cont', help="""If you want to resume an existing app, 302 specify its folder name heres. This must be 303 an existing dir name""") 304 parser.add_argument('--cores', help='How many cores should be parallelized') 305 306 args = parser.parse_args() 307 308 for config_filename in args.config: 309 cfg.read(config_filename) 310 311 continue_experiment = args.cont 312 if args.sourcelang and args.targetlang and args.selectpath: 313 #source-target lang code separated with hyphen 314 filepattern = "*{}-{}*".format(args.sourcelang, args.targetlang) 315 available_files = os.listdir(args.selectpath) 316 print available_files 317 chosen_files = fnmatch.filter(available_files, filepattern) 318 print chosen_files 319 #prepend path 320 chosen_files = [os.path.join(args.selectpath, f) for f in chosen_files] 321 cfg.set("general", "source_language", args.sourcelang) 322 cfg.set("general", "target_language", args.targetlang) 323 cfg.set("training", "filenames", ",".join(chosen_files)) 324 325 if args.cores: 326 cfg.set("general", "cores", args.cores) 327 328 path = cfg.prepare_dir(continue_experiment) 329 330 #os.chdir(path) 331 return cfg

Source Code for Module app.autoranking.bootstrap