1 """bootstrap.py
2 bootstrapping module to initialize necessary objects for annotation
3 """
4
5
6 import StringIO
7 from ConfigParser import ConfigParser, NoOptionError
8 from featuregenerator.parser.berkeley.berkeleyclient import BerkeleySocketFeatureGenerator, BerkeleyXMLRPCFeatureGenerator
9 from featuregenerator.iq.acrolinxclient import IQFeatureGenerator
10 from featuregenerator.lm.srilm.srilm_ngram import SRILMngramGenerator
11 import os
12 import re
13 import sys
14 import time
15 import random
16 import argparse
17 import fnmatch
18 import socket
19 from util.jvm import JVM
20 from py4j.java_gateway import GatewayClient, JavaGateway
21
22
23 CONFIG_FILENAME = os.path.abspath(os.path.join(os.path.dirname(__name__), 'config/pipeline.cfg'))
24 print 'config', CONFIG_FILENAME
25 CONFIG_TEMPLATE = """
26 """
27
28
30 """
31 An extension of the ConfigParser that initializes necessary object for annotation pipelines
32 according to the settings specified by the configuration files
33 """
34 checker = 0
35
37
38
39 java_classpath, dir_path = self.get_classpath()
40
41 if java_classpath:
42
43 self.jvm = JVM(java_classpath)
44 socket_no = self.jvm.socket_no
45
46 self.gatewayclient = GatewayClient('localhost', socket_no)
47 self.gateway = JavaGateway(self.gatewayclient, auto_convert=True, auto_field=True)
48 sys.stderr.write("Initialized global Java gateway with pid {} in socket {}\n".format(self.jvm.pid, socket_no))
49 return self.gateway
50
51
52
54 java_classpath = set()
55 for section in self.sections():
56 try:
57 java_classpath.add(self.get(section,"java_classpath"))
58 except NoOptionError:
59 pass
60 if len(java_classpath) > 0:
61 path = os.path.abspath(__file__)
62 dir_path = os.path.dirname(path)
63 java_classpath.add(dir_path)
64 return list(java_classpath), dir_path
65 return [], None
66
68 try:
69 return self.socket
70 except:
71 None
72
73
79
80
82 classifier_name = self.get("training", "classifier") + "Learner"
83 return eval(classifier_name)
84
86 self.classifier_params = eval(self.get("training", "params_%s" % self.get("training", "classifier")))
87
88
89
90
91
92
93
94
95
96
97
98
99
100
102 for parser_name in [section for section in self.sections() if section.startswith("parser:")]:
103 if self.get(parser_name, "language") == language:
104 return True
105 return False
106
108
109 for parser_name in [section for section in self.sections() if section.startswith("parser:")]:
110 if self.get(parser_name, "language") == language:
111 tokenize = self.getboolean(parser_name, "tokenize")
112 if self.get(parser_name, "type") == "xmlrpc":
113 url = self.get(parser_name, "url")
114 return BerkeleyXMLRPCFeatureGenerator(url, language, tokenize)
115 elif self.get(parser_name, "type") == "socket":
116 grammarfile = self.get(parser_name, "grammarfile")
117 sys.stderr.write("initializing socket parser with grammar file {}\n".format(grammarfile))
118
119
120 return BerkeleySocketFeatureGenerator(language, grammarfile, self.gateway)
121 return False
122
123
125 for parser_name in [section for section in self.sections() if section.startswith("parser:")]:
126 if self.get(parser_name, "language") == language:
127 return parser_name
128 return None
129
130
132 for checker_name in [section for section in self.sections() if section.startswith("checker:")]:
133 if self.get(checker_name, "language") == language:
134 return True
135 return False
136
137
139 settings = {}
140 for option in self.options(checker_name):
141 if option.startswith("setting_"):
142 setting_name = re.findall("setting_(.*)", option)[0]
143 setting_value = self.get(checker_name, option)
144 settings[setting_name] = setting_value
145 return settings
146
147
149
150 for checker_name in [section for section in self.sections() if section.startswith("checker:")]:
151 print "looking on checker ", checker_name , language
152 if self.get(checker_name, "language") == language:
153 wtime = random.randint(1, 15)
154 time.sleep(wtime)
155
156
157 settings = self._get_checker_settings(checker_name)
158
159
160
161
162 user_id = socket.gethostname()
163
164 feature_generator = IQFeatureGenerator(language,
165 settings,
166 user_id,
167 self.get(checker_name, "host"),
168 self.get(checker_name, "wsdl_path"),
169 self.get(checker_name, "protocol"),
170 "%s.dat" % user_id
171 )
172 print "returning feature generator with user_id", user_id
173 return feature_generator
174 print "Failure with checker for", language
175 return None
176
177
179 return self.get("general", "source_language")
180
182 return self.get("general", "target_language")
183
184
185
187 for lm_name in [section for section in self.sections() if section.startswith("lm:")]:
188 if self.get(lm_name, "language") == language:
189 return True
190 return False
191
192
193
195
196 for lm_name in [section for section in self.sections() if section.startswith("lm:")]:
197 if self.get(lm_name, "language") == language:
198
199 lm_url = self.get(lm_name, "url")
200 lm_tokenize = self.getboolean(lm_name, "tokenize")
201 lm_lowercase = self.getboolean(lm_name, "lowercase")
202 srilm_generator = SRILMngramGenerator(lm_url, language, lm_lowercase, lm_tokenize)
203 return srilm_generator
204 return None
205
206
208 for lm_name in [section for section in self.sections() if section.startswith("lm:")]:
209 if self.get(lm_name, "language") == language:
210 return lm_name
211 return ""
212
214 for tc_name in [section for section in self.sections() if section.startswith("tc:")]:
215 if self.get(tc_name, "language") == language:
216 return self.get(tc_name, "model")
217 return ""
218
221
222
223
224
226
227 path = self.get("general", "path")
228
229
230 try:
231 existing_files = os.listdir(path)
232 except:
233 os.makedirs(path)
234 existing_files = []
235
236 if continue_step:
237 current_step_id = continue_step
238 path = os.path.join(path, str(current_step_id))
239 else:
240 current_step_id = self._get_new_step_id(existing_files)
241 path = os.path.join(path, str(current_step_id))
242 os.mkdir(path)
243
244 os.chdir(path)
245
246 new_configfile = open("app.cfg",'w')
247 self.write(new_configfile)
248 new_configfile.close()
249 self.path = path
250 sys.stderr.write("Working in path {}\n".format(path))
251 sys.stderr.write("System process pid: {}\n".format(os.getpid()))
252 return path
253
255
256 filename_ids = []
257 for filename in existing_files:
258 try:
259 filename_ids.append(int(filename))
260 except:
261 pass
262 current_step_id = 1
263
264
265 if filename_ids:
266 highestnum = max(filename_ids)
267 current_step_id = highestnum + 1
268 sys.stderr.write("Running app as step {0}\n".format(current_step_id))
269 return current_step_id
270
273
274
275
276
277
278
279
285
287
288
289 cfg = ExperimentConfigParser()
290 cfg.readfp(StringIO.StringIO(CONFIG_TEMPLATE))
291
292
293
294 parser = argparse.ArgumentParser(description='')
295 parser.add_argument('--config', nargs='*', default=['cfg/pipeline.cfg'], help="Configuration files")
296 parser.add_argument('--sourcelang', '-s', help="Source language code")
297 parser.add_argument('--targetlang', '-t', help="Target language code")
298 parser.add_argument('--selectpath', help="""If source and target language are set,
299 then use all files in the indicated directory
300 that have these language codes in their filename""")
301 parser.add_argument('--cont', help="""If you want to resume an existing app,
302 specify its folder name heres. This must be
303 an existing dir name""")
304 parser.add_argument('--cores', help='How many cores should be parallelized')
305
306 args = parser.parse_args()
307
308 for config_filename in args.config:
309 cfg.read(config_filename)
310
311 continue_experiment = args.cont
312 if args.sourcelang and args.targetlang and args.selectpath:
313
314 filepattern = "*{}-{}*".format(args.sourcelang, args.targetlang)
315 available_files = os.listdir(args.selectpath)
316 print available_files
317 chosen_files = fnmatch.filter(available_files, filepattern)
318 print chosen_files
319
320 chosen_files = [os.path.join(args.selectpath, f) for f in chosen_files]
321 cfg.set("general", "source_language", args.sourcelang)
322 cfg.set("general", "target_language", args.targetlang)
323 cfg.set("training", "filenames", ",".join(chosen_files))
324
325 if args.cores:
326 cfg.set("general", "cores", args.cores)
327
328 path = cfg.prepare_dir(continue_experiment)
329
330
331 return cfg
332