Package dataprocessor :: Package sax :: Module utils
[hide private]
[frames] | no frames]

Source Code for Module dataprocessor.sax.utils

 1  from dataprocessor.sax.saxps2jcml import IncrementalJcml 
 2  from dataprocessor.ce.cejcml import CEJcmlReader 
 3  import logging as log 
 4   
5 -def join_jcml(filenames, output_filename):
6 writer = IncrementalJcml(output_filename) 7 for filename in filenames: 8 reader = CEJcmlReader(filename, all_general=True, all_target=True) 9 for parallelsentence in reader.get_parallelsentences(): 10 writer.add_parallelsentence(parallelsentence) 11 12 writer.close()
13
14 -def filter_jcml(input_filename, output_filename, callback, **kwargs):
15 reader = CEJcmlReader(input_filename, all_general=True, all_target=True) 16 writer = IncrementalJcml(output_filename) 17 count = 0 18 everything = 0 19 for parallelsentence in reader.get_parallelsentences(): 20 everything+=1 21 if callback(parallelsentence, **kwargs): 22 writer.add_parallelsentence(parallelsentence) 23 count+=1 24 log.info("Left {} out of {}".format(count, everything)) 25 writer.close()
26 27
28 -def join_filter_jcml(filenames, output_filename, callback, **kwargs):
29 writer = IncrementalJcml(output_filename) 30 count = 0 31 everything = 0 32 for filename in filenames: 33 log.info("Filtering and joining filename {}".format(filename)) 34 reader = CEJcmlReader(filename, all_general=True, all_target=True) 35 for parallelsentence in reader.get_parallelsentences(): 36 everything+=1 37 if callback(parallelsentence, **kwargs): 38 writer.add_parallelsentence(parallelsentence) 39 count+=1 40 log.info("Left {} out of {}".format(count, everything)) 41 writer.close() 42 return count, everything
43