Package support :: Package preprocessing :: Package jcml :: Module txt2jcml
[hide private]
[frames] | no frames]

Source Code for Module support.preprocessing.jcml.txt2jcml

  1  ''' 
  2  Created on 23 Feb 2012 
  3   
  4  @author: Eleftherios Avramidis 
  5  ''' 
  6  from optparse import OptionParser 
  7  from sentence.sentence import SimpleSentence 
  8  from collections import OrderedDict 
  9  from sentence.parallelsentence import ParallelSentence 
 10  from dataprocessor.sax.saxps2jcml import Parallelsentence2Jcml 
 11  from featuregenerator.glassbox.moses.extractor import MosesGlassboxExtractor 
 12   
 13  if __name__ == '__main__': 
 14   
 15      parser = OptionParser() 
 16      parser.add_option("-s", "--source", dest="source_filename", 
 17                        help="read one source sentence per line from FILE", metavar="FILE") 
 18       
 19      parser.add_option("-t", "--translation", dest="target_filename", 
 20                        help="read one translation output sentence per line from FILE", metavar="FILE") 
 21       
 22      parser.add_option("-m", "--system", dest="system_name", 
 23                        help="system name") 
 24       
 25     
 26      parser.add_option("-r", "--reference", dest="reference_filename", 
 27                        help="read one reference sentence per line from FILE", metavar="FILE") 
 28       
 29      parser.add_option("-l", "--score", dest="score_filename", 
 30                        help="read one score per line from FILE", metavar="FILE") 
 31       
 32      parser.add_option("-a", "--feature-names", action="append", dest="feature_names", type="str", 
 33                        help="a list of feature names", default=[]) 
 34       
 35      parser.add_option("-q", "--feature-files", action="append", dest="feature_files", type="str", default=[], 
 36                        help="a list of feature FILEs in respective order") 
 37   
 38      parser.add_option("-b", "--target-features-tab", dest="target_features_tab", type="str",  
 39                        help="all target features in one file, tab-separated") 
 40       
 41      parser.add_option("-n", "--target-features-tab-names", dest="target_features_tab_names", type="str",  
 42                        help="all target feature names in one file, tab-separated") 
 43   
 44       
 45      parser.add_option("-o", "--output", dest="output_filename", 
 46                        help="write output to this jcml FILE", metavar="FILE") 
 47       
 48      parser.add_option("-f", "--langsrc", dest="langsrc", 
 49                        help="source language code") 
 50       
 51      parser.add_option("-e", "--langtgt", dest="langtgt", 
 52                        help="target language code") 
 53       
 54      parser.add_option("-u", "--testset", dest="testset", 
 55                        help="set name") 
 56       
 57      parser.add_option("-g", "--moseslog", dest="moseslog", 
 58                        help="verbose log of moses decoding") 
 59       
 60      (opt, args) = parser.parse_args() 
 61       
 62      source_file = open(opt.source_filename, 'r') 
 63      target_file = open(opt.target_filename, 'r') 
 64       
 65      feature_file_objects = [open(f, 'r') for f in opt.feature_files] 
 66      print opt.feature_files, opt.feature_names 
 67      try: 
 68          reference_file = open(opt.reference_filename, 'r') 
 69      except: 
 70          reference_file = None 
 71      try: 
 72          score_file = open(opt.score_filename) 
 73      except: 
 74          score_file = None 
 75           
 76      try: 
 77          target_features_tabfile = open(opt.target_features_tab) 
 78      except: 
 79          target_features_tabfile = None 
 80       
 81      try: 
 82          target_features_tab_names_file = open(opt.target_features_tab_names) 
 83          target_features_tab_names = target_features_tab_names_file.readline().strip().split("\t") 
 84          target_features_tab_names_file.close() 
 85      except: 
 86          target_features_tab_names = [] 
 87  #    print "Feature tab names", target_features_tab_names  
 88       
 89      if opt.moseslog: 
 90          extractor = MosesGlassboxExtractor() 
 91          glassbox_features_dicts = extractor.create_dicts_of_sentences_attributes(opt.moseslog) 
 92   
 93      parallelsentences = [] 
 94      i = 0 
 95       
 96      for source_line in source_file: 
 97          i+=1 
 98          atts = OrderedDict() 
 99          source_line = source_line.strip() 
100          target_line = target_file.readline().strip() 
101           
102           
103          if reference_file: 
104              reference_line = reference_file.readline().strip() 
105              reference_sentence = SimpleSentence(reference_line) 
106          else: 
107              reference_sentence = None 
108           
109          #target sentence features         
110          if score_file: 
111              score = score_file.readline().strip() 
112              atts["score"] = score 
113           
114          atts["system"] = opt.system_name 
115           
116          #process glass-box features 
117          if opt.moseslog: 
118              atts.update(glassbox_features_dicts[i-1]) 
119           
120          #process tab-separated features file 
121          if target_features_tabfile: 
122              feature_values = target_features_tabfile.readline().strip().split("\t") 
123              for i, feature_value in enumerate(feature_values): 
124                  try: 
125                      feature_name = target_features_tab_names[i-1] 
126                  except: 
127                      feature_name = i 
128                  atts["qb_{}".format(feature_name)] = feature_value 
129                   
130          source_sentence = SimpleSentence(source_line) 
131          target_sentences = [SimpleSentence(target_line, atts)] 
132           
133          additional_atts = {} 
134          for feature_name, file_object in zip(opt.feature_names, feature_file_objects): 
135              value = file_object.readline().strip() 
136              additional_atts[feature_name] = value 
137               
138                   
139          ps_atts =  {"langsrc" : opt.langsrc , 
140                       "langtgt" : opt.langtgt , 
141                       "testset" : opt.testset , 
142                       "id" : str(i)} 
143           
144          ps_atts.update(additional_atts) 
145           
146          ps = ParallelSentence(source_sentence, target_sentences, reference_sentence, ps_atts) 
147          parallelsentences.append(ps) 
148       
149      for file_object in feature_file_objects: 
150          file_object.close() 
151       
152      Parallelsentence2Jcml(parallelsentences).write_to_file(opt.output_filename) 
153