Package dataprocessor :: Package input :: Module wmtreader
[hide private]
[frames] | no frames]

Source Code for Module dataprocessor.input.wmtreader

  1  ''' 
  2   
  3  @author: Eleftherios Avramidis 
  4  ''' 
  5  import os 
  6  import re 
  7  import codecs 
  8  from sentence.sentence import SimpleSentence 
  9  from sentence.parallelsentence import ParallelSentence 
 10  import logging 
 11   
12 -class WmtReader():
13 ''' 14 classdocs 15 ''' 16 17
18 - def __init__(self):
19 ''' 20 Constructor 21 '''
22
23 - def read_parallelsentences(self, base_dir, langpair, extract_references=False):
24 source_dir = "%s/plain/sources/" % base_dir 25 system_outputs_dir = "%s/plain/system-outputs/" % base_dir 26 reference_dir = "%s/plain/references/" % base_dir 27 testsets = os.listdir(system_outputs_dir) 28 29 parallelsentences = [] 30 31 for testset in testsets: 32 source_filename ="%s/%s-src.%s" % (source_dir, testset, langpair.split("-")[0]) 33 reference_filename = "%s/%s-ref.%s" % (reference_dir, testset, langpair.split("-")[1]) 34 try: 35 source_xml_file = open(source_filename, 'r') 36 except: 37 logging.warn("Source file '{}' could not be opened".format(source_xml_file)) 38 39 40 if extract_references: 41 try: 42 reference_file = open(reference_filename, 'r') 43 except: 44 logging.warn("Reference file '{}' could not be opened".format(reference_file)) 45 46 47 submissions = [] 48 testset_dir = "%s/%s" % (system_outputs_dir, testset) 49 langpairs = os.listdir(testset_dir) 50 if not langpair in langpairs: 51 print "didn't find language pair %s" % langpair 52 continue 53 langpair_dir = "%s/%s" % (testset_dir, langpair) 54 submission_filenames = os.listdir(langpair_dir) 55 for filename in submission_filenames: 56 match = re.search("\.([^.]*)$", filename) 57 system_name = match.group(1) 58 full_filename = "%s/%s" % (langpair_dir, filename) 59 submission_file = open(full_filename, 'r') 60 submissions.append((submission_file, system_name)) 61 62 k = 0 63 for sourceline in source_xml_file: 64 translations = [] 65 66 for i in range(len(submissions)): 67 translation_text = submissions[i][0].readline() 68 system_name = submissions[i][1] 69 attributes = { 'system' : system_name } 70 translation = SimpleSentence(translation_text, attributes) 71 translations.append(translation) 72 73 source = SimpleSentence(sourceline, {}) 74 attributes = {"id" : str(k+1), 75 "langsrc" : langpair.split("-")[0], 76 "langtgt" : langpair.split("-")[1], 77 "testset" : testset 78 } 79 80 if extract_references: 81 referenceline = reference_file.readline(); 82 reference = SimpleSentence(referenceline, {}) 83 else: 84 reference = None 85 86 parallelsentence = ParallelSentence(source, translations, reference, attributes) 87 parallelsentences.append(parallelsentence) 88 k += 1 89 90 return parallelsentences
91 92 93 if __name__ == '__main__': 94 import sys 95 from dataprocessor.sax.saxps2jcml import Parallelsentence2Jcml 96 97 langpairs = ["en-de", "de-en", "en-fr", "fr-en", "en-es", "es-en", "en-cs", "cs-en", "en-ru", "ru-en"] 98 base_dir = sys.argv[1] 99 output_dir = sys.argv[2] 100 file_prefix = sys.argv[3] 101 102 extract_references = "--ref" in sys.argv 103 104 for langpair in langpairs: 105 pss = WmtReader().read_parallelsentences(base_dir, langpair, extract_references) 106 filename = "{}.{}.jcml".format(file_prefix, langpair) 107 filename = os.path.join(output_dir, filename) 108 Parallelsentence2Jcml(pss).write_to_file(filename) 109