Package support :: Package preprocessing :: Package jcml :: Module wmt12decodingfeatures2jcml
[hide private]
[frames] | no frames]

Source Code for Module support.preprocessing.jcml.wmt12decodingfeatures2jcml

  1  ''' 
  2  Created on 29 Feb 2012 
  3  @author: Eleftherios Avramidis 
  4  ''' 
  5  import re 
  6  import sys 
  7  import os 
  8  import subprocess 
  9  import numpy 
 10  from featuregenerator.languagefeaturegenerator import LanguageFeatureGenerator 
 11   
 12  from dataprocessor.sax.saxps2jcml import Parallelsentence2Jcml 
 13  from dataprocessor.input.jcmlreader import JcmlReader 
 14   
 15   
16 -def process(path):
17 files = [(int(re.findall("(.*)\.sgml", f)[0]), f) for f in os.listdir(path) if f.endswith("sgml")] 18 atts_vector = [] 19 for sentence_id, filename in sorted(files): 20 filename = os.path.join(path, filename) 21 print filename 22 file = open(filename, 'r') 23 24 file_content = file.read() 25 file.close() 26 27 #SCORES 28 pattern = "SCORES \(UNWEIGHTED/WEIGHTED\): d: ([\d\-\.]*) w: ([\d\-\.]*) u: ([\d\-\.]*) d: ([\d\-\.]*) ([\d\-\.]*) ([\d\-\.]*) ([\d\-\.]*) ([\d\-\.]*) ([\d\-\.]*) lm: ([\d\-\.]*) tm: ([\d\-\.]*) ([\d\-\.]*) ([\d\-\.]*) ([\d\-\.]*) ([\d\-\.]*)" 29 values = re.findall(pattern, file_content)[0] 30 [score_d, score_w, score_u, score_d1, score_d2, score_d3, score_d4, score_d5, score_d6, score_lm, score_tm1, score_tm2, score_tm3, score_tm4, score_tm5] = values 31 32 sentence_atts = dict(score_d = score_d, score_w=score_w, 33 score_d1=score_d1, score_d2=score_d2, score_d3=score_d3, score_d4=score_d4, score_d5=score_d5, score_d6=score_d6, 34 score_lm=score_lm, score_tm1=score_tm1, score_tm2=score_tm2, score_tm3=score_tm3, score_tm4=score_tm4, score_tm5=score_tm5 ) 35 36 #GRAPH 37 graph_content = file_content.split("<wgraph>")[1] 38 pattern = "([\w]*)=([\d\-\.\,\ ]*)\s*" 39 40 atts = {} 41 graph_rows = re.findall(pattern, graph_content) 42 for (graph_feature_name, graph_feature_value)in graph_rows: 43 44 if graph_feature_name == "a": 45 atts.update(_split_a_vector(graph_feature_value)) 46 elif graph_feature_name == "r": 47 atts.update(_split_r_vector(graph_feature_value)) 48 49 elif graph_feature_name in ["UTTERANCE", "VERSION", "w"]: 50 pass 51 else: 52 if graph_feature_name == "pC": 53 graph_feature_value = graph_feature_value.strip()[:-1] 54 55 try: 56 atts[graph_feature_name].append(float(graph_feature_value)) 57 except KeyError: 58 atts[graph_feature_name] = [float(graph_feature_value)] 59 60 #now calculate avg 61 62 for graph_feature_name, graph_feature_values in atts.iteritems(): 63 sentence_atts["{0}_avg".format(graph_feature_name)] = "%.3f" % round(float(numpy.average(graph_feature_values)),3) 64 sentence_atts["{0}_std".format(graph_feature_name)] = "%.3f" % round(float(numpy.std(graph_feature_values)),3) 65 sentence_atts["{0}_var".format(graph_feature_name)] = "%.3f" % round(float(numpy.var(graph_feature_values)),3) 66 67 sentence_atts = dict([("d_%s" % k, v) for k,v in sentence_atts.iteritems()]) 68 atts_vector.append(sentence_atts) 69 return atts_vector
70 71
72 -def _split_a_vector(graph_feature_value):
73 values = graph_feature_value.split(", ") 74 atts = {} 75 i = 0 76 for value in values: 77 i+=1 78 atts["a{0}".format(i)] = float(value.strip()) 79 return atts
80 81
82 -def _split_r_vector(graph_feature_value):
83 #input data have a bug 84 values = re.findall("(\-?\d{1,3}\.\d{1,3})", graph_feature_value) 85 atts = {} 86 i = 0 87 for value in values: 88 i+=1 89 atts["r{0}".format(i)] = float(value.strip()) 90 return atts
91 92 93 if __name__ == '__main__': 94 input_path = sys.argv[1] #"/home/Eleftherios Avramidis/taraxu_data/wmt12/quality-estimation/training_set/decoding" 95 input_jcml = sys.argv[2] #"/home/Eleftherios Avramidis/taraxu_data/wmt12/quality-estimation/training_set/training.jcml" 96 output_jcml = sys.argv[3] #"/home/Eleftherios Avramidis/taraxu_data/wmt12/quality-estimation/training_set/training.decoding.es.f.jcml" 97 98 if len(sys.argv)<4 : 99 print "This script reads the supplementary decoding attributes of WMT12-quality estimation task and \ 100 wraps them into the designated jcml format. Syntax: script [dir] [input.jcml] [output.jcml]\n \ 101 dir: directory where the files [1-9]*.sgml, one file for sentence are stored \ 102 input.jcml: original jcml file containing the rest of the data \ 103 output.jcml: a copy of the original jcml file, with attributes from the decoded process added \ 104 " 105 106 att_vector = process(input_path) 107 dataset = JcmlReader(input_jcml).get_dataset() 108 dataset.add_attribute_vector(att_vector) 109 Parallelsentence2Jcml(dataset.get_parallelsentences()).write_to_file(output_jcml) 110