Package support :: Package preprocessing :: Package jcml :: Module jcml2arff
[hide private]
[frames] | no frames]

Source Code for Module support.preprocessing.jcml.jcml2arff

  1  ''' 
  2  Created on 06 Mar 2012 
  3   
  4  @author: Eleftherios Avramidis 
  5  ''' 
  6  from dataprocessor.input.jcmlreader import JcmlReader 
  7   
8 -class ArffWriter:
9 - def __init__(self, output_filename, class_type, class_values=[], numeric_attribute_names=[], string_attribute_names=[], nominal_attributes={}):
10 self.f = open(output_filename, 'w') 11 self.class_type = class_type 12 self.class_values = class_values 13 self.numeric_attribute_names = numeric_attribute_names 14 self.string_attribute_names = string_attribute_names 15 self.nominal_attributes = nominal_attributes
16
17 - def write_header(self, relation_name, ):
18 self.f.write("@RELATION {0}\n\n".format(relation_name)) 19 20 for attribute_name in self.numeric_attribute_names: 21 self.f.write("@ATTRIBUTE {0}\tNUMERIC\n".format(attribute_name)) 22 for attribute_name in self.string_attribute_names: 23 self.f.write("@ATTRIBUTE {0}\tSTRING\n".format(attribute_name)) 24 for attribute_name, values in self.nominal_attributes.iteritems(): 25 self.f.write("@ATTRIBUTE %s\t{%s}\n" % (attribute_name, ",".join(values))) 26 27 if self.class_type == "numeric": 28 self.f.write("@ATTRIBUTE class\tNUMERIC\n") 29 elif self.class_type == "nominal": 30 self.f.write("@ATTRIBUTE class\t{%s}\n\n" % (",".join(self.class_values))) 31 self.f.write("@DATA\n")
32
33 - def write_instance(self, attributes ={}, class_value = ""):
34 35 for attribute_name in self.numeric_attribute_names: 36 try: 37 self.f.write("{0},".format(attributes[attribute_name].replace("inf", "999999").replace("nan", "0"))) 38 except KeyError: 39 self.f.write("0,") 40 for attribute_name in self.string_attribute_names: 41 try: 42 self.f.write("{0},".format(attributes[attribute_name].replace("nan", "0"))) 43 except KeyError: 44 self.f.write("0,") 45 for attribute_name, values in self.nominal_attributes.iteritems(): 46 try: 47 self.f.write("{0},".format(attributes[attribute_name].replace("nan", "0"))) 48 except KeyError: 49 self.f.write("{0},".format(list(values)[0])) 50 self.f.write("{0}\n".format(class_value))
51
52 - def close(self):
53 if self.f: 54 self.f.close()
55 - def __del__(self):
56 self.close()
57
58 -class Jcml2Arff:
59 - def process(self, jcml_filename, arff_filename, 60 hidden_attribute_names, discrete_attribute_names, string_attribute_names, 61 relation_name, class_name, class_type, class_values):
62 dataset = JcmlReader(jcml_filename).get_dataset() 63 attribute_names = set(dataset.get_all_attribute_names()) - set(hidden_attribute_names) 64 nominal_attributes = dataset.get_discrete_attribute_values(discrete_attribute_names) 65 numeric_attribute_names = attribute_names - set(discrete_attribute_names) 66 arff = ArffWriter(arff_filename, class_type, class_values, numeric_attribute_names, string_attribute_names, nominal_attributes) 67 arff.write_header(relation_name) 68 69 for ps in dataset.get_parallelsentences(): 70 71 atts = {} 72 atts.update(ps.attributes) 73 atts.update(ps.get_nested_attributes()) 74 class_value = atts[class_name] 75 del(atts[class_name]) 76 for attname in hidden_attribute_names: 77 try: 78 del(atts[attname]) 79 80 except: 81 pass 82 arff.write_instance(atts, class_value) 83 arff.close()
84 85 86 if __name__ == '__main__': 87 hidden_attributes = ["tgt-1_berkeley-tree", "src_berkeley-tree","ref_berkeley-tree", 88 "testset", "judgment-id", "langsrc", "langtgt", "ps1_judgement_id", 89 "id", "tgt-1_score" , "tgt-1_system" , 90 ] 91 discrete_attributes = [ "src_reuse_status", 92 "src_terminologyAdmitted_status", 93 "src_total_status", 94 "src_spelling_status", 95 "src_style_status", 96 "src_grammar_status", 97 "src_terminology_status", 98 "src_resultStats_projectStatus", 99 ] 100 101 import sys 102 input_sgml = sys.argv[1] 103 output_arff = sys.argv[2] 104 try: 105 class_name = sys.argv[3] 106 except: 107 class_name = "tgt-1_score" 108 try: 109 class_type = sys.argv[4] 110 except: 111 class_type = "numeric" 112 113 Jcml2Arff().process(input_sgml, 114 output_arff, 115 hidden_attributes, discrete_attributes, [], "qe", class_name, class_type, []) 116