1 '''
2 Created on 06 Mar 2012
3
4 @author: Eleftherios Avramidis
5 '''
6 from dataprocessor.input.jcmlreader import JcmlReader
7
9 - def __init__(self, output_filename, class_type, class_values=[], numeric_attribute_names=[], string_attribute_names=[], nominal_attributes={}):
10 self.f = open(output_filename, 'w')
11 self.class_type = class_type
12 self.class_values = class_values
13 self.numeric_attribute_names = numeric_attribute_names
14 self.string_attribute_names = string_attribute_names
15 self.nominal_attributes = nominal_attributes
16
18 self.f.write("@RELATION {0}\n\n".format(relation_name))
19
20 for attribute_name in self.numeric_attribute_names:
21 self.f.write("@ATTRIBUTE {0}\tNUMERIC\n".format(attribute_name))
22 for attribute_name in self.string_attribute_names:
23 self.f.write("@ATTRIBUTE {0}\tSTRING\n".format(attribute_name))
24 for attribute_name, values in self.nominal_attributes.iteritems():
25 self.f.write("@ATTRIBUTE %s\t{%s}\n" % (attribute_name, ",".join(values)))
26
27 if self.class_type == "numeric":
28 self.f.write("@ATTRIBUTE class\tNUMERIC\n")
29 elif self.class_type == "nominal":
30 self.f.write("@ATTRIBUTE class\t{%s}\n\n" % (",".join(self.class_values)))
31 self.f.write("@DATA\n")
32
34
35 for attribute_name in self.numeric_attribute_names:
36 try:
37 self.f.write("{0},".format(attributes[attribute_name].replace("inf", "999999").replace("nan", "0")))
38 except KeyError:
39 self.f.write("0,")
40 for attribute_name in self.string_attribute_names:
41 try:
42 self.f.write("{0},".format(attributes[attribute_name].replace("nan", "0")))
43 except KeyError:
44 self.f.write("0,")
45 for attribute_name, values in self.nominal_attributes.iteritems():
46 try:
47 self.f.write("{0},".format(attributes[attribute_name].replace("nan", "0")))
48 except KeyError:
49 self.f.write("{0},".format(list(values)[0]))
50 self.f.write("{0}\n".format(class_value))
51
57
59 - def process(self, jcml_filename, arff_filename,
60 hidden_attribute_names, discrete_attribute_names, string_attribute_names,
61 relation_name, class_name, class_type, class_values):
84
85
86 if __name__ == '__main__':
87 hidden_attributes = ["tgt-1_berkeley-tree", "src_berkeley-tree","ref_berkeley-tree",
88 "testset", "judgment-id", "langsrc", "langtgt", "ps1_judgement_id",
89 "id", "tgt-1_score" , "tgt-1_system" ,
90 ]
91 discrete_attributes = [ "src_reuse_status",
92 "src_terminologyAdmitted_status",
93 "src_total_status",
94 "src_spelling_status",
95 "src_style_status",
96 "src_grammar_status",
97 "src_terminology_status",
98 "src_resultStats_projectStatus",
99 ]
100
101 import sys
102 input_sgml = sys.argv[1]
103 output_arff = sys.argv[2]
104 try:
105 class_name = sys.argv[3]
106 except:
107 class_name = "tgt-1_score"
108 try:
109 class_type = sys.argv[4]
110 except:
111 class_type = "numeric"
112
113 Jcml2Arff().process(input_sgml,
114 output_arff,
115 hidden_attributes, discrete_attributes, [], "qe", class_name, class_type, [])
116