1 '''
2 Created on Sep 17, 2012
3
4 @author: jogin
5 '''
6 from xml.etree.cElementTree import iterparse
7 import numpy
8
9 '''
10 First this class extracts values from tgt sentences and saves them into a list.
11 Second the extracted values are used for SVMlight classifier.
12 '''
23
24
26 source_xml_file = open(self.input_filename, "r")
27
28 context = iterparse(source_xml_file, events=("start", "end"))
29
30 context = iter(context)
31
32 event, root = context.next()
33 print root
34
35 for event, elem in context:
36
37 if event == "start" and elem.tag == self.TAG_SENT:
38 self.attribute_values.append([])
39
40 elif event == "start" and elem.tag == self.TAG_TGT:
41 values = elem.attrib.values()
42 self.attribute_values[-1].append(values)
43 self.remove_not_numbers()
44
45
46
47
48
49
51 self.attribute_values[-1][-1].pop(24)
52 self.attribute_values[-1][-1].pop(23)
53 self.attribute_values[-1][-1].pop(22)
54 self.attribute_values[-1][-1].pop(19)
55 self.attribute_values[-1][-1].pop(11)
56 self.attribute_values[-1][-1].pop(9)
57 self.attribute_values[-1][-1].pop(8)
58 self.attribute_values[-1][-1].pop(6)
59 self.attribute_values[-1][-1].pop(1)
60
61
62 '''
63 TODO (Python 2.7, shogun corresponding libraries, ...)
64 '''
68
69 from shogun.Features import StringCharFeatures, DNA, BinaryLabels
70 from shogun.Kernel import WeightedDegreeStringKernel
71 from shogun.Classifier import SVMLight
72
73 feats_train=StringCharFeatures(DNA)
74 feats_train.set_features(fm_train_dna)
75 feats_test=StringCharFeatures(DNA)
76 feats_test.set_features(fm_test_dna)
77
78 kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)
79
80 labels=BinaryLabels(label_train_dna)
81
82 svm=SVMLight(C, kernel, labels)
83 svm.set_qpsize(3)
84 svm.set_linear_term(-numpy.array([1,2,3,4,5,6,7,8,7,6], dtype=numpy.double));
85 svm.set_epsilon(epsilon)
86 svm.parallel.set_num_threads(num_threads)
87 svm.train()
88
89 kernel.init(feats_train, feats_test)
90 out = svm.apply().get_labels()
91 return out,kernel
92
93 Jcml2Shogun('wmt08.if.partial.jcml')
94