Package dataprocessor :: Package sax :: Module jcml2Shogun
[hide private]
[frames] | no frames]

Source Code for Module dataprocessor.sax.jcml2Shogun

 1  ''' 
 2  Created on Sep 17, 2012 
 3   
 4  @author: jogin 
 5  ''' 
 6  from xml.etree.cElementTree import iterparse 
 7  import numpy 
 8   
 9  ''' 
10  First this class extracts values from tgt sentences and saves them into a list. 
11  Second the extracted values are used for SVMlight classifier.  
12  ''' 
13 -class Jcml2Shogun():
14 - def __init__(self, input_xml_filename):
15 self.input_filename = input_xml_filename 16 self.TAG_SENT = 'judgedsentence' 17 self.TAG_SRC = 'src' 18 self.TAG_TGT = 'tgt' 19 self.TAG_DOC = 'jcml' 20 self.attribute_values = [] 21 22 self.get_jcml_attribute_values()
23 24
26 source_xml_file = open(self.input_filename, "r") 27 # get an iterable 28 context = iterparse(source_xml_file, events=("start", "end")) 29 # turn it into an iterator 30 context = iter(context) 31 # get the root element 32 event, root = context.next() 33 print root 34 35 for event, elem in context: 36 # create new list 37 if event == "start" and elem.tag == self.TAG_SENT: 38 self.attribute_values.append([]) 39 # get tgt values into a list 40 elif event == "start" and elem.tag == self.TAG_TGT: 41 values = elem.attrib.values() 42 self.attribute_values[-1].append(values) 43 self.remove_not_numbers() # remove infinities and strings
44 45 #for snt in self.attribute_values: 46 # print snt 47 48 49 # remove infinities and strings
50 - def remove_not_numbers(self):
51 self.attribute_values[-1][-1].pop(24) # inf 52 self.attribute_values[-1][-1].pop(23) # inf 53 self.attribute_values[-1][-1].pop(22) # inf 54 self.attribute_values[-1][-1].pop(19) # inf 55 self.attribute_values[-1][-1].pop(11) # string 56 self.attribute_values[-1][-1].pop(9) # inf 57 self.attribute_values[-1][-1].pop(8) # inf 58 self.attribute_values[-1][-1].pop(6) # string 59 self.attribute_values[-1][-1].pop(1) # inf
60 61 62 ''' 63 TODO (Python 2.7, shogun corresponding libraries, ...) 64 '''
65 - def classifier_svmlight_linear_term_modular(self, fm_train_dna=traindna,fm_test_dna=testdna, \ 66 label_train_dna=label_traindna,degree=3, \ 67 C=10,epsilon=1e-5,num_threads=1):
68 69 from shogun.Features import StringCharFeatures, DNA, BinaryLabels 70 from shogun.Kernel import WeightedDegreeStringKernel 71 from shogun.Classifier import SVMLight 72 73 feats_train=StringCharFeatures(DNA) 74 feats_train.set_features(fm_train_dna) 75 feats_test=StringCharFeatures(DNA) 76 feats_test.set_features(fm_test_dna) 77 78 kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) 79 80 labels=BinaryLabels(label_train_dna) 81 82 svm=SVMLight(C, kernel, labels) 83 svm.set_qpsize(3) 84 svm.set_linear_term(-numpy.array([1,2,3,4,5,6,7,8,7,6], dtype=numpy.double)); 85 svm.set_epsilon(epsilon) 86 svm.parallel.set_num_threads(num_threads) 87 svm.train() 88 89 kernel.init(feats_train, feats_test) 90 out = svm.apply().get_labels() 91 return out,kernel
92 93 Jcml2Shogun('wmt08.if.partial.jcml') 94