1 '''
2 Created on Jul 13, 2014
3
4 @author: Eleftherios Avramidis
5 '''
6 import logging
7 from featuregenerator.featuregenerator import FeatureGenerator
8
11 self.lhs = None
12 self.rhs = []
14 string = "{}_{}".format(self.lhs, "-".join(self.rhs))
15 string = string.replace("$,", "COMMA")
16 string = string.replace(",", "COMMA")
17 string = string.replace("$.", "DOT")
18 string = string.replace(".", "DOT")
19 string = string.replace(";", "DOT")
20 string = string.replace("$", "DLR")
21 string = string.replace("*", "_")
22 string = string.replace(":", "PUNCT")
23 return string
24
26 '''
27 Parse the bracketed format from a Berkley PCFG parse
28 and extract the CFG rules included
29 @param string: the parse in a bracketed format
30 @type string: str
31 @return: the CFG rules
32 @rtype: str
33 '''
34
35
36
37 stack = []
38
39
40
41 label = []
42
43
44 previousrule = Rule()
45
46
47
48 rules = []
49
50
51 prevchar = None
52 for char in list(string):
53 logging.debug(char)
54
55
56 if char=="(":
57 nextrule = Rule()
58 label = []
59
60
61
62
63 elif char==" " and prevchar != ")":
64 labelstr = "".join(label)
65 previousrule.rhs.append(labelstr)
66 nextrule.lhs = labelstr
67 logging.debug("Next rule: {}".format(nextrule))
68
69
70 stack.append(previousrule)
71 logging.debug("Stacking previous rule: {}".format(previousrule))
72
73
74 previousrule = nextrule
75 label = []
76
77
78
79 elif char==")" and stack:
80
81
82 if previousrule.rhs or terminals:
83 rules.append(previousrule)
84
85 logging.debug("Previous rule getting stored: {}".format(previousrule))
86
87
88
89 previousrule = stack.pop()
90 logging.debug("Popping previousrule: {}".format(previousrule))
91 label = []
92
93 else:
94 label.append(char)
95 logging.debug("---")
96
97
98
99 prevchar = char
100 return rules
101
102
104 '''
105 Handle the extraction of features out of CFG rules
106 '''
107
109 '''
110 Constructor
111 '''
112 pass
113
115 '''
116 Count the CFG rules appearing in the parse
117 '''
118 try:
119 parsestring = simplesentence.get_attribute("berkeley-tree")
120 except:
121 print "error reading berkeley tree"
122 return {}
123 cfg_rules = get_cfg_rules(parsestring)
124 atts = {}
125 for rule in cfg_rules:
126 atts[rule] = atts.setdefault(rule, 0) + 1
127 return atts
128