Package featuregenerator :: Package parser :: Package berkeley :: Module cfgrules
[hide private]
[frames] | no frames]

Source Code for Module featuregenerator.parser.berkeley.cfgrules

  1  ''' 
  2  Created on Jul 13, 2014 
  3   
  4  @author: Eleftherios Avramidis 
  5  ''' 
  6  import logging 
  7  from featuregenerator.featuregenerator import FeatureGenerator 
  8   
9 -class Rule:
10 - def __init__(self):
11 self.lhs = None 12 self.rhs = []
13 - def __str__(self):
14 string = "{}_{}".format(self.lhs, "-".join(self.rhs)) 15 string = string.replace("$,", "COMMA") #german grammar 16 string = string.replace(",", "COMMA") 17 string = string.replace("$.", "DOT") #german grammar 18 string = string.replace(".", "DOT") 19 string = string.replace(";", "DOT") 20 string = string.replace("$", "DLR") 21 string = string.replace("*", "_") 22 string = string.replace(":", "PUNCT") 23 return string
24
25 -def get_cfg_rules(string, terminals=False):
26 ''' 27 Parse the bracketed format from a Berkley PCFG parse 28 and extract the CFG rules included 29 @param string: the parse in a bracketed format 30 @type string: str 31 @return: the CFG rules 32 @rtype: str 33 ''' 34 35 #a stack stores the rules met upper on the tree and may 36 #have remained incomplete 37 stack = [] 38 39 #the label gathers the characters of the labels as they appear 40 #one by one 41 label = [] 42 43 #as we go, we keep track of the previous (unfinished) rule 44 previousrule = Rule() 45 #root = previousrule #not needed for now 46 47 #the rules that are ready get in this list 48 rules = [] 49 50 #get characters one by one (to catch-up with brackets) 51 prevchar = None 52 for char in list(string): 53 logging.debug(char) 54 55 #opening bracket initiates a rule (remains open) 56 if char=="(": 57 nextrule = Rule() 58 label = [] 59 60 #space indicates the label is finished and can be 61 #attached to the previous rule as a child (RHS) 62 #and the new rule as a head (LHS) 63 elif char==" " and prevchar != ")": #not after a closing bracket 64 labelstr = "".join(label) 65 previousrule.rhs.append(labelstr) 66 nextrule.lhs = labelstr 67 logging.debug("Next rule: {}".format(nextrule)) 68 69 #previous rule from upper nodes goes to the stack 70 stack.append(previousrule) 71 logging.debug("Stacking previous rule: {}".format(previousrule)) 72 #and next rule (still open) becomes current rule 73 #waiting for children 74 previousrule = nextrule 75 label = [] 76 77 #closing bracket indicates that a rule is closed/ready 78 #and can be delivered 79 elif char==")" and stack: 80 81 #deliver rule but maybe exclude leaves 82 if previousrule.rhs or terminals: 83 rules.append(previousrule) 84 85 logging.debug("Previous rule getting stored: {}".format(previousrule)) 86 87 #we need to pop the rule from the node above, because 88 #it may get more RHS in the next loop 89 previousrule = stack.pop() 90 logging.debug("Popping previousrule: {}".format(previousrule)) 91 label = [] 92 #get characters for the label 93 else: 94 label.append(char) 95 logging.debug("---") 96 97 #remember the previous character, to consider space after 98 #closing bracket 99 prevchar = char 100 return rules
101 102
103 -class CfgRulesExtractor(FeatureGenerator):
104 ''' 105 Handle the extraction of features out of CFG rules 106 ''' 107
108 - def __init__(self, params):
109 ''' 110 Constructor 111 ''' 112 pass
113
114 - def get_features_simplesentence(self, simplesentence, parallelsentence):
115 ''' 116 Count the CFG rules appearing in the parse 117 ''' 118 try: 119 parsestring = simplesentence.get_attribute("berkeley-tree") 120 except: 121 print "error reading berkeley tree" 122 return {} 123 cfg_rules = get_cfg_rules(parsestring) 124 atts = {} 125 for rule in cfg_rules: 126 atts[rule] = atts.setdefault(rule, 0) + 1 127 return atts
128