Package featuregenerator :: Module lengthfeaturegenerator
[hide private]
[frames] | no frames]

Source Code for Module featuregenerator.lengthfeaturegenerator

 1  """ 
 2   
 3  @author: Eleftherios Avramidis 
 4  """ 
 5  from featuregenerator import FeatureGenerator 
 6  from nltk.tokenize.punkt import PunktWordTokenizer 
 7   
 8   
9 -class LengthFeatureGenerator(FeatureGenerator):
10 """ 11 Class that provides a feature generator able to count the number of the tokens in the given simplesentences 12 """ 13 14
15 - def get_features_simplesentence(self, simplesentence, parallelsentence = None):
16 """ 17 Uses NLTK toolkit in order to tokenize given simplesentence and provide a feature with the number of tokens 18 @param simplesentence: The SimpleSentence whose words are to be counted 19 @type simplesentence: sentence.sentence.SimpleSentence 20 @rtype: dict 21 @return: dictionary containing lenght attribute 22 """ 23 sent_string = simplesentence.get_string().strip() 24 tokens = len(sent_string.split(' ')) #count tokens 25 chars = len(sent_string) 26 avg_chars = 1.000 * chars / tokens 27 28 return {"l_tokens" : str(tokens), 29 "l_chars" : str(chars), 30 "l_avgchars" : "{:.3}".format(avg_chars) 31 }
32