featuregenerator.lengthfeaturegenerator

10 """ 11 Class that provides a feature generator able to count the number of the tokens in the given simplesentences 12 """ 13 14

15 - def get_features_simplesentence(self, simplesentence, parallelsentence = None):

16 """ 17 Uses NLTK toolkit in order to tokenize given simplesentence and provide a feature with the number of tokens 18 @param simplesentence: The SimpleSentence whose words are to be counted 19 @type simplesentence: sentence.sentence.SimpleSentence 20 @rtype: dict 21 @return: dictionary containing lenght attribute 22 """ 23 sent_string = simplesentence.get_string().strip() 24 tokens = len(sent_string.split(' ')) #count tokens 25 chars = len(sent_string) 26 avg_chars = 1.000 * chars / tokens 27 28 return {"l_tokens" : str(tokens), 29 "l_chars" : str(chars), 30 "l_avgchars" : "{:.3}".format(avg_chars) 31 }

Source Code for Module featuregenerator.lengthfeaturegenerator