1 """
2
3 @author: Eleftherios Avramidis
4 """
5 from featuregenerator import FeatureGenerator
6 from nltk.tokenize.punkt import PunktWordTokenizer
7
8
10 """
11 Class that provides a feature generator able to count the number of the tokens in the given simplesentences
12 """
13
14
16 """
17 Uses NLTK toolkit in order to tokenize given simplesentence and provide a feature with the number of tokens
18 @param simplesentence: The SimpleSentence whose words are to be counted
19 @type simplesentence: sentence.sentence.SimpleSentence
20 @rtype: dict
21 @return: dictionary containing lenght attribute
22 """
23 sent_string = simplesentence.get_string().strip()
24 tokens = len(sent_string.split(' '))
25 chars = len(sent_string)
26 avg_chars = 1.000 * chars / tokens
27
28 return {"l_tokens" : str(tokens),
29 "l_chars" : str(chars),
30 "l_avgchars" : "{:.3}".format(avg_chars)
31 }
32