/*
 * Decompiled with CFR 0.152.
 */
package com.wcohen.ss.tokens;

import com.wcohen.ss.api.Token;
import com.wcohen.ss.api.Tokenizer;
import com.wcohen.ss.tokens.BasicToken;
import com.wcohen.ss.tokens.SimpleTokenizer;
import java.util.ArrayList;

public class NGramTokenizer
implements Tokenizer {
    private int minNGramSize;
    private int maxNGramSize;
    private boolean keepOldTokens;
    private Tokenizer innerTokenizer;
    public static NGramTokenizer DEFAULT_TOKENIZER = new NGramTokenizer(3, 5, true, SimpleTokenizer.DEFAULT_TOKENIZER);

    public NGramTokenizer(int minNGramSize, int maxNGramSize, boolean keepOldTokens, Tokenizer innerTokenizer) {
        this.minNGramSize = minNGramSize;
        this.maxNGramSize = maxNGramSize;
        this.keepOldTokens = keepOldTokens;
        this.innerTokenizer = innerTokenizer;
    }

    public Token[] tokenize(String input) {
        Token[] initialTokens = this.innerTokenizer.tokenize(input);
        ArrayList<Token> tokens = new ArrayList<Token>();
        int i = 0;
        while (i < initialTokens.length) {
            Token tok = initialTokens[i];
            String str = "^" + tok.getValue() + "$";
            if (this.keepOldTokens) {
                tokens.add(this.intern(str));
            }
            int lo = 1;
            while (lo < str.length()) {
                int len = this.minNGramSize;
                while (len <= this.maxNGramSize) {
                    if (lo + len < str.length()) {
                        tokens.add(this.innerTokenizer.intern(str.substring(lo, lo + len)));
                    }
                    ++len;
                }
                ++lo;
            }
            ++i;
        }
        return tokens.toArray(new BasicToken[tokens.size()]);
    }

    public Token intern(String s) {
        return this.innerTokenizer.intern(s);
    }

    public static void main(String[] argv) {
        NGramTokenizer tokenizer = new NGramTokenizer(1, 1, false, SimpleTokenizer.DEFAULT_TOKENIZER);
        int n = 0;
        Token[] tokens = tokenizer.tokenize("merlin in berlin");
        int j = 0;
        while (j < tokens.length) {
            System.out.println("token " + ++n + ":" + " id=" + tokens[j].getIndex() + " value: '" + tokens[j].getValue() + "'");
            ++j;
        }
    }
}

