/*
 * Decompiled with CFR 0.152.
 */
package com.wcohen.ss;

import com.wcohen.ss.AbstractStatisticalTokenDistance;
import com.wcohen.ss.BagOfTokens;
import com.wcohen.ss.BasicStringWrapper;
import com.wcohen.ss.BasicStringWrapperIterator;
import com.wcohen.ss.api.StringWrapper;
import com.wcohen.ss.api.Token;
import com.wcohen.ss.api.Tokenizer;
import com.wcohen.ss.tokens.NGramTokenizer;
import com.wcohen.ss.tokens.SimpleTokenizer;
import java.util.ArrayList;
import java.util.Iterator;

public class TFIDF
extends AbstractStatisticalTokenDistance {
    public TFIDF(Tokenizer tokenizer) {
        super(tokenizer);
    }

    public TFIDF() {
    }

    public double score(StringWrapper s, StringWrapper t) {
        this.checkTrainingHasHappened(s, t);
        UnitVector sBag = this.asUnitVector(s);
        UnitVector tBag = this.asUnitVector(t);
        double sim = 0.0;
        Iterator i = sBag.tokenIterator();
        while (i.hasNext()) {
            Token tok = (Token)i.next();
            if (!tBag.contains(tok)) continue;
            double tfidf = sBag.getWeight(tok) * tBag.getWeight(tok);
            sim += tfidf;
            logger.debug((Object)("Common tokens: " + tok.getValue() + ":\t\t " + sBag.getWeight(tok) + " " + tBag.getWeight(tok) + " tfidf (normalized) = " + tfidf));
        }
        return sim;
    }

    protected UnitVector asUnitVector(StringWrapper w) {
        if (w instanceof UnitVector) {
            return (UnitVector)w;
        }
        if (w instanceof BagOfTokens) {
            return new UnitVector((BagOfTokens)w);
        }
        return new UnitVector(w.unwrap(), this.tokenizer.tokenize(w.unwrap()));
    }

    public StringWrapper prepare(String s) {
        return new UnitVector(s, this.tokenizer.tokenize(s));
    }

    public String explainScore(StringWrapper s, StringWrapper t) {
        BagOfTokens sBag = (BagOfTokens)s;
        BagOfTokens tBag = (BagOfTokens)t;
        StringBuffer buf = new StringBuffer("");
        buf.append("Common tokens: ");
        Iterator i = sBag.tokenIterator();
        while (i.hasNext()) {
            Token tok = (Token)i.next();
            if (!tBag.contains(tok)) continue;
            buf.append(" " + tok.getValue() + ": ");
            buf.append(sBag.getWeight(tok));
            buf.append("*");
            buf.append(tBag.getWeight(tok));
        }
        buf.append("\nscore = " + this.score(s, t));
        return buf.toString();
    }

    public String toString() {
        return "[TFIDF]";
    }

    public static void main(String[] argv) {
        TFIDF tfidf = new TFIDF(SimpleTokenizer.DEFAULT_TOKENIZER);
        String str1 = "service hotel city locator";
        String str2 = "service country capital";
        String str3 = "country hospital finder";
        ArrayList<BasicStringWrapper> corpus = new ArrayList<BasicStringWrapper>();
        String[] words = new String[]{str1, str2, str3};
        int i = 0;
        while (i < words.length) {
            corpus.add(new BasicStringWrapper(words[i]));
            ++i;
        }
        BasicStringWrapperIterator iterator = new BasicStringWrapperIterator(corpus.iterator());
        tfidf.train(iterator);
        System.out.println("### Preparing string ###");
        StringWrapper w1 = tfidf.prepare(str1);
        System.out.println("### Preparing string ###");
        StringWrapper w2 = tfidf.prepare(str2);
        tfidf.score(w1, w2);
        System.out.println(tfidf.explainScore(w1, w2));
        str1 = "Programming";
        str2 = "Semantics";
        NGramTokenizer tokenizer = new NGramTokenizer(1, 1, false, SimpleTokenizer.DEFAULT_TOKENIZER);
        tfidf = new TFIDF(tokenizer);
        corpus = new ArrayList();
        String[] words2 = new String[]{str1, str2};
        int i2 = 0;
        while (i2 < words2.length) {
            corpus.add(new BasicStringWrapper(words2[i2]));
            ++i2;
        }
        iterator = new BasicStringWrapperIterator(corpus.iterator());
        tfidf.train(iterator);
        System.out.println("### Preparing string ###");
        w1 = tfidf.prepare(str1);
        System.out.println("### Preparing string ###");
        w2 = tfidf.prepare(str2);
        tfidf.score(w1, w2);
        System.out.println(tfidf.explainScore(w1, w2));
    }

    protected class UnitVector
    extends BagOfTokens {
        public UnitVector(String s, Token[] tokens) {
            super(s, tokens);
            this.termFreq2TFIDF();
        }

        public UnitVector(BagOfTokens bag) {
            this(bag.unwrap(), bag.getTokens());
            this.termFreq2TFIDF();
        }

        private void termFreq2TFIDF() {
            Token tok;
            double normalizer = 0.0;
            Iterator i = this.tokenIterator();
            while (i.hasNext()) {
                tok = (Token)i.next();
                if (TFIDF.this.collectionSize > 0) {
                    Integer dfInteger = (Integer)TFIDF.this.documentFrequency.get(tok);
                    double df = dfInteger == null ? 1.0 : (double)dfInteger.intValue();
                    double localWeight = Math.log(this.getWeight(tok) + 1.0);
                    double globalWeight = Math.log((double)TFIDF.this.collectionSize / df);
                    double w = localWeight * globalWeight;
                    this.setWeight(tok, w);
                    logger.debug((Object)(String.valueOf(tok.getValue()) + ":\t\t tf = " + localWeight + " idf = " + globalWeight + " tfidf (unnormalized) = " + w));
                    normalizer += w * w;
                    continue;
                }
                this.setWeight(tok, 1.0);
                normalizer += 1.0;
            }
            normalizer = Math.sqrt(normalizer);
            logger.debug((Object)("normalizer: " + normalizer));
            i = this.tokenIterator();
            while (i.hasNext()) {
                tok = (Token)i.next();
                this.setWeight(tok, this.getWeight(tok) / normalizer);
            }
        }
    }
}

