org.tallison.lucene.contrast.QueryToCorpusContraster.java Source code

Introduction

Here is the source code for org.tallison.lucene.contrast.QueryToCorpusContraster.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.tallison.lucene.contrast;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArrayMap;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.mutable.MutableValueInt;
import org.tallison.lucene.corpus.stats.IDFIndexCalc;
import org.tallison.lucene.corpus.stats.TFIDFPriorityQueue;
import org.tallison.lucene.corpus.stats.TermIDF;

public class QueryToCorpusContraster {

    private final int maxDocs;
    private final IndexSearcher searcher;
    private final Version version;
    private boolean ignoreCase = true;
    private Analyzer analyzer = null;
    private int maxTokens = 10000;

    //if the term doesn't show up in this many docs, ignore!
    private int minTermFreq = 10;

    public QueryToCorpusContraster(Version version, IndexSearcher searcher, int maxDocs) {
        this.searcher = searcher;
        this.maxDocs = maxDocs;
        this.version = version;
    }

    public List<TermIDF> contrast(Query query, String fieldName, int numResults) throws IOException {
        TopScoreDocCollector results = TopScoreDocCollector.create(maxDocs, maxDocs + 10000);
        searcher.search(query, results);

        ScoreDoc[] scoreDocs = results.topDocs().scoreDocs;
        //if there are fewer documents than minTermFreq
        //return empty list now
        if (scoreDocs.length < minTermFreq) {
            return new ArrayList<TermIDF>();
        }

        //total hack
        int initialSize = scoreDocs.length * 100;
        CharArrayMap<MutableValueInt> map = new CharArrayMap<MutableValueInt>(initialSize, ignoreCase);
        CharArraySet tmpSet = new CharArraySet(100, ignoreCase);
        Set<String> selector = new HashSet<String>();
        selector.add(fieldName);

        for (ScoreDoc scoreDoc : scoreDocs) {
            //get terms from doc
            processDoc(scoreDoc.doc, fieldName, selector, tmpSet);
            //now update global doc freqs
            Iterator<Object> it = tmpSet.iterator();
            while (it.hasNext()) {
                char[] token = (char[]) it.next();
                MutableValueInt docCount = map.get(token, 0, token.length);
                if (docCount == null) {
                    docCount = new MutableValueInt();
                    docCount.value = 1;
                } else {
                    docCount.value++;
                }
                map.put(token, docCount);
            }
            tmpSet.clear();
        }

        return getResults(fieldName, map, numResults);
    }

    private List<TermIDF> getResults(String fieldName, CharArrayMap<MutableValueInt> map, int numResults) {
        TFIDFPriorityQueue queue = new TFIDFPriorityQueue(numResults);
        IDFIndexCalc idfCalc = new IDFIndexCalc(searcher.getIndexReader());
        int tf = -1;
        double idf = -1.0;
        int minTf = minTermFreq;
        String text = null;
        //make more efficient
        //    Term reusableTerm = new Term(fieldName, "");
        for (Map.Entry<Object, MutableValueInt> entry : map.entrySet()) {

            tf = entry.getValue().value;
            if (tf < minTf)
                continue;

            text = new String((char[]) entry.getKey());
            // calculate idf for potential phrase
            try {
                idf = idfCalc.singleTermIDF(new Term(fieldName, text));
            } catch (IOException e) {
                throw new RuntimeException("Error trying to calculate IDF: " + e.getMessage());
            }
            int estimatedDF = (int) Math.max(1, Math.round(idfCalc.unIDF(idf)));

            TermIDF r = new TermIDF(text, estimatedDF, tf, idf);

            queue.insertWithOverflow(r);
        }
        List<TermIDF> results = new LinkedList<TermIDF>();

        while (queue.size() > 0) {
            results.add(0, queue.pop());
        }
        return results;
    }

    private void processDoc(int docid, String fieldName, Set<String> selector, CharArraySet set)
            throws IOException {
        Terms terms = searcher.getIndexReader().getTermVector(docid, fieldName);
        if (terms != null) {
            TermsEnum te = terms.iterator();
            BytesRef bytes = te.next();
            while (bytes != null) {
                set.add(bytes);
            }
        } else if (analyzer != null) {
            Document document = searcher.doc(docid, selector);
            IndexableField[] fields = document.getFields(fieldName);
            if (fields == null) {
                return;
            }
            for (IndexableField field : fields) {
                String s = field.stringValue();
                //is this possible
                if (s == null) {
                    continue;
                }
                processFieldEntry(fieldName, s, set);
            }

        } else {
            throw new IllegalArgumentException(
                    "The field must have a term vector or the analyzer must" + " not be null.");
        }
    }

    private void processFieldEntry(String fieldName, String s, CharArraySet set) throws IOException {
        TokenStream ts = analyzer.tokenStream(fieldName, s);
        CharTermAttribute cattr = ts.getAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            set.add(cattr.toString());
        }
        ts.end();
        ts.close();
    }

    /**
     * Sets the analyzer to be used if term vectors are not stored.
     *
     * @param analyzer  analyzer to be used if term vectors are not stored
     * @param maxTokens maximum number of tokens to analyze. If &lt; 0,
     *                  all tokens will be analyzed.
     */
    public void setAnalyzer(Analyzer analyzer, int maxTokens) {
        this.analyzer = analyzer;
        this.maxTokens = maxTokens;
    }
}