it.unibz.instasearch.indexing.SearchResultDoc.java Source code

Java tutorial

Introduction

Here is the source code for it.unibz.instasearch.indexing.SearchResultDoc.java

Source

/*
 * Copyright (c) 2009 Andrejs Jermakovics.
 * 
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *     Andrejs Jermakovics - initial implementation
 */
package it.unibz.instasearch.indexing;

import it.unibz.instasearch.InstaSearchPlugin;

import java.io.IOException;
import java.util.Collection;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.Directory;
import org.eclipse.core.resources.IFile;
import org.eclipse.core.resources.IWorkspaceRoot;
import org.eclipse.core.runtime.IPath;
import org.eclipse.core.runtime.Path;

public class SearchResultDoc {

    private Document doc;
    private int docId;
    private TermFreqVector termFreqVector;
    private float[] termScoreVector;
    private float score;
    private int matchCount;
    private Directory indexDir;

    public SearchResultDoc(Directory dir, Document doc, int docId, float score) {
        this.indexDir = dir;
        this.docId = docId;
        this.doc = doc;
        this.score = score;

        matchCount = 0;
    }

    private String getFieldValue(Field field) {
        return doc.get(field.toString());
    }

    public String getFilePath() {
        return getFieldValue(Field.FILE);
    }

    public String getFileName() {
        return getFieldValue(Field.NAME);
    }

    public String getFileExtension() {
        return getFieldValue(Field.EXT);
    }

    public boolean isInJar() {
        if (doc.getField(Field.JAR.toString()) == null)
            return false;

        String jarField = getFieldValue(Field.JAR);

        if (StorageIndexer.NO_VALUE.equals(jarField))
            return false;

        if (jarField.toLowerCase().endsWith(".jar"))
            return true;

        return false;
    }

    public String getJarName() {

        if (isInJar())
            return getFieldValue(Field.JAR);

        return null;
    }

    public IPath getProject() {
        return new Path(getFieldValue(Field.PROJ));
    }

    public String getProjectName() {
        return getProject().lastSegment();
    }

    /**
     * 
     * @return
     * @throws IOException
     */
    private float[] getTermScoreVector() throws IOException {
        if (termScoreVector == null) {
            IndexReader reader = IndexReader.open(indexDir, true);

            if (termFreqVector == null)
                createFreqVect(reader);

            termScoreVector = createTermScoreVector(termFreqVector, reader);
            reader.close();
        }

        return termScoreVector;
    }

    private TermFreqVector getTermFreqVector() throws IOException {
        if (termFreqVector == null) {
            IndexReader reader = IndexReader.open(indexDir, true);
            createFreqVect(reader);
            reader.close();
        }

        return termFreqVector;
    }

    private void createFreqVect(IndexReader reader) throws IOException {
        termFreqVector = reader.getTermFreqVector(docId, Field.CONTENTS.toString()); // obtain only when requested
    }

    /**
     * Returns a vector of given term scores (tf-idf). 
     * The size of the vector is the number of terms in this document
     * The term positions in the vector are the same as in the term frequency vector
     * 
     * @param terms
     * @return TermScoreVector
     * @throws IOException
     */
    public float[] getTermScoreVector(Collection<String> terms) throws IOException {
        float[] allTermScoreVect = getTermScoreVector();
        float[] termScoreVect = new float[allTermScoreVect.length];
        TermFreqVector freqVector = getTermFreqVector();

        for (String term : terms) {
            int idx = freqVector.indexOf(term); // does a binary search
            if (idx == -1)
                continue;
            termScoreVect[idx] = allTermScoreVect[idx];
        }

        return termScoreVect;
    }

    public double getTermScore(String term) throws IOException {
        float[] allTermScoreVect = getTermScoreVector();
        TermFreqVector freqVector = getTermFreqVector();

        if (freqVector == null)
            return 0;

        int idx = freqVector.indexOf(term); // does a binary search
        if (idx == -1)
            return 0;
        return allTermScoreVect[idx];
    }

    private float[] createTermScoreVector(TermFreqVector vect, IndexReader reader) throws IOException {
        if (vect == null)
            return new float[0];

        int[] termFrequencies = vect.getTermFrequencies();
        String[] terms = vect.getTerms();
        float[] scores = new float[terms.length];

        int numDocs = reader.maxDoc();
        Similarity sim = Searcher.SIMILARITY;

        for (int i = 0; i < terms.length; i++) {
            String termText = terms[i];
            Term term = new Term(Field.CONTENTS.toString(), termText);

            float termFreq = sim.tf(termFrequencies[i]);

            int docFreq = reader.docFreq(term);
            float idf = sim.idf(docFreq, numDocs);

            float tfIdf = termFreq * idf;

            scores[i] = tfIdf;
        }

        return scores;
    }

    public IFile getFile() {
        if (isInJar())
            return null;

        Path path = new Path(getFilePath());
        IWorkspaceRoot workspaceRoot = InstaSearchPlugin.getWorkspaceRoot();
        IFile file = workspaceRoot.getFile(path);

        if (file == null || file.getRawLocation() == null)
            file = workspaceRoot.getFileForLocation(path);

        return file;
    }

    /**
     * @return the score
     */
    public float getScore() {
        return score;
    }

    /**
     * @return the doc
     */
    public Document getDoc() {
        return doc;
    }

    /**
     * @return the docId
     */
    public int getDocId() {
        return docId;
    }

    /**
     * @return the matchCount
     */
    public int getMatchCount() {
        return matchCount;
    }

    /**
     * Computes match count as SUM( tf ) of all query terms in the document
     * Accesses the index thus affects performance
     * 
     * @param reader
     * @param queryTerms
     * @throws IOException
     */
    public void computeMatchCount(IndexReader reader, Collection<String> queryTerms) throws IOException {
        if (termFreqVector == null)
            createFreqVect(reader);

        if (termFreqVector == null)
            return;

        int freqs[] = termFreqVector.getTermFrequencies();
        int freqSum = 0;

        for (String term : queryTerms) {
            int idx = termFreqVector.indexOf(term); // does a binary search
            if (idx == -1)
                continue;
            freqSum += freqs[idx];
        }

        matchCount = freqSum;
    }

    @Override
    public String toString() {
        return getFilePath();
    }
}