ro.ranking.technique.bm25.BM25FTermScorer.java Source code

Java tutorial

Introduction

Here is the source code for ro.ranking.technique.bm25.BM25FTermScorer.java

Source

package ro.ranking.technique.bm25;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TermQuery;

/**
 * Calculate the relevance value of a term applying BM25F function ranking. The
 * {@link BM25FParameters} k1,b_field, boost_field are used.<BR>
 *
 *
 * @see BM25FParameters
 */
public class BM25FTermScorer extends Scorer {

    private TermDocs[] termDocs;
    private float idf = 0f;
    private String[] fields;
    private float[] boosts;
    private float[] bParam;
    private boolean[] termDocsNext;
    private int doc = Integer.MAX_VALUE;
    private boolean initializated = false;
    private byte[][] norms;
    private float[] averageLengths;
    private float K1;
    private int len;
    private float termBoost;
    private int docFreq;
    private int numDocs;
    private String termText;

    public BM25FTermScorer(IndexReader reader, TermQuery term, String[] fields, float[] boosts, float[] bParams,
            Similarity similarity) {
        super(similarity);
        this.fields = fields;
        this.boosts = boosts;
        this.bParam = bParams;
        len = fields.length;
        this.termDocs = new TermDocs[len];
        this.termDocsNext = new boolean[len];
        this.norms = new byte[len][];
        this.averageLengths = new float[len];
        this.K1 = BM25FParameters.getK1();
        this.termBoost = term.getBoost();
        this.numDocs = reader.numDocs();
        this.termText = term.getTerm().text();

        try {
            this.docFreq = reader.docFreq(new Term(BM25FParameters.getIdfField(), termText));
            for (int i = 0; i < len; i++) {
                String field = this.fields[i];
                this.termDocs[i] = reader.termDocs(new Term(field, termText));
                norms[i] = reader.norms(field);
                averageLengths[i] = BM25FParameters.getAverageLength(field);
            }
            this.idf = this.getSimilarity().idf(docFreq, numDocs);
        } catch (IOException e) {
        }

    }

    /*
     * (non-Javadoc)
     * 
     * @see org.apache.lucene.search.Scorer#docID()
     */
    @Override
    public int docID() {
        return this.doc;
    }

    private boolean init() throws IOException {
        boolean result = false;
        for (int i = 0; i < len; i++) {
            this.termDocsNext[i] = this.termDocs[i].next();
            if (this.termDocsNext[i] && this.termDocs[i].doc() < this.doc) {
                result = true;
                this.doc = this.termDocs[i].doc();
            }
        }
        return result;
    }

    /*
     * (non-Javadoc)
     *
     * @see org.apache.lucene.search.Scorer#nextDoc()
     */
    @Override
    public int nextDoc() throws IOException {
        if (!initializated) {
            this.initializated = true;
            if (this.init()) {
                return this.doc;
            } else {
                return NO_MORE_DOCS;
            }
        }

        int min = NO_MORE_DOCS;

        for (int i = 0; i < len; i++) {
            if (this.termDocsNext[i] && this.termDocs[i].doc() == this.doc) {
                this.termDocsNext[i] = this.termDocs[i].next();
            }
            if (this.termDocsNext[i] && this.termDocs[i].doc() < min)
                min = this.termDocs[i].doc();
        }
        return (this.doc = min);
    }

    /*
        * (non-Javadoc)
        *
        * @see org.apache.lucene.search.Scorer#score()
        */
    @Override
    public float score() throws IOException {
        float acum = 0f;

        for (int i = 0; i < len; i++) {
            if (this.termDocs[i].doc() == doc) {
                float av_length = this.averageLengths[i];
                float fieldNorm = this.getSimilarity().decodeNormValue(norms[i][this.docID()]);
                float length = 1 / (fieldNorm * fieldNorm);

                float aux = this.bParam[i] * length / av_length;

                aux += (1 - this.bParam[i]);
                acum += (this.termBoost * this.boosts[i] * this.termDocs[i].freq()) / aux;
            }
        }

        acum /= (this.K1 + acum);
        acum *= this.idf;
        return acum;
    }

    @Override
    public int advance(int target) throws IOException {
        if (target == NO_MORE_DOCS)
            return NO_MORE_DOCS;
        while (this.nextDoc() != NO_MORE_DOCS && this.docID() < target) {
        }

        return this.docID();
    }
}