org.apache.lucene.search.similarities.SimilarityBase.java Source code

Introduction

Here is the source code for org.apache.lucene.search.similarities.SimilarityBase.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.similarities;

import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.SmallFloat;

/**
 * A subclass of {@code Similarity} that provides a simplified API for its
 * descendants. Subclasses are only required to implement the {@link #score}
 * and {@link #toString()} methods. Implementing
 * {@link #explain(List, BasicStats, double, double)} is optional,
 * inasmuch as SimilarityBase already provides a basic explanation of the score
 * and the term frequency. However, implementers of a subclass are encouraged to
 * include as much detail about the scoring method as possible.
 * <p>
 * Note: multi-word queries such as phrase queries are scored in a different way
 * than Lucene's default ranking algorithm: whereas it "fakes" an IDF value for
 * the phrase as a whole (since it does not know it), this class instead scores
 * phrases as a summation of the individual term scores.
 * @lucene.experimental
 */
public abstract class SimilarityBase extends Similarity {
    /** For {@link #log2(double)}. Precomputed for efficiency reasons. */
    private static final double LOG_2 = Math.log(2);

    /** 
     * True if overlap tokens (tokens with a position of increment of zero) are
     * discounted from the document's length.
     */
    protected boolean discountOverlaps = true;

    /**
     * Sole constructor. (For invocation by subclass 
     * constructors, typically implicit.)
     */
    public SimilarityBase() {
    }

    /** Determines whether overlap tokens (Tokens with
     *  0 position increment) are ignored when computing
     *  norm.  By default this is true, meaning overlap
     *  tokens do not count when computing norms.
     *
     *  @lucene.experimental
     *
     *  @see #computeNorm
     */
    public void setDiscountOverlaps(boolean v) {
        discountOverlaps = v;
    }

    /**
     * Returns true if overlap tokens are discounted from the document's length. 
     * @see #setDiscountOverlaps 
     */
    public boolean getDiscountOverlaps() {
        return discountOverlaps;
    }

    @Override
    public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
        SimScorer weights[] = new SimScorer[termStats.length];
        for (int i = 0; i < termStats.length; i++) {
            BasicStats stats = newStats(collectionStats.field(), boost);
            fillBasicStats(stats, collectionStats, termStats[i]);
            weights[i] = new BasicSimScorer(stats);
        }
        if (weights.length == 1) {
            return weights[0];
        } else {
            return new MultiSimilarity.MultiSimScorer(weights);
        }
    }

    /** Factory method to return a custom stats object */
    protected BasicStats newStats(String field, double boost) {
        return new BasicStats(field, boost);
    }

    /** Fills all member fields defined in {@code BasicStats} in {@code stats}. 
     *  Subclasses can override this method to fill additional stats. */
    protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats,
            TermStatistics termStats) {
        // TODO: validate this for real, somewhere else
        assert termStats.totalTermFreq() <= collectionStats.sumTotalTermFreq();
        assert termStats.docFreq() <= collectionStats.sumDocFreq();

        // TODO: add sumDocFreq for field (numberOfFieldPostings)
        stats.setNumberOfDocuments(collectionStats.docCount());
        stats.setNumberOfFieldTokens(collectionStats.sumTotalTermFreq());
        stats.setAvgFieldLength(collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
        stats.setDocFreq(termStats.docFreq());
        stats.setTotalTermFreq(termStats.totalTermFreq());
    }

    /**
     * Scores the document {@code doc}.
     * <p>Subclasses must apply their scoring formula in this class.</p>
     * @param stats the corpus level statistics.
     * @param freq the term frequency.
     * @param docLen the document length.
     * @return the score.
     */
    protected abstract double score(BasicStats stats, double freq, double docLen);

    /**
     * Subclasses should implement this method to explain the score. {@code expl}
     * already contains the score, the name of the class and the doc id, as well
     * as the term frequency and its explanation; subclasses can add additional
     * clauses to explain details of their scoring formulae.
     * <p>The default implementation does nothing.</p>
     * 
     * @param subExpls the list of details of the explanation to extend
     * @param stats the corpus level statistics.
     * @param freq the term frequency.
     * @param docLen the document length.
     */
    protected void explain(List<Explanation> subExpls, BasicStats stats, double freq, double docLen) {
    }

    /**
     * Explains the score. The implementation here provides a basic explanation
     * in the format <em>score(name-of-similarity, doc=doc-id,
     * freq=term-frequency), computed from:</em>, and
     * attaches the score (computed via the {@link #score(BasicStats, double, double)}
     * method) and the explanation for the term frequency. Subclasses content with
     * this format may add additional details in
     * {@link #explain(List, BasicStats, double, double)}.
     *  
     * @param stats the corpus level statistics.
     * @param freq the term frequency and its explanation.
     * @param docLen the document length.
     * @return the explanation.
     */
    protected Explanation explain(BasicStats stats, Explanation freq, double docLen) {
        List<Explanation> subs = new ArrayList<>();
        explain(subs, stats, freq.getValue().floatValue(), docLen);

        return Explanation.match((float) score(stats, freq.getValue().floatValue(), docLen),
                "score(" + getClass().getSimpleName() + ", freq=" + freq.getValue() + "), computed from:", subs);
    }

    /**
     * Subclasses must override this method to return the name of the Similarity
     * and preferably the values of parameters (if any) as well.
     */
    @Override
    public abstract String toString();

    // ------------------------------ Norm handling ------------------------------

    /** Cache of decoded bytes. */
    private static final float[] LENGTH_TABLE = new float[256];

    static {
        for (int i = 0; i < 256; i++) {
            LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i);
        }
    }

    /** Encodes the document length in the same way as {@link BM25Similarity}. */
    @Override
    public final long computeNorm(FieldInvertState state) {
        final int numTerms;
        if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
            numTerms = state.getUniqueTermCount();
        } else if (discountOverlaps) {
            numTerms = state.getLength() - state.getNumOverlap();
        } else {
            numTerms = state.getLength();
        }
        return SmallFloat.intToByte4(numTerms);
    }

    // ----------------------------- Static methods ------------------------------

    /** Returns the base two logarithm of {@code x}. */
    public static double log2(double x) {
        // Put this to a 'util' class if we need more of these.
        return Math.log(x) / LOG_2;
    }

    // --------------------------------- Classes ---------------------------------

    /** Delegates the {@link #score(float, long)} and
     * {@link #explain(Explanation, long)} methods to
     * {@link SimilarityBase#score(BasicStats, double, double)} and
     * {@link SimilarityBase#explain(BasicStats, Explanation, double)},
     * respectively.
     */
    final class BasicSimScorer extends SimScorer {
        final BasicStats stats;

        BasicSimScorer(BasicStats stats) {
            this.stats = stats;
        }

        double getLengthValue(long norm) {
            return LENGTH_TABLE[Byte.toUnsignedInt((byte) norm)];
        }

        @Override
        public float score(float freq, long norm) {
            return (float) SimilarityBase.this.score(stats, freq, getLengthValue(norm));
        }

        @Override
        public Explanation explain(Explanation freq, long norm) {
            return SimilarityBase.this.explain(stats, freq, getLengthValue(norm));
        }

    }
}