org.hibernate.search.query.dsl.impl.MoreLikeThisBuilder.java Source code

Introduction

Here is the source code for org.hibernate.search.query.dsl.impl.MoreLikeThisBuilder.java
Source

/*
 * Hibernate Search, full-text search for your domain model
 *
 * License: GNU Lesser General Public License (LGPL), version 2.1 or later
 * See the lgpl.txt file in the root directory or <http://www.gnu.org/licenses/lgpl-2.1.html>.
 */

package org.hibernate.search.query.dsl.impl;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queries.mlt.MoreLikeThis;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.UnicodeUtil;
import org.hibernate.search.analyzer.impl.LuceneAnalyzerReference;
import org.hibernate.search.annotations.Store;
import org.hibernate.search.bridge.FieldBridge;
import org.hibernate.search.bridge.builtin.NumericFieldBridge;
import org.hibernate.search.bridge.util.impl.ContextualExceptionBridgeHelper;
import org.hibernate.search.engine.impl.DocumentBuilderHelper;
import org.hibernate.search.engine.integration.impl.ExtendedSearchIntegrator;
import org.hibernate.search.engine.metadata.impl.DocumentFieldMetadata;
import org.hibernate.search.engine.spi.DocumentBuilderIndexedEntity;
import org.hibernate.search.exception.AssertionFailure;
import org.hibernate.search.query.engine.spi.EntityInfo;
import org.hibernate.search.query.engine.spi.HSQuery;
import org.hibernate.search.util.impl.PassThroughAnalyzer;
import org.hibernate.search.util.logging.impl.Log;
import org.hibernate.search.util.logging.impl.LoggerFactory;

import static org.hibernate.search.query.dsl.impl.ConnectedMoreLikeThisQueryBuilder.INPUT_TYPE.ENTITY;
import static org.hibernate.search.query.dsl.impl.ConnectedMoreLikeThisQueryBuilder.INPUT_TYPE.ID;

/**
 * Class inspired and code copied from Apache Lucene MoreLikeThis class.
 * Apache Lucene code copyright the Apache Software Foundation released under the
 * Apache Software License 2.0.
 *
 * @author Emmanuel Bernard
 */
public class MoreLikeThisBuilder<T> {

    private static final Log log = LoggerFactory.make();

    private final int minWordLen = MoreLikeThis.DEFAULT_MIN_WORD_LENGTH;
    private final int maxNumTokensParsed = MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED;
    private final int maxWordLen = MoreLikeThis.DEFAULT_MAX_WORD_LENGTH;
    private final Set<?> stopWords = MoreLikeThis.DEFAULT_STOP_WORDS;
    private final DocumentBuilderIndexedEntity documentBuilder;
    // We lower the min defaults to 1 because we don't merge the freq of *all* fields unlike the original MoreLikeThis
    // TODO: is that hurting performance? Could we guess "small fields" and ony lower these?
    private final int minTermFreq = 1; //MoreLikeThis.DEFAULT_MIN_TERM_FREQ;
    private final int minDocFreq = 1; //MoreLikeThis.DEFAULT_MIN_DOC_FREQ;
    private final int maxDocFreq = MoreLikeThis.DEFAULT_MAX_DOC_FREQ;
    private final int maxQueryTerms = MoreLikeThis.DEFAULT_MAX_QUERY_TERMS;
    private boolean boost = MoreLikeThis.DEFAULT_BOOST;
    private float boostFactor = 1;
    private TFIDFSimilarity similarity;
    private Integer documentNumber;
    private String[] compatibleFieldNames;
    private IndexReader indexReader;
    private FieldsContext fieldsContext;
    private Object input;
    private QueryBuildingContext queryContext;
    private boolean excludeEntityCompared;
    private ConnectedMoreLikeThisQueryBuilder.INPUT_TYPE inputType;
    private TermQuery findById;

    public MoreLikeThisBuilder(DocumentBuilderIndexedEntity documentBuilder,
            ExtendedSearchIntegrator searchIntegrator) {
        this.documentBuilder = documentBuilder;
        Similarity configuredSimilarity = searchIntegrator.getIndexBindings().get(documentBuilder.getBeanClass())
                .getSimilarity();
        if (configuredSimilarity instanceof TFIDFSimilarity) {
            this.similarity = (TFIDFSimilarity) configuredSimilarity;
        } else {
            throw log.requireTFIDFSimilarity(documentBuilder.getBeanClass());
        }
    }

    public MoreLikeThisBuilder indexReader(IndexReader indexReader) {
        this.indexReader = indexReader;
        return this;
    }

    public MoreLikeThisBuilder compatibleFieldNames(String... compatibleFieldNames) {
        this.compatibleFieldNames = compatibleFieldNames;
        return this;
    }

    public MoreLikeThisBuilder otherMoreLikeThisContext(MoreLikeThisQueryContext moreLikeThisContext) {
        this.boost = moreLikeThisContext.isBoostTerms();
        this.boostFactor = moreLikeThisContext.getTermBoostFactor();
        this.excludeEntityCompared = moreLikeThisContext.isExcludeEntityUsedForComparison();
        return this;
    }

    /**
     * Return a query that will return docs like the passed lucene document ID.
     * @return a query that will return docs like the passed lucene document ID.
     */
    public Query createQuery() {
        try {
            documentNumber = getLuceneDocumentIdFromIdAsTermOrNull(documentBuilder);
            return maybeExcludeComparedEntity(createQuery(retrieveTerms()));
        } catch (IOException e) {
            throw log.ioExceptionOnIndexOfEntity(e, documentBuilder.getBeanClass());
        }
    }

    /**
     * Try and retrieve the document id from the input. If failing and a backup approach exists, returns null.
     */
    private Integer getLuceneDocumentIdFromIdAsTermOrNull(DocumentBuilderIndexedEntity documentBuilder) {
        String id;
        if (inputType == ID) {
            id = documentBuilder.getIdBridge().objectToString(input);
        } else if (inputType == ENTITY) {
            // Try and extract the id, if failing the id will be null
            try {
                // I expect a two way bridge to return null from a null input, correct?
                id = documentBuilder.getIdBridge().objectToString(documentBuilder.getId(input));
            } catch (IllegalStateException e) {
                id = null;
            }
        } else {
            throw new AssertionFailure("We don't support string and reader for MoreLikeThis");
        }
        if (id == null) {
            return null;
        }
        findById = new TermQuery(new Term(documentBuilder.getIdFieldName(), id));
        HSQuery query = queryContext.getFactory().createHSQuery(findById, queryContext.getEntityType());
        List<EntityInfo> entityInfos = query.maxResults(1).projection(HSQuery.DOCUMENT_ID).queryEntityInfos();
        if (entityInfos.size() == 0) {
            if (inputType == ID) {
                throw log.entityWithIdNotFound(queryContext.getEntityType(), id);
            } else {
                return null;
            }
        }
        return (Integer) entityInfos.iterator().next().getProjection()[0];
    }

    private Query maybeExcludeComparedEntity(Query query) {
        // It would be better to attach a collector to exclude a document by its id
        // but at this stage we could have documents reordered and thus with a different id
        // Maybe a Filter would be more efficient?
        if (excludeEntityCompared && documentNumber != null) {
            return new BooleanQuery.Builder().add(query, BooleanClause.Occur.MUST)
                    .add(new ConstantScoreQuery(findById), BooleanClause.Occur.MUST_NOT).build();
        } else {
            return query;
        }
    }

    /**
     * Create the More Like This query from a PriorityQueue
     */
    private Query createQuery(List<PriorityQueue<Object[]>> q) {
        //In the original algorithm, the number of terms is limited to maxQueryTerms
        //In the current implementation, we do nbrOfFields * maxQueryTerms
        int length = fieldsContext.size();
        if (length == 0) {
            throw new AssertionFailure("Querying MoreLikeThis on 0 field.");
        } else if (length == 1) {
            return createQuery(q.get(0), fieldsContext.getFirst());
        } else {
            BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
            //the fieldsContext indexes are aligned with the priority queue's
            Iterator<FieldContext> fieldsContextIterator = fieldsContext.iterator();
            for (PriorityQueue<Object[]> queue : q) {
                try {
                    queryBuilder.add(createQuery(queue, fieldsContextIterator.next()), BooleanClause.Occur.SHOULD);
                } catch (BooleanQuery.TooManyClauses ignore) {
                    break;
                }
            }
            return queryBuilder.build();
        }
    }

    private Query createQuery(PriorityQueue<Object[]> q, FieldContext fieldContext) {
        if (q == null) {
            final FieldBridge fieldBridge = fieldContext.getFieldBridge() != null ? fieldContext.getFieldBridge()
                    : documentBuilder.getBridge(fieldContext.getField());
            if (fieldBridge instanceof NumericFieldBridge) {
                // we probably can do something here
                //TODO how to build the query where we don't have the value?
                throw log.numericFieldCannotBeUsedInMoreLikeThis(fieldContext.getField(),
                        documentBuilder.getBeanClass());
            }
            DocumentFieldMetadata fieldMetadata = documentBuilder.getTypeMetadata()
                    .getDocumentFieldMetadataFor(fieldContext.getField());
            if (fieldMetadata == null) {
                throw log.unknownFieldNameForMoreLikeThisQuery(fieldContext.getField(),
                        documentBuilder.getBeanClass().getName());
            }
            boolean hasTermVector = fieldMetadata.getTermVector() != Field.TermVector.NO;
            boolean isStored = fieldMetadata.getStore() != Store.NO;
            if (!(hasTermVector || isStored)) {
                throw log.fieldNotStoredNorTermVectorCannotBeUsedInMoreLikeThis(fieldContext.getField(),
                        documentBuilder.getBeanClass());
            }
            boolean isIdOrEmbeddedId = fieldMetadata.isId() || fieldMetadata.isIdInEmbedded();
            if (isIdOrEmbeddedId) {
                throw log.fieldIdCannotBeUsedInMoreLikeThis(fieldContext.getField(),
                        documentBuilder.getBeanClass());
            }
        }

        BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
        Object cur;
        int qterms = 0;
        float bestScore = 0;
        while ((cur = q.pop()) != null) {
            Object[] ar = (Object[]) cur;
            TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0]));

            if (boost) {
                if (qterms == 0) {
                    bestScore = ((Float) ar[2]);
                }
                float myScore = ((Float) ar[2]);

                tq.setBoost(boostFactor * myScore / bestScore);
            }

            try {
                queryBuilder.add(tq, BooleanClause.Occur.SHOULD);
            } catch (BooleanQuery.TooManyClauses ignore) {
                break;
            }

            qterms++;
            if (maxQueryTerms > 0 && qterms >= maxQueryTerms) {
                break;
            }
        }
        // Apply field adjustments
        return fieldContext.getFieldCustomizer().setWrappedQuery(queryBuilder.build()).createQuery();
    }

    /**
     * Find words for a more-like-this query former.
     * Store them per field name according to the order of fieldnames defined in {@link #fieldsContext}.
     * If the field name is not compatible with term retrieval, the queue will be empty for that index.
     */
    private List<PriorityQueue<Object[]>> retrieveTerms() throws IOException {
        int size = fieldsContext.size();
        Map<String, Map<String, Int>> termFreqMapPerFieldname = new HashMap<String, Map<String, Int>>(size);
        final Fields vectors;
        Document maybeDocument = null;
        if (documentNumber == null && size > 0) {
            //build the document from the entity instance

            //first build the list of fields we are interested in
            String[] fieldNames = new String[size];
            Iterator<FieldContext> fieldsContextIterator = fieldsContext.iterator();
            for (int index = 0; index < size; index++) {
                fieldNames[index] = fieldsContextIterator.next().getField();
            }
            //TODO should we keep the fieldToAnalyzerMap around to pass to the analyzer?
            Map<String, String> fieldToAnalyzerMap = new HashMap<String, String>();
            //FIXME by calling documentBuilder we don't honor .comparingField("foo").ignoreFieldBridge(): probably not a problem in practice though
            maybeDocument = documentBuilder.getDocument(null, input, null, fieldToAnalyzerMap, null,
                    new ContextualExceptionBridgeHelper(), fieldNames);
            vectors = null;
        } else {
            vectors = indexReader.getTermVectors(documentNumber);
        }
        for (FieldContext fieldContext : fieldsContext) {
            String fieldName = fieldContext.getField();
            if (isCompatibleField(fieldName)) {
                Map<String, Int> termFreqMap = new HashMap<String, Int>();
                termFreqMapPerFieldname.put(fieldName, termFreqMap);
                final Terms vector;
                if (vectors != null) {
                    vector = vectors.terms(fieldName);
                } else {
                    vector = null;
                }

                // field does not store term vector info
                if (vector == null) {
                    if (maybeDocument == null) {
                        maybeDocument = indexReader.document(documentNumber);
                    }
                    IndexableField[] fields = maybeDocument.getFields(fieldName);
                    for (IndexableField field : fields) {
                        //TODO numbers
                        final String stringValue = DocumentBuilderHelper.extractStringFromFieldable(field);
                        if (stringValue != null) {
                            addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldContext);
                        }
                    }
                } else {
                    addTermFrequencies(termFreqMap, vector);
                }
            } else {
                //place null as the field is not compatible
                termFreqMapPerFieldname.put(fieldName, null);
            }
        }
        List<PriorityQueue<Object[]>> results = new ArrayList<PriorityQueue<Object[]>>(size);
        for (Map.Entry<String, Map<String, Int>> entry : termFreqMapPerFieldname.entrySet()) {
            results.add(createQueue(entry.getKey(), entry.getValue()));
        }
        return results;
    }

    private boolean isCompatibleField(String fieldName) {
        for (String compatibleFieldName : compatibleFieldNames) {
            if (compatibleFieldName.equals(fieldName)) {
                return true;
            }
        }
        return false;
    }

    /**
     * Create a PriorityQueue from a word->tf map.
     *
     * @param words a map of words keyed on the word(String) with Int objects as the values.
     */
    private PriorityQueue<Object[]> createQueue(String fieldName, Map<String, Int> words) throws IOException {
        if (words == null) {
            //incompatible field name
            return null;
        }
        // have collected all words in doc and their freqs
        int numDocs = indexReader.numDocs();
        FreqQ res = new FreqQ(words.size()); // will order words by score

        for (Map.Entry<String, Int> entry : words.entrySet()) { // for every word
            String word = entry.getKey();
            int tf = entry.getValue().x; // term freq in the source doc
            if (minTermFreq > 0 && tf < minTermFreq) {
                continue; // filter out words that don't occur enough times in the source
            }

            // The original algorithm looks for all field names and finds the top frequency
            // and only consider this field for the query
            // "go through all the fields and find the largest document frequency"
            Term term = new Term(fieldName, word);
            int freq = indexReader.docFreq(new Term(fieldName, word));

            if (minDocFreq > 0 && freq < minDocFreq) {
                continue; // filter out words that don't occur in enough docs
            }

            if (freq > maxDocFreq) {
                continue; // filter out words that occur in too many docs
            }

            if (freq == 0) {
                continue; // index update problem?
            }

            float idf = similarity.idf(freq, numDocs);
            float score = tf * idf;

            // only really need 1st 3 entries, other ones are for troubleshooting
            res.insertWithOverflow(new Object[] { word, // the word
                    fieldName, // the top field
                    score, // overall score
                    idf, // idf
                    freq, // freq in all docs
                    tf });
        }
        return res;
    }

    /**
     * Adds terms and frequencies found in vector into the Map termFreqMap
     *
     * @param termFreqMap a Map of terms and their frequencies
     * @param vector List of terms and their frequencies for a doc/field
     */
    private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException {
        final TermsEnum termsEnum = vector.iterator();
        char[] charBuffer = new char[0];
        CharsRef outputReference = new CharsRef();
        BytesRef text;
        while ((text = termsEnum.next()) != null) {
            charBuffer = ArrayUtil.grow(charBuffer, text.length);
            final int stringLenght = UnicodeUtil.UTF8toUTF16(text, charBuffer);
            outputReference.chars = charBuffer;
            outputReference.length = stringLenght;
            final String term = outputReference.toString();
            if (isNoiseWord(term)) {
                continue;
            }
            final int freq = (int) termsEnum.totalTermFreq();

            // increment frequency
            Int cnt = termFreqMap.get(term);
            if (cnt == null) {
                cnt = new Int();
                termFreqMap.put(term, cnt);
                cnt.x = freq;
            } else {
                cnt.x += freq;
            }
        }
    }

    /**
     * Adds term frequencies found by tokenizing text from reader into the Map words
     *
     * @param r a source of text to be tokenized
     * @param termFreqMap a Map of terms and their frequencies
     * @param fieldName Used by analyzer for any special per-field analysis
     */
    private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, FieldContext fieldContext)
            throws IOException {
        String fieldName = fieldContext.getField();
        Analyzer analyzer = queryContext.getQueryAnalyzerReference().unwrap(LuceneAnalyzerReference.class)
                .getAnalyzer();
        if (!fieldContext.applyAnalyzer()) {
            // essentially does the Reader to String conversion for us
            analyzer = PassThroughAnalyzer.INSTANCE;
        }
        TokenStream ts = analyzer.tokenStream(fieldName, r);
        try {
            int tokenCount = 0;
            // for every token
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            ts.reset();
            while (ts.incrementToken()) {
                String word = termAtt.toString();
                tokenCount++;
                if (tokenCount > maxNumTokensParsed) {
                    break;
                }
                if (isNoiseWord(word)) {
                    continue;
                }

                // increment frequency
                Int cnt = termFreqMap.get(word);
                if (cnt == null) {
                    termFreqMap.put(word, new Int());
                } else {
                    cnt.x++;
                }
            }
            ts.end();
        } finally {
            IOUtils.closeWhileHandlingException(ts);
        }
    }

    /**
     * determines if the passed term is likely to be of interest in "more like" comparisons
     *
     * @param term The word being considered
     *
     * @return true if should be ignored, false if should be used in further analysis
     */
    private boolean isNoiseWord(String term) {
        int len = term.length();
        if (minWordLen > 0 && len < minWordLen) {
            return true;
        }
        if (maxWordLen > 0 && len > maxWordLen) {
            return true;
        }
        return stopWords != null && stopWords.contains(term);
    }

    public MoreLikeThisBuilder fieldsContext(FieldsContext fieldsContext) {
        this.fieldsContext = fieldsContext;
        return this;
    }

    public MoreLikeThisBuilder input(Object input) {
        this.input = input;
        return this;
    }

    public MoreLikeThisBuilder queryContext(QueryBuildingContext queryContext) {
        this.queryContext = queryContext;
        return this;
    }

    public MoreLikeThisBuilder idAsTerm(String idAsTerm) {
        return this;
    }

    public MoreLikeThisBuilder inputType(ConnectedMoreLikeThisQueryBuilder.INPUT_TYPE inputType) {
        this.inputType = inputType;
        return this;
    }

    /**
     * PriorityQueue that orders words by score.
     */
    private static class FreqQ extends PriorityQueue<Object[]> {
        FreqQ(int s) {
            super(s);
        }

        @Override
        protected boolean lessThan(Object[] aa, Object[] bb) {
            Float fa = (Float) aa[2];
            Float fb = (Float) bb[2];
            return fa > fb;
        }
    }

    /**
     * Use for frequencies and to avoid renewing Integers.
     */
    private static class Int {
        int x;

        Int() {
            x = 1;
        }

        @Override
        public String toString() {
            return "Int{" + x + '}';
        }
    }
}