org.apache.lucene.search.suggest.DocumentDictionary.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.lucene.search.suggest.DocumentDictionary.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest;

import java.io.IOException;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.MultiBits;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;

/**
 * <p>
 * Dictionary with terms, weights, payload (optional) and contexts (optional)
 * information taken from stored/indexed fields in a Lucene index.
 * </p>
 * <b>NOTE:</b> 
 *  <ul>
 *    <li>
 *      The term field has to be stored; if it is missing, the document is skipped.
 *    </li>
 *    <li>
 *      The payload and contexts field are optional and are not required to be stored.
 *    </li>
 *    <li>
 *      The weight field can be stored or can be a {@link NumericDocValues}.
 *      If the weight field is not defined, the value of the weight is <code>0</code>
 *    </li>
 *  </ul>
 */
public class DocumentDictionary implements Dictionary {

    /** {@link IndexReader} to load documents from */
    protected final IndexReader reader;

    /** Field to read payload from */
    protected final String payloadField;
    /** Field to read contexts from */
    protected final String contextsField;
    private final String field;
    private final String weightField;

    /**
     * Creates a new dictionary with the contents of the fields named <code>field</code>
     * for the terms and <code>weightField</code> for the weights that will be used for
     * the corresponding terms.
     */
    public DocumentDictionary(IndexReader reader, String field, String weightField) {
        this(reader, field, weightField, null);
    }

    /**
     * Creates a new dictionary with the contents of the fields named <code>field</code>
     * for the terms, <code>weightField</code> for the weights that will be used for the 
     * the corresponding terms and <code>payloadField</code> for the corresponding payloads
     * for the entry.
     */
    public DocumentDictionary(IndexReader reader, String field, String weightField, String payloadField) {
        this(reader, field, weightField, payloadField, null);
    }

    /**
     * Creates a new dictionary with the contents of the fields named <code>field</code>
     * for the terms, <code>weightField</code> for the weights that will be used for the 
     * the corresponding terms, <code>payloadField</code> for the corresponding payloads
     * for the entry and <code>contextsField</code> for associated contexts.
     */
    public DocumentDictionary(IndexReader reader, String field, String weightField, String payloadField,
            String contextsField) {
        this.reader = reader;
        this.field = field;
        this.weightField = weightField;
        this.payloadField = payloadField;
        this.contextsField = contextsField;
    }

    @Override
    public InputIterator getEntryIterator() throws IOException {
        return new DocumentInputIterator(payloadField != null, contextsField != null);
    }

    /** Implements {@link InputIterator} from stored fields. */
    protected class DocumentInputIterator implements InputIterator {

        private final int docCount;
        private final Set<String> relevantFields;
        private final boolean hasPayloads;
        private final boolean hasContexts;
        private final Bits liveDocs;
        private int currentDocId = -1;
        private long currentWeight = 0;
        private BytesRef currentPayload = null;
        private Set<BytesRef> currentContexts;
        private final NumericDocValues weightValues;
        IndexableField[] currentDocFields = new IndexableField[0];
        int nextFieldsPosition = 0;

        /**
         * Creates an iterator over term, weight and payload fields from the lucene
         * index. setting <code>withPayload</code> to false, implies an iterator
         * over only term and weight.
         */
        public DocumentInputIterator(boolean hasPayloads, boolean hasContexts) throws IOException {
            this.hasPayloads = hasPayloads;
            this.hasContexts = hasContexts;
            docCount = reader.maxDoc() - 1;
            weightValues = (weightField != null) ? MultiDocValues.getNumericValues(reader, weightField) : null;
            liveDocs = (reader.leaves().size() > 0) ? MultiBits.getLiveDocs(reader) : null;
            relevantFields = getRelevantFields(new String[] { field, weightField, payloadField, contextsField });
        }

        @Override
        public long weight() {
            return currentWeight;
        }

        @Override
        public BytesRef next() throws IOException {
            while (true) {
                if (nextFieldsPosition < currentDocFields.length) {
                    // Still values left from the document
                    IndexableField fieldValue = currentDocFields[nextFieldsPosition++];
                    if (fieldValue.binaryValue() != null) {
                        return fieldValue.binaryValue();
                    } else if (fieldValue.stringValue() != null) {
                        return new BytesRef(fieldValue.stringValue());
                    } else {
                        continue;
                    }
                }

                if (currentDocId == docCount) {
                    // Iterated over all the documents.
                    break;
                }

                currentDocId++;
                if (liveDocs != null && !liveDocs.get(currentDocId)) {
                    continue;
                }

                Document doc = reader.document(currentDocId, relevantFields);

                BytesRef tempPayload = null;
                if (hasPayloads) {
                    IndexableField payload = doc.getField(payloadField);
                    if (payload != null) {
                        if (payload.binaryValue() != null) {
                            tempPayload = payload.binaryValue();
                        } else if (payload.stringValue() != null) {
                            tempPayload = new BytesRef(payload.stringValue());
                        }
                    }
                    // in case that the iterator has payloads configured, use empty values
                    // instead of null for payload
                    if (tempPayload == null) {
                        tempPayload = new BytesRef();
                    }
                }

                Set<BytesRef> tempContexts;
                if (hasContexts) {
                    tempContexts = new HashSet<>();
                    final IndexableField[] contextFields = doc.getFields(contextsField);
                    for (IndexableField contextField : contextFields) {
                        if (contextField.binaryValue() != null) {
                            tempContexts.add(contextField.binaryValue());
                        } else if (contextField.stringValue() != null) {
                            tempContexts.add(new BytesRef(contextField.stringValue()));
                        } else {
                            continue;
                        }
                    }
                } else {
                    tempContexts = Collections.emptySet();
                }

                currentDocFields = doc.getFields(field);
                nextFieldsPosition = 0;
                if (currentDocFields.length == 0) { // no values in this document
                    continue;
                }
                IndexableField fieldValue = currentDocFields[nextFieldsPosition++];
                BytesRef tempTerm;
                if (fieldValue.binaryValue() != null) {
                    tempTerm = fieldValue.binaryValue();
                } else if (fieldValue.stringValue() != null) {
                    tempTerm = new BytesRef(fieldValue.stringValue());
                } else {
                    continue;
                }

                currentPayload = tempPayload;
                currentContexts = tempContexts;
                currentWeight = getWeight(doc, currentDocId);

                return tempTerm;
            }

            return null;
        }

        @Override
        public BytesRef payload() {
            return currentPayload;
        }

        @Override
        public boolean hasPayloads() {
            return hasPayloads;
        }

        /** 
         * Returns the value of the <code>weightField</code> for the current document.
         * Retrieves the value for the <code>weightField</code> if it's stored (using <code>doc</code>)
         * or if it's indexed as {@link NumericDocValues} (using <code>docId</code>) for the document.
         * If no value is found, then the weight is 0.
         */
        protected long getWeight(Document doc, int docId) throws IOException {
            IndexableField weight = doc.getField(weightField);
            if (weight != null) { // found weight as stored
                return (weight.numericValue() != null) ? weight.numericValue().longValue() : 0;
            } else if (weightValues != null) { // found weight as NumericDocValue
                if (weightValues.docID() < docId) {
                    weightValues.advance(docId);
                }
                if (weightValues.docID() == docId) {
                    return weightValues.longValue();
                } else {
                    // missing
                    return 0;
                }
            } else { // fall back
                return 0;
            }
        }

        private Set<String> getRelevantFields(String... fields) {
            Set<String> relevantFields = new HashSet<>();
            for (String relevantField : fields) {
                if (relevantField != null) {
                    relevantFields.add(relevantField);
                }
            }
            return relevantFields;
        }

        @Override
        public Set<BytesRef> contexts() {
            if (hasContexts) {
                return currentContexts;
            }
            return null;
        }

        @Override
        public boolean hasContexts() {
            return hasContexts;
        }
    }
}