org.pageseeder.flint.lucene.search.Terms.java Source code

Introduction

Here is the source code for org.pageseeder.flint.lucene.search.Terms.java
Source

/*
 * Copyright 2015 Allette Systems (Australia)
 * http://www.allette.com.au
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.pageseeder.flint.lucene.search;

import org.apache.lucene.index.Fields;
import org.apache.lucene.index.*;
import org.apache.lucene.search.FuzzyTermsEnum;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.pageseeder.flint.lucene.util.Beta;
import org.pageseeder.flint.lucene.util.Bucket;
import org.pageseeder.flint.lucene.util.Bucket.Entry;
import org.pageseeder.xmlwriter.XMLWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.*;

/**
 * A collection of utility methods to manipulate and extract terms.
 *
 * @author Christophe Lauret
 * @version 18 March 2011
 */
public final class Terms {

    /**
     * private logger
     */
    private final static Logger LOGGER = LoggerFactory.getLogger(Terms.class);

    /**
     * Compares terms using their text value instead of their field value.
     */
    private static final Comparator<Term> TEXT_COMPARATOR = new Comparator<Term>() {
        /** {@inheritDoc} */
        @Override
        public int compare(Term t1, Term t2) {
            return t1.text().compareTo(t2.text());
        }
    };

    /** Utility class. */
    private Terms() {
    }

    /**
     * Returns a comparator to order terms using their text value.
     *
     * @return a comparator to order terms using their text value.
     */
    public static Comparator<Term> textComparator() {
        return TEXT_COMPARATOR;
    }

    /**
     * Returns the list of terms based on the given list of fields and texts.
     *
     * <p>The number of the terms returns is (number of fields) x (number of texts).
     *
     * @param fields The list of fields.
     * @param texts  The list of texts.
     *
     * @return The corresponding list of terms.
     */
    public static List<Term> terms(List<String> fields, List<String> texts) {
        List<Term> terms = new ArrayList<Term>();
        for (String field : fields) {
            for (String text : texts) {
                terms.add(new Term(field, text));
            }
        }
        return terms;
    }

    /**
     * Returns the list of fuzzy terms given a term and using the specified index reader.
     *
     * @param reader Index reader to use.
     * @param term   The term to use.
     *
     * @return The corresponding list of fuzzy terms.
     *
     * @throws IOException If an error is thrown by the fuzzy term enumeration.
     */
    public static List<String> fuzzy(IndexReader reader, Term term) throws IOException {
        List<String> values = new ArrayList<String>();
        fuzzy(reader, values, term);
        return values;
    }

    /**
     * Returns the list of prefix terms given a term and using the specified index reader.
     *
     * @param reader Index reader to use.
     * @param term   The term to use.
     *
     * @return The corresponding list of prefix terms.
     *
     * @throws IOException If an error is thrown by the prefix term enumeration.
     */
    public static List<String> prefix(IndexReader reader, Term term) throws IOException {
        List<String> terms = new ArrayList<String>();
        prefix(reader, terms, term);
        return terms;
    }

    /**
     * Loads all the fuzzy terms in the list of terms given the reader.
     *
     * @param reader Index reader to use.
     * @param values The list of terms to load.
     * @param term   The term to use.
     *
     * @throws IOException If an error is thrown by the fuzzy term enumeration.
     */
    public static void fuzzy(IndexReader reader, List<String> values, Term term) throws IOException {
        fuzzy(reader, values, term, 2);
    }

    /**
     * Loads all the fuzzy terms in the list of terms given the reader.
     *
     * @param reader Index reader to use.
     * @param values The list of terms to load.
     * @param term   The term to use.
     *
     * @throws IOException If an error is thrown by the fuzzy term enumeration.
     */
    public static void fuzzy(IndexReader reader, List<String> values, Term term, int minSimilarity)
            throws IOException {
        AttributeSource atts = new AttributeSource();
        Fields fields = MultiFields.getFields(reader);
        org.apache.lucene.index.Terms terms = fields == null ? null : fields.terms(term.field());
        if (terms == null)
            return;
        FuzzyTermsEnum fuzzy = new FuzzyTermsEnum(terms, atts, term, minSimilarity, 0, false);
        BytesRef val;
        BytesRef searched = term.bytes();
        while ((val = fuzzy.next()) != null) {
            if (!searched.bytesEquals(val))
                values.add(val.utf8ToString());
        }
    }

    /**
     * Loads all the fuzzy terms in the list of terms given the reader.
     *
     * @param reader  Index reader to use.
     * @param bucket  Where to store the terms.
     * @param term    The term to use.
     *
     * @throws IOException If an error is thrown by the fuzzy term enumeration.
     */
    @Beta
    public static void fuzzy(IndexReader reader, Bucket<Term> bucket, Term term) throws IOException {
        fuzzy(reader, bucket, term, 2);
    }

    /**
     * Loads all the fuzzy terms in the list of terms given the reader.
     *
     * @param reader  Index reader to use.
     * @param bucket  Where to store the terms.
     * @param term    The term to use.
     *
     * @throws IOException If an error is thrown by the fuzzy term enumeration.
     */
    @Beta
    public static void fuzzy(IndexReader reader, Bucket<Term> bucket, Term term, int minSimilarity)
            throws IOException {
        AttributeSource atts = new AttributeSource();
        Fields fields = MultiFields.getFields(reader);
        org.apache.lucene.index.Terms terms = fields == null ? null : fields.terms(term.field());
        if (terms == null)
            return;
        FuzzyTermsEnum fuzzy = new FuzzyTermsEnum(terms, atts, term, minSimilarity, 0, true);
        BytesRef val;
        BytesRef searched = term.bytes();
        while ((val = fuzzy.next()) != null) {
            if (!searched.bytesEquals(val)) {
                Term t = new Term(term.field(), BytesRef.deepCopyOf(val));
                bucket.add(t, reader.docFreq(t));
            }
        }
    }

    /**
     * Loads all the prefix terms in the list of terms given the reader.
     *
     * @param reader  Index reader to use.
     * @param values  The list of values to load.
     * @param term    The term to use.
     *
     * @throws IOException If an error is thrown by the prefix term enumeration.
     */
    public static void prefix(IndexReader reader, List<String> values, Term term) throws IOException {
        Fields fields = MultiFields.getFields(reader);
        org.apache.lucene.index.Terms terms = fields == null ? null : fields.terms(term.field());
        if (terms == null)
            return;
        TermsEnum prefixes = terms.intersect(new CompiledAutomaton(PrefixQuery.toAutomaton(term.bytes())), null);
        BytesRef val;
        while ((val = prefixes.next()) != null) {
            values.add(val.utf8ToString());
        }
    }

    /**
     * Loads all the prefix terms in the list of terms given the reader.
     *
     * @param reader  Index reader to use.
     * @param bucket  Where to store the terms.
     * @param term    The term to use.
     *
     * @throws IOException If an error is thrown by the prefix term enumeration.
     */
    public static void prefix(IndexReader reader, Bucket<Term> bucket, Term term) throws IOException {
        Fields fields = MultiFields.getFields(reader);
        org.apache.lucene.index.Terms terms = fields == null ? null : fields.terms(term.field());
        if (terms == null)
            return;
        TermsEnum prefixes = terms.intersect(new CompiledAutomaton(PrefixQuery.toAutomaton(term.bytes())),
                term.bytes());
        BytesRef val;
        while ((val = prefixes.next()) != null) {
            Term t = new Term(term.field(), BytesRef.deepCopyOf(val));
            bucket.add(t, reader.docFreq(t));
        }
    }

    /**
     * Returns the list of field names for the specified reader.
     *
     * @param reader The index reader
     *
     * @return the list of field names
     *
     * @throws IOException should any IO error be reported by the {@link MultiFields#getFields(IndexReader)} method.
     */
    @Beta
    public static List<String> fields(IndexReader reader) throws IOException {
        LOGGER.debug("Loading fields");
        List<String> fieldnames = new ArrayList<>();
        Fields fields = MultiFields.getFields(reader);
        if (fields == null)
            return fieldnames;
        Iterator<String> it = fields.iterator();
        while (it.hasNext()) {
            fieldnames.add(it.next());
        }
        return fieldnames;
    }

    /**
     * Returns the list of terms for the specified field.
     *
     * @param reader The index reader
     * @param field  The field
     *
     * @return the list of terms for this field
     *
     * @throws IOException should any IO error be reported.
     */
    @Beta
    public static List<Term> terms(IndexReader reader, String field) throws IOException {
        LOGGER.debug("Loading terms for field {}", field);
        org.apache.lucene.index.Terms terms = MultiFields.getTerms(reader, field);
        if (terms == null)
            return Collections.emptyList();
        TermsEnum termsEnum = terms.iterator();
        if (termsEnum == TermsEnum.EMPTY)
            return Collections.emptyList();
        Map<BytesRef, Term> termsList = new HashMap<BytesRef, Term>();
        while (termsEnum.next() != null) {
            BytesRef t = termsEnum.term();
            if (t == null)
                break;
            termsList.put(t, new Term(field, BytesRef.deepCopyOf(t)));
        }
        return new ArrayList<>(termsList.values());
    }

    /**
     * Returns the list of term fields from the list of the fields provided which are in the search results of the query provided.
     *
     * @param searcher   a searcher on the index desired
     * @param query      the base query
     * @param candidates the list of candidate fields
     *
     * @return the list of fields with search results
     *
     * @throws IOException should any IO error be reported when querying the index.
     */
    @Beta
    public static List<String> fields(IndexSearcher searcher, Query query, List<String> candidates)
            throws IOException {
        LOGGER.debug("Loading fields for query {}", query);
        List<String> fields = new ArrayList<String>();
        for (String field : candidates) {
            FieldDocumentChecker checker = new FieldDocumentChecker(field);
            searcher.search(query, checker);
            if (checker.fieldFound())
                fields.add(field);
        }
        return fields;
    }

    /**
     * Returns the list of term values for the specified field.
     *
     * @param reader The index reader to use
     * @param field  The field
     *
     * @return the list of terms for this field
     *
     * @throws IOException should any IO error be reported.
     */
    @Beta
    public static List<String> values(IndexReader reader, String field) throws IOException {
        LOGGER.debug("Loading term values for field {}", field);
        List<String> values = new ArrayList<String>();
        org.apache.lucene.index.Terms terms = MultiFields.getTerms(reader, field);
        if (terms == null)
            return values;
        TermsEnum termsEnum = terms.iterator();
        if (termsEnum == TermsEnum.EMPTY)
            return values;
        while (termsEnum.next() != null) {
            BytesRef t = termsEnum.term();
            if (t == null)
                break;
            values.add(t.utf8ToString());
        }
        return values;
    }

    // XML Serialisers ==============================================================================

    /**
     * Returns the XML for a list of terms.
     *
     * @param xml   The XML writer.
     * @param terms The list of terms to serialise as XML.
     *
     * @throws IOException Any I/O error thrown by the XML writer.
     */
    public static void toXML(XMLWriter xml, List<Term> terms) throws IOException {
        for (Term t : terms) {
            toXML(xml, t);
        }
    }

    /**
     * Returns the XML for a list of terms.
     *
     * @param xml   The XML writer.
     * @param terms The list of terms to serialise as XML.
     *
     * @throws IOException Any I/O error thrown by the XML writer.
     */
    public static void toXML(XMLWriter xml, Bucket<Term> terms) throws IOException {
        for (Entry<Term> t : terms.entrySet()) {
            toXML(xml, t.item(), t.count());
        }
    }

    /**
     * Returns the XML for a term.
     *
     * @param xml The XML writer.
     * @param t   Term to serialise as XML.
     *
     * @throws IOException Any I/O error thrown by the XML writer.
     */
    public static void toXML(XMLWriter xml, Term t) throws IOException {
        xml.openElement("term");
        xml.attribute("field", t.field());
        xml.attribute("text", t.text());
        xml.closeElement();
    }

    /**
     * Returns the XML for a term.
     *
     * @param xml       The XML writer.
     * @param t         Term to serialise as XML.
     * @param frequency The term document frequency.
     *
     * @throws IOException Any I/O error thrown by the XML writer.
     */
    public static void toXML(XMLWriter xml, Term t, int frequency) throws IOException {
        xml.openElement("term");
        xml.attribute("field", t.field());
        xml.attribute("text", t.text());
        xml.attribute("frequency", frequency);
        xml.closeElement();
    }

}