de.unihildesheim.iw.lucene.index.FDRIndexDataProvider.java Source code

Introduction

Here is the source code for de.unihildesheim.iw.lucene.index.FDRIndexDataProvider.java
Source

/*
 * Copyright (C) 2015 Jens Bertram (code@jens-bertram.net)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package de.unihildesheim.iw.lucene.index;

import de.unihildesheim.iw.Buildable;
import de.unihildesheim.iw.lucene.document.DocumentModel;
import de.unihildesheim.iw.lucene.util.BytesRefUtils.MergingBytesRefHash;
import de.unihildesheim.iw.lucene.util.StreamUtils;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.ReaderSlice;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.FixedBitSet;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

/**
 * @author Jens Bertram (code@jens-bertram.net)
 */
public final class FDRIndexDataProvider implements IndexDataProvider {
    /**
     * Logger instance for this class.
     */
    private static final Logger LOG = LoggerFactory.getLogger(FDRIndexDataProvider.class);
    /**
     * Object wrapping Lucene index information.
     */
    private final LuceneIndex index;
    /**
     * Size of the document-model LRU cache.
     */
    private static final int CACHE_DOCMOD_SIZE = 10000;
    /**
     * LRU cache of document models.
     */
    @SuppressWarnings({ "AnonymousInnerClassMayBeStatic", "CloneableClassWithoutClone", "serial" })
    private final Map<Integer, DocumentModel> cache_docmod = Collections
            .synchronizedMap(new LinkedHashMap<Integer, DocumentModel>(CACHE_DOCMOD_SIZE + 1, .75F, true) {
                @Override
                public boolean removeEldestEntry(final Map.Entry eldest) {
                    return size() > CACHE_DOCMOD_SIZE;
                }
            });
    /**
     * Size of the term-frequency LRU cache.
     */
    private static final int CACHE_TF_SIZE = 10000;
    /**
     * LRU cache of term-frequency values.
     */
    @SuppressWarnings({ "AnonymousInnerClassMayBeStatic", "CloneableClassWithoutClone", "serial" })
    private final Map<BytesRef, Long> cache_tf = Collections
            .synchronizedMap(new LinkedHashMap<BytesRef, Long>(CACHE_TF_SIZE + 1, .75F, true) {
                @Override
                public boolean removeEldestEntry(final Map.Entry eldest) {
                    return size() > CACHE_TF_SIZE;
                }
            });
    /**
     * Size of the document-frequency LRU cache.
     */
    private static final int CACHE_DF_SIZE = 10000;
    /**
     * LRU cache of document-frequency values.
     */
    @SuppressWarnings({ "AnonymousInnerClassMayBeStatic", "CloneableClassWithoutClone", "serial" })
    private final Map<BytesRef, Integer> cache_df = Collections
            .synchronizedMap(new LinkedHashMap<BytesRef, Integer>(CACHE_DF_SIZE + 1, .75F, true) {
                @Override
                public boolean removeEldestEntry(final Map.Entry eldest) {
                    return size() > CACHE_DF_SIZE;
                }
            });

    /**
     * Create instance by using {@link Builder}.
     *
     * @param builder Builder instance
     * @throws IOException Thrown on low-level I/O-errors
     */
    @SuppressWarnings("WeakerAccess")
    FDRIndexDataProvider(final Builder builder) throws IOException {
        // first initialize the Lucene index
        assert builder.idxReader != null;

        LOG.info("Initializing index & gathering base data..");
        this.index = new LuceneIndex(builder.idxReader);

        if (LOG.isDebugEnabled()) {
            LOG.debug("index.TTF {} index.UT {}", this.index.ttf, this.index.uniqueTerms);
            LOG.debug("TTF (abwasserreinigungsstuf): {}", getTermFrequency(new BytesRef("abwasserreinigungsstuf")));
        }
    }

    @Override
    public long getTermFrequency() {
        return this.index.ttf;
    }

    @SuppressFBWarnings("EXS_EXCEPTION_SOFTENING_NO_CONSTRAINTS")
    @Override
    public long getTermFrequency(@NotNull final BytesRef term) {
        // try get a cached value first
        @Nullable
        Long tf = this.cache_tf.get(term);
        if (tf == null) {
            tf = 0L;
            for (final LeafReaderContext lrc : this.index.reader.leaves()) {
                final LeafReader r = lrc.reader();
                long fieldTf = 0L;
                if (r.numDocs() > 0) {
                    try {
                        for (final String s : r.fields()) {
                            @Nullable
                            final Terms terms = r.terms(s);
                            if (terms != null) {
                                final TermsEnum termsEnum = terms.iterator(null);
                                if (termsEnum.seekExact(term)) {
                                    fieldTf += termsEnum.totalTermFreq();
                                }
                            }
                        }
                    } catch (final IOException e) {
                        throw new UncheckedIOException(e);
                    }
                }
                tf += fieldTf;
            }
            this.cache_tf.put(BytesRef.deepCopyOf(term), tf);
        }

        return tf;
    }

    @Override
    public double getRelativeTermFrequency(@NotNull final BytesRef term) {
        final long tf = getTermFrequency(term);
        return tf == 0L ? 0d : (double) tf / (double) this.index.ttf;
    }

    @SuppressFBWarnings("EXS_EXCEPTION_SOFTENING_NO_CONSTRAINTS")
    @Override
    public int getDocumentFrequency(@NotNull final BytesRef term) {
        Integer df = this.cache_df.get(term);
        if (df == null) {
            df = this.index.reader.leaves().stream().map(LeafReaderContext::reader).filter(r -> r.numDocs() > 0)
                    .mapToInt(r -> {
                        try {
                            return StreamSupport.stream(r.fields().spliterator(), false).mapToInt(f -> {
                                try {
                                    @Nullable
                                    final Terms terms = r.terms(f);
                                    if (terms == null) {
                                        return 0;
                                    }
                                    final TermsEnum termsEnum = terms.iterator(null);
                                    return termsEnum.seekExact(term) ? termsEnum.docFreq() : 0;
                                } catch (final IOException e) {
                                    throw new UncheckedIOException(e);
                                }
                            }).max().orElse(0);
                        } catch (final IOException e) {
                            throw new UncheckedIOException(e);
                        }
                    }).sum();
            this.cache_df.put(BytesRef.deepCopyOf(term), df);
        }
        return df;
    }

    @Override
    public double getRelativeDocumentFrequency(final BytesRef term) {
        return (double) getDocumentFrequency(term) / (double) this.index.docCount;
    }

    @Override
    @NotNull
    public IntStream getDocumentIds() {
        return StreamUtils.stream(this.index.docIds);
    }

    @SuppressFBWarnings("RCN_REDUNDANT_NULLCHECK_OF_NONNULL_VALUE")
    @NotNull
    @Override
    public DocumentModel getDocumentModel(final int docId) {
        DocumentModel dm = this.cache_docmod.get(docId);
        if (dm == null) {
            dm = new DocumentModel.Builder(docId).setTermFrequency(getDocumentTerms(docId)).build();
            this.cache_docmod.put(docId, dm);
        }
        return dm;
    }

    /**
     * Get a mapping (or list) of all terms in a specific document.
     *
     * @param docId Document id
     * @return List of terms or mapping of term to (within document) term
     * frequency value
     */
    private Map<BytesRef, Long> getDocumentTerms(final int docId) {
        return Arrays.stream(this.index.fields).flatMap(f -> {
            try {
                @Nullable
                final Terms terms = this.index.reader.getTermVector(docId, f);
                if (terms == null) {
                    LOG.warn("No Term Vectors for field {} in document {}.", f, docId);
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Field exists? {}", this.index.reader.document(docId).getField(f) != null);
                    }
                    return Stream.empty();
                } else {
                    return StreamUtils.stream(terms.iterator(null));
                }
            } catch (final IOException e) {
                throw new UncheckedIOException(e);
            }
        }).collect(HashMap<BytesRef, Long>::new, (map, ba) -> {
            if (map.containsKey(ba)) {
                map.put(ba, map.get(ba) + 1L);
            } else {
                map.put(ba, 1L);
            }
        }, HashMap<BytesRef, Long>::putAll);
    }

    @Override
    public boolean hasDocument(final int docId) {
        return docId < this.index.docIds.length() && this.index.docIds.get(docId);
    }

    @SuppressFBWarnings({ "EXS_EXCEPTION_SOFTENING_NO_CONSTRAINTS", "EXS_EXCEPTION_SOFTENING_NO_CHECKED" })
    @Override
    public Stream<BytesRef> getDocumentTerms(final int docId, @NotNull final String... field) {
        Arrays.sort(field);
        final Fields fields;
        try {
            fields = this.index.reader.getTermVectors(docId);
        } catch (final IOException e) {
            throw new UncheckedIOException(e);
        }

        if (fields == null) {
            return Stream.empty();
        }

        final BytesRefHash terms = new BytesRefHash();
        StreamSupport.stream(fields.spliterator(), false)
                // filter for required fields
                .filter(fn -> Arrays.binarySearch(field, fn) >= 0).map(fn -> {
                    try {
                        return fields.terms(fn);
                    } catch (final IOException e) {
                        throw new UncheckedIOException(e);
                    }
                }).filter(t -> t != null).forEach(t -> {
                    try {
                        final TermsEnum te = t.iterator(null);
                        BytesRef term;
                        while ((term = te.next()) != null) {
                            terms.add(term);
                        }
                    } catch (final IOException e) {
                        throw new UncheckedIOException(e);
                    }
                });

        return StreamUtils.stream(terms);
    }

    @SuppressFBWarnings("EXS_EXCEPTION_SOFTENING_NO_CONSTRAINTS")
    @Override
    @NotNull
    public Stream<BytesRef> getDocumentsTerms(@NotNull final DocIdSet docIds) {
        try {
            return StreamUtils.stream(docIds).mapToObj(docId -> {
                try {
                    return this.index.reader.getTermVectors(docId);
                } catch (final IOException e) {
                    throw new UncheckedIOException(e);
                }
            }).filter(f -> f != null).map(f -> {
                final BytesRefHash terms = new BytesRefHash();
                StreamSupport.stream(f.spliterator(), false).map(fn -> {
                    try {
                        return f.terms(fn);
                    } catch (final IOException e) {
                        throw new UncheckedIOException(e);
                    }
                }).filter(t -> t != null).forEach(t -> {
                    try {
                        final TermsEnum te = t.iterator(null);
                        BytesRef term;
                        while ((term = te.next()) != null) {
                            terms.add(term);
                        }
                    } catch (final IOException e) {
                        throw new UncheckedIOException(e);
                    }
                });
                return terms;
            }).collect(MergingBytesRefHash::new, MergingBytesRefHash::addAll, MergingBytesRefHash::addAll).stream();
        } catch (final IOException e) {
            throw new UncheckedIOException(e);
        }
    }

    @Override
    public long getDocumentCount() {
        return (long) this.index.docCount;
    }

    @Override
    @NotNull
    public String[] getDocumentFields() {
        return this.index.fields;
    }

    /**
     * Information about the provided Lucene index.
     */
    private static final class LuceneIndex {
        /**
         * Logger instance for this class.
         */
        private static final Logger LOG = LoggerFactory.getLogger(LuceneIndex.class);
        /**
         * {@link IndexReader} to access the Lucene index.
         */
        final FilteredDirectoryReader reader;
        /**
         * List of document-id of visible documents.
         */
        final FixedBitSet docIds;
        /**
         * Number of documents visible.
         */
        final int docCount;
        /**
         * Frequency of all terms in index.
         */
        final long ttf;
        /**
         * Number of unique terms in index (respects active fields).
         */
        final long uniqueTerms;
        /**
         * Document fields.
         */
        final String[] fields;
        /**
         * Constant value if no document fields are available.
         */
        private static final String[] NO_FIELDS = new String[0];

        /**
         * Initialize the index information store.
         *
         * @param r IndexReader
         * @throws IOException Thrown on low-level I/O-errors
         */
        @SuppressFBWarnings("EXS_EXCEPTION_SOFTENING_NO_CONSTRAINTS")
        LuceneIndex(@NotNull final FilteredDirectoryReader r) throws IOException {
            this.reader = r;

            // collect the ids of all documents in the index
            if (LOG.isDebugEnabled()) {
                LOG.debug("Estimating index size");
            }
            final int numDocs = this.reader.numDocs();

            LOG.info("Collecting all ({}) documents from index.", numDocs);
            final Query q = new MatchAllDocsQuery();
            final IndexSearcher searcher = IndexUtils.getSearcher(this.reader);
            final TopDocs matches = searcher.search(q, numDocs);

            if (LOG.isDebugEnabled()) {
                LOG.debug("Query returned {} matching documents.", matches.totalHits);
            }
            final int[] docIds = Arrays.stream(matches.scoreDocs).mapToInt(sd -> sd.doc).sorted().toArray();
            this.docIds = new FixedBitSet(docIds[docIds.length - 1] + 1);
            Arrays.stream(docIds).forEach(this.docIds::set);
            this.docCount = this.docIds.cardinality();
            if (LOG.isDebugEnabled()) {
                LOG.debug("DocIds c={}", this.docCount);
            }
            // both counts should be equal, since there are no deletions
            assert this.docCount == numDocs;

            // collect summed total term frequency of all terms in the index
            LOG.info("Collecting term counts (TTF)");
            @Nullable
            final Fields fields = MultiFields.getFields(this.reader);
            if (fields == null) {
                LOG.warn("Reader does not contain any postings.");
                this.ttf = 0L;
                this.fields = NO_FIELDS;
            } else {
                this.fields = StreamSupport.stream(fields.spliterator(), false).toArray(String[]::new);
                this.ttf = Arrays.stream(this.fields).mapToLong(f -> {
                    try {
                        return this.reader.getSumTotalTermFreq(f);
                    } catch (final IOException e) {
                        throw new UncheckedIOException(e);
                    }
                }).sum();
            }

            // check for TermVectors
            if (this.fields.length > 0) {
                final boolean termVectorsMissing = this.reader.getContext().leaves().stream()
                        .filter(arc -> arc.reader().getFieldInfos().size() > 0)
                        .flatMap(arc -> StreamSupport.stream(arc.reader().getFieldInfos().spliterator(), false))
                        .filter(fi -> {
                            if (!fi.hasVectors()) {
                                LOG.error("TermVector missing. f={}", fi.name);
                                return true;
                            }
                            return false;
                        }).findFirst().isPresent();
                if (termVectorsMissing) {
                    throw new IllegalStateException("TermVectors are not present for all fields.");
                }
            }

            // collect all unique terms from the index
            LOG.info("Collecting term counts (unique terms)");
            if (this.fields.length == 0) {
                // still no postings
                this.uniqueTerms = 0L;
            } else {
                // gather sub-reader which have documents
                final LeafReaderContext[] leaves = this.reader.leaves().stream()
                        .filter(lrc -> lrc.reader().numDocs() > 0).toArray(LeafReaderContext[]::new);

                // collect slices for all sub-readers
                final ReaderSlice[] slices = IntStream.range(0, leaves.length)
                        // create slice for each sub-reader
                        .mapToObj(i -> new ReaderSlice(leaves[i].docBase, leaves[i].reader().maxDoc(), i))
                        .toArray(ReaderSlice[]::new);

                // iterate all terms in all fields in all sub-readers
                this.uniqueTerms = Arrays.stream(this.fields)
                        // collect terms instances from all sub-readers
                        .map(f -> IntStream.range(0, leaves.length).mapToObj(i -> {
                            try {
                                return leaves[i].reader().terms(f);
                            } catch (final IOException e) {
                                throw new UncheckedIOException(e);
                            }
                        })
                                // exclude empty terms
                                .filter(t -> t != null).toArray(Terms[]::new))
                        .flatMap(t -> {
                            try {
                                final MultiTerms mTerms = new MultiTerms(t, slices);
                                return StreamUtils.stream(mTerms.iterator(null));
                            } catch (final IOException e) {
                                throw new UncheckedIOException(e);
                            }
                        }).distinct().count();
            }
        }
    }

    /**
     * Builder for creating a new {@link FDRIndexDataProvider}.
     */
    @SuppressWarnings("PublicInnerClass")
    public static final class Builder implements Buildable<FDRIndexDataProvider> {
        /**
         * {@link FilteredDirectoryReader} to use for accessing the Lucene index.
         */
        @Nullable
        FilteredDirectoryReader idxReader;

        /**
         * Set the {@link FilteredDirectoryReader} to use.
         *
         * @param reader {@link FilteredDirectoryReader} instance
         * @return Self reference
         */
        public Builder indexReader(@NotNull final FilteredDirectoryReader reader) {
            this.idxReader = reader;
            return this;
        }

        /**
         * Create a new {@link FDRIndexDataProvider} instance.
         *
         * @return new {@link FDRIndexDataProvider} instance
         * @throws BuildException Thrown on any error during construction
         * @throws ConfigurationException Thrown on configuration errors
         */
        @NotNull
        @Override
        public FDRIndexDataProvider build() throws BuildException, ConfigurationException {
            validate();
            try {
                return new FDRIndexDataProvider(this);
            } catch (final IOException e) {
                throw new BuildException("Failed to create instance.", e);
            }
        }

        @Override
        public void validate() throws ConfigurationException {
            if (this.idxReader == null) {
                throw new ConfigurationException("IndexReader not set.");
            }
            if (!FilteredDirectoryReader.class.isInstance(this.idxReader)) {
                throw new ConfigurationException("IndexReader must be an instance of FilteredDirectoryReader.");
            }
        }
    }
}