org.voyanttools.trombone.input.index.LuceneIndexer.java Source code

Introduction

Here is the source code for org.voyanttools.trombone.input.index.LuceneIndexer.java
Source

/*******************************************************************************
 * Trombone is a flexible text processing and analysis library used
 * primarily by Voyant Tools (voyant-tools.org).
 * 
 * Copyright () 2007-2012 Stfan Sinclair & Geoffrey Rockwell
 * 
 * This file is part of Trombone.
 * 
 * Trombone is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * Trombone is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with Trombone.  If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package org.voyanttools.trombone.input.index;

import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.text.NumberFormat;
import java.time.Instant;
import java.time.temporal.ChronoUnit;
import java.time.temporal.Temporal;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
import org.apache.lucene.LucenePackage;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.BytesRef;
import org.voyanttools.trombone.input.source.InputSource;
import org.voyanttools.trombone.input.source.InputStreamInputSource;
import org.voyanttools.trombone.lucene.LuceneManager;
import org.voyanttools.trombone.model.DocumentMetadata;
import org.voyanttools.trombone.model.StoredDocumentSource;
import org.voyanttools.trombone.model.TokenType;
import org.voyanttools.trombone.storage.Storage;
import org.voyanttools.trombone.storage.StoredDocumentSourceStorage;
import org.voyanttools.trombone.util.FlexibleParameters;
import org.voyanttools.trombone.util.TextUtils;

/**
 * @author sgs
 *
 */
public class LuceneIndexer implements Indexer {

    private static int VERSION = 4; // helpful for setting unique version of
    // document based not only on Lucene version but also this code,
    // the actual number doesn't matter but will usually just
    // increment to uniqueness

    private Storage storage;
    private FlexibleParameters parameters;

    public LuceneIndexer(Storage storage, FlexibleParameters parameters) {
        this.storage = storage;
        this.parameters = parameters;
    }

    public String index(List<StoredDocumentSource> storedDocumentSources) throws IOException {

        // let's check if we need to create new sources because of tokenization parameters
        if (parameters.getParameterValue("tokenization", "").isEmpty() == false
                || parameters.getParameterValue("language", "").isEmpty() == false) {
            StoredDocumentSourceStorage sourceDocumentSourceStorage = storage.getStoredDocumentSourceStorage();
            String tokenizationParam = parameters.getParameterValue("tokenization", "");
            String langParam = parameters.getParameterValue("language", "");
            for (int i = 0, len = storedDocumentSources.size(); i < len; i++) {
                StoredDocumentSource storedDocumentSource = storedDocumentSources.get(i);
                String id = storedDocumentSource.getId();
                String newId = DigestUtils.md5Hex(id + tokenizationParam + langParam);
                InputStream inputStream = sourceDocumentSourceStorage.getStoredDocumentSourceInputStream(id);
                DocumentMetadata metadata = storedDocumentSource.getMetadata();
                metadata.setLastTokenPositionIndex(TokenType.lexical, 0); // this is crucial to ensure that document is re-analyzed and metadata re-rewritten
                InputSource inputSource = new InputStreamInputSource(newId, metadata, inputStream);
                storedDocumentSources.set(i, sourceDocumentSourceStorage.getStoredDocumentSource(inputSource));
                inputStream.close();
            }
        }

        List<String> ids = new ArrayList<String>();
        for (StoredDocumentSource storedDocumentSource : storedDocumentSources) {
            ids.add(storedDocumentSource.getId());
        }
        String corpusId = storage.storeStrings(ids, Storage.Location.object);

        // determine if we need to modify the Lucene index
        Collection<StoredDocumentSource> storedDocumentSourceForLucene = new ArrayList<StoredDocumentSource>();
        if (storage.getLuceneManager().directoryExists(corpusId)) {
            LeafReader reader = SlowCompositeReaderWrapper
                    .wrap(storage.getLuceneManager().getDirectoryReader(corpusId));
            Terms terms = reader.terms("id");
            if (terms == null) {
                storedDocumentSourceForLucene.addAll(storedDocumentSources);
            } else {
                TermsEnum termsEnum = terms.iterator();
                for (StoredDocumentSource storedDocumentSource : storedDocumentSources) {
                    String id = storedDocumentSource.getId();
                    if (!termsEnum.seekExact(new BytesRef(id))) {
                        storedDocumentSourceForLucene.add(storedDocumentSource);
                    }
                }
            }
        } else {
            storedDocumentSourceForLucene.addAll(storedDocumentSources);
        }

        if (storedDocumentSourceForLucene.isEmpty() == false) {
            //         Temporal start = Instant.now();
            //         System.out.println(start);
            //         if (parameters.getParameterBooleanValue("stream")) {
            //            System.out.println("stream");
            //            indexStream(storedDocumentSourceForLucene, corpusId);
            //         } else {
            //            System.out.println("executor");
            indexExecutorService(storedDocumentSourceForLucene, corpusId);
            //         }
            //         Temporal end = Instant.now();
            //         ChronoUnit.SECONDS.between(start, end);
            //         System.out.println(ChronoUnit.SECONDS.between(start, end));
        }

        return corpusId;

    }

    private void indexStream(Collection<StoredDocumentSource> storedDocumentSourceForLucene, String corpusId)
            throws CorruptIndexException, LockObtainFailedException, IOException {
        // index documents (or at least add corpus to document if not already there), we need to get a new writer
        IndexWriter indexWriter = storage.getLuceneManager().getIndexWriter(corpusId);
        DirectoryReader indexReader = DirectoryReader.open(indexWriter);
        IndexSearcher indexSearcher = new IndexSearcher(indexReader);
        boolean verbose = parameters.getParameterBooleanValue("verbose");
        try {
            storedDocumentSourceForLucene.parallelStream().forEach(storedDocumentSource -> {
                Runnable runnable;
                try {
                    runnable = new StoredDocumentSourceIndexer(storage, indexWriter, indexSearcher,
                            storedDocumentSource, corpusId, verbose);
                    runnable.run();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            });
        } catch (Exception e) {
            e.printStackTrace();
        }

        if (parameters.containsKey("forceMerge")) {
            indexWriter.forceMerge(parameters.getParameterIntValue("forceMerge"));
        }

        indexReader = DirectoryReader.open(indexWriter);
        storage.getLuceneManager().setDirectoryReader(corpusId, indexReader); // make sure it's available afterwards            

        // now determine which documents need to be analyzed
        Collection<StoredDocumentSource> storedDocumentSourceForAnalysis = new ArrayList<StoredDocumentSource>();
        for (StoredDocumentSource storedDocumentSource : storedDocumentSourceForLucene) {
            if (storedDocumentSource.getMetadata().getLastTokenPositionIndex(TokenType.lexical) == 0) { // don't re-analyze
                storedDocumentSourceForAnalysis.add(storedDocumentSource);
            }
        }

        if (storedDocumentSourceForAnalysis.isEmpty() == false) {
            IndexSearcher indexSearcher2 = new IndexSearcher(indexReader);
            try {
                storedDocumentSourceForAnalysis.parallelStream().forEach(storedDocumentSource -> {
                    if (storedDocumentSource.getMetadata().getLastTokenPositionIndex(TokenType.lexical) == 0) { // don't re-analyze
                        Runnable worker;
                        try {
                            worker = new IndexedDocumentAnalyzer(storage, indexSearcher2, storedDocumentSource,
                                    corpusId, verbose);
                            worker.run();
                        } catch (IOException e) {
                            // TODO Auto-generated catch block
                            e.printStackTrace();
                        }
                    }
                });
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

    }

    private void indexExecutorService(Collection<StoredDocumentSource> storedDocumentSourceForLucene,
            String corpusId) throws CorruptIndexException, LockObtainFailedException, IOException {
        // index documents (or at least add corpus to document if not already there), we need to get a new writer
        IndexWriter indexWriter = storage.getLuceneManager().getIndexWriter(corpusId);
        DirectoryReader indexReader = DirectoryReader.open(indexWriter);
        IndexSearcher indexSearcher = new IndexSearcher(indexReader);
        boolean verbose = parameters.getParameterBooleanValue("verbose");
        int processors = Runtime.getRuntime().availableProcessors();
        ExecutorService executor;

        // index
        executor = Executors.newFixedThreadPool(processors);
        for (StoredDocumentSource storedDocumentSource : storedDocumentSourceForLucene) {
            Runnable worker = new StoredDocumentSourceIndexer(storage, indexWriter, indexSearcher,
                    storedDocumentSource, corpusId, verbose);
            try {
                executor.execute(worker);
            } catch (Exception e) {
                executor.shutdown();
                throw e;
            }
        }
        executor.shutdown();
        try {
            if (!executor.awaitTermination(parameters.getParameterIntValue("luceneIndexingTimeout", 60 * 10),
                    TimeUnit.SECONDS)) { // default 10 minutes
                executor.shutdownNow();
                throw new InterruptedException("Lucene indexing has run out of time.");
            }
        } catch (InterruptedException e) {
            executor.shutdownNow();
            Thread.currentThread().interrupt();
            throw new RuntimeException("Lucene indexing has been interrupted.", e);
        } finally {

            try {
                indexWriter.commit();
            } catch (IOException e) {
                indexWriter.close(); // this may also throw an exception, but docs say to close on commit error
                throw e;
            }
        }

        if (parameters.containsKey("forceMerge")) {
            indexWriter.forceMerge(parameters.getParameterIntValue("forceMerge"));
        }

        indexReader = DirectoryReader.open(indexWriter);
        storage.getLuceneManager().setDirectoryReader(corpusId, indexReader); // make sure it's available afterwards            

        // now determine which documents need to be analyzed
        Collection<StoredDocumentSource> storedDocumentSourceForAnalysis = new ArrayList<StoredDocumentSource>();
        for (StoredDocumentSource storedDocumentSource : storedDocumentSourceForLucene) {
            if (storedDocumentSource.getMetadata().getLastTokenPositionIndex(TokenType.lexical) == 0) { // don't re-analyze
                storedDocumentSourceForAnalysis.add(storedDocumentSource);
            }
        }

        if (storedDocumentSourceForAnalysis.isEmpty() == false) {
            indexSearcher = new IndexSearcher(indexReader);
            executor = Executors.newFixedThreadPool(processors);
            for (StoredDocumentSource storedDocumentSource : storedDocumentSourceForAnalysis) {
                if (storedDocumentSource.getMetadata().getLastTokenPositionIndex(TokenType.lexical) == 0) { // don't re-analyze
                    Runnable worker = new IndexedDocumentAnalyzer(storage, indexSearcher, storedDocumentSource,
                            corpusId, verbose);
                    executor.execute(worker);
                }
            }
            executor.shutdown();
            try {
                if (!executor.awaitTermination(parameters.getParameterIntValue("luceneAnalysisTimeout", 60 * 10),
                        TimeUnit.SECONDS)) { // default 10 minutes
                    throw new InterruptedException("Lucene analysis has run out of time.");
                }
            } catch (InterruptedException e) {
                throw new RuntimeException("Lucene document analysis run out of time", e);
            }
        }

    }

    private class IndexedDocumentAnalyzer implements Runnable {

        private Storage storage;
        private StoredDocumentSource storedDocumentSource;
        private IndexReader indexReader;
        private IndexSearcher indexSearcher;
        private String corpusId;
        private String id;
        private boolean verbose;

        public IndexedDocumentAnalyzer(Storage storage, IndexSearcher indexSearcher,
                StoredDocumentSource storedDocumentSource, String corpusId, boolean verbose) throws IOException {
            this.storage = storage;
            this.indexReader = indexSearcher.getIndexReader();
            this.indexSearcher = indexSearcher;
            this.storedDocumentSource = storedDocumentSource;
            this.corpusId = corpusId;
            this.id = storedDocumentSource.getId();
            this.verbose = verbose;
        }

        @Override
        public void run() {

            if (verbose) {
                //            System.out.println("analyzing indexed document "+storedDocumentSource.getMetadata());
            }

            Query query = new TermQuery(new Term("id", id));
            TopDocs topDocs;

            try {
                topDocs = indexSearcher.search(query, 1); // there may be multiple documents in the index but they should have the same text
                int docId = topDocs.scoreDocs[0].doc;
                Terms terms = indexReader.getTermVector(docId, "lexical");
                int totalTokens = 0;
                int totalTypes = 0;
                int lastOffset = 0;
                int lastPosition = 0;
                DescriptiveStatistics stats = new DescriptiveStatistics();
                if (terms != null) {
                    TermsEnum termsEnum = terms.iterator();
                    while (true) {
                        BytesRef term = termsEnum.next();
                        if (term != null) {
                            totalTypes++;
                            PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
                            while (true) {
                                int doc = postingsEnum.nextDoc();
                                if (doc != PostingsEnum.NO_MORE_DOCS) {
                                    int freq = postingsEnum.freq();
                                    stats.addValue(freq);
                                    totalTokens += freq;
                                    for (int i = 0; i < freq; i++) {
                                        int pos = postingsEnum.nextPosition();
                                        if (pos > lastPosition) {
                                            lastPosition = pos;
                                        }
                                        int offset = postingsEnum.startOffset();
                                        if (offset > lastOffset) {
                                            lastOffset = offset;
                                        }
                                    }
                                } else {
                                    break;
                                }
                            }
                        } else {
                            break;
                        }
                    }
                }
                DocumentMetadata metadata = storedDocumentSource.getMetadata();
                metadata.setTypesCount(TokenType.lexical, totalTypes);
                metadata.setTokensCount(TokenType.lexical, totalTokens);
                metadata.setTypesCountMean(TokenType.lexical, (float) stats.getMean());
                metadata.setTypesCountStdDev(TokenType.lexical, (float) stats.getStandardDeviation());
                metadata.setLastTokenPositionIndex(TokenType.lexical, lastPosition);
                metadata.setLastTokenOffsetIndex(TokenType.lexical, lastOffset);
                storage.getStoredDocumentSourceStorage().updateStoredDocumentSourceMetadata(id, metadata);
            } catch (IOException e) {
                throw new RuntimeException("Unable to query document during index analysis.", e);
            }
        }

    }

    private class StoredDocumentSourceIndexer implements Runnable {

        private Storage storage;
        private StoredDocumentSource storedDocumentSource;
        private IndexWriter indexWriter;
        private IndexSearcher indexSearcher;
        private LuceneManager luceneManager;
        private String corpusId;
        private String id;
        private String string = null;
        private boolean verbose;

        public StoredDocumentSourceIndexer(Storage storage, IndexWriter indexWriter, IndexSearcher indexSearcher,
                StoredDocumentSource storedDocumentSource, String corpusId, boolean verbose) throws IOException {
            this.storage = storage;
            this.indexWriter = indexWriter;
            this.indexSearcher = indexSearcher;
            this.storedDocumentSource = storedDocumentSource;
            this.luceneManager = storage.getLuceneManager();
            this.corpusId = corpusId;
            this.id = storedDocumentSource.getId();
            this.verbose = verbose;
        }

        private String getString() throws IOException {
            if (this.string == null) {
                InputStream is = null;
                try {
                    is = storage.getStoredDocumentSourceStorage().getStoredDocumentSourceInputStream(id);
                    StringWriter sw = new StringWriter();
                    IOUtils.copy(is, sw);
                    string = sw.toString();
                } finally {
                    if (is != null)
                        is.close();
                }
            }
            return string;
        }

        @Override
        public void run() {

            if (verbose) {
                //            System.out.println("indexing "+storedDocumentSource.getMetadata());
            }

            try {

                TopDocs topDocs = indexSearcher.search(new TermQuery(new Term("id", id)), 1);
                if (topDocs.totalHits > 0) { // already indexed
                    return;
                }

                // this is used by lexical and the metadata (expecting term vectors to be present)
                FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
                ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
                ft.setStoreTermVectors(true);
                ft.setStoreTermVectorOffsets(true);
                ft.setStoreTermVectorPositions(true);

                Document document = new Document();

                // create lexical document
                document = new Document();
                document.add(new StringField("id", id, Field.Store.NO));
                //            document.add(new StringField("corpus", corpusId, Field.Store.NO));
                document.add(new StringField("version", LucenePackage.get().getImplementationVersion() + "-"
                        + String.valueOf(LuceneIndexer.VERSION), Field.Store.YES));

                FlexibleParameters p = new FlexibleParameters();
                p.setParameter("language", storedDocumentSource.getMetadata().getLanguageCode());
                if (parameters.getParameterValue("tokenization", "").isEmpty() == false) {
                    p.setParameter("tokenization", parameters.getParameterValue("tokenization"));
                }
                document.add(new Field("lexical", getString() + "<!-- " + p.getAsQueryString() + " -->", ft));
                //            System.err.println(id+": "+getString());

                FlexibleParameters params = storedDocumentSource.getMetadata().getFlexibleParameters();
                FacetsConfig config = new FacetsConfig();
                for (String key : params.getKeys()) {
                    // store term vector so that we can build term DB, combine multiple values into one
                    String v = StringUtils.join(params.getParameterValues(key), " ");
                    if (v != null && v.trim().isEmpty() == false) {
                        document.add(new Field(key, v, ft));
                    }
                    for (String value : params.getParameterValues(key)) {
                        String facet = "facet." + key;
                        config.setMultiValued(facet, true);
                        config.setIndexFieldName(key, facet);
                        if (value.trim().isEmpty() == false) {
                            // store as facet field
                            document.add(new SortedSetDocValuesFacetField(facet, value));
                        }
                    }
                }

                if (parameters.getParameterBooleanValue("lemmatize")) {
                    // pass in parameters, including language, used by lemmatizer
                    document.add(new Field("lemma", getString() + "<!-- " + p.getAsQueryString() + " -->", ft));
                }

                if (parameters.getParameterBooleanValue("stem")) {
                    // pass in parameters, including language, used by lemmatizer
                    document.add(new Field("stem", getString() + "<!-- " + p.getAsQueryString() + " -->", ft));
                }

                // TODO: add lemmatization
                /*
                if (storedDocumentSource.getMetadata().getLanguageCode().equals("en")) {
                   // FIXME: deal with other lemmatization languages
                   document.add(new Field("lemmatized-en", getString(), ft));
                }
                else {
                   // next look for stemmed index if needed
                   String lang = storedDocumentSource.getMetadata().getLanguageCode();
                   StemmableLanguage stemmableLanguage = StemmableLanguage.fromCode(lang);
                   if (stemmableLanguage!=null) {
                      document.add(new Field("stemmed-"+lang, getString(), ft));      
                   }
                }
                */

                // approximate the number of sentences
                List<String> sentences = TextUtils.getSentences(getString(),
                        storedDocumentSource.getMetadata().getLanguageCode());
                storedDocumentSource.getMetadata().setSentencesCount(sentences.size());

                indexWriter.addDocument(config.build(document));

            } catch (IOException e) {
                throw new RuntimeException("Unable to index stored document: " + storedDocumentSource, e);
            }
        }

    }
}