org.meresco.lucene.suggestion.SuggestionIndex.java Source code

Java tutorial

Introduction

Here is the source code for org.meresco.lucene.suggestion.SuggestionIndex.java

Source

/* begin license *
 *
 * "Meresco Lucene" is a set of components and tools to integrate Lucene (based on PyLucene) into Meresco
 *
 * Copyright (C) 2015 Koninklijke Bibliotheek (KB) http://www.kb.nl
 * Copyright (C) 2015-2016 Seecr (Seek You Too B.V.) http://seecr.nl
 *
 * This file is part of "Meresco Lucene"
 *
 * "Meresco Lucene" is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * "Meresco Lucene" is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with "Meresco Lucene"; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *
 * end license */

package org.meresco.lucene.suggestion;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.meresco.lucene.suggestion.SuggestionNGramIndex.Reader;

public class SuggestionIndex {
    private static final String ID_FIELD = "__id__";

    public static final String CONCAT_MARKER = "$$--$$";

    private static final String RECORD_VALUE_FIELDNAME = "__record_value__";
    private static final String KEY_FIELDNAME = "__key__";
    private static int MAX_COMMIT_COUNT_SUGGESTION = 1000000;

    public static final FieldType SIMPLE_NOT_STORED_STRING_FIELD = new FieldType();
    public static final FieldType SIMPLE_STORED_STRING_FIELD = new FieldType();
    static {
        SIMPLE_NOT_STORED_STRING_FIELD.setIndexed(true);
        SIMPLE_NOT_STORED_STRING_FIELD.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
        SIMPLE_NOT_STORED_STRING_FIELD.setOmitNorms(false);
        SIMPLE_NOT_STORED_STRING_FIELD.setStored(false);
        SIMPLE_NOT_STORED_STRING_FIELD.setTokenized(false);
        SIMPLE_NOT_STORED_STRING_FIELD.freeze();

        SIMPLE_STORED_STRING_FIELD.setIndexed(true);
        SIMPLE_STORED_STRING_FIELD.setIndexOptions(IndexOptions.DOCS_ONLY);
        SIMPLE_STORED_STRING_FIELD.setOmitNorms(true);
        SIMPLE_STORED_STRING_FIELD.setStored(true);
        SIMPLE_STORED_STRING_FIELD.setTokenized(false);
        SIMPLE_STORED_STRING_FIELD.freeze();
    }

    public IndexingState indexingState = null;

    private final IndexWriter writer;
    private final ShingleAnalyzer shingleAnalyzer;
    private final FSDirectory suggestionIndexDir;
    private final int maxCommitCount;

    private int commitCount = 0;

    private SuggestionNGramIndex suggestionNGramIndex;
    private String suggestionNGramIndexDir;
    private Reader currentReader;
    private Map<String, DocIdSet> filterKeySets = new HashMap<>();

    public SuggestionIndex(String suggestionIndexDir, String suggestionNGramIndexDir, int minShingleSize,
            int maxShingleSize) throws IOException {
        this(suggestionIndexDir, suggestionNGramIndexDir, minShingleSize, maxShingleSize, 1);
    }

    public SuggestionIndex(String suggestionIndexDir, String suggestionNGramIndexDir, int minShingleSize,
            int maxShingleSize, int commitCount) throws IOException {
        this.maxCommitCount = commitCount;

        this.shingleAnalyzer = new ShingleAnalyzer(minShingleSize, maxShingleSize);

        this.suggestionIndexDir = FSDirectory.open(new File(suggestionIndexDir));
        IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, new StandardAnalyzer());
        this.writer = new IndexWriter(this.suggestionIndexDir, config);
        this.writer.commit();
        this.suggestionNGramIndexDir = suggestionNGramIndexDir;
        this.suggestionNGramIndex = new SuggestionNGramIndex(this.suggestionNGramIndexDir,
                MAX_COMMIT_COUNT_SUGGESTION);
    }

    public void add(String identifier, int key, String[] values, String[] types, String[] creators)
            throws IOException {
        Document recordDoc = new Document();
        recordDoc.add(new Field(ID_FIELD, identifier, SIMPLE_NOT_STORED_STRING_FIELD));
        recordDoc.add(new NumericDocValuesField(KEY_FIELDNAME, key));
        for (int i = 0; i < values.length; i++) {
            String value = (types[i] != null ? types[i] : "") + CONCAT_MARKER
                    + (creators[i] != null ? creators[i] : "") + CONCAT_MARKER + values[i];
            recordDoc.add(new Field(RECORD_VALUE_FIELDNAME, value, SIMPLE_NOT_STORED_STRING_FIELD));
        }
        this.writer.updateDocument(new Term(ID_FIELD, identifier), recordDoc);
        maybeCommitAfterUpdate();
    }

    public void delete(String identifier) throws IOException {
        this.writer.deleteDocuments(new Term(ID_FIELD, identifier));
        maybeCommitAfterUpdate();
    }

    public void registerFilterKeySet(String name, DocIdSet keySet) throws IOException {
        this.filterKeySets.put(name, keySet);
    }

    public synchronized void createSuggestionNGramIndex(boolean wait, final boolean verbose) throws IOException {
        this.commit();

        Thread create = new Thread() {
            public void run() {
                indexingState = new IndexingState();
                try {
                    DirectoryReader reader = DirectoryReader.open(suggestionIndexDir);
                    String tempDir = suggestionNGramIndexDir + "~";
                    String tempTempDir = suggestionNGramIndexDir + "~~";
                    deleteIndexDirectory(tempDir);
                    deleteIndexDirectory(tempTempDir);
                    SuggestionNGramIndex newSuggestionNGramIndex = new SuggestionNGramIndex(tempDir,
                            MAX_COMMIT_COUNT_SUGGESTION);
                    newSuggestionNGramIndex.createSuggestions(reader, RECORD_VALUE_FIELDNAME, KEY_FIELDNAME,
                            indexingState);
                    newSuggestionNGramIndex.close();
                    reader.close();
                    suggestionNGramIndex.close();
                    new File(suggestionNGramIndexDir).renameTo(new File(tempTempDir));
                    new File(tempDir).renameTo(new File(suggestionNGramIndexDir));

                    suggestionNGramIndex = new SuggestionNGramIndex(suggestionNGramIndexDir,
                            MAX_COMMIT_COUNT_SUGGESTION);
                    deleteIndexDirectory(tempTempDir);

                    if (currentReader != null)
                        currentReader.reopen();
                    else
                        createSuggestionsReader();
                } catch (IOException e) {
                    e.printStackTrace();
                } finally {
                    long totalTime = (System.currentTimeMillis() - indexingState.started) / 1000;
                    long averageSpeed = totalTime > 0 ? indexingState.count / totalTime : 0;
                    if (verbose) {
                        System.out.println("Creating " + indexingState.count + " suggestions took: " + totalTime
                                + "s" + "; Average: " + averageSpeed + "/s");
                        System.out.flush();
                    }
                    indexingState = null;
                }

            }

            private void deleteIndexDirectory(String dir) {
                File[] files = new File(dir).listFiles();
                if (files != null) {
                    for (File currentFile : new File(dir).listFiles()) {
                        currentFile.delete();
                    }
                    new File(dir).delete();
                }
            }
        };
        if (wait)
            create.run();
        else
            create.start();
    }

    public IndexingState indexingState() {
        return indexingState;
    }

    public int numDocs() throws IOException {
        DirectoryReader reader = DirectoryReader.open(this.suggestionIndexDir);
        int numDocs = reader.numDocs();
        reader.close();
        return numDocs;
    }

    public long ngramIndexTimestamp() {
        return new File(suggestionNGramIndexDir).lastModified();
    }

    public void createSuggestionsReader() throws IOException {
        this.currentReader = this.suggestionNGramIndex.createReader(this.filterKeySets);
    }

    public SuggestionNGramIndex.Reader getSuggestionsReader() throws IOException {
        if (this.currentReader == null)
            createSuggestionsReader();
        return this.currentReader;
    }

    private void maybeCommitAfterUpdate() throws IOException {
        this.commitCount++;
        if (this.commitCount >= this.maxCommitCount) {
            this.commit();
        }
    }

    public void commit() throws IOException {
        this.writer.commit();
        this.commitCount = 0;
    }

    public void close() throws IOException {
        this.writer.close();
        this.suggestionNGramIndex.close();
    }

    public List<String> shingles(String s) throws IOException {
        List<String> shingles = new ArrayList<String>();
        TokenStream stream = this.shingleAnalyzer.tokenStream("ignored", s);
        stream.reset();
        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);
        while (stream.incrementToken()) {
            shingles.add(termAttribute.toString());
        }
        stream.close();
        return shingles;
    }

    public class IndexingState {
        public long started;
        public int count;

        public IndexingState() {
            started = System.currentTimeMillis();
            count = 0;
        }
    }
}