org.zephyrsoft.sdb2.service.IndexerServiceImpl.java Source code

Introduction

Here is the source code for org.zephyrsoft.sdb2.service.IndexerServiceImpl.java
Source

/*
 * This file is part of the Song Database (SDB).
 * 
 * SDB is free software: you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 * 
 * SDB is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with SDB. If not, see <http://www.gnu.org/licenses/>.
 */
package org.zephyrsoft.sdb2.service;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.zephyrsoft.sdb2.model.Song;

import com.google.common.base.Stopwatch;

/**
 * An in-memory index based on Lucene.
 * 
 * @author Mathis Dirksen-Thedens
 */
public class IndexerServiceImpl implements IndexerService<Song> {

    private static final Logger LOG = LoggerFactory.getLogger(IndexerServiceImpl.class);

    private static final String SIMPLIFY_INDEXING_REGEX = "[-\r\n\t.,;:_/+'\"!?()\\[\\]]++";
    public static final String TERM_SPLIT_REGEX = "[- .,;:_/+'\"!?()\\[\\]]++";

    private Map<String, Song> songByUuid = new HashMap<>();

    private Map<IndexType, Directory> indexes = new HashMap<>();
    private static final Object INDEXES_LOCK = new Object();

    private final ExecutorService executor = Executors.newSingleThreadExecutor();

    private Directory getIndex(IndexType indexType) {
        synchronized (INDEXES_LOCK) {
            return indexes.get(indexType);
        }
    }

    private void putIndex(IndexType indexType, Directory directory) {
        synchronized (INDEXES_LOCK) {
            indexes.put(indexType, directory);
        }
    }

    @Override
    public void index(final IndexType indexType, final Collection<Song> songs) {
        executor.execute(new Runnable() {
            @Override
            public void run() {
                Stopwatch stopwatch = Stopwatch.createStarted();

                Directory directory = new RAMDirectory();
                try {
                    LOG.debug("available tokenizers: {}", TokenizerFactory.availableTokenizers());
                    LOG.debug("available token filters: {}", TokenFilterFactory.availableTokenFilters());
                    Analyzer analyzer = CustomAnalyzer.builder().withTokenizer("standard")
                            .addTokenFilter("lowercase")
                            .addTokenFilter("ngram", "minGramSize", "1", "maxGramSize", "25").build();
                    IndexWriterConfig config = new IndexWriterConfig(analyzer);
                    try (IndexWriter writer = new IndexWriter(directory, config)) {
                        for (Song song : songs) {
                            Document document = createDocument(song);
                            writer.addDocument(document);
                            songByUuid.put(song.getUUID(), song);
                        }
                    } catch (IOException e) {
                        LOG.warn("couldn't index songs", e);
                    }
                } catch (IOException e1) {
                    LOG.warn("couldn't create analyzer", e1);
                } finally {
                    putIndex(indexType, directory);
                    stopwatch.stop();
                    LOG.info("indexing songs in background thread took {}", stopwatch.toString());
                }
            }
        });
    }

    private Document createDocument(Song song) {
        Document document = new Document();
        document.add(new StringField(FieldName.UUID.name(), song.getUUID(), Store.YES));
        document.add(new TextField(FieldName.TITLE.name(), simplify(song.getTitle()), Store.NO));
        document.add(new TextField(FieldName.LYRICS.name(), simplify(song.getLyrics()), Store.NO));
        return document;
    }

    private String simplify(String content) {
        return content == null ? "" : content.replaceAll(SIMPLIFY_INDEXING_REGEX, " ");
    }

    @Override
    public List<Song> search(IndexType indexType, String searchString, FieldName... fieldsToSearchIn) {
        try {
            Directory directory = getIndex(indexType);
            IndexReader indexReader = DirectoryReader.open(directory);
            IndexSearcher indexSearcher = new IndexSearcher(indexReader);

            BooleanQuery outerBooleanQuery = new BooleanQuery();
            for (FieldName field : fieldsToSearchIn) {
                PhraseQuery query = new PhraseQuery();
                for (String searchTerm : searchString.toLowerCase().split(TERM_SPLIT_REGEX)) {
                    query.add(new Term(field.name(), searchTerm));
                }
                query.setBoost(field.getBoost());
                outerBooleanQuery.add(query, Occur.SHOULD);
            }
            TopDocs hits = indexSearcher.search(outerBooleanQuery, Integer.MAX_VALUE);

            LOG.debug("{} hits for filter \"{}\"", hits.totalHits, outerBooleanQuery);

            List<Song> ret = new LinkedList<>();
            for (ScoreDoc scoreDocument : hits.scoreDocs) {
                Document document;
                document = indexSearcher.doc(scoreDocument.doc);
                String uuid = document.get(FieldName.UUID.name());
                ret.add(songByUuid.get(uuid));
            }
            return ret;
        } catch (Exception e) {
            LOG.warn("problem while searching", e);
            return new ArrayList<>(0);
        }
    }

}