org.splevo.vpm.analyzer.semantic.lucene.Indexer.java Source code

Introduction

Here is the source code for org.splevo.vpm.analyzer.semantic.lucene.Indexer.java
Source

/*******************************************************************************
 * Copyright (c) 2014
 *
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *    Daniel Kojic - initial API and implementation and/or initial documentation
 *    Benjamin Klatt
 *******************************************************************************/
package org.splevo.vpm.analyzer.semantic.lucene;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.splevo.vpm.analyzer.semantic.Config;

import com.google.common.base.Strings;

/**
 * This class handles the indexing. It creates one single main index. By providing two
 * {@link Analyzer}s Comments can be handled different than regular code. This class offers Methods
 * to add text to an index, clear the main index and get reading access to an index.
 *
 * DesignDecision: Singleton as required by Lucene infrastructure
 *
 * This class is a singleton because there should be only one index that holds all informations, at
 * any time. This class has the ability to create that index. To ensure there is only one Index, the
 * singleton pattern is used.
 */
public final class Indexer {

    private static final Version LUCENE_VERSION = Version.LUCENE_47;

    private static Logger logger = Logger.getLogger(Indexer.class);

    /** The field name the Variation Point ID is stored in the Lucene index. */
    public static final String INDEX_VARIATIONPOINT = "VP";

    /** The field name the content is stored in the Lucene index. */
    public static final String INDEX_CONTENT = "CONTENT";

    /** The field name the comments are stored in the Lucene index. */
    public static final String INDEX_COMMENT = "COMMENT";

    /** The {@link Directory} for the index. */
    private Directory directory;

    /** Singleton instance. */
    private static Indexer instance;

    /** Specifies how to store the code fragments in the index. */
    private static final FieldType TYPE_STORED_CONTENT = new FieldType();

    /** Specifies how to store the comment fragments in the index. */
    private static final FieldType TYPE_STORED_ID = new FieldType();

    /** Specifies whether to split on case-change or not. */
    private boolean splitCamelCase;

    /** Contains the stop-words to be filtered out. */
    private String[] stopWords;

    /** Option which stemming to use. */
    private Stemming stemming;

    /** Option which words to preserve during tokenization. */
    private Set<String> featureTermSet;

    /** Option to consider featured terms only if some has been defined. */
    private boolean featuredTermsOnly;

    /**
     * Define the Field-Type the text gets added to the index. To allow term extraction,
     * DOCS_AND_FREQS has to be stored in the index.
     */
    static {
        TYPE_STORED_CONTENT.setIndexed(true);
        TYPE_STORED_CONTENT.setTokenized(true);
        TYPE_STORED_CONTENT.setStored(true);
        TYPE_STORED_CONTENT.setStoreTermVectors(true);
        TYPE_STORED_CONTENT.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    }

    /** Define the Field-Type the IDs get added to the index. */
    static {
        TYPE_STORED_ID.setIndexed(false);
        TYPE_STORED_ID.setTokenized(false);
        TYPE_STORED_ID.setStored(true);
        TYPE_STORED_ID.setStoreTermVectors(false);
        TYPE_STORED_ID.setIndexOptions(IndexOptions.DOCS_ONLY);
    }

    /**
     * Private constructor to prevent this class from being instantiated multiple times.
     */
    private Indexer() {
        this.splitCamelCase = true;
        this.stopWords = Config.DEFAULT_STOP_WORDS.split(" ");
        this.stemming = Stemming.valueOf(Config.DEFAULT_STEMMING);

        // Use RAMDirectory to use an in-memory index.
        directory = new RAMDirectory();
    }

    /**
     * Sets the stop-words that should be used by the {@link LuceneCodeAnalyzer}.
     *
     * @param stopWords
     *            The stop-words.
     */
    public void setStopWords(String[] stopWords) {
        this.stopWords = stopWords;
    }

    /**
     * Sets the option whether to split text on case-change or not.
     *
     * @param splitCamelCase
     *            True to split on case-change; False otherwise.
     */
    public void setSplitCamelCase(boolean splitCamelCase) {
        this.splitCamelCase = splitCamelCase;
    }

    /**
     * Sets the option to use stemming or not.
     *
     * @param stemming
     *            True if stemming should be used.
     */
    public void setStemming(Stemming stemming) {
        this.stemming = stemming;
    }

    /**
     * Gets the singleton instance.
     *
     * @return The singleton instance.
     */
    public static synchronized Indexer getInstance() {
        // Return singleton, create new if not existing.
        if (instance == null) {
            instance = new Indexer();
        }

        return instance;
    }

    /**
     * @return A {@link DirectoryReader} to search the main index.
     * @throws IOException
     *             Throws an {@link IOException} if there are problems opening the index.
     */
    public DirectoryReader getIndexReader() throws IOException {
        return DirectoryReader.open(directory);
    }

    /**
     * @return the featureTermSet
     */
    public Set<String> getFeatureTermSet() {
        return featureTermSet;
    }

    /**
     * @param featureTermSet the featureTermSet to set
     */
    public void setFeatureTermSet(Set<String> featureTermSet) {
        this.featureTermSet = featureTermSet;
    }

    /**
     * @return the featuredTermsOnly
     */
    public boolean isFeaturedTermsOnly() {
        return featuredTermsOnly;
    }

    /**
     * @param featuredTermsOnly the featuredTermsOnly to set
     */
    public void setFeaturedTermsOnly(boolean featuredTermsOnly) {
        this.featuredTermsOnly = featuredTermsOnly;
    }

    /**
     * Adds content to the main index.
     *
     * @param variationPointId
     *            The ID of the {@link VariationPoint} to be linked with its content.
     * @param code
     *            The text content of the {@link VariationPoint}.
     * @param comments
     *            The text comments of the {@link VariationPoint}.
     * @throws IOException
     *             Thrown if the document couldn't be added to index.
     */
    public void addToIndex(String variationPointId, String code, String comments) throws IOException {
        if (variationPointId == null || variationPointId.length() == 0 || (code == null && comments == null)) {
            throw new IllegalArgumentException();
        }

        if (!Strings.isNullOrEmpty(code) || !Strings.isNullOrEmpty(comments)) {
            addDocument(variationPointId, code, comments);
        }
    }

    /**
     * Adds a {@link Document} to the index with the given VP ID and the text.
     *
     * @param variationPointId
     *            The ID to be stored.
     * @param content
     *            The text to be added to the index.
     * @param comments
     *            The text comments of the {@link VariationPoint}.
     * @throws IOException
     *             If the index cannot be opened or there are problems adding the document to the
     *             index.
     */
    private void addDocument(String variationPointId, String content, String comments) throws IOException {

        if (variationPointId == null) {
            logger.warn("Tried to add undefined variation point to index");
            return;
        }
        if (Strings.isNullOrEmpty(content) && Strings.isNullOrEmpty(comments)) {
            logger.warn("Tried to add document without semantic content");
            return;
        }

        // Create the document with two fields: the content and its ID.
        Document doc = new Document();
        doc.add(new Field(INDEX_VARIATIONPOINT, variationPointId, TYPE_STORED_ID));

        if (!Strings.isNullOrEmpty(content)) {
            doc.add(new Field(INDEX_CONTENT, content, TYPE_STORED_CONTENT));
        }
        if (Strings.isNullOrEmpty(comments)) {
            doc.add(new Field(INDEX_COMMENT, comments, TYPE_STORED_CONTENT));
        }

        IndexWriter indexWriter = createIndexWriter();
        indexWriter.addDocument(doc);
        indexWriter.close();
    }

    /**
     * Deletes all contents from the main index.
     *
     * @throws IOException
     *             Failed to clear index.
     */
    public void clearIndex() throws IOException {
        IndexWriter indexWriter = createIndexWriter();
        indexWriter.deleteAll();
        indexWriter.close();
        directory.close();
        directory = new RAMDirectory();
    }

    /**
     * Factory method to get a new writer for accessing the lucene index.
     *
     * @return The prepared index writer.
     * @throws IOException
     *             An error when trying to init the index access.
     */
    private IndexWriter createIndexWriter() throws IOException {
        PerFieldAnalyzerWrapper wrapper = createAnalzyerWrapper();
        IndexWriterConfig config = new IndexWriterConfig(LUCENE_VERSION, wrapper);
        IndexWriter indexWriter = new IndexWriter(directory, config);
        return indexWriter;
    }

    /**
     * This method retrieves a wrapper that handles all index fields with a specific
     * {@link Analyzer}.
     *
     * @param stopWords
     *            The stop-word list to be used by the {@link LuceneCodeAnalyzer}.
     * @param splitCamelCase
     *            Specifies whether to split on case-change or not.
     * @param stemming
     *            Option to use stemming or not.
     * @return A {@link AnalyzerWrapper}.
     */
    private PerFieldAnalyzerWrapper createAnalzyerWrapper() {
        Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>();
        analyzerPerField.put(Indexer.INDEX_CONTENT,
                new LuceneCodeAnalyzer(stopWords, splitCamelCase, stemming, featureTermSet, featuredTermsOnly));
        analyzerPerField.put(Indexer.INDEX_COMMENT, new StandardAnalyzer(LUCENE_VERSION));

        PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper(
                new LuceneCodeAnalyzer(stopWords, splitCamelCase, stemming, featureTermSet, featuredTermsOnly),
                analyzerPerField);
        return aWrapper;
    }
}