com.ibm.watson.developer_cloud.professor_languo.ingestion.indexing.LuceneIndexer.java Source code

Java tutorial

Introduction

Here is the source code for com.ibm.watson.developer_cloud.professor_languo.ingestion.indexing.LuceneIndexer.java

Source

/*
 * Copyright IBM Corp. 2015
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */

package com.ibm.watson.developer_cloud.professor_languo.ingestion.indexing;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Properties;
import java.util.List;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;

import com.ibm.watson.developer_cloud.professor_languo.configuration.ConfigurationConstants;
import com.ibm.watson.developer_cloud.professor_languo.configuration.Messages;
import com.ibm.watson.developer_cloud.professor_languo.exception.IngestionException;
import com.ibm.watson.developer_cloud.professor_languo.model.stack_exchange.StackExchangeThread;
import com.ibm.watson.developer_cloud.professor_languo.model.stack_exchange.StackExchangeConstants.IndexStatFieldName;
import com.ibm.watson.developer_cloud.professor_languo.pipeline.primary_search.LuceneSearcher;
import com.ibm.watson.developer_cloud.professor_languo.pipeline.primary_search.SingletonAnalyzer;

/**
 * An implementation of the {@link Indexer} API that creates a Lucene index from a collection of
 * {@link StackExchangeThread} objects.
 *
 */
public class LuceneIndexer implements Indexer {
    private final static Logger logger = LogManager.getLogger(LuceneIndexer.class.getName());

    /**
     * Used by Lucene to write document into the index file
     */
    private IndexWriter indexWriter = null;

    /**
     * every {@link LuceneIndexer} has a {@link DocumentMapper} associated with it, which is used to
     * convert StackExchangeThread into a LuceneDocument
     */
    private DocumentMapper documentMapper = null;

    /**
     * The (RAM or disk) directory used to store the Lucene index files, which needs to be accessed by
     * the {@link LuceneSearcher} to perform search
     */
    private Directory indexDir = null;
    private String indexDirPath = null;

    /**
     * The path of the index statistics file which contains the summary of the indexing procedure
     */
    private String indexStatPath = null;
    /**
     * The summary of the statistics during the indexing procedure
     */
    private IndexingStats indexStat = null;

    @Override
    public void initialize(Properties properties) throws IngestionException {
        String resDirPath = properties.getProperty(ConfigurationConstants.INGESTION_BASE_DIR) + File.separator;
        indexStatPath = resDirPath + properties.getProperty(ConfigurationConstants.INDEX_STAT_PATH);

        if (properties.getProperty(ConfigurationConstants.INDEX_DIR_TYPE).toUpperCase()
                .equals(ConfigurationConstants.IndexDirTypes.RAM.toString()))
            indexDir = new RAMDirectory();
        else if (properties.getProperty(ConfigurationConstants.INDEX_DIR_TYPE).toUpperCase()
                .equals(ConfigurationConstants.IndexDirTypes.FS.toString())) {
            try {
                indexDirPath = resDirPath + properties.getProperty(ConfigurationConstants.INDEX_DIR);
                indexDir = FSDirectory.open(new File(indexDirPath).toPath());
                // clear the previous index files in the folder before a new
                // indexing process begins
                clearIndexDirectory(indexDir);
            } catch (IngestionException | IOException e) {
                throw new IngestionException(e);
            }
        } else {
            throw new IngestionException("Unrecognized " + ConfigurationConstants.INDEX_DIR_TYPE + ": "
                    + properties.getProperty(ConfigurationConstants.INDEX_DIR_TYPE));
        }
    }

    @Override
    public IndexingStats indexCorpus(String uniqueThreadDirPath) throws IngestionException {
        IndexWriter writer = getIndexWriter();
        DocumentMapper docMapper = getDocumentMapper();
        indexStat = indexCorpus(uniqueThreadDirPath, writer, docMapper);
        saveIndexStatToDisk();
        return indexStat;
    }

    /**
     * Given a built corpus(a set of StackExchangeThreads without duplicates), an index writer and a
     * document mapper, write the indexing file with documents and record the statistics during the
     * indexing period.
     * 
     * @param uniqueThreadDirPath - the path of the folder which stores the unique threads
     * @param writer - an index writer which can write document unit to the index file
     * @param docMapper - document mapper which maps the StackExchange instance to the document unit
     * @return the statistics during the indexing period.
     * @throws IngestionException
     */
    private IndexingStats indexCorpus(String uniqueThreadDirPath, IndexWriter writer, DocumentMapper docMapper)
            throws IngestionException {
        List<Integer> indexThreadIds = new ArrayList<Integer>();
        long startTime, endTime;
        int indexDocNum;
        StackExchangeThread thread = null;
        File[] serFiles = new File(uniqueThreadDirPath).listFiles();

        try {
            startTime = System.currentTimeMillis();
            // restore the uniqe StackExchangeThreads from the .ser Files and
            // index them
            for (File serFile : serFiles) {
                thread = StackExchangeThreadSerializer.deserializeThreadFromBinFile(serFile.getPath());
                Document doc = docMapper.createDocument(thread);
                writer.addDocument(doc);
                indexThreadIds.add(thread.getId());
            }

            endTime = System.currentTimeMillis();
            indexDocNum = writer.numDocs();
            closeIndexWriter();
        } catch (IOException e) {
            throw new IngestionException(e);
        }

        return createIndexingStats(indexDocNum, indexThreadIds, endTime - startTime);
    }

    /**
     * Create a indexingStat instance to record the the statistics during the indexing period.
     * 
     * @param indexDocNum - the number of documents indexed
     * @param indexThreadIds - the post ids of the threads indexed
     * @param period - the time consumed in the indexing peroid
     * @return
     */
    private IndexingStats createIndexingStats(int indexDocNum, List<Integer> indexThreadIds, long period) {
        IndexingStats indexingStats = new IndexingStats();
        indexingStats.addStatistic(IndexStatFieldName.INDEX_DOC_NUM.toString(), indexDocNum);
        indexingStats.addStatistic(IndexStatFieldName.INDEX_THREAD_IDS.toString(), indexThreadIds);
        indexingStats.addStatistic(IndexStatFieldName.INDEX_TIME.toString(), period);
        return indexingStats;
    }

    /**
     * Set the mapper of the indexer
     * 
     * @param mapper - the mapper associated with the indexer
     */
    public void setDocumentMapper(DocumentMapper mapper) {
        documentMapper = mapper;
    }

    /**
     * Get the mapper of the indexer
     * 
     * @return the mapper associated with the indexer
     */
    public DocumentMapper getDocumentMapper() {
        if (documentMapper == null) {
            return new LuceneDocumentMapper();
        }
        return documentMapper;
    }

    /**
     * Get the index writer in order to perform adding the documents to the index file. Initialize the
     * index writer if it hasn't been created.
     * 
     * @return the index writer which can add the documents to the index
     * @throws IngestionException
     */
    private IndexWriter getIndexWriter() throws IngestionException {
        if (indexWriter == null) {
            try {
                IndexWriterConfig config = new IndexWriterConfig(SingletonAnalyzer.getAnalyzer());
                indexWriter = new IndexWriter(indexDir, config);
            } catch (IOException e) {
                logger.fatal(Messages.getString("RetrieveAndRank.DIR_OPEN_FAIL")); //$NON-NLS-1$
                throw new IngestionException(e);
            }
        }
        return indexWriter;
    }

    /**
     * Close the {@link IndexWriter} after all the documents have been added to the index. Flush the
     * index to make it take effect!
     * 
     * @throws IOException
     */
    public void closeIndexWriter() throws IOException {
        if (indexWriter != null) {
            indexWriter.close();
        }
    }

    /**
     * Get the path of the directory which stores the index file
     * 
     * @return the path of the directory which stores the index file
     */
    public Directory getIndexDir() {
        return indexDir;
    }

    /**
     * clear all the index files in the Lucence directory
     * 
     * @param indexDir - Lucene directory to store index file
     * @throws IngestionException
     */
    private void clearIndexDirectory(Directory indexDir) throws IngestionException {
        if (indexDir != null) {
            try {
                String[] files = indexDir.listAll();
                for (String file : files)
                    indexDir.deleteFile(file);
            } catch (IOException e) {
                throw new IngestionException(e);
            }
        }
    }

    /**
     * save the index statistics file into a disk folder
     * 
     * @throws IngestionException
     */
    private void saveIndexStatToDisk() throws IngestionException {
        PrintWriter out;
        try {
            File txtFile = new File(indexStatPath);
            if (txtFile.getParentFile() != null)
                txtFile.getParentFile().mkdirs();
            if (!txtFile.exists())
                txtFile.createNewFile();
            out = new PrintWriter(txtFile);
            out.println(indexStat.toString());
            out.close();
        } catch (IOException e) {
            throw new IngestionException(e);
        }
    }

    public String getIndexStatPath() {
        return indexStatPath;
    }

    public IndexingStats getIndexStat() {
        return indexStat;
    }

}