psidev.psi.mi.search.index.PsimiIndexWriter.java Source code

Java tutorial

Introduction

Here is the source code for psidev.psi.mi.search.index.PsimiIndexWriter.java

Source

/**
 * Copyright 2007 The European Bioinformatics Institute, and others.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package psidev.psi.mi.search.index;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.hupo.psi.calimocho.io.IllegalFieldException;
import org.hupo.psi.calimocho.model.Row;
import psidev.psi.mi.search.util.DocumentBuilder;
import psidev.psi.mi.tab.PsimiTabException;
import psidev.psi.mi.tab.converter.txt2tab.MitabLineException;
import psidev.psi.mi.tab.model.BinaryInteraction;
import psidev.psi.mi.xml.converter.ConverterException;

import java.io.*;

/**
 * TODO comment this!
 *
 * @author Bruno Aranda (baranda@ebi.ac.uk)
 * @version $Id$
 */
public class PsimiIndexWriter {

    private static final Log log = LogFactory.getLog(PsimiIndexWriter.class);

    /**
    * Determines how often segment indices are merged by addDocument().
    * With smaller values, less RAM is used while indexing, and searches
    * on unoptimized indices are faster, but indexing speed is slower.
    * With larger values, more RAM is used during indexing, and while
    * searches on unoptimized indices are slower, indexing is faster.
    * Thus larger values (> 10) are best for batch index creation, and
    * smaller values (< 10) for indices that are interactively maintained.
    * This must never be less than 2. The default value is 10.
    */
    public static final int MERGE_FACTOR = 30;

    private DocumentBuilder documentBuilder;

    public PsimiIndexWriter(DocumentBuilder documentBuilder) {
        this.documentBuilder = documentBuilder;
    }

    public void index(String indexDir, InputStream is, boolean createIndex, boolean hasHeaderLine)
            throws IOException, ConverterException, MitabLineException {
        Directory directory = FSDirectory.open(new File(indexDir));
        index(directory, is, createIndex, hasHeaderLine);
    }

    public void index(File indexDir, InputStream is, boolean createIndex, boolean hasHeaderLine)
            throws IOException, ConverterException, MitabLineException {
        Directory directory = FSDirectory.open(indexDir);
        index(directory, is, createIndex, hasHeaderLine);
    }

    public void index(Directory directory, InputStream is, boolean createIndex, boolean hasHeaderLine)
            throws IOException, ConverterException, MitabLineException {
        IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LUCENE_30,
                new StandardAnalyzer(Version.LUCENE_30));
        LogMergePolicy policy = new LogDocMergePolicy();
        policy.setMergeFactor(MERGE_FACTOR);
        policy.setMaxMergeDocs(Integer.MAX_VALUE);
        writerConfig.setMergePolicy(policy);

        IndexWriter indexWriter = new IndexWriter(directory, writerConfig);

        if (createIndex) {
            indexWriter.commit();
            indexWriter.deleteAll();
            indexWriter.commit();
        }

        index(indexWriter, is, hasHeaderLine);
        indexWriter.close();
    }

    public void index(IndexWriter indexModifier, InputStream is, boolean hasHeaderLine)
            throws IOException, ConverterException, MitabLineException {
        if (log.isInfoEnabled())
            log.info("Starting index creation: " + indexModifier);
        long startTime = System.currentTimeMillis();

        BufferedReader reader = new BufferedReader(new InputStreamReader(is));

        if (hasHeaderLine) {
            reader.readLine();
        }

        String line;
        while ((line = reader.readLine()) != null) {
            if (log.isTraceEnabled())
                log.trace("\tIndexing: " + line);

            addLineToIndex(indexModifier, line);
        }

        // close reader
        reader.close();

        if (log.isInfoEnabled()) {
            long elapsedTime = (System.currentTimeMillis() - startTime) / 1000;
            log.info("Index created. Time: " + elapsedTime + "s");
        }
    }

    /**
     * @deprecated Use addBinaryInteractionToIndex(IndexWriter, BinaryInteraction) method instead
     */
    @Deprecated
    public void addLineToIndex(IndexWriter indexWriter, String line) throws IOException, MitabLineException {
        try {
            addBinaryInteractionToIndex(indexWriter, documentBuilder.getMitabReader().readLine(line));
        } catch (PsimiTabException e) {
            throw new MitabLineException(e);
        }
    }

    public void addBinaryInteractionToIndex(IndexWriter indexWriter, BinaryInteraction binaryInteraction)
            throws IOException, MitabLineException {
        indexWriter.addDocument(documentBuilder.createDocument(binaryInteraction));
    }

    public void addBinaryInteractionToIndex(IndexWriter indexWriter, Row row)
            throws IOException, IllegalFieldException {
        indexWriter.addDocument(documentBuilder.createDocument(row));
    }

    public DocumentBuilder getDocumentBuilder() {
        return documentBuilder;
    }
}