org.apache.jackrabbit.core.query.lucene.AbstractIndex.java Source code

Introduction

Here is the source code for org.apache.jackrabbit.core.query.lucene.AbstractIndex.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.jackrabbit.core.query.lucene;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.io.StringReader;
import java.util.BitSet;
import java.util.Iterator;

/**
 * Implements common functionality for a lucene index.
 * <p/>
 * Note on synchronization: This class is not entirely thread-safe. Certain
 * concurrent access is however allowed. Read-only access on this index using
 * {@link #getReadOnlyIndexReader()} is thread-safe. That is, multiple threads
 * my call that method concurrently and use the returned IndexReader at the same
 * time.<br/>
 * Modifying threads must be synchronized externally in a way that only one
 * thread is using the returned IndexReader and IndexWriter instances returned
 * by {@link #getIndexReader()} and {@link #getIndexWriter()} at a time.<br/>
 * Concurrent access by <b>one</b> modifying thread and multiple read-only
 * threads is safe!
 */
abstract class AbstractIndex {

    /** The logger instance for this class */
    private static final Logger log = LoggerFactory.getLogger(AbstractIndex.class);

    /** PrintStream that pipes all calls to println(String) into log.info() */
    private static final LoggingPrintStream STREAM_LOGGER = new LoggingPrintStream();

    /** The currently set IndexWriter or <code>null</code> if none is set */
    private IndexWriter indexWriter;

    /** The currently set IndexReader or <code>null</code> if none is set */
    private CommittableIndexReader indexReader;

    /** The underlying Directory where the index is stored */
    private Directory directory;

    /** Analyzer we use to tokenize text */
    private Analyzer analyzer;

    /** Compound file flag */
    private boolean useCompoundFile = true;

    /** minMergeDocs config parameter */
    private int minMergeDocs = SearchIndex.DEFAULT_MIN_MERGE_DOCS;

    /** maxMergeDocs config parameter */
    private int maxMergeDocs = SearchIndex.DEFAULT_MAX_MERGE_DOCS;

    /** mergeFactor config parameter */
    private int mergeFactor = SearchIndex.DEFAULT_MERGE_FACTOR;

    /** maxFieldLength config parameter */
    private int maxFieldLength = SearchIndex.DEFAULT_MAX_FIELD_LENGTH;

    /**
     * The document number cache if this index may use one.
     */
    private DocNumberCache cache;

    /** The shared IndexReader for all read-only IndexReaders */
    private SharedIndexReader sharedReader;

    /**
     * The indexing queue.
     */
    private IndexingQueue indexingQueue;

    /**
     * Constructs an index with an <code>analyzer</code> and a
     * <code>directory</code>.
     *
     * @param analyzer      the analyzer for text tokenizing.
     * @param directory     the underlying directory.
     * @param cache         the document number cache if this index should use
     *                      one; otherwise <code>cache</code> is
     *                      <code>null</code>.
     * @param indexingQueue the indexing queue.
     * @throws IOException if the index cannot be initialized.
     */
    AbstractIndex(Analyzer analyzer, Directory directory, DocNumberCache cache, IndexingQueue indexingQueue)
            throws IOException {
        this.analyzer = analyzer;
        this.directory = directory;
        this.cache = cache;
        this.indexingQueue = indexingQueue;

        if (!IndexReader.indexExists(directory)) {
            indexWriter = new IndexWriter(directory, analyzer);
            // immediately close, now that index has been created
            indexWriter.close();
            indexWriter = null;
        }
    }

    /**
     * Default implementation returns the same instance as passed
     * in the constructor.
     *
     * @return the directory instance passed in the constructor
     * @throws IOException
     */
    Directory getDirectory() throws IOException {
        return directory;
    }

    /**
     * Adds a document to this index and invalidates the shared reader.
     *
     * @param doc the document to add.
     * @throws IOException if an error occurs while writing to the index.
     */
    void addDocument(Document doc) throws IOException {
        // check if text extractor completed its work
        doc = getFinishedDocument(doc);
        getIndexWriter().addDocument(doc);
        invalidateSharedReader();
    }

    /**
     * Removes the document from this index. This call will not invalidate
     * the shared reader. If a subclass whishes to do so, it should overwrite
     * this method and call {@link #invalidateSharedReader()}.
     *
     * @param idTerm the id term of the document to remove.
     * @throws IOException if an error occurs while removing the document.
     * @return number of documents deleted
     */
    int removeDocument(Term idTerm) throws IOException {
        return getIndexReader().deleteDocuments(idTerm);
    }

    /**
     * Returns an <code>IndexReader</code> on this index. This index reader
     * may be used to delete documents.
     *
     * @return an <code>IndexReader</code> on this index.
     * @throws IOException if the reader cannot be obtained.
     */
    protected synchronized IndexReader getIndexReader() throws IOException {
        if (indexWriter != null) {
            indexWriter.close();
            log.debug("closing IndexWriter.");
            indexWriter = null;
        }
        if (indexReader == null) {
            indexReader = new CommittableIndexReader(IndexReader.open(getDirectory()));
        }
        return indexReader;
    }

    /**
     * Returns a read-only index reader, that can be used concurrently with
     * other threads writing to this index. The returned index reader is
     * read-only, that is, any attempt to delete a document from the index
     * will throw an <code>UnsupportedOperationException</code>.
     *
     * @return a read-only index reader.
     * @throws IOException if an error occurs while obtaining the index reader.
     */
    synchronized ReadOnlyIndexReader getReadOnlyIndexReader() throws IOException {
        // get current modifiable index reader
        IndexReader modifiableReader = getIndexReader();
        // capture snapshot of deleted documents
        BitSet deleted = new BitSet(modifiableReader.maxDoc());
        for (int i = 0; i < modifiableReader.maxDoc(); i++) {
            if (modifiableReader.isDeleted(i)) {
                deleted.set(i);
            }
        }
        if (sharedReader == null) {
            // create new shared reader
            CachingIndexReader cr = new CachingIndexReader(IndexReader.open(getDirectory()), cache);
            sharedReader = new SharedIndexReader(cr);
        }
        return new ReadOnlyIndexReader(sharedReader, deleted);
    }

    /**
     * Returns an <code>IndexWriter</code> on this index.
     * @return an <code>IndexWriter</code> on this index.
     * @throws IOException if the writer cannot be obtained.
     */
    protected synchronized IndexWriter getIndexWriter() throws IOException {
        if (indexReader != null) {
            indexReader.close();
            log.debug("closing IndexReader.");
            indexReader = null;
        }
        if (indexWriter == null) {
            indexWriter = new IndexWriter(getDirectory(), analyzer);
            // since lucene 2.0 setMaxBuffereDocs is equivalent to previous minMergeDocs attribute
            indexWriter.setMaxBufferedDocs(minMergeDocs);
            indexWriter.setMaxMergeDocs(maxMergeDocs);
            indexWriter.setMergeFactor(mergeFactor);
            indexWriter.setMaxFieldLength(maxFieldLength);
            indexWriter.setUseCompoundFile(useCompoundFile);
            indexWriter.setInfoStream(STREAM_LOGGER);
        }
        return indexWriter;
    }

    /**
     * Commits all pending changes to the underlying <code>Directory</code>.
     * @throws IOException if an error occurs while commiting changes.
     */
    protected void commit() throws IOException {
        commit(false);
    }

    /**
     * Commits all pending changes to the underlying <code>Directory</code>.
     *
     * @param optimize if <code>true</code> the index is optimized after the
     *                 commit.
     * @throws IOException if an error occurs while commiting changes.
     */
    protected synchronized void commit(boolean optimize) throws IOException {
        // if index is not locked there are no pending changes
        if (!IndexReader.isLocked(getDirectory())) {
            return;
        }

        if (indexReader != null) {
            indexReader.commitDeleted();
        }
        if (indexWriter != null) {
            log.debug("committing IndexWriter.");
            indexWriter.close();
            indexWriter = null;
        }
        // optimize if requested
        if (optimize) {
            IndexWriter writer = getIndexWriter();
            writer.optimize();
            writer.close();
            indexWriter = null;
        }
    }

    /**
     * Closes this index, releasing all held resources.
     */
    synchronized void close() {
        if (indexWriter != null) {
            try {
                indexWriter.close();
            } catch (IOException e) {
                log.warn("Exception closing index writer: " + e.toString());
            }
            indexWriter = null;
        }
        if (indexReader != null) {
            try {
                indexReader.close();
            } catch (IOException e) {
                log.warn("Exception closing index reader: " + e.toString());
            }
            indexReader = null;
        }
        if (sharedReader != null) {
            try {
                sharedReader.close();
            } catch (IOException e) {
                log.warn("Exception closing index reader: " + e.toString());
            }
        }
        if (directory != null) {
            try {
                directory.close();
            } catch (IOException e) {
                directory = null;
            }
        }
    }

    /**
     * Closes the shared reader.
     *
     * @throws IOException if an error occurs while closing the reader.
     */
    protected synchronized void invalidateSharedReader() throws IOException {
        // invalidate shared reader
        if (sharedReader != null) {
            sharedReader.close();
            sharedReader = null;
        }
    }

    /**
     * Returns a document that is finished with text extraction and is ready to
     * be added to the index.
     *
     * @param doc the document to check.
     * @return <code>doc</code> if it is finished already or a stripped down
     *         copy of <code>doc</code> without text extractors.
     * @throws IOException if the document cannot be added to the indexing
     *                     queue.
     */
    private Document getFinishedDocument(Document doc) throws IOException {
        if (!Util.isDocumentReady(doc)) {
            Document copy = new Document();
            for (Iterator fields = doc.getFields().iterator(); fields.hasNext();) {
                Field f = (Field) fields.next();
                Field field = null;
                Field.TermVector tv = getTermVectorParameter(f);
                Field.Store stored = getStoreParameter(f);
                Field.Index indexed = getIndexParameter(f);
                if (f.readerValue() != null) {
                    // replace all readers with empty string reader
                    field = new Field(f.name(), new StringReader(""), tv);
                } else if (f.stringValue() != null) {
                    field = new Field(f.name(), f.stringValue(), stored, indexed, tv);
                } else if (f.isBinary()) {
                    field = new Field(f.name(), f.binaryValue(), stored);
                }
                if (field != null) {
                    field.setOmitNorms(f.getOmitNorms());
                    copy.add(field);
                }
            }
            // schedule the original document for later indexing
            Document existing = indexingQueue.addDocument(doc);
            if (existing != null) {
                // the queue already contained a pending document for this
                // node. -> dispose the document
                Util.disposeDocument(existing);
            }
            // use the stripped down copy for now
            doc = copy;
        }
        return doc;
    }

    //-------------------------< properties >-----------------------------------

    /**
     * The lucene index writer property: useCompountFile
     */
    void setUseCompoundFile(boolean b) {
        useCompoundFile = b;
        if (indexWriter != null) {
            indexWriter.setUseCompoundFile(b);
        }
    }

    /**
     * The lucene index writer property: minMergeDocs
     */
    void setMinMergeDocs(int minMergeDocs) {
        this.minMergeDocs = minMergeDocs;
        if (indexWriter != null) {
            // since lucene 2.0 setMaxBuffereDocs is equivalent to previous minMergeDocs attribute
            indexWriter.setMaxBufferedDocs(minMergeDocs);
        }
    }

    /**
     * The lucene index writer property: maxMergeDocs
     */
    void setMaxMergeDocs(int maxMergeDocs) {
        this.maxMergeDocs = maxMergeDocs;
        if (indexWriter != null) {
            indexWriter.setMaxMergeDocs(maxMergeDocs);
        }
    }

    /**
     * The lucene index writer property: mergeFactor
     */
    void setMergeFactor(int mergeFactor) {
        this.mergeFactor = mergeFactor;
        if (indexWriter != null) {
            indexWriter.setMergeFactor(mergeFactor);
        }
    }

    /**
     * The lucene index writer property: maxFieldLength
     */
    void setMaxFieldLength(int maxFieldLength) {
        this.maxFieldLength = maxFieldLength;
        if (indexWriter != null) {
            indexWriter.setMaxFieldLength(maxFieldLength);
        }
    }

    //------------------------------< internal >--------------------------------

    /**
     * Returns the index parameter set on <code>f</code>.
     *
     * @param f a lucene field.
     * @return the index parameter on <code>f</code>.
     */
    private Field.Index getIndexParameter(Field f) {
        if (!f.isIndexed()) {
            return Field.Index.NO;
        } else if (f.isTokenized()) {
            return Field.Index.TOKENIZED;
        } else {
            return Field.Index.UN_TOKENIZED;
        }
    }

    /**
     * Returns the store parameter set on <code>f</code>.
     *
     * @param f a lucene field.
     * @return the store parameter on <code>f</code>.
     */
    private Field.Store getStoreParameter(Field f) {
        if (f.isCompressed()) {
            return Field.Store.COMPRESS;
        } else if (f.isStored()) {
            return Field.Store.YES;
        } else {
            return Field.Store.NO;
        }
    }

    /**
     * Returns the term vector parameter set on <code>f</code>.
     *
     * @param f a lucene field.
     * @return the term vector parameter on <code>f</code>.
     */
    private Field.TermVector getTermVectorParameter(Field f) {
        if (f.isStorePositionWithTermVector() && f.isStoreOffsetWithTermVector()) {
            return Field.TermVector.WITH_POSITIONS_OFFSETS;
        } else if (f.isStorePositionWithTermVector()) {
            return Field.TermVector.WITH_POSITIONS;
        } else if (f.isStoreOffsetWithTermVector()) {
            return Field.TermVector.WITH_OFFSETS;
        } else if (f.isTermVectorStored()) {
            return Field.TermVector.YES;
        } else {
            return Field.TermVector.NO;
        }
    }

    /**
     * Adapter to pipe info messages from lucene into log messages.
     */
    private static final class LoggingPrintStream extends PrintStream {

        /** Buffer print calls until a newline is written */
        private StringBuffer buffer = new StringBuffer();

        public LoggingPrintStream() {
            super(new OutputStream() {
                public void write(int b) {
                    // do nothing
                }
            });
        }

        public void print(String s) {
            buffer.append(s);
        }

        public void println(String s) {
            buffer.append(s);
            log.debug(buffer.toString());
            buffer.setLength(0);
        }
    }
}