gate.creole.ir.lucene.LuceneIndexManager.java Source code

Java tutorial

Introduction

Here is the source code for gate.creole.ir.lucene.LuceneIndexManager.java

Source

/*
 *  LuceneIndexManager.java
 *
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gaI enjoy seeing
the occasional update on Facebook :te.ac.uk/gate/licence.html).
 *
 *  Rosen Marinov, 19/Apr/2002
 *
 */

package gate.creole.ir.lucene;

import gate.Corpus;
import gate.Document;
import gate.creole.ir.IndexDefinition;
import gate.creole.ir.IndexException;
import gate.creole.ir.IndexField;
import gate.creole.ir.IndexManager;
import gate.util.GateRuntimeException;

import java.io.File;
import java.util.Iterator;
import java.util.List;

import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/** This class represents Lucene implementation of IndexManeager interface.*/
public class LuceneIndexManager implements IndexManager {

    /** used in Lucene Documents as a key for gate document ID value. */
    public final static String DOCUMENT_ID = "DOCUMENT_ID";

    /** IndexDefinition - location, type, fields, etc.*/
    private IndexDefinition indexDefinition;

    /** An corpus for indexing*/
    private Corpus corpus;

    /* Niraj */
    /** constant that ensures that corpus is indexed with IR plugin */
    public final static String CORPUS_INDEX_FEATURE = "CorpusIndexFeature";
    public final static String CORPUS_INDEX_FEATURE_VALUE = "IR";
    /* End */

    /** Constructor of the class. */
    public LuceneIndexManager() {
    }

    /** Creates index directory and indexing all
     *  documents in the corpus. */
    @Override
    public void createIndex() throws IndexException {
        if (indexDefinition == null)
            throw new GateRuntimeException("Index definition is null!");
        if (corpus == null)
            throw new GateRuntimeException("Corpus is null!");

        String location = indexDefinition.getIndexLocation();
        try {
            File file = new File(location);
            if (file.exists()) {
                if (file.isDirectory() && file.listFiles().length > 0) {
                    throw new IndexException(location + " is not empty directory");
                }
                if (!file.isDirectory()) {
                    throw new IndexException("Only empty directory can be index path");
                }
            }

            /* Niraj */
            // ok so lets put the corpus index feature
            corpus.getFeatures().put(CORPUS_INDEX_FEATURE, CORPUS_INDEX_FEATURE_VALUE);
            /* End */

            IndexWriter writer = new IndexWriter(FSDirectory.open(new File(location)),
                    new IndexWriterConfig(Version.LUCENE_31, new SimpleAnalyzer(Version.LUCENE_30))
                            .setOpenMode(OpenMode.CREATE));

            /*IndexWriter writer = new IndexWriter(
                    FSDirectory.open(new File(location)),
                    new SimpleAnalyzer(Version.LUCENE_30), 
                    true,
                    new IndexWriter.MaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH)
                    );*/

            for (int i = 0; i < corpus.size(); i++) {
                boolean isLoaded = corpus.isDocumentLoaded(i);
                gate.Document gateDoc = corpus.get(i);
                writer.addDocument(getLuceneDoc(gateDoc));
                if (!isLoaded) {
                    corpus.unloadDocument(gateDoc);
                    gate.Factory.deleteResource(gateDoc);
                }
            } //for (all documents)

            writer.commit();
            writer.close();
            corpus.sync();
        } catch (java.io.IOException ioe) {
            throw new IndexException(ioe.getMessage());
        } catch (gate.persist.PersistenceException pe) {
            pe.printStackTrace();
        }
    }

    /** Optimize existing index. */
    @Override
    public void optimizeIndex() throws IndexException {
        if (indexDefinition == null)
            throw new GateRuntimeException("Index definition is null!");
        try {

            IndexWriter writer = new IndexWriter(FSDirectory.open(new File(indexDefinition.getIndexLocation())),
                    new IndexWriterConfig(Version.LUCENE_31, new SimpleAnalyzer(Version.LUCENE_30))
                            .setOpenMode(OpenMode.APPEND));

            /*IndexWriter writer = new IndexWriter(
                    FSDirectory.open(new File(indexDefinition.getIndexLocation())),
                    new SimpleAnalyzer(Version.LUCENE_30), 
                    false,
                    new IndexWriter.MaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH));*/

            //writer.optimize();
            writer.forceMerge(1, true);

            writer.commit();
            writer.close();
        } catch (java.io.IOException ioe) {
            throw new IndexException(ioe.getMessage());
        }
    }

    /** Delete index. */
    @Override
    public void deleteIndex() throws IndexException {
        if (indexDefinition == null)
            throw new GateRuntimeException("Index definition is null!");
        boolean isDeleted = true;
        File dir = new File(indexDefinition.getIndexLocation());
        if (dir.exists() && dir.isDirectory()) {
            File[] files = dir.listFiles();
            for (int i = 0; i < files.length; i++) {
                File f = files[i];
                isDeleted = f.delete();
            }
        }
        dir.delete();
        if (!isDeleted) {
            throw new IndexException("Can't delete directory" + indexDefinition.getIndexLocation());
        }
    }

    /** Reindexing changed documents, removing removed documents and
     *  add to the index new corpus documents. */
    @Override
    public void sync(List<Document> added, List<String> removedIDs, List<Document> changed) throws IndexException {
        String location = indexDefinition.getIndexLocation();
        try {

            IndexReader reader = IndexReader.open(FSDirectory.open(new File(location)), false);

            for (int i = 0; i < removedIDs.size(); i++) {
                String id = removedIDs.get(i).toString();
                org.apache.lucene.index.Term term = new org.apache.lucene.index.Term(DOCUMENT_ID, id);
                reader.deleteDocuments(term);
            } //for (remove all removed documents)

            for (int i = 0; i < changed.size(); i++) {
                gate.Document gateDoc = changed.get(i);
                String id = gateDoc.getLRPersistenceId().toString();
                org.apache.lucene.index.Term term = new org.apache.lucene.index.Term(DOCUMENT_ID, id);
                reader.deleteDocuments(term);
            } //for (remove all changed documents)

            reader.close();

            /*IndexWriter writer = new IndexWriter(
                    FSDirectory.open(new File(location)),
                    new SimpleAnalyzer(Version.LUCENE_30), 
                    false,
                    new IndexWriter.MaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH)
                    );*/
            IndexWriter writer = new IndexWriter(FSDirectory.open(new File(location)),
                    new IndexWriterConfig(Version.LUCENE_31, new SimpleAnalyzer(Version.LUCENE_30))
                            .setOpenMode(OpenMode.APPEND));

            for (int i = 0; i < added.size(); i++) {
                gate.Document gateDoc = added.get(i);
                writer.addDocument(getLuceneDoc(gateDoc));
            } //for (add all added documents)

            for (int i = 0; i < changed.size(); i++) {
                gate.Document gateDoc = changed.get(i);
                writer.addDocument(getLuceneDoc(gateDoc));
            } //for (add all changed documents)

            writer.close();
        } catch (java.io.IOException ioe) {
            throw new IndexException(ioe.getMessage());
        }
    }

    private org.apache.lucene.document.Document getLuceneDoc(gate.Document gateDoc) {
        org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document();

        Iterator<IndexField> fields = indexDefinition.getIndexFields();

        //    luceneDoc.add(Field.Keyword(DOCUMENT_ID,
        //                                gateDoc.getLRPersistenceId().toString()));

        // update version of Lucene
        luceneDoc.add(new Field(DOCUMENT_ID, gateDoc.getLRPersistenceId().toString(), Field.Store.YES,
                Field.Index.NOT_ANALYZED));

        while (fields.hasNext()) {
            IndexField field = fields.next();
            String valueForIndexing;

            if (field.getReader() == null) {
                valueForIndexing = gateDoc.getFeatures().get(field.getName()).toString();
            } else {
                valueForIndexing = field.getReader().getPropertyValue(gateDoc);
            } //if-else reader or feature

            if (field.isPreseved()) {
                luceneDoc.add(
                        new Field(field.getName(), valueForIndexing, Field.Store.YES, Field.Index.NOT_ANALYZED));
                // luceneDoc.add(Field.Keyword(field.getName(),valueForIndexing));
            } else {
                luceneDoc.add(new Field(field.getName(), valueForIndexing, Field.Store.NO, Field.Index.ANALYZED));
                // luceneDoc.add(Field.UnStored(field.getName(),valueForIndexing));
            } // if-else keyword or text

        } // while (add all fields)

        return luceneDoc;
    }

    @Override
    public Corpus getCorpus() {
        return corpus;
    }

    @Override
    public void setCorpus(Corpus corpus) {
        this.corpus = corpus;
    }

    @Override
    public IndexDefinition getIndexDefinition() {
        return indexDefinition;
    }

    @Override
    public void setIndexDefinition(IndexDefinition indexDefinition) {
        this.indexDefinition = indexDefinition;
    }

}