edu.cmu.cs.in.hoop.hoops.save.HoopDocumentIndexer.java Source code

Introduction

Here is the source code for edu.cmu.cs.in.hoop.hoops.save.HoopDocumentIndexer.java
Source

/** 
 * Author: Martin van Velsen <vvelsen@cs.cmu.edu>
 * 
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as 
 *  published by the Free Software Foundation, either version 3 of the 
 *  License, or (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * 
 */

package edu.cmu.cs.in.hoop.hoops.save;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import edu.cmu.cs.in.base.HoopLink;
import edu.cmu.cs.in.base.kv.HoopKV;
import edu.cmu.cs.in.base.kv.HoopKVDocument;
import edu.cmu.cs.in.hoop.hoops.base.HoopBase;
import edu.cmu.cs.in.hoop.hoops.base.HoopSaveBase;

/**
* 
*/
public class HoopDocumentIndexer extends HoopSaveBase {
    private static final long serialVersionUID = -8495052993868801211L;

    // Lucene variables ...

    private IndexWriter writer = null;
    private boolean create = false; // This will setup an append if the index is already available
    private String searchStore = "";

    /**
     *
     */
    public HoopDocumentIndexer() {
        setClassName("HoopDocumentIndexer");
        debug("HoopDocumentIndexer ()");

        setHoopDescription("Lucene Document Indexer");
    }

    /**
     * 
     */
    public void reset() {
        debug("reset ()");

        super.reset();

        initSearch();
    }

    /**
     *
     */
    public Boolean runHoop(HoopBase inHoop) {
        debug("runHoop ()");

        ArrayList<HoopKV> inData = inHoop.getData();

        if (inData == null) {
            this.setErrorString("Error: no input data to work with");
            return (false);
        }

        for (int t = 0; t < inData.size(); t++) {
            HoopKV aKV = inData.get(t);

            if (aKV instanceof HoopKVDocument) {
                HoopKVDocument newDocument = (HoopKVDocument) aKV;

                addToSearchIndex(newDocument);
            }

            updateProgressStatus(t, inData.size());
        }

        return (true);
    }

    /**
     * 
     */
    private void initSearch() {
        debug("initSearch ()");

        if (searchStore.isEmpty() == true) {
            searchStore = getProjectPath() + "/system/search";

            File checker = new File(searchStore);

            if (checker.exists() == false) {
                if (HoopLink.fManager.createDirectory(searchStore) == false) {
                    debug("Error creating search directory: " + searchStore);
                    return;
                }
            } else
                debug("Document search directory exists, excellent");
        }

        debug("Indexing to directory '" + searchStore + "'...");

        if (writer == null) {
            Directory dir = null;

            try {
                dir = FSDirectory.open(new File(searchStore));
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
                return;
            }

            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
            IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40, analyzer);

            if (create) {
                // Create a new index in the directory, removing any
                // previously indexed documents:
                iwc.setOpenMode(OpenMode.CREATE);
            } else {
                // Add new documents to an existing index:
                iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
            }

            // Optional: for better indexing performance, if you
            // are indexing many documents, increase the RAM
            // buffer.  But if you do this, increase the max heap
            // size to the JVM (eg add -Xmx512m or -Xmx1g):
            //
            // iwc.setRAMBufferSizeMB(256.0);

            try {
                writer = new IndexWriter(dir, iwc);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
                writer = null; // Disable
            }
        }
    }

    /**
     * 
     */
    protected void addToSearchIndex(HoopKVDocument aDocument) {
        debug("addToSearchIndex ()");

        // make a new, empty document
        Document doc = new Document();

        // Add the path of the file as a field named "path".  Use a
        // field that is indexed (i.e. searchable), but don't tokenize 
        // the field into separate words and don't index term frequency
        // or positional information:

        //Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
        Field pathField = new StringField("path", aDocument.url.getValue(), Field.Store.YES);
        doc.add(pathField);

        // Add the last modified date of the file a field named "modified".
        // Use a LongField that is indexed (i.e. efficiently filterable with
        // NumericRangeFilter).  This indexes to milli-second resolution, which
        // is often too fine.  You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you require.
        // For example the long value 2011021714 would mean
        // February 17, 2011, 2-3 PM.

        //doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));
        doc.add(new LongField("modified", aDocument.modifiedDate.getValueSize(), Field.Store.NO));

        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will fail.

        doc.add(new TextField("contents", aDocument.text.getValue(), Field.Store.NO));

        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            // System.out.println("adding " + file);
            debug("Adding " + aDocument.url.getValue() + " to search index");

            try {
                writer.addDocument(doc);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        } else {
            // Existing index (an old copy of this document may have been indexed) so 
            // we use updateDocument instead to replace the old one matching the exact 
            // path, if present:

            debug("Updating " + aDocument.url.getValue() + " in search index");
            try {
                writer.updateDocument(new Term("path", aDocument.url.getValue()), doc);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    }

    /**
     * 
     */
    public HoopBase copy() {
        return (new HoopDocumentIndexer());
    }
}