net.ontopia.infoset.fulltext.impl.lucene.LuceneIndexer.java Source code

Introduction

Here is the source code for net.ontopia.infoset.fulltext.impl.lucene.LuceneIndexer.java
Source

/*
 * #!
 * Ontopia Engine
 * #-
 * Copyright (C) 2001 - 2013 The Ontopia Project
 * #-
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * !#
 */

package net.ontopia.infoset.fulltext.impl.lucene;

import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringWriter;
import java.util.Iterator;

import net.ontopia.infoset.core.LocatorIF;
import net.ontopia.infoset.fulltext.core.DocumentIF;
import net.ontopia.infoset.fulltext.core.FieldIF;
import net.ontopia.infoset.fulltext.core.IndexerIF;
import net.ontopia.infoset.fulltext.topicmaps.DefaultTopicMapIndexer;
import net.ontopia.topicmaps.core.TopicMapIF;
import net.ontopia.topicmaps.core.TopicMapReaderIF;
import net.ontopia.topicmaps.utils.ImportExportUtils;
import net.ontopia.topicmaps.utils.ltm.LTMTopicMapReader;
import net.ontopia.topicmaps.xml.XTMTopicMapReader;
import net.ontopia.utils.CmdlineOptions;
import net.ontopia.utils.CmdlineUtils;
import net.ontopia.utils.FileUtils;
import net.ontopia.utils.OntopiaRuntimeException;
import net.ontopia.utils.URIUtils;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/**
 * INTERNAL: The Lucene indexer implementation. This indexer uses the
 * Lucene search engine to index documents.<p>
 */

public class LuceneIndexer implements IndexerIF {

    // Define a logging category.
    static Logger log = LoggerFactory.getLogger(LuceneIndexer.class.getName());

    static {
        try {
            net.ontopia.net.data.Handler.install();
        } catch (SecurityException e) {
            // Fail silently if there are security issues.
        }
    }

    protected String path;
    protected Directory dir;

    protected Analyzer analyzer;
    protected IndexWriter writer;
    protected IndexReader reader;

    /**
     * INTERNAL: Creates an indexer instance that will store its index in
     * the given file system directory. Tokenization will be done using
     * the StandardAnalyzer.<p>
     *
     * @param path The file system directory where the index is located.
     * @param create If true the indexer will create a new index (replacing
     *               the old one, if there is one). If false the indexed
     *               information will be added to the existing index.
     */
    public LuceneIndexer(String path, boolean create) throws IOException {
        this(path, new StandardAnalyzer(Version.LUCENE_36), create);
    }

    /**
     * INTERNAL: Creates an indexer instance that will store its index in
     * the given file system directory and use the specified token
     * stream analyzer.<p>
     *
     * @param path The file system directory where the index is located.
     * @param analyzer The token stream analyzer that the searcer is to use.
     * @param create If true the indexer will create a new index (replacing
     *               the old one, if there is one). If false the indexed
     *               information will be added to the existing index.
     */
    public LuceneIndexer(String path, Analyzer analyzer, boolean create) throws IOException {
        this(FSDirectory.open(new File(path)), analyzer, create);
        this.path = path;
    }

    /**
     * INTERNAL: Creates an indexer instance that will store its index in
     * the given lucene directory and use the default token stream
     * analyzer.<p>
     *
     * @param dir The lucene directory where the index is located.
     * @param create If true the indexer will create a new index (replacing
     *               the old one, if there is one). If false the indexed
     *               information will be added to the existing index.
     * @since 3.0
     */
    public LuceneIndexer(Directory dir, boolean create) throws IOException {
        this(dir, new StandardAnalyzer(Version.LUCENE_36), create);
    }

    /**
     * INTERNAL: Creates an indexer instance that will store its index in
     * the given lucene directory and use the specified token stream
     * analyzer.<p>
     *
     * @param dir The lucene directory where the index is located.
     * @param analyzer The token stream analyzer that the searcer is to use.
     * @param create If true the indexer will create a new index (replacing
     *               the old one, if there is one). If false the indexed
     *               information will be added to the existing index.
     */
    public LuceneIndexer(Directory dir, Analyzer analyzer, boolean create) throws IOException {
        this.dir = dir;
        this.analyzer = analyzer;
        if (create)
            this.writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_36, analyzer));
    }

    protected IndexWriter getWriter() {
        if (writer == null) {
            try {
                if (reader != null) {
                    reader.close();
                    reader = null;
                }
                IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
                config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
                writer = new IndexWriter(dir, config);
            } catch (IOException e) {
                throw new OntopiaRuntimeException(e);
            }
        }
        return writer;
    }

    /**
     * INTERNAL: Returns the path where the index used is stored.<p>
     */
    public String getPath() {
        return path;
    }

    /**
     * INTERNAL: Returns the number of documents stored in the index.
     */
    public synchronized int getDocs() throws IOException {
        return getWriter().numDocs();
    }

    public synchronized void index(DocumentIF document) throws IOException {
        getWriter().addDocument(getDocument(document));
    }

    public synchronized void delete(String field, String value) throws IOException {
        getWriter().deleteDocuments(new Term(field, value));
    }

    public synchronized void flush() throws IOException {
    }

    public synchronized void delete() throws IOException {
        // Close indexer
        close();

        // If this is a FSDirectory we can delete the directory
        // recursively.
        if (getPath() != null)
            FileUtils.delete(getPath(), true);
    }

    public synchronized void close() throws IOException {
        if (reader != null) {
            reader.close();
            reader = null;
        }
        if (writer != null) {
            writer.close();
            writer = null;
        }
    }

    protected Document getDocument(DocumentIF document) throws IOException {
        Document lucene_document = new Document();
        Iterator<FieldIF> iter = document.getFields().iterator();
        while (iter.hasNext()) {
            FieldIF field = iter.next();
            lucene_document.add(getField(field));
        }
        return lucene_document;
    }

    protected Field getField(FieldIF field) throws IOException {
        Field lucene_field;
        if (field.getReader() != null) {
            if (!field.isStored() && field.isIndexed() && field.isTokenized())
                lucene_field = new Field(field.getName(), field.getReader()); // Reader based field
            else {
                lucene_field = new Field(field.getName(), getStringValue(field.getReader()), getStoreSetting(field),
                        getIndexSetting(field));
            }
        } else {
            lucene_field = new Field(field.getName(), field.getValue(), getStoreSetting(field),
                    getIndexSetting(field));
        }
        return lucene_field;
    }

    protected Field.Store getStoreSetting(FieldIF field) {
        if (field.isStored()) {
            return Field.Store.YES;
        } else {
            return Field.Store.NO;
        }
    }

    protected Field.Index getIndexSetting(FieldIF field) {
        if (field.isIndexed()) {
            if (field.isTokenized()) {
                return Field.Index.ANALYZED;
            } else {
                return Field.Index.NOT_ANALYZED;
            }
        } else {
            return Field.Index.NO;
        }
    }

    protected String getStringValue(Reader reader) throws IOException {
        // Read the reader contents into a string
        StringWriter swriter = new StringWriter();
        int c;
        while ((c = reader.read()) != -1)
            swriter.write(c);
        return swriter.getBuffer().toString();
    }

    // -----------------------------------------------------------------------------
    // Command line
    // -----------------------------------------------------------------------------

    private static Analyzer createAnalyzer(String acname) {
        try {
            Class<?> aclass = Class.forName(acname);
            return (Analyzer) aclass.newInstance();
        } catch (Throwable e) {
            throw new OntopiaRuntimeException("Could not create analyzer: " + acname, e);
        }
    }

    /**
     * INTERNAL: Command line version of the indexer.
     */
    public static void main(String[] argv) {

        // Initialize logging
        CmdlineUtils.initializeLogging();

        // Initialize command line option parser and listeners
        CmdlineOptions options = new CmdlineOptions("LuceneIndexer", argv);
        OptionsListener ohandler = new OptionsListener();

        // Register local options
        options.addLong(ohandler, "props", 'p', true);
        options.addLong(ohandler, "syntax", 's', true);
        options.addLong(ohandler, "external", 'e');
        options.addLong(ohandler, "timeout", 't', true);
        options.addLong(ohandler, "maxthreads", 'x', true);
        options.addLong(ohandler, "preloaddir", 'd', true);
        options.addLong(ohandler, "analyzer", 'a', true);

        // Register logging options
        CmdlineUtils.registerLoggingOptions(options);

        // Parse command line options
        try {
            options.parse();
        } catch (CmdlineOptions.OptionsException e) {
            System.err.println("Error: " + e.getMessage());
            usage();
            System.exit(1);
        }

        // Get command line arguments
        String[] args = options.getArguments();
        if (args.length != 2) {
            usage();
            System.exit(1);
        }

        try {
            // Create topic map reader
            String urlstring = args[1];
            LocatorIF url = URIUtils.getURI(urlstring);
            TopicMapReaderIF reader;

            if (ohandler.syntax == null) {
                if (ohandler.propfile == null) {
                    reader = ImportExportUtils.getReader(url);
                } else {
                    reader = ImportExportUtils.getReader(ohandler.propfile, urlstring);
                }
            } else if (ohandler.syntax.equals("ltm"))
                reader = new LTMTopicMapReader(url);
            else if (ohandler.syntax.equals("xtm")) {
                reader = new XTMTopicMapReader(url);
            } else
                throw new OntopiaRuntimeException("Unknown syntax: " + ohandler.syntax);

            // HACK: disable validation
            if (reader instanceof XTMTopicMapReader)
                ((XTMTopicMapReader) reader).setValidation(false);

            // Load the topic map
            TopicMapIF topicmap = reader.read();

            // Create a Lucene indexer
            IndexerIF lucene_indexer = null;
            if (ohandler.acname == null)
                lucene_indexer = new LuceneIndexer(args[0], true);
            else
                lucene_indexer = new LuceneIndexer(args[0], createAnalyzer(ohandler.acname), true);

            // Creates an instance of the default topic map indexer.
            DefaultTopicMapIndexer imanager = new DefaultTopicMapIndexer(lucene_indexer, ohandler.external,
                    ohandler.preloaddir);

            // Indexes the topic map
            imanager.index(topicmap);

            // Close indexer and index manager
            imanager.close();
            lucene_indexer.close();

        } catch (Exception e) {
            e.printStackTrace();
            System.exit(3);
        }
    }

    protected static void usage() {
        System.out.println(
                "java net.ontopia.infoset.fulltext.impl.lucene.LuceneIndexer [options] <index> <url> [<analyzer>]");
        System.out.println();
        System.out.println("  Indexes the specified topic map using the Lucene fulltext indexer.");
        System.out.println();
        System.out.println("  Options:");
        CmdlineUtils.printLoggingOptionsUsage(System.out);
        System.out.println("    --props=<propfile>: the topic map store properties file");
        System.out.println("    --syntax=[xtm|ltm]: the syntax of input document");
        System.out.println("    --external: download and index external resources (default: false)");
        System.out.println("    --timeout=<ms>: timeout in millisecs when downloading a resource (default: 60000)");
        System.out.println("    --maxthreads=<number>: maximum concurrent preloaders (default: 10)");
        System.out.println("    --preloaddir=<dir>: directory in which preloaded documents are stored");
        System.out.println("                  (default: $user.dir/preloader)");
        System.out.println("    --analyzer=<classname>: class name of the analyzer to use");
        System.out.println();
        System.out.println("  <index>: directory that is to contain the result index.");
        System.out.println("  <url>: source topic map");
    }

    private static class OptionsListener implements CmdlineOptions.ListenerIF {
        String propfile;
        String syntax;
        boolean external = false;
        int timeout = 60000;
        int max_threads = 10;
        String preloaddir = System.getProperty("user.dir") + File.separator + "preloader";
        String acname;

        public void processOption(char option, String value) throws CmdlineOptions.OptionsException {
            if (option == 'e')
                external = true;
            else if (option == 's')
                syntax = value;
            else if (option == 'p')
                propfile = value;
            else if (option == 't')
                timeout = Integer.parseInt(value);
            else if (option == 'd')
                preloaddir = value;
            else if (option == 'x')
                max_threads = Integer.parseInt(value);
            else if (option == 'a')
                acname = value;
        }
    }

}