org.dyndns.andreasbaumann.LuceneAnalyzer.java Source code

Java tutorial

Introduction

Here is the source code for org.dyndns.andreasbaumann.LuceneAnalyzer.java

Source

/*
 *   LuceneAnalyzer - Lucene Index Analyzer
 *
 *   Copyright (C) 2006  Andreas Baumann
 *
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

package org.dyndns.andreasbaumann;

import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import jargs.gnu.CmdLineParser;
import jargs.gnu.CmdLineParser.Option;
import jargs.gnu.CmdLineParser.OptionException;

import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.FieldOption;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

import java.util.logging.LogManager;
import java.util.logging.Logger;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.core.CoreDescriptor;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.core.SolrConfig;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.search.SolrIndexSearcher;
import javax.xml.parsers.ParserConfigurationException;
import org.xml.sax.SAXException;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.FieldType;

/**
 * Lucene index analyzer. Works for file system indexes only (not
 * for indexes fully in RAM or in different persistence systems as
 * a JDBCDirectory.
 *
 * Note: requires at least lucene 3.0!
 *
 * @author Andreas Baumann, <abaumann@yahoo.com>
 * @version $Id$
 */

public class LuceneAnalyzer {
    private static final String programName = "lucenanalyzer";
    private static final String versionString = "0.0.4";

    static {
        LogManager.getLogManager().reset();
        Logger globalLogger = Logger.getLogger("" /* java.util.logging.Logger.GLOBAL_LOGGER_NAME */ );
        globalLogger.setLevel(java.util.logging.Level.OFF);
    }

    private static void printGlobalInfo(IndexReader indexReader, boolean printHeaders, boolean isSolr,
            SolrIndexSearcher solrSearch) throws IOException {
        if (printHeaders) {
            System.out.println("Global Information:");
            System.out.println("===================");
        }

        System.out.println("\tnumber of documents: " + indexReader.numDocs());

        // we should get the number of features differently, this is inefficient, but Lucene
        // has no notion of global statistics (because the default weighting schema doesn't
        // make use of it!)
        int nofFeatures = 0;
        int nofTokens = 0;
        TermEnum terms = indexReader.terms();
        while (terms.next()) {
            Term term = terms.term();
            int df = terms.docFreq();
            nofFeatures++;
            nofTokens += df;
        }
        System.out.println("\ttotal number of features: " + nofFeatures);
        System.out.println("\ttotal number of tokens: " + nofTokens);

        System.out.println("\tversion: " + indexReader.getVersion());
        System.out.println("\tstill current: " + indexReader.isCurrent());

        //TODO: we don't get segment information!
        //System.out.println( "is optimized:" + segmentInfos.size( ) == 1 && !indexReader.hasDeletions( ) );
        System.out.println("\tmaximal document number: " + indexReader.maxDoc());
        System.out.println("\thas deletions: " + indexReader.hasDeletions());

        if (isSolr) {
            System.out.println("\tSolr version: " + solrSearch.getVersion());
        }

        System.out.println("");
    }

    private static void printFieldInfoPerFieldOption(IndexReader indexReader, IndexReader.FieldOption fieldOption) {
        System.out.println("Fields of type '" + fieldOption + "':");
        Collection fields = indexReader.getFieldNames(fieldOption);
        Iterator fieldIterator = fields.iterator();
        while (fieldIterator.hasNext()) {
            String field = (String) fieldIterator.next();
            if (field != null && !field.equals("")) {
                // TODO: define data type here!
                System.out.println("\t" + field.toString());
            }
        }
    }

    private static void printFieldInfo(IndexReader indexReader, boolean printHeaders, boolean isSolr,
            SolrIndexSearcher solrSearch) throws IOException {
        if (printHeaders) {
            System.out.println("Field Information:");
            System.out.println("==================");
        }

        // print info per Lucene field type
        printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.ALL);
        printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.INDEXED);
        printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.INDEXED_NO_TERMVECTOR);
        printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.INDEXED_WITH_TERMVECTOR);
        printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.OMIT_POSITIONS);
        printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.OMIT_TERM_FREQ_AND_POSITIONS);
        printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.STORES_PAYLOADS);
        printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.TERMVECTOR);
        printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET);
        printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.TERMVECTOR_WITH_POSITION);
        printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET);
        printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.UNINDEXED);

        System.out.println("");
    }

    private static void printTerms(IndexReader indexReader, boolean printHeaders, boolean isSolr,
            SolrIndexSearcher solrSearch, boolean printDocNumbers, boolean printPositions) throws IOException {
        if (printHeaders) {
            System.out.println("Terms:");
            System.out.println("======");
        }
        TermEnum terms = indexReader.terms();
        while (terms.next()) {
            Term term = terms.term();
            // the df is stored in the iterator and not in the term, weird...
            int df = terms.docFreq();
            String field = term.field();
            String text = term.text();
            if (isSolr) {
                IndexSchema schema = solrSearch.getSchema();
                SchemaField schemaField = schema.getField(field);
                FieldType fieldType = schemaField.getType();
                text = fieldType.indexedToReadable(text);
            }
            if (!printDocNumbers && !printPositions) {
                System.out.print(field + "\t" + text + "\t" + df);
            } else {
                System.out.print(field + "\t" + text);
            }

            if (printDocNumbers) {
                TermDocs termDocs = indexReader.termDocs(term);
                boolean first = true;
                while (termDocs.next()) {
                    if (first) {
                        System.out.print("\t" + termDocs.doc());
                        first = false;
                    } else {
                        System.out.print("," + termDocs.doc());
                    }
                }
                termDocs.close();
            } else if (printPositions) {
                TermPositions termPositions = indexReader.termPositions(term);
                boolean first = true;
                while (termPositions.next()) {
                    if (first) {
                        System.out.print("\t" + termPositions.doc());
                        first = false;
                    } else {
                        System.out.print("," + termPositions.doc());
                    }

                    for (int i = 0; i < termPositions.freq(); i++) {
                        int position = termPositions.nextPosition();
                        if (i == 0) {
                            System.out.print("[");
                        }
                        System.out.print(position);
                        if (i < termPositions.freq() - 1) {
                            System.out.print(",");
                        }
                        if (i == termPositions.freq() - 1) {
                            System.out.print("]");
                        }
                    }
                }
                termPositions.close();
            }

            System.out.println("");
        }
        System.out.println("");
    }

    private static List optionHelpStrings = new ArrayList();

    private static Option addHelp(Option option, String helpString) {
        if (option.shortForm() != null) {
            optionHelpStrings.add(" -" + option.shortForm() + "/--" + option.longForm() + ": " + helpString);
        } else {
            optionHelpStrings.add(" --" + option.longForm() + ": " + helpString);
        }
        return option;
    }

    private static void printUsage() {
        System.err.println("Usage: " + programName + " [options] <lucene index dir>\n");
        for (Iterator i = optionHelpStrings.iterator(); i.hasNext();) {
            System.err.println(i.next());
        }
    }

    private static void printVersion() {
        System.out.println("Version " + LuceneAnalyzer.class.getName() + " " + versionString);
    }

    public static void main(String[] args) throws IOException {
        CmdLineParser parser = new CmdLineParser();

        // default options, well-known, should always be around
        Option verbose = addHelp(parser.addBooleanOption('v', "verbose"), "print extra verbosity information");
        Option help = addHelp(parser.addBooleanOption('h', "help"), "print this help message");
        Option version = addHelp(parser.addBooleanOption("version"), "print version information");
        Option globals = addHelp(parser.addBooleanOption('g', "globals"), "print global statistics");
        Option fields = addHelp(parser.addBooleanOption('f', "fields"), "print field information");
        Option terms = addHelp(parser.addBooleanOption('t', "terms"), "print statistics per term");
        Option headers = addHelp(parser.addBooleanOption('H', "headers"), "print headers for sections");
        Option solr = addHelp(parser.addBooleanOption('s', "solr"),
                "treat index as a Solr index, indexDir is the Solr base dir");

        // read the command line options
        try {
            parser.parse(args);
        } catch (OptionException e) {
            System.err.println(e.getMessage());
            printUsage();
            System.exit(1);
        }

        if ((Boolean) parser.getOptionValue(help, Boolean.FALSE)) {
            printUsage();
            System.exit(0);
        }

        if ((Boolean) parser.getOptionValue(version, Boolean.FALSE)) {
            printVersion();
            System.exit(0);
        }

        // verbosity as a level, increased with -vvv
        int verbosity = 0;
        while (true) {
            Boolean verboseValue = (Boolean) parser.getOptionValue(verbose);
            if (verboseValue == null) {
                break;
            } else {
                verbosity++;
            }
        }

        boolean printHeaders = false;
        if ((Boolean) parser.getOptionValue(headers, Boolean.FALSE)) {
            printHeaders = true;
        }

        boolean isSolr = false;
        if ((Boolean) parser.getOptionValue(solr, Boolean.FALSE)) {
            isSolr = true;
        }

        // read command line arguments
        String[] otherArgs = parser.getRemainingArgs();

        if (otherArgs.length != 1) {
            System.err.println("Missing a lucene index directory as first argument");
            printUsage();
            System.exit(1);
        }

        String basePath = otherArgs[0];
        String indexPath = otherArgs[0];
        if (isSolr) {
            indexPath += "/data/index";
        }
        File indexDir = new File(indexPath);
        if (!indexDir.exists()) {
            System.err.println(indexPath + " doesn't exist");
            System.exit(1);
        }
        if (!indexDir.isDirectory()) {
            System.err.println(indexPath + " is not a directory");
            System.exit(1);
        }

        SolrIndexSearcher solrSearcher = null;
        Directory luceneDirectory = new SimpleFSDirectory(indexDir);
        IndexReader indexReader = IndexReader.open(luceneDirectory);
        if (isSolr) {
            try {
                Properties p = System.getProperties();
                p.setProperty("solr.solr.home", basePath);

                CoreContainer cores = new CoreContainer(new SolrResourceLoader(basePath));
                SolrConfig solrConfig = new SolrConfig(basePath, SolrConfig.DEFAULT_CONF_FILE, null);
                CoreDescriptor descrCore = new CoreDescriptor(cores, "",
                        solrConfig.getResourceLoader().getInstanceDir());
                IndexSchema solrSchema = new IndexSchema(solrConfig, basePath + "/conf/schema.xml", null);
                SolrCore solrCore = new SolrCore(basePath, solrSchema);
                solrSearcher = new SolrIndexSearcher(solrCore, solrSchema, "test", luceneDirectory, true, false);
            } catch (javax.xml.parsers.ParserConfigurationException e) {
                System.err.println("Illegal Solr configuration: " + e);
                System.exit(1);
            } catch (org.xml.sax.SAXException e) {
                System.err.println("Illegal Solr configuration: " + e);
                System.exit(1);
            }
        }

        if ((Boolean) parser.getOptionValue(globals, Boolean.FALSE)) {
            printGlobalInfo(indexReader, printHeaders, isSolr, solrSearcher);
        }
        if ((Boolean) parser.getOptionValue(fields, Boolean.FALSE)) {
            printFieldInfo(indexReader, printHeaders, isSolr, solrSearcher);
        }
        if ((Boolean) parser.getOptionValue(terms, Boolean.FALSE)) {
            printTerms(indexReader, printHeaders, isSolr, solrSearcher, verbosity == 1, verbosity >= 2);
        }

        indexReader.close();

        System.exit(0);
    }
}