Java tutorial
/* * LuceneAnalyzer - Lucene Index Analyzer * * Copyright (C) 2006 Andreas Baumann * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package org.dyndns.andreasbaumann; import java.io.File; import java.io.IOException; import java.util.Collection; import java.util.Iterator; import java.util.ArrayList; import java.util.List; import java.util.Properties; import jargs.gnu.CmdLineParser; import jargs.gnu.CmdLineParser.Option; import jargs.gnu.CmdLineParser.OptionException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader.FieldOption; import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.TermPositions; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import java.util.logging.LogManager; import java.util.logging.Logger; import org.apache.solr.core.CoreContainer; import org.apache.solr.core.CoreDescriptor; import org.apache.solr.core.SolrResourceLoader; import org.apache.solr.core.SolrConfig; import org.apache.solr.core.SolrCore; import org.apache.solr.schema.IndexSchema; import org.apache.solr.search.SolrIndexSearcher; import javax.xml.parsers.ParserConfigurationException; import org.xml.sax.SAXException; import org.apache.solr.schema.SchemaField; import org.apache.solr.schema.FieldType; /** * Lucene index analyzer. Works for file system indexes only (not * for indexes fully in RAM or in different persistence systems as * a JDBCDirectory. * * Note: requires at least lucene 3.0! * * @author Andreas Baumann, <abaumann@yahoo.com> * @version $Id$ */ public class LuceneAnalyzer { private static final String programName = "lucenanalyzer"; private static final String versionString = "0.0.4"; static { LogManager.getLogManager().reset(); Logger globalLogger = Logger.getLogger("" /* java.util.logging.Logger.GLOBAL_LOGGER_NAME */ ); globalLogger.setLevel(java.util.logging.Level.OFF); } private static void printGlobalInfo(IndexReader indexReader, boolean printHeaders, boolean isSolr, SolrIndexSearcher solrSearch) throws IOException { if (printHeaders) { System.out.println("Global Information:"); System.out.println("==================="); } System.out.println("\tnumber of documents: " + indexReader.numDocs()); // we should get the number of features differently, this is inefficient, but Lucene // has no notion of global statistics (because the default weighting schema doesn't // make use of it!) int nofFeatures = 0; int nofTokens = 0; TermEnum terms = indexReader.terms(); while (terms.next()) { Term term = terms.term(); int df = terms.docFreq(); nofFeatures++; nofTokens += df; } System.out.println("\ttotal number of features: " + nofFeatures); System.out.println("\ttotal number of tokens: " + nofTokens); System.out.println("\tversion: " + indexReader.getVersion()); System.out.println("\tstill current: " + indexReader.isCurrent()); //TODO: we don't get segment information! //System.out.println( "is optimized:" + segmentInfos.size( ) == 1 && !indexReader.hasDeletions( ) ); System.out.println("\tmaximal document number: " + indexReader.maxDoc()); System.out.println("\thas deletions: " + indexReader.hasDeletions()); if (isSolr) { System.out.println("\tSolr version: " + solrSearch.getVersion()); } System.out.println(""); } private static void printFieldInfoPerFieldOption(IndexReader indexReader, IndexReader.FieldOption fieldOption) { System.out.println("Fields of type '" + fieldOption + "':"); Collection fields = indexReader.getFieldNames(fieldOption); Iterator fieldIterator = fields.iterator(); while (fieldIterator.hasNext()) { String field = (String) fieldIterator.next(); if (field != null && !field.equals("")) { // TODO: define data type here! System.out.println("\t" + field.toString()); } } } private static void printFieldInfo(IndexReader indexReader, boolean printHeaders, boolean isSolr, SolrIndexSearcher solrSearch) throws IOException { if (printHeaders) { System.out.println("Field Information:"); System.out.println("=================="); } // print info per Lucene field type printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.ALL); printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.INDEXED); printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.INDEXED_NO_TERMVECTOR); printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.INDEXED_WITH_TERMVECTOR); printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.OMIT_POSITIONS); printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.OMIT_TERM_FREQ_AND_POSITIONS); printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.STORES_PAYLOADS); printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.TERMVECTOR); printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET); printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.TERMVECTOR_WITH_POSITION); printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET); printFieldInfoPerFieldOption(indexReader, IndexReader.FieldOption.UNINDEXED); System.out.println(""); } private static void printTerms(IndexReader indexReader, boolean printHeaders, boolean isSolr, SolrIndexSearcher solrSearch, boolean printDocNumbers, boolean printPositions) throws IOException { if (printHeaders) { System.out.println("Terms:"); System.out.println("======"); } TermEnum terms = indexReader.terms(); while (terms.next()) { Term term = terms.term(); // the df is stored in the iterator and not in the term, weird... int df = terms.docFreq(); String field = term.field(); String text = term.text(); if (isSolr) { IndexSchema schema = solrSearch.getSchema(); SchemaField schemaField = schema.getField(field); FieldType fieldType = schemaField.getType(); text = fieldType.indexedToReadable(text); } if (!printDocNumbers && !printPositions) { System.out.print(field + "\t" + text + "\t" + df); } else { System.out.print(field + "\t" + text); } if (printDocNumbers) { TermDocs termDocs = indexReader.termDocs(term); boolean first = true; while (termDocs.next()) { if (first) { System.out.print("\t" + termDocs.doc()); first = false; } else { System.out.print("," + termDocs.doc()); } } termDocs.close(); } else if (printPositions) { TermPositions termPositions = indexReader.termPositions(term); boolean first = true; while (termPositions.next()) { if (first) { System.out.print("\t" + termPositions.doc()); first = false; } else { System.out.print("," + termPositions.doc()); } for (int i = 0; i < termPositions.freq(); i++) { int position = termPositions.nextPosition(); if (i == 0) { System.out.print("["); } System.out.print(position); if (i < termPositions.freq() - 1) { System.out.print(","); } if (i == termPositions.freq() - 1) { System.out.print("]"); } } } termPositions.close(); } System.out.println(""); } System.out.println(""); } private static List optionHelpStrings = new ArrayList(); private static Option addHelp(Option option, String helpString) { if (option.shortForm() != null) { optionHelpStrings.add(" -" + option.shortForm() + "/--" + option.longForm() + ": " + helpString); } else { optionHelpStrings.add(" --" + option.longForm() + ": " + helpString); } return option; } private static void printUsage() { System.err.println("Usage: " + programName + " [options] <lucene index dir>\n"); for (Iterator i = optionHelpStrings.iterator(); i.hasNext();) { System.err.println(i.next()); } } private static void printVersion() { System.out.println("Version " + LuceneAnalyzer.class.getName() + " " + versionString); } public static void main(String[] args) throws IOException { CmdLineParser parser = new CmdLineParser(); // default options, well-known, should always be around Option verbose = addHelp(parser.addBooleanOption('v', "verbose"), "print extra verbosity information"); Option help = addHelp(parser.addBooleanOption('h', "help"), "print this help message"); Option version = addHelp(parser.addBooleanOption("version"), "print version information"); Option globals = addHelp(parser.addBooleanOption('g', "globals"), "print global statistics"); Option fields = addHelp(parser.addBooleanOption('f', "fields"), "print field information"); Option terms = addHelp(parser.addBooleanOption('t', "terms"), "print statistics per term"); Option headers = addHelp(parser.addBooleanOption('H', "headers"), "print headers for sections"); Option solr = addHelp(parser.addBooleanOption('s', "solr"), "treat index as a Solr index, indexDir is the Solr base dir"); // read the command line options try { parser.parse(args); } catch (OptionException e) { System.err.println(e.getMessage()); printUsage(); System.exit(1); } if ((Boolean) parser.getOptionValue(help, Boolean.FALSE)) { printUsage(); System.exit(0); } if ((Boolean) parser.getOptionValue(version, Boolean.FALSE)) { printVersion(); System.exit(0); } // verbosity as a level, increased with -vvv int verbosity = 0; while (true) { Boolean verboseValue = (Boolean) parser.getOptionValue(verbose); if (verboseValue == null) { break; } else { verbosity++; } } boolean printHeaders = false; if ((Boolean) parser.getOptionValue(headers, Boolean.FALSE)) { printHeaders = true; } boolean isSolr = false; if ((Boolean) parser.getOptionValue(solr, Boolean.FALSE)) { isSolr = true; } // read command line arguments String[] otherArgs = parser.getRemainingArgs(); if (otherArgs.length != 1) { System.err.println("Missing a lucene index directory as first argument"); printUsage(); System.exit(1); } String basePath = otherArgs[0]; String indexPath = otherArgs[0]; if (isSolr) { indexPath += "/data/index"; } File indexDir = new File(indexPath); if (!indexDir.exists()) { System.err.println(indexPath + " doesn't exist"); System.exit(1); } if (!indexDir.isDirectory()) { System.err.println(indexPath + " is not a directory"); System.exit(1); } SolrIndexSearcher solrSearcher = null; Directory luceneDirectory = new SimpleFSDirectory(indexDir); IndexReader indexReader = IndexReader.open(luceneDirectory); if (isSolr) { try { Properties p = System.getProperties(); p.setProperty("solr.solr.home", basePath); CoreContainer cores = new CoreContainer(new SolrResourceLoader(basePath)); SolrConfig solrConfig = new SolrConfig(basePath, SolrConfig.DEFAULT_CONF_FILE, null); CoreDescriptor descrCore = new CoreDescriptor(cores, "", solrConfig.getResourceLoader().getInstanceDir()); IndexSchema solrSchema = new IndexSchema(solrConfig, basePath + "/conf/schema.xml", null); SolrCore solrCore = new SolrCore(basePath, solrSchema); solrSearcher = new SolrIndexSearcher(solrCore, solrSchema, "test", luceneDirectory, true, false); } catch (javax.xml.parsers.ParserConfigurationException e) { System.err.println("Illegal Solr configuration: " + e); System.exit(1); } catch (org.xml.sax.SAXException e) { System.err.println("Illegal Solr configuration: " + e); System.exit(1); } } if ((Boolean) parser.getOptionValue(globals, Boolean.FALSE)) { printGlobalInfo(indexReader, printHeaders, isSolr, solrSearcher); } if ((Boolean) parser.getOptionValue(fields, Boolean.FALSE)) { printFieldInfo(indexReader, printHeaders, isSolr, solrSearcher); } if ((Boolean) parser.getOptionValue(terms, Boolean.FALSE)) { printTerms(indexReader, printHeaders, isSolr, solrSearcher, verbosity == 1, verbosity >= 2); } indexReader.close(); System.exit(0); } }