Example usage for org.apache.lucene.benchmark.byTask.feeds DocMaker DocMaker

List of usage examples for org.apache.lucene.benchmark.byTask.feeds DocMaker DocMaker

Introduction

In this page you can find the example usage for org.apache.lucene.benchmark.byTask.feeds DocMaker DocMaker.

Prototype

public DocMaker() 

Source Link

Usage

From source file:info.boytsov.lucene.CreateIndex.java

License:Open Source License

public static void main(String[] args) throws Exception {
    if (args.length != 3 && args.length != 4) {
        printUsage();/* w  w w  . ja v a  2  s.c  om*/
        System.exit(1);
    }
    String indexType = args[0];
    String indexSource = args[1];
    int commitInterval = 1000000;

    if (args.length >= 4) {
        commitInterval = Integer.parseInt(args[3]);
    }

    System.out.println("Commiting after indexing " + commitInterval + " docs");

    File outputDir = new File(args[2]);
    if (!outputDir.exists()) {
        if (!outputDir.mkdirs()) {
            System.out.println("couldn't create " + outputDir.getAbsolutePath());
            return;
        }
    }
    if (!outputDir.isDirectory()) {
        System.out.println(outputDir.getAbsolutePath() + " is not a directory!");
        return;
    }
    if (!outputDir.canWrite()) {
        System.out.println("Can't write to " + outputDir.getAbsolutePath());
        return;
    }

    FSDirectory dir = FSDirectory.open(outputDir);

    StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);// default
                                                                        // stop
                                                                        // words
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46, analyzer);
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);// overwrites
                                                          // if
                                                          // needed
    IndexWriter indexWriter = new IndexWriter(dir, config);

    DocMaker docMaker = new DocMaker();
    Properties properties = new Properties();
    properties.setProperty("content.source.forever", "false"); // will
                                                               // parse
                                                               // each
                                                               // document
                                                               // only
                                                               // once
    properties.setProperty("doc.index.props", "true");
    // We want to store small-size fields like URL or even title  ...
    properties.setProperty("doc.stored", "true");
    // but not the large one (great savings, 3x reduction in space)!
    properties.setProperty("doc.body.stored", "false");

    ContentSource source = CreateSource(indexType, indexSource, properties);

    if (source == null) {
        System.err.println("Failed to create a source: " + indexType + "(" + indexSource + ")");
        printUsage();
        System.exit(1);
    }

    Config c = new Config(properties);
    source.setConfig(c);
    source.resetInputs();// though this does not seem needed, it is
                         // (gets the file opened?)
    docMaker.setConfig(c, source);
    int count = 0;
    System.out.println("Starting Indexing of " + indexType + " source " + indexSource);

    long start = System.currentTimeMillis();
    Document doc;
    try {
        while ((doc = docMaker.makeDocument()) != null) {
            indexWriter.addDocument(doc);
            ++count;
            if (count % 5000 == 0) {
                System.out.println(
                        "Indexed " + count + " documents in " + (System.currentTimeMillis() - start) + " ms");
            }
            if (count % commitInterval == 0) {
                indexWriter.commit();
                System.out.println("Committed");
            }
        }
    } catch (org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException nmd) {
        System.out.println("Caught NoMoreDataException! -- Finishing"); // All done
    }
    long finish = System.currentTimeMillis();
    System.out.println("Indexing " + count + " documents took " + (finish - start) + " ms");
    System.out.println("Total data processed: " + source.getTotalBytesCount() + " bytes");
    System.out.println("Index should be located at " + dir.getDirectory().getAbsolutePath());
    docMaker.close();
    indexWriter.commit();
    indexWriter.close();

}