Example usage for org.apache.lucene.benchmark.byTask.feeds ContentSource getTotalBytesCount

List of usage examples for org.apache.lucene.benchmark.byTask.feeds ContentSource getTotalBytesCount

Introduction

In this page you can find the example usage for org.apache.lucene.benchmark.byTask.feeds ContentSource getTotalBytesCount.

Prototype

public final long getTotalBytesCount() 

Source Link

Document

Returns the total number of bytes that were generated by this source.

Usage

From source file:info.boytsov.lucene.CreateIndex.java

License:Open Source License

public static void main(String[] args) throws Exception {
    if (args.length != 3 && args.length != 4) {
        printUsage();/*from w w  w.  ja  v a 2s. c  o  m*/
        System.exit(1);
    }
    String indexType = args[0];
    String indexSource = args[1];
    int commitInterval = 1000000;

    if (args.length >= 4) {
        commitInterval = Integer.parseInt(args[3]);
    }

    System.out.println("Commiting after indexing " + commitInterval + " docs");

    File outputDir = new File(args[2]);
    if (!outputDir.exists()) {
        if (!outputDir.mkdirs()) {
            System.out.println("couldn't create " + outputDir.getAbsolutePath());
            return;
        }
    }
    if (!outputDir.isDirectory()) {
        System.out.println(outputDir.getAbsolutePath() + " is not a directory!");
        return;
    }
    if (!outputDir.canWrite()) {
        System.out.println("Can't write to " + outputDir.getAbsolutePath());
        return;
    }

    FSDirectory dir = FSDirectory.open(outputDir);

    StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);// default
                                                                        // stop
                                                                        // words
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46, analyzer);
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);// overwrites
                                                          // if
                                                          // needed
    IndexWriter indexWriter = new IndexWriter(dir, config);

    DocMaker docMaker = new DocMaker();
    Properties properties = new Properties();
    properties.setProperty("content.source.forever", "false"); // will
                                                               // parse
                                                               // each
                                                               // document
                                                               // only
                                                               // once
    properties.setProperty("doc.index.props", "true");
    // We want to store small-size fields like URL or even title  ...
    properties.setProperty("doc.stored", "true");
    // but not the large one (great savings, 3x reduction in space)!
    properties.setProperty("doc.body.stored", "false");

    ContentSource source = CreateSource(indexType, indexSource, properties);

    if (source == null) {
        System.err.println("Failed to create a source: " + indexType + "(" + indexSource + ")");
        printUsage();
        System.exit(1);
    }

    Config c = new Config(properties);
    source.setConfig(c);
    source.resetInputs();// though this does not seem needed, it is
                         // (gets the file opened?)
    docMaker.setConfig(c, source);
    int count = 0;
    System.out.println("Starting Indexing of " + indexType + " source " + indexSource);

    long start = System.currentTimeMillis();
    Document doc;
    try {
        while ((doc = docMaker.makeDocument()) != null) {
            indexWriter.addDocument(doc);
            ++count;
            if (count % 5000 == 0) {
                System.out.println(
                        "Indexed " + count + " documents in " + (System.currentTimeMillis() - start) + " ms");
            }
            if (count % commitInterval == 0) {
                indexWriter.commit();
                System.out.println("Committed");
            }
        }
    } catch (org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException nmd) {
        System.out.println("Caught NoMoreDataException! -- Finishing"); // All done
    }
    long finish = System.currentTimeMillis();
    System.out.println("Indexing " + count + " documents took " + (finish - start) + " ms");
    System.out.println("Total data processed: " + source.getTotalBytesCount() + " bytes");
    System.out.println("Index should be located at " + dir.getDirectory().getAbsolutePath());
    docMaker.close();
    indexWriter.commit();
    indexWriter.close();

}