Example usage for org.apache.lucene.index IndexWriterConfig setRAMBufferSizeMB

List of usage examples for org.apache.lucene.index IndexWriterConfig setRAMBufferSizeMB

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriterConfig setRAMBufferSizeMB.

Prototype

@Override
    public IndexWriterConfig setRAMBufferSizeMB(double ramBufferSizeMB) 

Source Link

Usage

From source file:edu.virginia.cs.index.UserIndexer.java

/**
 * Creates the initial index files on disk
 *
 * @param indexPath/* w  w  w  .j a  v  a  2 s  .  c o  m*/
 * @return
 * @throws IOException
 */
private static IndexWriter setupIndex(String indexPath) throws IOException {
    Analyzer analyzer = new SpecialAnalyzer();
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46, analyzer);
    config.setOpenMode(OpenMode.CREATE);
    config.setRAMBufferSizeMB(2048.0);

    FSDirectory dir = FSDirectory.open(new File(indexPath));
    IndexWriter writer = new IndexWriter(dir, config);

    return writer;
}

From source file:eu.eexcess.sourceselection.redde.indexer.BinaryIndexResource.java

License:Apache License

/**
 * opens the sample index for writing; overwrites existing one
 * //from   w w  w  .  jav  a  2s .  c  o m
 * @param ramBufferSizeMB
 *            determines the amount of RAM that may be used for buffering
 * @throws IOException
 *             if unable to open/create index
 */
void openOutIndex(double ramBufferSizeMB) throws IOException {

    try {
        Directory indexDirectory = FSDirectory.open(new File(outIndexPath));
        Analyzer analyzer = new EnglishAnalyzer();
        IndexWriterConfig writerConfig = new IndexWriterConfig(luceneVersion, analyzer);
        writerConfig.setOpenMode(OpenMode.CREATE);
        writerConfig.setRAMBufferSizeMB(ramBufferSizeMB);
        outIndexWriter = new IndexWriter(indexDirectory, writerConfig);

    } catch (IOException e) {
        logger.log(Level.SEVERE, "unable to open/create index at [" + outIndexPath + "]", e);
        throw e;
    }
}

From source file:eu.eexcess.sourceselection.redde.indexer.TrecToLuceneIndexBuilder.java

License:Apache License

/**
 * Builds/overwrites existing Lucene index using TREC documents as source
 *//*from  w  w w.  ja  va 2  s .  co m*/
public void index() {
    Date startTimestamp = new Date();
    final File documentsDirectory = new File(documentsPath);

    if (!documentsDirectory.exists() || !documentsDirectory.canRead()) {

        logger.severe("cannot access document directory [" + documentsDirectory.getAbsolutePath() + "]");

    } else {

        try {
            logger.info("processing directory [" + documentsPath + "] to index [" + indexPath + "]");

            Directory indexDirectory = FSDirectory.open(new File(indexPath));
            Analyzer analyzer = new EnglishAnalyzer();
            IndexWriterConfig writerConfig = new IndexWriterConfig(luceneVersion, analyzer);

            writerConfig.setOpenMode(OpenMode.CREATE);
            writerConfig.setRAMBufferSizeMB(ramBufferSize);

            IndexWriter indexWriter = new IndexWriter(indexDirectory, writerConfig);
            indexDocs(indexWriter, documentsDirectory);

            indexWriter.commit();
            indexWriter.close();

            Date stopTimestamp = new Date();
            logger.info("processed [" + dirsCount + "] dirs [" + filesCount + "] files [" + documentsTotal
                    + "] documents [" + filesSkipped + "] files skipped in ["
                    + (stopTimestamp.getTime() - startTimestamp.getTime()) + "] ms]");

        } catch (IOException e) {
            logger.log(Level.SEVERE, "failed indexing documents", e);
        }
    }
}

From source file:indexer.LuceneIndexer.java

/**
 * Indexing the files. This method checks for the directories and then 
 * finishes out after the indexing is complete.
 * @param global This is for reference to the global class variables 
 * and methods.//from  www. java 2 s  .c om
 * @param createIndex If true a new index will be created from scratch
 * and the old index will be destroyed.
 * @param indexPanel If true it will also print the console printout lines 
 * to the main panel.
 */
public static void IndexFiles(Global global, Boolean createIndex) {
    String dataDir = global.dataDir;
    String indexDir = global.indexDir;

    //Verifies that the data directory exists
    if (dataDir == null) {
        System.err.println("Data Directory Is not accessable, Unable to Index files.");
    }

    //Verifies that the data directory is readable and writeable
    final Path docDir = Paths.get(dataDir);
    if (!Files.isReadable(docDir)) {
        System.out.println("Document directory '" + docDir.toAbsolutePath()
                + "' does not exist or is not readable, please check the path");
    }

    startTime = new Date();
    try {
        System.out.println("Indexing to directory '" + indexDir + "'...");

        //Setups the analyzer
        Analyzer analyzer;
        try (Directory dir = FSDirectory.open(Paths.get(indexDir))) {

            analyzer = new StandardAnalyzer();
            IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
            if (createIndex) {
                // Create a new index in the directory, removing any
                // previously indexed documents:
                iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
            } else {
                // Add new documents to an existing index:
                iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
            }
            iwc.setRAMBufferSizeMB(global.RAM_BUFFER_SIZE);
            iwc.setMaxBufferedDocs(global.MAX_BUFFERED_DOCS);

            LogDocMergePolicy ldmp = new LogDocMergePolicy();
            ldmp.setMergeFactor(global.MERGE_FACTOR);
            iwc.setMergePolicy(ldmp);

            try (IndexWriter writer = new IndexWriter(dir, iwc)) {
                hm.clear();
                indexDocs(writer, docDir, global);

                //This is a costly operation, we scheduled the time to apply it
                if (global.merge) {
                    System.out.println("Starting Merge");
                    writer.forceMerge(1);
                    global.merge = false;
                }
                writer.close();
            }
            finishTime = new Date();
            long millis = finishTime.getTime() - startTime.getTime();
            totalTime = String.format("%02dhr %02dmin %02dsec", TimeUnit.MILLISECONDS.toHours(millis),
                    TimeUnit.MILLISECONDS.toMinutes(millis)
                            - TimeUnit.HOURS.toMinutes(TimeUnit.MILLISECONDS.toHours(millis)), // The change is in this line
                    TimeUnit.MILLISECONDS.toSeconds(millis)
                            - TimeUnit.MINUTES.toSeconds(TimeUnit.MILLISECONDS.toMinutes(millis)));
            System.out.println("");
            System.out.println("");
            System.out.println("Start Time:          " + global.sdf.format(startTime.getTime()));
            System.out.println("Building List Time:  " + listBuildTime);
            System.out.println("Indexing Time:       " + indexingTime);
            System.out.println("Total Time:          " + totalTime);
            System.out.println("Number of Documents: " + amountOfDocuments);
            System.out.println("Finish Time:         " + global.sdf.format(finishTime.getTime()));
            System.out.println("");
        }
        analyzer.close();
    } catch (IOException e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
        log.fatal(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:intelligentWebAlgorithms.algos.search.lucene.LuceneIndexBuilder.java

License:Apache License

private IndexWriter getIndexWriter(File file) throws IOException {
    FSDirectory dir = FSDirectory.open(file);
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_44,
            new StandardAnalyzer(Version.LUCENE_44));
    config.setOpenMode(OpenMode.CREATE_OR_APPEND);
    config.setRAMBufferSizeMB(RamBufferSizeMB);
    return new IndexWriter(dir, config);
}

From source file:io.anserini.index.IndexClueWeb09b.java

License:Apache License

public int indexWithThreads(int numThreads) throws IOException, InterruptedException {

    System.out.println(//w w w  .  j  a  v  a  2  s  .c om
            "Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'...");

    final Directory dir = FSDirectory.open(indexPath);

    final IndexWriterConfig iwc = new IndexWriterConfig(analyzer());

    iwc.setSimilarity(new BM25Similarity());
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    iwc.setRAMBufferSizeMB(256.0);
    iwc.setUseCompoundFile(false);
    iwc.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, iwc);

    final ExecutorService executor = Executors.newFixedThreadPool(numThreads);

    List<Path> warcFiles = discoverWarcFiles(docDir);
    if (doclimit > 0 && warcFiles.size() < doclimit)
        warcFiles = warcFiles.subList(0, doclimit);

    for (Path f : warcFiles)
        executor.execute(new IndexerThread(writer, f));

    //add some delay to let some threads spawn by scheduler
    Thread.sleep(30000);
    executor.shutdown(); // Disable new tasks from being submitted

    try {
        // Wait for existing tasks to terminate
        while (!executor.awaitTermination(5, TimeUnit.MINUTES)) {
            Thread.sleep(1000);
        }
    } catch (InterruptedException ie) {
        // (Re-)Cancel if current thread also interrupted
        executor.shutdownNow();
        // Preserve interrupt status
        Thread.currentThread().interrupt();
    }

    int numIndexed = writer.maxDoc();

    try {
        writer.commit();
        if (optimize)
            writer.forceMerge(1);
    } finally {
        writer.close();
    }

    return numIndexed;
}

From source file:io.anserini.index.IndexCollection.java

License:Apache License

public void run() throws IOException, InterruptedException {
    final long start = System.nanoTime();
    LOG.info("Starting indexer...");

    int numThreads = args.threads;

    final Directory dir = FSDirectory.open(indexPath);
    final EnglishAnalyzer analyzer = args.keepStopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET)
            : new EnglishAnalyzer();
    final IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setSimilarity(new BM25Similarity());
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    config.setRAMBufferSizeMB(args.memorybufferSize);
    config.setUseCompoundFile(false);/*from  w  w w .  j  ava 2 s . c  o  m*/
    config.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, config);

    final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads);
    final List<Path> segmentPaths = collection.getFileSegmentPaths();

    final int segmentCnt = segmentPaths.size();
    LOG.info(segmentCnt + " files found in " + collectionPath.toString());
    for (int i = 0; i < segmentCnt; i++) {
        executor.execute(new IndexerThread(writer, collection, segmentPaths.get(i)));
    }

    executor.shutdown();

    try {
        // Wait for existing tasks to terminate
        while (!executor.awaitTermination(1, TimeUnit.MINUTES)) {
            LOG.info(String.format("%.2f percent completed",
                    (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d));
        }
    } catch (InterruptedException ie) {
        // (Re-)Cancel if current thread also interrupted
        executor.shutdownNow();
        // Preserve interrupt status
        Thread.currentThread().interrupt();
    }

    if (segmentCnt != executor.getCompletedTaskCount()) {
        throw new RuntimeException("totalFiles = " + segmentCnt + " is not equal to completedTaskCount =  "
                + executor.getCompletedTaskCount());
    }

    int numIndexed = writer.maxDoc();

    try {
        writer.commit();
        if (args.optimize)
            writer.forceMerge(1);
    } finally {
        try {
            writer.close();
        } catch (IOException e) {
            // It is possible that this happens... but nothing much we can do at this point,
            // so just log the error and move on.
            LOG.error(e);
        }
    }

    LOG.info("Indexed documents: " + counters.indexedDocuments.get());
    LOG.info("Empty documents: " + counters.emptyDocuments.get());
    LOG.info("Errors: " + counters.errors.get());

    final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
    LOG.info("Total " + numIndexed + " documents indexed in "
            + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"));
}

From source file:io.anserini.index.IndexWebCollection.java

License:Apache License

public int indexWithThreads(int numThreads) throws IOException, InterruptedException {

    LOG.info("Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'...");

    final Directory dir = FSDirectory.open(indexPath);

    final IndexWriterConfig iwc = new IndexWriterConfig(new EnglishAnalyzer());

    iwc.setSimilarity(new BM25Similarity());
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    iwc.setRAMBufferSizeMB(512);
    iwc.setUseCompoundFile(false);//from   www .ja v  a2  s  .  com
    iwc.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, iwc);

    final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads);
    final String suffix = Collection.GOV2.equals(collection) ? ".gz" : ".warc.gz";
    final Deque<Path> warcFiles = discoverWarcFiles(docDir, suffix);

    if (doclimit > 0 && warcFiles.size() < doclimit)
        for (int i = doclimit; i < warcFiles.size(); i++)
            warcFiles.removeFirst();

    long totalWarcFiles = warcFiles.size();
    LOG.info(totalWarcFiles + " many " + suffix + " files found under the docs path : " + docDir.toString());

    for (int i = 0; i < 2000; i++) {
        if (!warcFiles.isEmpty())
            executor.execute(new IndexerThread(writer, warcFiles.removeFirst()));
        else {
            if (!executor.isShutdown()) {
                Thread.sleep(30000);
                executor.shutdown();
            }
            break;
        }
    }

    long first = 0;
    //add some delay to let some threads spawn by scheduler
    Thread.sleep(30000);

    try {
        // Wait for existing tasks to terminate
        while (!executor.awaitTermination(1, TimeUnit.MINUTES)) {

            final long completedTaskCount = executor.getCompletedTaskCount();

            LOG.info(String.format("%.2f percentage completed",
                    (double) completedTaskCount / totalWarcFiles * 100.0d));

            if (!warcFiles.isEmpty())
                for (long i = first; i < completedTaskCount; i++) {
                    if (!warcFiles.isEmpty())
                        executor.execute(new IndexerThread(writer, warcFiles.removeFirst()));
                    else {
                        if (!executor.isShutdown())
                            executor.shutdown();
                    }
                }

            first = completedTaskCount;
            Thread.sleep(1000);
        }
    } catch (InterruptedException ie) {
        // (Re-)Cancel if current thread also interrupted
        executor.shutdownNow();
        // Preserve interrupt status
        Thread.currentThread().interrupt();
    }

    if (totalWarcFiles != executor.getCompletedTaskCount())
        throw new RuntimeException("totalWarcFiles = " + totalWarcFiles
                + " is not equal to completedTaskCount =  " + executor.getCompletedTaskCount());

    int numIndexed = writer.maxDoc();

    try {
        writer.commit();
        if (optimize)
            writer.forceMerge(1);
    } finally {
        writer.close();
    }

    return numIndexed;
}

From source file:io.anserini.IndexerCW09B.java

License:Apache License

public int indexWithThreads(int numThreads) throws IOException, InterruptedException {

    System.out.println(/*from  w  w w.ja  v  a2s.  c  o m*/
            "Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'...");

    final Directory dir = FSDirectory.open(indexPath);

    final IndexWriterConfig iwc = new IndexWriterConfig(analyzer());

    iwc.setSimilarity(new BM25Similarity());
    iwc.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE);
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    iwc.setRAMBufferSizeMB(256.0);
    iwc.setUseCompoundFile(false);
    iwc.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, iwc);

    final ExecutorService executor = Executors.newFixedThreadPool(numThreads);

    for (Path f : discoverWarcFiles(docDir))
        executor.execute(new IndexerThread(writer, f));

    //add some delay to let some threads spawn by scheduler
    Thread.sleep(30000);
    executor.shutdown(); // Disable new tasks from being submitted

    try {
        // Wait for existing tasks to terminate
        while (!executor.awaitTermination(5, TimeUnit.MINUTES)) {
            Thread.sleep(1000);
        }
    } catch (InterruptedException ie) {
        // (Re-)Cancel if current thread also interrupted
        executor.shutdownNow();
        // Preserve interrupt status
        Thread.currentThread().interrupt();
    }

    int numIndexed = writer.maxDoc();

    try {
        writer.commit();
    } finally {
        writer.close();
    }

    return numIndexed;
}

From source file:io.druid.extension.lucene.LuceneDruidSegment.java

License:Apache License

private static IndexWriter buildRamWriter(RAMDirectory dir, Analyzer analyzer, int maxDocsPerSegment)
        throws IOException {
    IndexWriterConfig writerConfig = new IndexWriterConfig(analyzer);
    writerConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
    // some arbitrary large numbers
    writerConfig.setMaxBufferedDocs(maxDocsPerSegment * 2);
    writerConfig.setRAMBufferSizeMB(5000);
    writerConfig.setUseCompoundFile(false);
    writerConfig.setCommitOnClose(true);
    writerConfig.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE);
    writerConfig.setMergePolicy(NoMergePolicy.INSTANCE);
    writerConfig.setMergeScheduler(NoMergeScheduler.INSTANCE);
    return new IndexWriter(dir, writerConfig);
}