Example usage for org.apache.lucene.index IndexWriter addIndexes

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addIndexes.

Prototype

public long addIndexes(CodecReader... readers) throws IOException

Source Link

Document

Merges the provided indexes into this index.

Usage

From source file:dk.netarkivet.harvester.indexserver.CrawlLogIndexCache.java

License:Open Source License

/**
 * Combine a number of crawl.log files into one Lucene index. This index is placed as gzip files under the directory
 * returned by getCacheFile()./*www  .j a v a  2  s . co m*/
 *
 * @param rawfiles The map from job ID into crawl.log contents. No null values are allowed in this map.
 */
protected void combine(Map<Long, File> rawfiles) {
    ++indexingJobCount;
    long datasetSize = rawfiles.values().size();
    log.info("Starting combine task #{}. This combines a dataset with {} crawl logs (thread = {})",
            indexingJobCount, datasetSize, Thread.currentThread().getName());

    File resultDir = getCacheFile(rawfiles.keySet());
    Set<File> tmpfiles = new HashSet<File>();
    String indexLocation = resultDir.getAbsolutePath() + ".luceneDir";
    ThreadPoolExecutor executor = null;
    try {
        DigestIndexer indexer = createStandardIndexer(indexLocation);
        final boolean verboseIndexing = false;
        DigestOptions indexingOptions = new DigestOptions(this.useBlacklist, verboseIndexing, this.mimeFilter);
        long count = 0;
        Set<IndexingState> outstandingJobs = new HashSet<IndexingState>();
        final int maxThreads = Settings.getInt(HarvesterSettings.INDEXSERVER_INDEXING_MAXTHREADS);
        executor = new ThreadPoolExecutor(maxThreads, maxThreads, 0L, TimeUnit.MILLISECONDS,
                new LinkedBlockingQueue<Runnable>());

        executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());

        for (Map.Entry<Long, File> entry : rawfiles.entrySet()) {
            Long jobId = entry.getKey();
            File crawlLog = entry.getValue();
            // Generate UUID to ensure a unique filedir for the index.
            File tmpFile = new File(FileUtils.getTempDir(), UUID.randomUUID().toString());
            tmpfiles.add(tmpFile);
            String localindexLocation = tmpFile.getAbsolutePath();
            Long cached = cdxcache.cache(jobId);
            if (cached == null) {
                log.warn("Skipping the ingest of logs for job {}. Unable to retrieve cdx-file for job.",
                        entry.getKey());
                continue;
            }
            File cachedCDXFile = cdxcache.getCacheFile(cached);

            // Dispatch this indexing task to a separate thread that
            // handles the sorting of the logfiles and the generation
            // of a lucene index for this crawllog and cdxfile.
            ++count;
            String taskID = count + " out of " + datasetSize;
            log.debug("Making subthread for indexing job " + jobId + " - task " + taskID);
            Callable<Boolean> task = new DigestIndexerWorker(localindexLocation, jobId, crawlLog, cachedCDXFile,
                    indexingOptions, taskID);
            Future<Boolean> result = executor.submit(task);
            outstandingJobs.add(new IndexingState(jobId, localindexLocation, result));
        }

        // wait for all the outstanding subtasks to complete.
        Set<Directory> subindices = new HashSet<Directory>();

        // Deadline for the combine-task
        long combineTimeout = Settings.getLong(HarvesterSettings.INDEXSERVER_INDEXING_TIMEOUT);
        long timeOutTime = System.currentTimeMillis() + combineTimeout;

        // The indexwriter for the totalindex.
        IndexWriter totalIndex = indexer.getIndex();
        int subindicesInTotalIndex = 0;
        // Max number of segments in totalindex.
        int maxSegments = Settings.getInt(HarvesterSettings.INDEXSERVER_INDEXING_MAX_SEGMENTS);

        final int ACCUMULATED_SUBINDICES_BEFORE_MERGING = 200;

        while (outstandingJobs.size() > 0) {
            log.info("Outstanding jobs in combine task #{} is now {}", indexingJobCount,
                    outstandingJobs.size());
            Iterator<IndexingState> iterator = outstandingJobs.iterator();
            if (timeOutTime < System.currentTimeMillis()) {
                log.warn(
                        "Max indexing time exceeded for one index ({}). Indexing stops here, "
                                + "although missing subindices for {} jobs",
                        TimeUtils.readableTimeInterval(combineTimeout), outstandingJobs.size());
                break;
            }
            while (iterator.hasNext() && subindices.size() < ACCUMULATED_SUBINDICES_BEFORE_MERGING) {
                Future<Boolean> nextResult;
                IndexingState next = iterator.next();
                if (next.getResultObject().isDone()) {
                    nextResult = next.getResultObject();
                    try {
                        // check, if the indexing failed
                        if (nextResult.get()) {
                            subindices.add(new SimpleFSDirectory(new File(next.getIndex())));
                        } else {
                            log.warn("Indexing of job {} failed.", next.getJobIdentifier());
                        }

                    } catch (InterruptedException e) {
                        log.warn("Unable to get Result back from indexing thread", e);
                    } catch (ExecutionException e) {
                        log.warn("Unable to get Result back from indexing thread", e);
                    }
                    // remove the done object from the set
                    iterator.remove();
                }
            }

            if (subindices.size() >= ACCUMULATED_SUBINDICES_BEFORE_MERGING) {

                log.info(
                        "Adding {} subindices to main index. Forcing index to contain max {} files (related to combine task #{})",
                        subindices.size(), maxSegments, indexingJobCount);
                totalIndex.addIndexes(subindices.toArray(new Directory[0]));
                totalIndex.forceMerge(maxSegments);
                totalIndex.commit();
                for (Directory luceneDir : subindices) {
                    luceneDir.close();
                }
                subindicesInTotalIndex += subindices.size();
                log.info(
                        "Completed adding {} subindices to main index, now containing {} subindices(related to combine task #{})",
                        subindices.size(), subindicesInTotalIndex, indexingJobCount);
                subindices.clear();
            } else {
                sleepAwhile();
            }
        }

        log.info(
                "Adding the final {} subindices to main index. "
                        + "Forcing index to contain max {} files (related to combine task #{})",
                subindices.size(), maxSegments, indexingJobCount);

        totalIndex.addIndexes(subindices.toArray(new Directory[0]));
        totalIndex.forceMerge(maxSegments);
        totalIndex.commit();
        for (Directory luceneDir : subindices) {
            luceneDir.close();
        }
        subindices.clear();

        log.info("Adding operation completed (combine task #{})!", indexingJobCount);
        long docsInIndex = totalIndex.numDocs();

        indexer.close();
        log.info("Closed index (related to combine task #{}", indexingJobCount);

        // Now the index is made, gzip it up.
        File totalIndexDir = new File(indexLocation);
        log.info("Gzip-compressing the individual {} index files of combine task # {}",
                totalIndexDir.list().length, indexingJobCount);
        ZipUtils.gzipFiles(totalIndexDir, resultDir);
        log.info(
                "Completed combine task #{} that combined a dataset with {} crawl logs (entries in combined index: {}) - compressed index has size {}",
                indexingJobCount, datasetSize, docsInIndex, FileUtils.getHumanReadableFileSize(resultDir));
    } catch (IOException e) {
        throw new IOFailure("Error setting up craw.log index framework for " + resultDir.getAbsolutePath(), e);
    } finally {
        // close down Threadpool-executor
        closeDownThreadpoolQuietly(executor);
        FileUtils.removeRecursively(new File(indexLocation));
        for (File temporaryFile : tmpfiles) {
            FileUtils.removeRecursively(temporaryFile);
        }
    }
}

From source file:fi.passiba.services.bibledata.sword.index.lucene.LuceneIndex.java

License:Open Source License

/**
 * Generate an index to use, telling the job about progress as you go.
 * @throws BookException If we fail to read the index files
 */// ww w . j  a  v a  2  s . c o  m
public LuceneIndex(Book book, URI storage, boolean create) throws BookException {
    assert create;

    this.book = book;
    File finalPath = null;
    try {
        finalPath = NetUtil.getAsFile(storage);
        this.path = finalPath.getCanonicalPath();
    } catch (IOException ex) {
        throw new BookException(UserMsg.LUCENE_INIT, ex);
    }
    System.out.println("index path " + finalPath.getAbsolutePath());
    // Indexing the book is a good way to police data errors.
    DataPolice.setBook(book.getBookMetaData());

    IndexStatus finalStatus = IndexStatus.UNDONE;

    Analyzer analyzer = new LuceneAnalyzer(book);

    List errors = new ArrayList();
    File tempPath = new File(path + '.' + IndexStatus.CREATING.toString());

    try {
        synchronized (CREATING) {

            book.setIndexStatus(IndexStatus.CREATING);

            // An index is created by opening an IndexWriter with the create argument set to true.
            //IndexWriter writer = new IndexWriter(tempPath.getCanonicalPath(), analyzer, true);

            // Create the index in core.
            RAMDirectory ramDir = new RAMDirectory();
            IndexWriter writer = new IndexWriter(ramDir, analyzer, true);

            generateSearchIndexImpl(errors, writer, book.getGlobalKeyList(), 0);

            // Consolidate the index into the minimum number of files.
            // writer.optimize(); /* Optimize is done by addIndexes */
            writer.close();

            // Write the core index to disk.
            IndexWriter fsWriter = new IndexWriter(tempPath.getCanonicalPath(), analyzer, true);
            fsWriter.addIndexes(new Directory[] { ramDir });
            fsWriter.close();

            // Free up the space used by the ram directory
            ramDir.close();

            tempPath.renameTo(finalPath);

            if (finalPath.exists()) {
                finalStatus = IndexStatus.DONE;
            }

            if (errors.size() > 0) {
                StringBuffer buf = new StringBuffer();
                Iterator iter = errors.iterator();
                while (iter.hasNext()) {
                    buf.append(iter.next());
                    buf.append('\n');
                }
                Reporter.informUser(this, UserMsg.BAD_VERSE, buf);
            }

        }
    } catch (IOException ex) {

        throw new BookException(UserMsg.LUCENE_INIT, ex);
    } finally {
        book.setIndexStatus(finalStatus);

    }
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

@Test
public void testCloneIndex() throws Exception {
    System.out.println("Cloning index:");
    Directory dir1 = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir1);

    Directory dir2 = FSDirectory.open(tempDir2);
    IndexWriterConfig config = new IndexWriterConfig(new EnglishAnalyzer());
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    IndexWriter writer = new IndexWriter(dir2, config);

    LeafReader leafReader = reader.leaves().get(0).reader();
    CodecReader codecReader = SlowCodecReaderWrapper.wrap(leafReader);
    writer.addIndexes(new MyFilterCodecReader(codecReader));
    writer.commit();/*from  www.j  a  va2s . com*/
    writer.forceMerge(1);
    writer.close();

    reader.close();

    // Open up the cloned index and verify it.
    reader = DirectoryReader.open(dir2);
    assertEquals(3, reader.numDocs());
    assertEquals(1, reader.leaves().size());

    System.out.println("Dumping out postings...");
    dumpPostings(reader);

    assertEquals(2, reader.docFreq(new Term("text", "here")));
    assertEquals(2, reader.docFreq(new Term("text", "more")));
    assertEquals(1, reader.docFreq(new Term("text", "some")));
    assertEquals(1, reader.docFreq(new Term("text", "test")));
    assertEquals(2, reader.docFreq(new Term("text", "text")));

    reader.close();
}

From source file:io.datalayer.lucene.helper.AosUtil.java

License:Apache License

public void addIndexes() throws Exception {
    Directory otherDir = null;//from   w  w  w .j av a2 s. co m
    Directory ramDir = null;
    IndexWriter writer = new IndexWriter(otherDir,
            new IndexWriterConfig(Version.LUCENE_46, new SimpleAnalyzer(Version.LUCENE_46)));
    writer.addIndexes(new Directory[] { ramDir });
}

From source file:io.yucca.lucene.FieldRemover.java

License:Apache License

/**
 * Remove fields from an index. All readers and writer are closed on
 * completion or on an exception./*  www.j a  va 2  s  .  co  m*/
 * 
 * @param reader
 *            IndexReader
 * @param writer
 *            IndexWriter
 *            File destination index directory
 * @param fields
 *            String[] fields to remove
 */
public void removeFields(IndexReader reader, IndexWriter writer, String[] fields) {
    Set<String> removals = toTrimmedSet(fields);
    List<AtomicReaderContext> leaves = reader.leaves();
    AtomicReader wrappedLeaves[] = new AtomicReader[leaves.size()];
    for (int i = 0; i < leaves.size(); i++) {
        wrappedLeaves[i] = new FieldFilterAtomicReader(leaves.get(i).reader(), removals, true);
    }
    try {
        MultiReader mr = new MultiReader(wrappedLeaves, true);
        writer.addIndexes(mr);
        writer.commit();
        writer.close();
        mr.close();
    } catch (IOException e) {
        log.error("Writing new index failed.", e);
    } finally {
        IOUtils.closeWhileHandlingException(reader);
        IOUtils.closeWhileHandlingException(writer);
        IOUtils.closeWhileHandlingException(writer.getDirectory());
    }
}

From source file:lius.lucene.LuceneActions.java

License:Apache License

public void addIndexes(Directory[] directoriesToIndex, String indexDir, LiusConfig lc) {
    Analyzer analyzer = AnalyzerFactory.getAnalyzer(lc);
    IndexWriter writer = null;
    try {/* ww  w.  j  a  va  2 s .  c  o m*/
        boolean createIndex = createIndexValue(lc.getCreateIndex(), indexDir);
        Directory fsDir = FSDirectory.getDirectory(indexDir, createIndex);
        writer = new IndexWriter(fsDir, analyzer, createIndex);
        setIndexWriterProps(writer, lc);
        writer.addIndexes(directoriesToIndex);
    } catch (Exception e) {
        logger.error(e.getMessage());
    } finally {
        try {
            if (writer != null) {
                writer.close();
            }
        } catch (IOException e) {
            logger.error(e.getMessage());
        }
    }
}

From source file:net.bible.service.sword.IndexMerger.java

License:Open Source License

/** Index all text files under a directory. */
public static void main(String[] args) {

    if (args.length != 2) {
        System.out.println("Usage: java -jar IndexMerger.jar " + "existing_indexes_dir merged_index_dir");
        System.out/*w  w  w  . ja  v a  2s  . c  om*/
                .println(" existing_indexes_dir: A directory where the " + "indexes that have to merged exist");
        System.out.println("   e.g. indexes/");
        System.out.println("   e.g.         index1");
        System.out.println("   e.g.         index2");
        System.out.println("   e.g.         index3");
        System.out.println(" merged_index_dir: A directory where the merged " + "index will be stored");
        System.out.println("   e.g. merged_indexes");
        System.exit(1);
    }

    File INDEXES_DIR = new File(args[0]);
    File INDEX_DIR = new File(args[1]);

    INDEX_DIR.mkdir();

    Date start = new Date();

    try {
        IndexWriter writer = new IndexWriter(INDEX_DIR, new StandardAnalyzer(), true);
        writer.setMergeFactor(1000);
        writer.setRAMBufferSizeMB(50);

        Directory indexes[] = new Directory[INDEXES_DIR.list().length];

        for (int i = 0; i < INDEXES_DIR.list().length; i++) {
            System.out.println("Adding: " + INDEXES_DIR.list()[i]);
            indexes[i] = FSDirectory.getDirectory(INDEXES_DIR.getAbsolutePath() + "/" + INDEXES_DIR.list()[i]);
        }

        System.out.print("Merging added indexes...");
        writer.addIndexes(indexes);
        System.out.println("done");

        System.out.print("Optimizing index...");
        writer.optimize();
        writer.close();
        System.out.println("done");

        Date end = new Date();
        System.out.println("It took: " + ((end.getTime() - start.getTime()) / 1000) + "\"");

    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:org.apache.blur.manager.writer.IndexImporter.java

License:Apache License

private IndexAction getIndexAction(final HdfsDirectory directory, final FileSystem fileSystem) {
    return new IndexAction() {

        @Override//w  w w  .  j a  va2  s .c om
        public void performMutate(IndexSearcherCloseable searcher, IndexWriter writer) throws IOException {
            LOG.info("About to import [{0}] into [{1}/{2}]", directory, _shard, _table);
            boolean emitDeletes = searcher.getIndexReader().numDocs() != 0;
            applyDeletes(directory, writer, _shard, emitDeletes);
            LOG.info("Add index [{0}] [{1}/{2}]", directory, _shard, _table);
            writer.addIndexes(directory);
            LOG.info("Removing delete markers [{0}] on [{1}/{2}]", directory, _shard, _table);
            writer.deleteDocuments(new Term(BlurConstants.DELETE_MARKER, BlurConstants.DELETE_MARKER_VALUE));
            LOG.info("Finishing import [{0}], commiting on [{1}/{2}]", directory, _shard, _table);
        }

        @Override
        public void doPreCommit(IndexSearcherCloseable indexSearcher, IndexWriter writer) throws IOException {

        }

        @Override
        public void doPostCommit(IndexWriter writer) throws IOException {
            Path path = directory.getPath();
            fileSystem.delete(new Path(path, INPROGRESS), false);
            LOG.info("Import complete on [{0}/{1}]", _shard, _table);
            writer.maybeMerge();
        }

        @Override
        public void doPreRollback(IndexWriter writer) throws IOException {
            LOG.info("Starting rollback on [{0}/{1}]", _shard, _table);
        }

        @Override
        public void doPostRollback(IndexWriter writer) throws IOException {
            LOG.info("Finished rollback on [{0}/{1}]", _shard, _table);
            Path path = directory.getPath();
            String name = path.getName();
            fileSystem.rename(path, new Path(path.getParent(), rename(name, BADROWIDS)));
        }
    };
}

From source file:org.apache.blur.mapreduce.lib.GenericBlurRecordWriter.java

License:Apache License

private void copyAndOptimizeInFlightDir() throws IOException {
    CopyRateDirectory copyRateDirectory = new CopyRateDirectory(_finalDir, _copyRateCounter);
    copyRateDirectory.setLockFactory(NoLockFactory.getNoLockFactory());
    DirectoryReader reader = DirectoryReader.open(_localDir);
    IndexWriter writer = new IndexWriter(copyRateDirectory, _conf.clone());
    writer.addIndexes(reader);
    writer.close();//  ww  w.j  a v a 2s.co m
    rm(_localPath);
}

From source file:org.apache.blur.store.hdfs.HdfsDirectorySymlinkTest.java

License:Apache License

@Test
public void testSymlinkWithIndexes() throws IOException {
    HdfsDirectory dir1 = new HdfsDirectory(_configuration, new Path(_base, "dir1"));
    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_43, new KeywordAnalyzer());
    IndexWriter writer1 = new IndexWriter(dir1, conf.clone());
    writer1.addDocument(getDoc());//  ww w . ja v a 2s.c om
    writer1.close();

    HdfsDirectory dir2 = new HdfsDirectory(_configuration, new Path(_base, "dir2"));
    IndexWriter writer2 = new IndexWriter(dir2, conf.clone());
    writer2.addIndexes(dir1);
    writer2.close();

    DirectoryReader reader1 = DirectoryReader.open(dir1);
    DirectoryReader reader2 = DirectoryReader.open(dir2);

    assertEquals(1, reader1.maxDoc());
    assertEquals(1, reader2.maxDoc());
    assertEquals(1, reader1.numDocs());
    assertEquals(1, reader2.numDocs());

    Document document1 = reader1.document(0);
    Document document2 = reader2.document(0);

    assertEquals(document1.get("id"), document2.get("id"));
}