Example usage for org.apache.lucene.index IndexWriter forceMerge

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter forceMerge.

Prototype

public void forceMerge(int maxNumSegments) throws IOException

Source Link

Document

Forces merge policy to merge segments until there are <= maxNumSegments .

Usage

From source file:com.rocana.lucene.codec.v1.TestRocanaPerFieldPostingsFormat2.java

License:Apache License

@Test
public void testMergeUnusedPerFieldCodec() throws IOException {
    Directory dir = newDirectory();/*  w ww.ja v  a2s  . co  m*/
    IndexWriterConfig iwconf = newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.CREATE)
            .setCodec(new MockCodec());
    IndexWriter writer = newWriter(dir, iwconf);
    addDocs(writer, 10);
    writer.commit();
    addDocs3(writer, 10);
    writer.commit();
    addDocs2(writer, 10);
    writer.commit();
    assertEquals(30, writer.maxDoc());
    TestUtil.checkIndex(dir);
    writer.forceMerge(1);
    assertEquals(30, writer.maxDoc());
    writer.close();
    dir.close();
}

From source file:com.rocana.lucene.codec.v1.TestRocanaPerFieldPostingsFormat2.java

License:Apache License

@Test
public void testChangeCodecAndMerge() throws IOException {
    Directory dir = newDirectory();/*from   www. j a  v a  2s  .  c om*/
    if (VERBOSE) {
        System.out.println("TEST: make new index");
    }
    IndexWriterConfig iwconf = newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.CREATE)
            .setCodec(new MockCodec());
    iwconf.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH);
    //((LogMergePolicy) iwconf.getMergePolicy()).setMergeFactor(10);
    IndexWriter writer = newWriter(dir, iwconf);

    addDocs(writer, 10);
    writer.commit();
    assertQuery(new Term("content", "aaa"), dir, 10);
    if (VERBOSE) {
        System.out.println("TEST: addDocs3");
    }
    addDocs3(writer, 10);
    writer.commit();
    writer.close();

    assertQuery(new Term("content", "ccc"), dir, 10);
    assertQuery(new Term("content", "aaa"), dir, 10);
    Codec codec = iwconf.getCodec();

    iwconf = newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND).setCodec(codec);
    //((LogMergePolicy) iwconf.getMergePolicy()).setNoCFSRatio(0.0);
    //((LogMergePolicy) iwconf.getMergePolicy()).setMergeFactor(10);
    iwconf.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH);

    iwconf.setCodec(new MockCodec2()); // uses standard for field content
    writer = newWriter(dir, iwconf);
    // swap in new codec for currently written segments
    if (VERBOSE) {
        System.out.println("TEST: add docs w/ Standard codec for content field");
    }
    addDocs2(writer, 10);
    writer.commit();
    codec = iwconf.getCodec();
    assertEquals(30, writer.maxDoc());
    assertQuery(new Term("content", "bbb"), dir, 10);
    assertQuery(new Term("content", "ccc"), dir, 10); ////
    assertQuery(new Term("content", "aaa"), dir, 10);

    if (VERBOSE) {
        System.out.println("TEST: add more docs w/ new codec");
    }
    addDocs2(writer, 10);
    writer.commit();
    assertQuery(new Term("content", "ccc"), dir, 10);
    assertQuery(new Term("content", "bbb"), dir, 20);
    assertQuery(new Term("content", "aaa"), dir, 10);
    assertEquals(40, writer.maxDoc());

    if (VERBOSE) {
        System.out.println("TEST: now optimize");
    }
    writer.forceMerge(1);
    assertEquals(40, writer.maxDoc());
    writer.close();
    assertQuery(new Term("content", "ccc"), dir, 10);
    assertQuery(new Term("content", "bbb"), dir, 20);
    assertQuery(new Term("content", "aaa"), dir, 10);

    dir.close();
}

From source file:com.rocana.lucene.codec.v1.TestRocanaPerFieldPostingsFormat2.java

License:Apache License

@Test
public void testStressPerFieldCodec() throws IOException {
    Directory dir = newDirectory(random());
    final int docsPerRound = 97;
    int numRounds = atLeast(1);
    for (int i = 0; i < numRounds; i++) {
        int num = TestUtil.nextInt(random(), 30, 60);
        IndexWriterConfig config = newIndexWriterConfig(random(), new MockAnalyzer(random()));
        config.setOpenMode(OpenMode.CREATE_OR_APPEND);
        IndexWriter writer = newWriter(dir, config);
        for (int j = 0; j < docsPerRound; j++) {
            final Document doc = new Document();
            for (int k = 0; k < num; k++) {
                FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
                customType.setTokenized(random().nextBoolean());
                customType.setOmitNorms(random().nextBoolean());
                Field field = newField("" + k, TestUtil.randomRealisticUnicodeString(random(), 128),
                        customType);// w  ww. j  a v a  2s  .c  o  m
                doc.add(field);
            }
            writer.addDocument(doc);
        }
        if (random().nextBoolean()) {
            writer.forceMerge(1);
        }
        writer.commit();
        assertEquals((i + 1) * docsPerRound, writer.maxDoc());
        writer.close();
    }
    dir.close();
}

From source file:com.senseidb.abacus.api.codec.CodecTest.java

License:Apache License

static Directory buildIndex(Iterable<String> datasrc, Codec codec) throws Exception {
    String idxname = codec == null ? "lucene" : codec.getName();
    Directory dir = FSDirectory.open(new File("/tmp/codectest", idxname));//new RAMDirectory();
    //Directory dir = new RAMDirectory();
    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_44, new StandardAnalyzer(Version.LUCENE_44));
    conf.setUseCompoundFile(false);/*from w  w w .  j a v a  2  s .  co  m*/
    if (codec != null) {
        conf.setCodec(codec);
    }

    IndexWriter writer = new IndexWriter(dir, conf);

    for (String doc : datasrc) {
        if (doc == null)
            break;
        doc = doc.trim();
        if (doc.length() == 0)
            continue;
        Document d = new Document();
        FieldType ft = new FieldType();
        ft.setIndexed(true);
        ft.setStored(false);
        ft.setIndexOptions(IndexOptions.DOCS_ONLY);
        ft.setOmitNorms(true);
        Field f = new Field(FIELD, doc, ft);
        d.add(f);
        writer.addDocument(d);
    }
    writer.forceMerge(1);
    writer.commit();
    writer.close();
    return dir;
}

From source file:dk.dma.msinm.lucene.AbstractLuceneIndex.java

License:Open Source License

/**
 * Updates the Lucene index/*from  w  w w .  j  a  v  a2 s .c o  m*/
 *
 * @param maxIndexCount max number of entities to index at a time
 * @param force update even if the locked flag is set
 * @return the number of updates
 */
public int updateLuceneIndex(int maxIndexCount, boolean force) {
    // Check if we are in the middle of re-indexing
    if (!force && locked) {
        return 0;
    }

    Date lastUpdated = getLastUpdated();

    long t0 = System.currentTimeMillis();
    log.debug(String.format("Indexing at most %d changed entities since %s", maxIndexCount, lastUpdated));

    IndexWriter writer = null;
    try {
        // Find all customers changed since the lastUpdated time stamp
        List<T> updatedEntities = findUpdatedEntities(lastUpdated, maxIndexCount);
        if (updatedEntities.size() == 0) {
            return 0;
        }

        // Create a new index writer
        writer = getNewWriter();

        // Update the index with the changes
        for (T entity : updatedEntities) {
            indexEntity(writer, entity);
            if (entity.getUpdated().after(lastUpdated)) {
                lastUpdated = entity.getUpdated();
            }
        }

        // Update the last-updated flag
        setLastUpdated(lastUpdated, writer);

        // Commit the changes
        writer.commit();

        // Re-open the reader from the writer
        refreshReader(writer);

        // Check if we need to optimize the index
        optimizeIndexCount += updatedEntities.size();
        if (optimizeIndexCount > OPTIMIZE_INDEX_COUNT) {
            writer.forceMerge(MAX_NUM_SEGMENTS);
            optimizeIndexCount = 0;
        }

        log.info("Indexed " + updatedEntities.size() + " entities in " + (System.currentTimeMillis() - t0)
                + " ms");

        return updatedEntities.size();
    } catch (Exception ex) {
        log.error("Error updating Lucene index: " + ex.getMessage(), ex);
        return 0;
    } finally {
        closeWriter(writer);
    }
}

From source file:dk.netarkivet.harvester.indexserver.CrawlLogIndexCache.java

License:Open Source License

/**
 * Combine a number of crawl.log files into one Lucene index. This index is placed as gzip files under the directory
 * returned by getCacheFile()./*from   w w  w .j a  v a2  s.c o m*/
 *
 * @param rawfiles The map from job ID into crawl.log contents. No null values are allowed in this map.
 */
protected void combine(Map<Long, File> rawfiles) {
    ++indexingJobCount;
    long datasetSize = rawfiles.values().size();
    log.info("Starting combine task #{}. This combines a dataset with {} crawl logs (thread = {})",
            indexingJobCount, datasetSize, Thread.currentThread().getName());

    File resultDir = getCacheFile(rawfiles.keySet());
    Set<File> tmpfiles = new HashSet<File>();
    String indexLocation = resultDir.getAbsolutePath() + ".luceneDir";
    ThreadPoolExecutor executor = null;
    try {
        DigestIndexer indexer = createStandardIndexer(indexLocation);
        final boolean verboseIndexing = false;
        DigestOptions indexingOptions = new DigestOptions(this.useBlacklist, verboseIndexing, this.mimeFilter);
        long count = 0;
        Set<IndexingState> outstandingJobs = new HashSet<IndexingState>();
        final int maxThreads = Settings.getInt(HarvesterSettings.INDEXSERVER_INDEXING_MAXTHREADS);
        executor = new ThreadPoolExecutor(maxThreads, maxThreads, 0L, TimeUnit.MILLISECONDS,
                new LinkedBlockingQueue<Runnable>());

        executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());

        for (Map.Entry<Long, File> entry : rawfiles.entrySet()) {
            Long jobId = entry.getKey();
            File crawlLog = entry.getValue();
            // Generate UUID to ensure a unique filedir for the index.
            File tmpFile = new File(FileUtils.getTempDir(), UUID.randomUUID().toString());
            tmpfiles.add(tmpFile);
            String localindexLocation = tmpFile.getAbsolutePath();
            Long cached = cdxcache.cache(jobId);
            if (cached == null) {
                log.warn("Skipping the ingest of logs for job {}. Unable to retrieve cdx-file for job.",
                        entry.getKey());
                continue;
            }
            File cachedCDXFile = cdxcache.getCacheFile(cached);

            // Dispatch this indexing task to a separate thread that
            // handles the sorting of the logfiles and the generation
            // of a lucene index for this crawllog and cdxfile.
            ++count;
            String taskID = count + " out of " + datasetSize;
            log.debug("Making subthread for indexing job " + jobId + " - task " + taskID);
            Callable<Boolean> task = new DigestIndexerWorker(localindexLocation, jobId, crawlLog, cachedCDXFile,
                    indexingOptions, taskID);
            Future<Boolean> result = executor.submit(task);
            outstandingJobs.add(new IndexingState(jobId, localindexLocation, result));
        }

        // wait for all the outstanding subtasks to complete.
        Set<Directory> subindices = new HashSet<Directory>();

        // Deadline for the combine-task
        long combineTimeout = Settings.getLong(HarvesterSettings.INDEXSERVER_INDEXING_TIMEOUT);
        long timeOutTime = System.currentTimeMillis() + combineTimeout;

        // The indexwriter for the totalindex.
        IndexWriter totalIndex = indexer.getIndex();
        int subindicesInTotalIndex = 0;
        // Max number of segments in totalindex.
        int maxSegments = Settings.getInt(HarvesterSettings.INDEXSERVER_INDEXING_MAX_SEGMENTS);

        final int ACCUMULATED_SUBINDICES_BEFORE_MERGING = 200;

        while (outstandingJobs.size() > 0) {
            log.info("Outstanding jobs in combine task #{} is now {}", indexingJobCount,
                    outstandingJobs.size());
            Iterator<IndexingState> iterator = outstandingJobs.iterator();
            if (timeOutTime < System.currentTimeMillis()) {
                log.warn(
                        "Max indexing time exceeded for one index ({}). Indexing stops here, "
                                + "although missing subindices for {} jobs",
                        TimeUtils.readableTimeInterval(combineTimeout), outstandingJobs.size());
                break;
            }
            while (iterator.hasNext() && subindices.size() < ACCUMULATED_SUBINDICES_BEFORE_MERGING) {
                Future<Boolean> nextResult;
                IndexingState next = iterator.next();
                if (next.getResultObject().isDone()) {
                    nextResult = next.getResultObject();
                    try {
                        // check, if the indexing failed
                        if (nextResult.get()) {
                            subindices.add(new SimpleFSDirectory(new File(next.getIndex())));
                        } else {
                            log.warn("Indexing of job {} failed.", next.getJobIdentifier());
                        }

                    } catch (InterruptedException e) {
                        log.warn("Unable to get Result back from indexing thread", e);
                    } catch (ExecutionException e) {
                        log.warn("Unable to get Result back from indexing thread", e);
                    }
                    // remove the done object from the set
                    iterator.remove();
                }
            }

            if (subindices.size() >= ACCUMULATED_SUBINDICES_BEFORE_MERGING) {

                log.info(
                        "Adding {} subindices to main index. Forcing index to contain max {} files (related to combine task #{})",
                        subindices.size(), maxSegments, indexingJobCount);
                totalIndex.addIndexes(subindices.toArray(new Directory[0]));
                totalIndex.forceMerge(maxSegments);
                totalIndex.commit();
                for (Directory luceneDir : subindices) {
                    luceneDir.close();
                }
                subindicesInTotalIndex += subindices.size();
                log.info(
                        "Completed adding {} subindices to main index, now containing {} subindices(related to combine task #{})",
                        subindices.size(), subindicesInTotalIndex, indexingJobCount);
                subindices.clear();
            } else {
                sleepAwhile();
            }
        }

        log.info(
                "Adding the final {} subindices to main index. "
                        + "Forcing index to contain max {} files (related to combine task #{})",
                subindices.size(), maxSegments, indexingJobCount);

        totalIndex.addIndexes(subindices.toArray(new Directory[0]));
        totalIndex.forceMerge(maxSegments);
        totalIndex.commit();
        for (Directory luceneDir : subindices) {
            luceneDir.close();
        }
        subindices.clear();

        log.info("Adding operation completed (combine task #{})!", indexingJobCount);
        long docsInIndex = totalIndex.numDocs();

        indexer.close();
        log.info("Closed index (related to combine task #{}", indexingJobCount);

        // Now the index is made, gzip it up.
        File totalIndexDir = new File(indexLocation);
        log.info("Gzip-compressing the individual {} index files of combine task # {}",
                totalIndexDir.list().length, indexingJobCount);
        ZipUtils.gzipFiles(totalIndexDir, resultDir);
        log.info(
                "Completed combine task #{} that combined a dataset with {} crawl logs (entries in combined index: {}) - compressed index has size {}",
                indexingJobCount, datasetSize, docsInIndex, FileUtils.getHumanReadableFileSize(resultDir));
    } catch (IOException e) {
        throw new IOFailure("Error setting up craw.log index framework for " + resultDir.getAbsolutePath(), e);
    } finally {
        // close down Threadpool-executor
        closeDownThreadpoolQuietly(executor);
        FileUtils.removeRecursively(new File(indexLocation));
        for (File temporaryFile : tmpfiles) {
            FileUtils.removeRecursively(temporaryFile);
        }
    }
}

From source file:edu.utsa.sifter.Indexer.java

License:Apache License

public static void main(String[] args) throws IOException, InterruptedException {
    if (args.length != 1 && args.length != 2) {
        System.err.println(/* w ww  . java  2  s . c  om*/
                "Wrong number of args supplied. Takes path to index directory and optional stop words file path.");
        return;
    }
    final long begin = System.currentTimeMillis();

    final File evPath = new File(args[0]);
    final File indexPath = new File(evPath, "primary-idx");
    if (!evPath.mkdir()) {
        System.err.println("Could not create directory " + evPath.toString());
        return;
    }

    final SifterConfig conf = new SifterConfig();
    conf.loadFromXMLFile("sifter_props.xml");

    FSRipReader ripper = new FSRipReader(conf.THREAD_POOL_SIZE, conf.LARGE_FILE_THRESHOLD, conf.TEMP_DIR,
            conf.FILETYPE_MODEL_FILE);
    try {
        final IndexWriter index = getIndexWriter(indexPath.toString(), args.length == 2 ? args[1] : null, conf);
        boolean ret = ripper.readData(System.in, index);
        if (ret) {
            System.out.println("Optimizing index");
            index.forceMerge(1);
            System.out.println("Successful finish");
        }
        index.close();
    } finally {
        System.out.println("FilesRead: " + ripper.FilesRead);
        System.out.println("BytesRead: " + ripper.BytesRead);
        System.out.println("FileBytesRead: " + ripper.FileBytesRead);
        System.out.println("Duration: " + ((System.currentTimeMillis() - begin) / 1000) + " seconds");
    }
}

From source file:edu.utsa.sifter.som.MainSOM.java

License:Apache License

public static void main(String[] args)
        throws IOException, InterruptedException, CorruptIndexException, NoSuchFieldException {
    final File evPath = new File(args[0]);
    final File idxPath = new File(evPath, "primary-idx");

    final long begin = System.currentTimeMillis();

    // createIndex(path);
    final Path outPath = new Path(new Path(evPath.toString()), "docVectors.seq");
    final Configuration hadoopConf = new Configuration();
    final LocalFileSystem fs = FileSystem.getLocal(hadoopConf);
    final SequenceFile.Writer file = SequenceFile.createWriter(fs, hadoopConf, outPath, LongWritable.class,
            IntArrayWritable.class);

    final DirectoryReader dirReader = DirectoryReader.open(FSDirectory.open(idxPath));

    final SifterConfig conf = new SifterConfig();
    InputStream xmlProps = null;/*from   ww w.j a  v  a 2s. c o m*/
    try {
        xmlProps = new FileInputStream("sifter_props.xml");
    } catch (FileNotFoundException ex) {
        ; // swallow exeption
    }
    conf.loadFromXML(xmlProps); // safe with null

    final MainSOM builder = new MainSOM(dirReader, conf);
    IndexWriter writer = null;
    FileOutputStream somJSFile = null;
    try {
        builder.initTerms();
        builder.writeVectors(file);
        file.close();

        final SequenceFile.Reader seqRdr = new SequenceFile.Reader(fs, outPath, hadoopConf);
        writer = builder.createWriter(new File(evPath, "som-idx"), conf);

        somJSFile = new FileOutputStream(new File(evPath, "som.js"));
        final CharsetEncoder utf8 = Charset.forName("UTF-8").newEncoder();
        utf8.onMalformedInput(CodingErrorAction.IGNORE);
        final Writer somJS = new BufferedWriter(new OutputStreamWriter(somJSFile, utf8));
        builder.makeSOM(conf, seqRdr, writer, somJS);
        writer.forceMerge(1);
    } catch (Exception e) {
        e.printStackTrace(System.err);
    } finally {
        file.close();
        if (writer != null) {
            writer.close();
        }
        if (somJSFile != null) {
            somJSFile.close();
        }
        dirReader.close();

        System.out.println("Number of docs written: " + builder.getNumDocs());
        System.out.println("Number of outlier docs: " + builder.getNumOutliers());
        System.out.println("Total term dimensions: " + builder.getTermsMap().size());
        System.out.println("Max terms per doc: " + builder.getMaxDocTerms());
        System.out.println("Avg terms per doc: " + builder.getAvgDocTerms());
        System.out.println("Duration: " + ((System.currentTimeMillis() - begin) / 1000) + " seconds");

        conf.storeToXML(new FileOutputStream("sifter_props.xml"));
    }
}

From source file:io.anserini.embeddings.IndexW2V.java

License:Apache License

public void indexEmbeddings() throws IOException, InterruptedException {
    LOG.info("Starting indexer...");
    long startTime = System.currentTimeMillis();
    final WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
    final IndexWriterConfig config = new IndexWriterConfig(analyzer);
    final IndexWriter writer = new IndexWriter(directory, config);

    BufferedReader bRdr = new BufferedReader(new FileReader(args.input));
    String line = null;/*from w ww .  ja  v  a  2 s  .  c o  m*/
    bRdr.readLine();

    Document document = new Document();
    ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
    int cnt = 0;

    while ((line = bRdr.readLine()) != null) {
        String[] termEmbedding = line.trim().split("\t");
        document.add(new StringField(LuceneDocumentGenerator.FIELD_ID, termEmbedding[0], Field.Store.NO));
        String[] parts = termEmbedding[1].split(" ");

        for (int i = 0; i < parts.length; ++i) {
            byteStream.write(ByteBuffer.allocate(4).putFloat(Float.parseFloat(parts[i])).array());
        }
        document.add(new StoredField(FIELD_BODY, byteStream.toByteArray()));

        byteStream.flush();
        byteStream.reset();
        writer.addDocument(document);
        document.clear();
        cnt++;

        if (cnt % 100000 == 0) {
            LOG.info(cnt + " terms indexed");
        }
    }

    LOG.info(String.format("Total of %s terms added", cnt));

    try {
        writer.commit();
        writer.forceMerge(1);
    } finally {
        try {
            writer.close();
        } catch (IOException e) {
            LOG.error(e);
        }
    }

    LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
}

From source file:io.anserini.index.IndexClueWeb09b.java

License:Apache License

public int indexWithThreads(int numThreads) throws IOException, InterruptedException {

    System.out.println(//from   w  w w . j ava2 s  . c o  m
            "Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'...");

    final Directory dir = FSDirectory.open(indexPath);

    final IndexWriterConfig iwc = new IndexWriterConfig(analyzer());

    iwc.setSimilarity(new BM25Similarity());
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    iwc.setRAMBufferSizeMB(256.0);
    iwc.setUseCompoundFile(false);
    iwc.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, iwc);

    final ExecutorService executor = Executors.newFixedThreadPool(numThreads);

    List<Path> warcFiles = discoverWarcFiles(docDir);
    if (doclimit > 0 && warcFiles.size() < doclimit)
        warcFiles = warcFiles.subList(0, doclimit);

    for (Path f : warcFiles)
        executor.execute(new IndexerThread(writer, f));

    //add some delay to let some threads spawn by scheduler
    Thread.sleep(30000);
    executor.shutdown(); // Disable new tasks from being submitted

    try {
        // Wait for existing tasks to terminate
        while (!executor.awaitTermination(5, TimeUnit.MINUTES)) {
            Thread.sleep(1000);
        }
    } catch (InterruptedException ie) {
        // (Re-)Cancel if current thread also interrupted
        executor.shutdownNow();
        // Preserve interrupt status
        Thread.currentThread().interrupt();
    }

    int numIndexed = writer.maxDoc();

    try {
        writer.commit();
        if (optimize)
            writer.forceMerge(1);
    } finally {
        writer.close();
    }

    return numIndexed;
}