Example usage for org.apache.lucene.index IndexWriter commit

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter commit.

Prototype

@Override
public final long commit() throws IOException

Source Link

Document

Commits all pending changes (added and deleted documents, segment merges, added indexes, etc.) to the index, and syncs all referenced index files, such that a reader will see the changes and the index updates will survive an OS or machine crash or power loss.

Usage

From source file:demo.jaxrs.search.server.Catalog.java

License:Apache License

private void initIndex() throws IOException {
    final IndexWriter writer = getIndexWriter();

    try {/*from   w w w  . j  av  a  2  s  .c  om*/
        writer.commit();
    } finally {
        writer.close();
    }
}

From source file:demo.jaxrs.search.server.Catalog.java

License:Apache License

private void storeAndIndex(final LuceneDocumentMetadata metadata, final byte[] content) throws IOException {

    try (BufferedInputStream in = new BufferedInputStream(new ByteArrayInputStream(content))) {

        final Document document = extractor.extract(in, metadata);
        if (document != null) {
            final IndexWriter writer = getIndexWriter();

            try {
                storage.addDocument(metadata.getSource(), content);
                writer.addDocument(document);
                writer.commit();
            } finally {
                writer.close();/*from w  ww.  j  a v a 2 s . c  om*/
            }
        }
    }
}

From source file:demo.jaxrs.search.server.Indexer.java

License:Apache License

public void storeAndIndex(final LuceneDocumentMetadata metadata, final byte[] content) throws IOException {
    BufferedInputStream in = null;

    try {/*from  ww w.  ja v  a  2  s  . c  om*/
        in = new BufferedInputStream(new ByteArrayInputStream(content));

        final Document document = extractor.extract(in, metadata);
        if (document != null) {
            final IndexWriter writer = getIndexWriter();

            try {
                storage.addDocument(metadata.getSource(), content);
                writer.addDocument(document);
                writer.commit();
            } finally {
                writer.close();
            }
        }
    } finally {
        if (in != null) {
            try {
                in.close();
            } catch (IOException ex) {
                /* do nothing */ }
        }
    }
}

From source file:dk.dbc.opensearch.fedora.search.PidCollectorTest.java

License:Open Source License

private AtomicReader populateIndexAndGetIndexReader(Document... docs) throws IOException {
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_41, new SimpleAnalyzer(Version.LUCENE_41));
    IndexWriter indexWriter = new IndexWriter(index, config);
    for (Document doc : docs) {
        indexWriter.addDocument(doc);/*from  ww w . j a v a  2s. c  om*/
    }
    indexWriter.commit();
    indexWriter.close();
    return SlowCompositeReaderWrapper.wrap(DirectoryReader.open(index));
}

From source file:dk.dma.msinm.lucene.AbstractLuceneIndex.java

License:Open Source License

/**
 * Deletes the current index/*from ww w.  j  a  va2s.c om*/
 * @throws IOException
 */
public void deleteIndex() throws IOException {
    // Delete the index
    IndexWriter writer = null;
    try {
        writer = getNewWriter();
        writer.deleteAll();
        writer.setCommitData(new HashMap<>());
        writer.commit();
    } finally {
        closeWriter(writer);
    }
}

From source file:dk.dma.msinm.lucene.AbstractLuceneIndex.java

License:Open Source License

/**
 * Updates the Lucene index/* w  w w.  ja  va  2 s  .c o m*/
 *
 * @param maxIndexCount max number of entities to index at a time
 * @param force update even if the locked flag is set
 * @return the number of updates
 */
public int updateLuceneIndex(int maxIndexCount, boolean force) {
    // Check if we are in the middle of re-indexing
    if (!force && locked) {
        return 0;
    }

    Date lastUpdated = getLastUpdated();

    long t0 = System.currentTimeMillis();
    log.debug(String.format("Indexing at most %d changed entities since %s", maxIndexCount, lastUpdated));

    IndexWriter writer = null;
    try {
        // Find all customers changed since the lastUpdated time stamp
        List<T> updatedEntities = findUpdatedEntities(lastUpdated, maxIndexCount);
        if (updatedEntities.size() == 0) {
            return 0;
        }

        // Create a new index writer
        writer = getNewWriter();

        // Update the index with the changes
        for (T entity : updatedEntities) {
            indexEntity(writer, entity);
            if (entity.getUpdated().after(lastUpdated)) {
                lastUpdated = entity.getUpdated();
            }
        }

        // Update the last-updated flag
        setLastUpdated(lastUpdated, writer);

        // Commit the changes
        writer.commit();

        // Re-open the reader from the writer
        refreshReader(writer);

        // Check if we need to optimize the index
        optimizeIndexCount += updatedEntities.size();
        if (optimizeIndexCount > OPTIMIZE_INDEX_COUNT) {
            writer.forceMerge(MAX_NUM_SEGMENTS);
            optimizeIndexCount = 0;
        }

        log.info("Indexed " + updatedEntities.size() + " entities in " + (System.currentTimeMillis() - t0)
                + " ms");

        return updatedEntities.size();
    } catch (Exception ex) {
        log.error("Error updating Lucene index: " + ex.getMessage(), ex);
        return 0;
    } finally {
        closeWriter(writer);
    }
}

From source file:dk.netarkivet.harvester.indexserver.CrawlLogIndexCache.java

License:Open Source License

/**
 * Combine a number of crawl.log files into one Lucene index. This index is placed as gzip files under the directory
 * returned by getCacheFile()./*  w w w . j av a 2 s.  c o  m*/
 *
 * @param rawfiles The map from job ID into crawl.log contents. No null values are allowed in this map.
 */
protected void combine(Map<Long, File> rawfiles) {
    ++indexingJobCount;
    long datasetSize = rawfiles.values().size();
    log.info("Starting combine task #{}. This combines a dataset with {} crawl logs (thread = {})",
            indexingJobCount, datasetSize, Thread.currentThread().getName());

    File resultDir = getCacheFile(rawfiles.keySet());
    Set<File> tmpfiles = new HashSet<File>();
    String indexLocation = resultDir.getAbsolutePath() + ".luceneDir";
    ThreadPoolExecutor executor = null;
    try {
        DigestIndexer indexer = createStandardIndexer(indexLocation);
        final boolean verboseIndexing = false;
        DigestOptions indexingOptions = new DigestOptions(this.useBlacklist, verboseIndexing, this.mimeFilter);
        long count = 0;
        Set<IndexingState> outstandingJobs = new HashSet<IndexingState>();
        final int maxThreads = Settings.getInt(HarvesterSettings.INDEXSERVER_INDEXING_MAXTHREADS);
        executor = new ThreadPoolExecutor(maxThreads, maxThreads, 0L, TimeUnit.MILLISECONDS,
                new LinkedBlockingQueue<Runnable>());

        executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());

        for (Map.Entry<Long, File> entry : rawfiles.entrySet()) {
            Long jobId = entry.getKey();
            File crawlLog = entry.getValue();
            // Generate UUID to ensure a unique filedir for the index.
            File tmpFile = new File(FileUtils.getTempDir(), UUID.randomUUID().toString());
            tmpfiles.add(tmpFile);
            String localindexLocation = tmpFile.getAbsolutePath();
            Long cached = cdxcache.cache(jobId);
            if (cached == null) {
                log.warn("Skipping the ingest of logs for job {}. Unable to retrieve cdx-file for job.",
                        entry.getKey());
                continue;
            }
            File cachedCDXFile = cdxcache.getCacheFile(cached);

            // Dispatch this indexing task to a separate thread that
            // handles the sorting of the logfiles and the generation
            // of a lucene index for this crawllog and cdxfile.
            ++count;
            String taskID = count + " out of " + datasetSize;
            log.debug("Making subthread for indexing job " + jobId + " - task " + taskID);
            Callable<Boolean> task = new DigestIndexerWorker(localindexLocation, jobId, crawlLog, cachedCDXFile,
                    indexingOptions, taskID);
            Future<Boolean> result = executor.submit(task);
            outstandingJobs.add(new IndexingState(jobId, localindexLocation, result));
        }

        // wait for all the outstanding subtasks to complete.
        Set<Directory> subindices = new HashSet<Directory>();

        // Deadline for the combine-task
        long combineTimeout = Settings.getLong(HarvesterSettings.INDEXSERVER_INDEXING_TIMEOUT);
        long timeOutTime = System.currentTimeMillis() + combineTimeout;

        // The indexwriter for the totalindex.
        IndexWriter totalIndex = indexer.getIndex();
        int subindicesInTotalIndex = 0;
        // Max number of segments in totalindex.
        int maxSegments = Settings.getInt(HarvesterSettings.INDEXSERVER_INDEXING_MAX_SEGMENTS);

        final int ACCUMULATED_SUBINDICES_BEFORE_MERGING = 200;

        while (outstandingJobs.size() > 0) {
            log.info("Outstanding jobs in combine task #{} is now {}", indexingJobCount,
                    outstandingJobs.size());
            Iterator<IndexingState> iterator = outstandingJobs.iterator();
            if (timeOutTime < System.currentTimeMillis()) {
                log.warn(
                        "Max indexing time exceeded for one index ({}). Indexing stops here, "
                                + "although missing subindices for {} jobs",
                        TimeUtils.readableTimeInterval(combineTimeout), outstandingJobs.size());
                break;
            }
            while (iterator.hasNext() && subindices.size() < ACCUMULATED_SUBINDICES_BEFORE_MERGING) {
                Future<Boolean> nextResult;
                IndexingState next = iterator.next();
                if (next.getResultObject().isDone()) {
                    nextResult = next.getResultObject();
                    try {
                        // check, if the indexing failed
                        if (nextResult.get()) {
                            subindices.add(new SimpleFSDirectory(new File(next.getIndex())));
                        } else {
                            log.warn("Indexing of job {} failed.", next.getJobIdentifier());
                        }

                    } catch (InterruptedException e) {
                        log.warn("Unable to get Result back from indexing thread", e);
                    } catch (ExecutionException e) {
                        log.warn("Unable to get Result back from indexing thread", e);
                    }
                    // remove the done object from the set
                    iterator.remove();
                }
            }

            if (subindices.size() >= ACCUMULATED_SUBINDICES_BEFORE_MERGING) {

                log.info(
                        "Adding {} subindices to main index. Forcing index to contain max {} files (related to combine task #{})",
                        subindices.size(), maxSegments, indexingJobCount);
                totalIndex.addIndexes(subindices.toArray(new Directory[0]));
                totalIndex.forceMerge(maxSegments);
                totalIndex.commit();
                for (Directory luceneDir : subindices) {
                    luceneDir.close();
                }
                subindicesInTotalIndex += subindices.size();
                log.info(
                        "Completed adding {} subindices to main index, now containing {} subindices(related to combine task #{})",
                        subindices.size(), subindicesInTotalIndex, indexingJobCount);
                subindices.clear();
            } else {
                sleepAwhile();
            }
        }

        log.info(
                "Adding the final {} subindices to main index. "
                        + "Forcing index to contain max {} files (related to combine task #{})",
                subindices.size(), maxSegments, indexingJobCount);

        totalIndex.addIndexes(subindices.toArray(new Directory[0]));
        totalIndex.forceMerge(maxSegments);
        totalIndex.commit();
        for (Directory luceneDir : subindices) {
            luceneDir.close();
        }
        subindices.clear();

        log.info("Adding operation completed (combine task #{})!", indexingJobCount);
        long docsInIndex = totalIndex.numDocs();

        indexer.close();
        log.info("Closed index (related to combine task #{}", indexingJobCount);

        // Now the index is made, gzip it up.
        File totalIndexDir = new File(indexLocation);
        log.info("Gzip-compressing the individual {} index files of combine task # {}",
                totalIndexDir.list().length, indexingJobCount);
        ZipUtils.gzipFiles(totalIndexDir, resultDir);
        log.info(
                "Completed combine task #{} that combined a dataset with {} crawl logs (entries in combined index: {}) - compressed index has size {}",
                indexingJobCount, datasetSize, docsInIndex, FileUtils.getHumanReadableFileSize(resultDir));
    } catch (IOException e) {
        throw new IOFailure("Error setting up craw.log index framework for " + resultDir.getAbsolutePath(), e);
    } finally {
        // close down Threadpool-executor
        closeDownThreadpoolQuietly(executor);
        FileUtils.removeRecursively(new File(indexLocation));
        for (File temporaryFile : tmpfiles) {
            FileUtils.removeRecursively(temporaryFile);
        }
    }
}

From source file:dk.statsbiblioteket.netark.dvenabler.DVReaderTest.java

License:Apache License

private static File generateIndex(int documents) throws IOException {
    final File INDEX = new File("target/testindex.deletefreely." + documents);
    final long seed = new Random().nextLong();
    Random random = new Random(seed);
    log.info("Testing with random seed" + seed);
    Analyzer analyzer = new StandardAnalyzer(LUCENE_VERSION);

    final FieldType SINGLE_F = new FieldType();
    SINGLE_F.setIndexed(true);//from w w  w  .  j  a va2  s .c o  m
    SINGLE_F.setStored(true);

    final FieldType MULTI_F = new FieldType();
    MULTI_F.setIndexed(true);
    MULTI_F.setStored(true);

    final FieldType SEARCH_F = new FieldType();
    SEARCH_F.setIndexed(true);

    final FieldType LONG_F = new FieldType();
    LONG_F.setIndexed(true);
    LONG_F.setStored(true);
    LONG_F.setNumericType(FieldType.NumericType.LONG);

    final FieldType DOUBLE_F = new FieldType();
    DOUBLE_F.setIndexed(true);
    DOUBLE_F.setStored(true);
    DOUBLE_F.setNumericType(FieldType.NumericType.DOUBLE);

    IndexWriter indexWriter = new IndexWriter(MMapDirectory.open(INDEX),
            new IndexWriterConfig(LUCENE_VERSION, analyzer));
    for (int docID = 0; docID < documents; docID++) {
        Document document = new Document();
        document.add(new Field(ID, Integer.toString(docID), SINGLE_F));
        document.add(new Field(SEARCH, SEARCH_CONTENT + "_" + docID, SEARCH_F));
        if (random.nextInt(5) > 0) {
            document.add(new Field(SINGLE, SINGLE_CONTENT + "_r" + random.nextInt(), SINGLE_F));
        }
        if (random.nextInt(5) > 0) {
            document.add(new Field(MULTI, MULTI_CONTENT_1 + "_" + docID, MULTI_F));
            if (random.nextInt(3) > 0) {
                document.add(new Field(MULTI, MULTI_CONTENT_2 + "_random" + random.nextInt(5), MULTI_F));
            }
        }
        if (random.nextInt(5) > 0) {
            document.add(new LongField(LONG, random.nextLong(), LONG_F));
        }
        if (random.nextInt(5) > 0) {
            document.add(new DoubleField(DOUBLE, random.nextDouble(), DOUBLE_F));
        }
        indexWriter.addDocument(document);
        if (docID == documents / 3) {
            indexWriter.commit(); // Ensure multi-segment
        }
    }
    indexWriter.commit();
    indexWriter.close();
    return INDEX;
}

From source file:dk.statsbiblioteket.netark.dvenabler.DVReaderTest.java

License:Apache License

public static File generateIndex() throws IOException {
    final File INDEX = new File("target/testindex.deletefreely");
    Analyzer analyzer = new StandardAnalyzer(LUCENE_VERSION);

    final FieldType SINGLE_F = new FieldType();
    SINGLE_F.setIndexed(true);/*from ww w.  j  a  va  2  s .c  o m*/
    SINGLE_F.setStored(true);

    final FieldType MULTI_F = new FieldType();
    MULTI_F.setIndexed(true);
    MULTI_F.setStored(true);

    final FieldType SEARCH_F = new FieldType();
    SEARCH_F.setIndexed(true);

    final FieldType LONG_F = new FieldType();
    LONG_F.setIndexed(true);
    LONG_F.setStored(true);
    LONG_F.setNumericType(FieldType.NumericType.LONG);

    /*        final FieldType DOUBLE_F = new FieldType();
            DOUBLE_F.setIndexed(true);
            DOUBLE_F.setStored(true);
            DOUBLE_F.setNumericType(FieldType.NumericType.DOUBLE);
            
            final FieldType FLOAT_F = new FieldType();
            FLOAT_F.setIndexed(true);
            FLOAT_F.setStored(true);
            FLOAT_F.setNumericType(FieldType.NumericType.FLOAT);
      */

    /*        final FieldType STR_DV = new FieldType();
            STR_DV.setIndexed(true);
            STR_DV.setStored(true);
            STR_DV.setDocValueType(FieldInfo.DocValuesType.SORTED);*/

    IndexWriter indexWriter = new IndexWriter(MMapDirectory.open(INDEX),
            new IndexWriterConfig(LUCENE_VERSION, analyzer));
    {
        Document document = new Document();
        document.add(new Field(ID, "1", MULTI_F));
        document.add(new Field(SEARCH, SEARCH_CONTENT, SEARCH_F));
        document.add(new Field(SINGLE, SINGLE_CONTENT, MULTI_F));
        document.add(new Field(MULTI, MULTI_CONTENT_1, MULTI_F));
        document.add(new Field(MULTI, MULTI_CONTENT_2, MULTI_F));
        document.add(new LongField(LONG, LONG_CONTENT, LONG_F));
        //            document.add(new DoubleField(DOUBLE, DOUBLE_CONTENT, DOUBLE_F));
        //            document.add(new FloatField(FLOAT, FLOAT_CONTENT, FLOAT_F));
        document.add(new SortedDocValuesField(DV, new BytesRef(DV_CONTENT)));
        indexWriter.addDocument(document);
    }
    indexWriter.commit();
    indexWriter.close();
    return INDEX;
}

From source file:edu.cmu.lti.oaqa.knn4qa.apps.LuceneIndexer.java

License:Apache License

public static void main(String[] args) {
    Options options = new Options();

    options.addOption(CommonParams.ROOT_DIR_PARAM, null, true, CommonParams.ROOT_DIR_DESC);
    options.addOption(CommonParams.SUB_DIR_TYPE_PARAM, null, true, CommonParams.SUB_DIR_TYPE_DESC);
    options.addOption(CommonParams.MAX_NUM_REC_PARAM, null, true, CommonParams.MAX_NUM_REC_DESC);
    options.addOption(CommonParams.SOLR_FILE_NAME_PARAM, null, true, CommonParams.SOLR_FILE_NAME_DESC);
    options.addOption(CommonParams.OUT_INDEX_PARAM, null, true, CommonParams.OUT_MINDEX_DESC);

    CommandLineParser parser = new org.apache.commons.cli.GnuParser();

    try {/*from w  w w.  ja  v a 2 s.co m*/
        CommandLine cmd = parser.parse(options, args);

        String rootDir = null;

        rootDir = cmd.getOptionValue(CommonParams.ROOT_DIR_PARAM);

        if (null == rootDir)
            Usage("Specify: " + CommonParams.ROOT_DIR_DESC, options);

        String outputDirName = cmd.getOptionValue(CommonParams.OUT_INDEX_PARAM);

        if (null == outputDirName)
            Usage("Specify: " + CommonParams.OUT_MINDEX_DESC, options);

        String subDirTypeList = cmd.getOptionValue(CommonParams.SUB_DIR_TYPE_PARAM);

        if (null == subDirTypeList || subDirTypeList.isEmpty())
            Usage("Specify: " + CommonParams.SUB_DIR_TYPE_DESC, options);

        String solrFileName = cmd.getOptionValue(CommonParams.SOLR_FILE_NAME_PARAM);
        if (null == solrFileName)
            Usage("Specify: " + CommonParams.SOLR_FILE_NAME_DESC, options);

        int maxNumRec = Integer.MAX_VALUE;

        String tmp = cmd.getOptionValue(CommonParams.MAX_NUM_REC_PARAM);

        if (tmp != null) {
            try {
                maxNumRec = Integer.parseInt(tmp);
                if (maxNumRec <= 0) {
                    Usage("The maximum number of records should be a positive integer", options);
                }
            } catch (NumberFormatException e) {
                Usage("The maximum number of records should be a positive integer", options);
            }
        }

        File outputDir = new File(outputDirName);
        if (!outputDir.exists()) {
            if (!outputDir.mkdirs()) {
                System.out.println("couldn't create " + outputDir.getAbsolutePath());
                System.exit(1);
            }
        }
        if (!outputDir.isDirectory()) {
            System.out.println(outputDir.getAbsolutePath() + " is not a directory!");
            System.exit(1);
        }
        if (!outputDir.canWrite()) {
            System.out.println("Can't write to " + outputDir.getAbsolutePath());
            System.exit(1);
        }

        String subDirs[] = subDirTypeList.split(",");

        int docNum = 0;

        // No English analyzer here, all language-related processing is done already,
        // here we simply white-space tokenize and index tokens verbatim.
        Analyzer analyzer = new WhitespaceAnalyzer();
        FSDirectory indexDir = FSDirectory.open(outputDir);
        IndexWriterConfig indexConf = new IndexWriterConfig(analyzer.getVersion(), analyzer);

        System.out.println("Creating a new Lucene index, maximum # of docs to process: " + maxNumRec);
        indexConf.setOpenMode(OpenMode.CREATE);
        IndexWriter indexWriter = new IndexWriter(indexDir, indexConf);

        for (int subDirId = 0; subDirId < subDirs.length && docNum < maxNumRec; ++subDirId) {
            String inputFileName = rootDir + "/" + subDirs[subDirId] + "/" + solrFileName;

            System.out.println("Input file name: " + inputFileName);

            BufferedReader inpText = new BufferedReader(
                    new InputStreamReader(CompressUtils.createInputStream(inputFileName)));
            String docText = XmlHelper.readNextXMLIndexEntry(inpText);

            for (; docText != null && docNum < maxNumRec; docText = XmlHelper.readNextXMLIndexEntry(inpText)) {
                ++docNum;
                Map<String, String> docFields = null;

                Document luceneDoc = new Document();

                try {
                    docFields = XmlHelper.parseXMLIndexEntry(docText);
                } catch (Exception e) {
                    System.err.println(String.format("Parsing error, offending DOC #%d:\n%s", docNum, docText));
                    System.exit(1);
                }

                String id = docFields.get(UtilConst.TAG_DOCNO);

                if (id == null) {
                    System.err.println(String.format("No ID tag '%s', offending DOC #%d:\n%s",
                            UtilConst.TAG_DOCNO, docNum, docText));
                }

                luceneDoc.add(new StringField(UtilConst.TAG_DOCNO, id, Field.Store.YES));

                for (Map.Entry<String, String> e : docFields.entrySet())
                    if (!e.getKey().equals(UtilConst.TAG_DOCNO)) {
                        luceneDoc.add(new TextField(e.getKey(), e.getValue(), Field.Store.YES));
                    }
                indexWriter.addDocument(luceneDoc);
                if (docNum % 1000 == 0)
                    System.out.println("Indexed " + docNum + " docs");
            }
            System.out.println("Indexed " + docNum + " docs");
        }

        indexWriter.commit();
        indexWriter.close();

    } catch (ParseException e) {
        Usage("Cannot parse arguments", options);
    } catch (Exception e) {
        System.err.println("Terminating due to an exception: " + e);
        System.exit(1);
    }

}