List of usage examples for org.apache.lucene.index IndexWriter forceMerge
public void forceMerge(int maxNumSegments) throws IOException
From source file:com.rocana.lucene.codec.v1.TestRocanaPerFieldPostingsFormat2.java
License:Apache License
@Test public void testMergeUnusedPerFieldCodec() throws IOException { Directory dir = newDirectory();/* w ww.ja v a2s . co m*/ IndexWriterConfig iwconf = newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.CREATE) .setCodec(new MockCodec()); IndexWriter writer = newWriter(dir, iwconf); addDocs(writer, 10); writer.commit(); addDocs3(writer, 10); writer.commit(); addDocs2(writer, 10); writer.commit(); assertEquals(30, writer.maxDoc()); TestUtil.checkIndex(dir); writer.forceMerge(1); assertEquals(30, writer.maxDoc()); writer.close(); dir.close(); }
From source file:com.rocana.lucene.codec.v1.TestRocanaPerFieldPostingsFormat2.java
License:Apache License
@Test public void testChangeCodecAndMerge() throws IOException { Directory dir = newDirectory();/*from www. j a v a 2s . c om*/ if (VERBOSE) { System.out.println("TEST: make new index"); } IndexWriterConfig iwconf = newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.CREATE) .setCodec(new MockCodec()); iwconf.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH); //((LogMergePolicy) iwconf.getMergePolicy()).setMergeFactor(10); IndexWriter writer = newWriter(dir, iwconf); addDocs(writer, 10); writer.commit(); assertQuery(new Term("content", "aaa"), dir, 10); if (VERBOSE) { System.out.println("TEST: addDocs3"); } addDocs3(writer, 10); writer.commit(); writer.close(); assertQuery(new Term("content", "ccc"), dir, 10); assertQuery(new Term("content", "aaa"), dir, 10); Codec codec = iwconf.getCodec(); iwconf = newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND).setCodec(codec); //((LogMergePolicy) iwconf.getMergePolicy()).setNoCFSRatio(0.0); //((LogMergePolicy) iwconf.getMergePolicy()).setMergeFactor(10); iwconf.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH); iwconf.setCodec(new MockCodec2()); // uses standard for field content writer = newWriter(dir, iwconf); // swap in new codec for currently written segments if (VERBOSE) { System.out.println("TEST: add docs w/ Standard codec for content field"); } addDocs2(writer, 10); writer.commit(); codec = iwconf.getCodec(); assertEquals(30, writer.maxDoc()); assertQuery(new Term("content", "bbb"), dir, 10); assertQuery(new Term("content", "ccc"), dir, 10); //// assertQuery(new Term("content", "aaa"), dir, 10); if (VERBOSE) { System.out.println("TEST: add more docs w/ new codec"); } addDocs2(writer, 10); writer.commit(); assertQuery(new Term("content", "ccc"), dir, 10); assertQuery(new Term("content", "bbb"), dir, 20); assertQuery(new Term("content", "aaa"), dir, 10); assertEquals(40, writer.maxDoc()); if (VERBOSE) { System.out.println("TEST: now optimize"); } writer.forceMerge(1); assertEquals(40, writer.maxDoc()); writer.close(); assertQuery(new Term("content", "ccc"), dir, 10); assertQuery(new Term("content", "bbb"), dir, 20); assertQuery(new Term("content", "aaa"), dir, 10); dir.close(); }
From source file:com.rocana.lucene.codec.v1.TestRocanaPerFieldPostingsFormat2.java
License:Apache License
@Test public void testStressPerFieldCodec() throws IOException { Directory dir = newDirectory(random()); final int docsPerRound = 97; int numRounds = atLeast(1); for (int i = 0; i < numRounds; i++) { int num = TestUtil.nextInt(random(), 30, 60); IndexWriterConfig config = newIndexWriterConfig(random(), new MockAnalyzer(random())); config.setOpenMode(OpenMode.CREATE_OR_APPEND); IndexWriter writer = newWriter(dir, config); for (int j = 0; j < docsPerRound; j++) { final Document doc = new Document(); for (int k = 0; k < num; k++) { FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.setTokenized(random().nextBoolean()); customType.setOmitNorms(random().nextBoolean()); Field field = newField("" + k, TestUtil.randomRealisticUnicodeString(random(), 128), customType);// w ww. j a v a 2s .c o m doc.add(field); } writer.addDocument(doc); } if (random().nextBoolean()) { writer.forceMerge(1); } writer.commit(); assertEquals((i + 1) * docsPerRound, writer.maxDoc()); writer.close(); } dir.close(); }
From source file:com.senseidb.abacus.api.codec.CodecTest.java
License:Apache License
static Directory buildIndex(Iterable<String> datasrc, Codec codec) throws Exception { String idxname = codec == null ? "lucene" : codec.getName(); Directory dir = FSDirectory.open(new File("/tmp/codectest", idxname));//new RAMDirectory(); //Directory dir = new RAMDirectory(); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_44, new StandardAnalyzer(Version.LUCENE_44)); conf.setUseCompoundFile(false);/*from w w w . j a v a 2 s . co m*/ if (codec != null) { conf.setCodec(codec); } IndexWriter writer = new IndexWriter(dir, conf); for (String doc : datasrc) { if (doc == null) break; doc = doc.trim(); if (doc.length() == 0) continue; Document d = new Document(); FieldType ft = new FieldType(); ft.setIndexed(true); ft.setStored(false); ft.setIndexOptions(IndexOptions.DOCS_ONLY); ft.setOmitNorms(true); Field f = new Field(FIELD, doc, ft); d.add(f); writer.addDocument(d); } writer.forceMerge(1); writer.commit(); writer.close(); return dir; }
From source file:dk.dma.msinm.lucene.AbstractLuceneIndex.java
License:Open Source License
/** * Updates the Lucene index/*from w w w . j a v a2 s .c o m*/ * * @param maxIndexCount max number of entities to index at a time * @param force update even if the locked flag is set * @return the number of updates */ public int updateLuceneIndex(int maxIndexCount, boolean force) { // Check if we are in the middle of re-indexing if (!force && locked) { return 0; } Date lastUpdated = getLastUpdated(); long t0 = System.currentTimeMillis(); log.debug(String.format("Indexing at most %d changed entities since %s", maxIndexCount, lastUpdated)); IndexWriter writer = null; try { // Find all customers changed since the lastUpdated time stamp List<T> updatedEntities = findUpdatedEntities(lastUpdated, maxIndexCount); if (updatedEntities.size() == 0) { return 0; } // Create a new index writer writer = getNewWriter(); // Update the index with the changes for (T entity : updatedEntities) { indexEntity(writer, entity); if (entity.getUpdated().after(lastUpdated)) { lastUpdated = entity.getUpdated(); } } // Update the last-updated flag setLastUpdated(lastUpdated, writer); // Commit the changes writer.commit(); // Re-open the reader from the writer refreshReader(writer); // Check if we need to optimize the index optimizeIndexCount += updatedEntities.size(); if (optimizeIndexCount > OPTIMIZE_INDEX_COUNT) { writer.forceMerge(MAX_NUM_SEGMENTS); optimizeIndexCount = 0; } log.info("Indexed " + updatedEntities.size() + " entities in " + (System.currentTimeMillis() - t0) + " ms"); return updatedEntities.size(); } catch (Exception ex) { log.error("Error updating Lucene index: " + ex.getMessage(), ex); return 0; } finally { closeWriter(writer); } }
From source file:dk.netarkivet.harvester.indexserver.CrawlLogIndexCache.java
License:Open Source License
/** * Combine a number of crawl.log files into one Lucene index. This index is placed as gzip files under the directory * returned by getCacheFile()./*from w w w .j a v a2 s.c o m*/ * * @param rawfiles The map from job ID into crawl.log contents. No null values are allowed in this map. */ protected void combine(Map<Long, File> rawfiles) { ++indexingJobCount; long datasetSize = rawfiles.values().size(); log.info("Starting combine task #{}. This combines a dataset with {} crawl logs (thread = {})", indexingJobCount, datasetSize, Thread.currentThread().getName()); File resultDir = getCacheFile(rawfiles.keySet()); Set<File> tmpfiles = new HashSet<File>(); String indexLocation = resultDir.getAbsolutePath() + ".luceneDir"; ThreadPoolExecutor executor = null; try { DigestIndexer indexer = createStandardIndexer(indexLocation); final boolean verboseIndexing = false; DigestOptions indexingOptions = new DigestOptions(this.useBlacklist, verboseIndexing, this.mimeFilter); long count = 0; Set<IndexingState> outstandingJobs = new HashSet<IndexingState>(); final int maxThreads = Settings.getInt(HarvesterSettings.INDEXSERVER_INDEXING_MAXTHREADS); executor = new ThreadPoolExecutor(maxThreads, maxThreads, 0L, TimeUnit.MILLISECONDS, new LinkedBlockingQueue<Runnable>()); executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy()); for (Map.Entry<Long, File> entry : rawfiles.entrySet()) { Long jobId = entry.getKey(); File crawlLog = entry.getValue(); // Generate UUID to ensure a unique filedir for the index. File tmpFile = new File(FileUtils.getTempDir(), UUID.randomUUID().toString()); tmpfiles.add(tmpFile); String localindexLocation = tmpFile.getAbsolutePath(); Long cached = cdxcache.cache(jobId); if (cached == null) { log.warn("Skipping the ingest of logs for job {}. Unable to retrieve cdx-file for job.", entry.getKey()); continue; } File cachedCDXFile = cdxcache.getCacheFile(cached); // Dispatch this indexing task to a separate thread that // handles the sorting of the logfiles and the generation // of a lucene index for this crawllog and cdxfile. ++count; String taskID = count + " out of " + datasetSize; log.debug("Making subthread for indexing job " + jobId + " - task " + taskID); Callable<Boolean> task = new DigestIndexerWorker(localindexLocation, jobId, crawlLog, cachedCDXFile, indexingOptions, taskID); Future<Boolean> result = executor.submit(task); outstandingJobs.add(new IndexingState(jobId, localindexLocation, result)); } // wait for all the outstanding subtasks to complete. Set<Directory> subindices = new HashSet<Directory>(); // Deadline for the combine-task long combineTimeout = Settings.getLong(HarvesterSettings.INDEXSERVER_INDEXING_TIMEOUT); long timeOutTime = System.currentTimeMillis() + combineTimeout; // The indexwriter for the totalindex. IndexWriter totalIndex = indexer.getIndex(); int subindicesInTotalIndex = 0; // Max number of segments in totalindex. int maxSegments = Settings.getInt(HarvesterSettings.INDEXSERVER_INDEXING_MAX_SEGMENTS); final int ACCUMULATED_SUBINDICES_BEFORE_MERGING = 200; while (outstandingJobs.size() > 0) { log.info("Outstanding jobs in combine task #{} is now {}", indexingJobCount, outstandingJobs.size()); Iterator<IndexingState> iterator = outstandingJobs.iterator(); if (timeOutTime < System.currentTimeMillis()) { log.warn( "Max indexing time exceeded for one index ({}). Indexing stops here, " + "although missing subindices for {} jobs", TimeUtils.readableTimeInterval(combineTimeout), outstandingJobs.size()); break; } while (iterator.hasNext() && subindices.size() < ACCUMULATED_SUBINDICES_BEFORE_MERGING) { Future<Boolean> nextResult; IndexingState next = iterator.next(); if (next.getResultObject().isDone()) { nextResult = next.getResultObject(); try { // check, if the indexing failed if (nextResult.get()) { subindices.add(new SimpleFSDirectory(new File(next.getIndex()))); } else { log.warn("Indexing of job {} failed.", next.getJobIdentifier()); } } catch (InterruptedException e) { log.warn("Unable to get Result back from indexing thread", e); } catch (ExecutionException e) { log.warn("Unable to get Result back from indexing thread", e); } // remove the done object from the set iterator.remove(); } } if (subindices.size() >= ACCUMULATED_SUBINDICES_BEFORE_MERGING) { log.info( "Adding {} subindices to main index. Forcing index to contain max {} files (related to combine task #{})", subindices.size(), maxSegments, indexingJobCount); totalIndex.addIndexes(subindices.toArray(new Directory[0])); totalIndex.forceMerge(maxSegments); totalIndex.commit(); for (Directory luceneDir : subindices) { luceneDir.close(); } subindicesInTotalIndex += subindices.size(); log.info( "Completed adding {} subindices to main index, now containing {} subindices(related to combine task #{})", subindices.size(), subindicesInTotalIndex, indexingJobCount); subindices.clear(); } else { sleepAwhile(); } } log.info( "Adding the final {} subindices to main index. " + "Forcing index to contain max {} files (related to combine task #{})", subindices.size(), maxSegments, indexingJobCount); totalIndex.addIndexes(subindices.toArray(new Directory[0])); totalIndex.forceMerge(maxSegments); totalIndex.commit(); for (Directory luceneDir : subindices) { luceneDir.close(); } subindices.clear(); log.info("Adding operation completed (combine task #{})!", indexingJobCount); long docsInIndex = totalIndex.numDocs(); indexer.close(); log.info("Closed index (related to combine task #{}", indexingJobCount); // Now the index is made, gzip it up. File totalIndexDir = new File(indexLocation); log.info("Gzip-compressing the individual {} index files of combine task # {}", totalIndexDir.list().length, indexingJobCount); ZipUtils.gzipFiles(totalIndexDir, resultDir); log.info( "Completed combine task #{} that combined a dataset with {} crawl logs (entries in combined index: {}) - compressed index has size {}", indexingJobCount, datasetSize, docsInIndex, FileUtils.getHumanReadableFileSize(resultDir)); } catch (IOException e) { throw new IOFailure("Error setting up craw.log index framework for " + resultDir.getAbsolutePath(), e); } finally { // close down Threadpool-executor closeDownThreadpoolQuietly(executor); FileUtils.removeRecursively(new File(indexLocation)); for (File temporaryFile : tmpfiles) { FileUtils.removeRecursively(temporaryFile); } } }
From source file:edu.utsa.sifter.Indexer.java
License:Apache License
public static void main(String[] args) throws IOException, InterruptedException { if (args.length != 1 && args.length != 2) { System.err.println(/* w ww . java 2 s . c om*/ "Wrong number of args supplied. Takes path to index directory and optional stop words file path."); return; } final long begin = System.currentTimeMillis(); final File evPath = new File(args[0]); final File indexPath = new File(evPath, "primary-idx"); if (!evPath.mkdir()) { System.err.println("Could not create directory " + evPath.toString()); return; } final SifterConfig conf = new SifterConfig(); conf.loadFromXMLFile("sifter_props.xml"); FSRipReader ripper = new FSRipReader(conf.THREAD_POOL_SIZE, conf.LARGE_FILE_THRESHOLD, conf.TEMP_DIR, conf.FILETYPE_MODEL_FILE); try { final IndexWriter index = getIndexWriter(indexPath.toString(), args.length == 2 ? args[1] : null, conf); boolean ret = ripper.readData(System.in, index); if (ret) { System.out.println("Optimizing index"); index.forceMerge(1); System.out.println("Successful finish"); } index.close(); } finally { System.out.println("FilesRead: " + ripper.FilesRead); System.out.println("BytesRead: " + ripper.BytesRead); System.out.println("FileBytesRead: " + ripper.FileBytesRead); System.out.println("Duration: " + ((System.currentTimeMillis() - begin) / 1000) + " seconds"); } }
From source file:edu.utsa.sifter.som.MainSOM.java
License:Apache License
public static void main(String[] args) throws IOException, InterruptedException, CorruptIndexException, NoSuchFieldException { final File evPath = new File(args[0]); final File idxPath = new File(evPath, "primary-idx"); final long begin = System.currentTimeMillis(); // createIndex(path); final Path outPath = new Path(new Path(evPath.toString()), "docVectors.seq"); final Configuration hadoopConf = new Configuration(); final LocalFileSystem fs = FileSystem.getLocal(hadoopConf); final SequenceFile.Writer file = SequenceFile.createWriter(fs, hadoopConf, outPath, LongWritable.class, IntArrayWritable.class); final DirectoryReader dirReader = DirectoryReader.open(FSDirectory.open(idxPath)); final SifterConfig conf = new SifterConfig(); InputStream xmlProps = null;/*from ww w.j a v a 2s. c o m*/ try { xmlProps = new FileInputStream("sifter_props.xml"); } catch (FileNotFoundException ex) { ; // swallow exeption } conf.loadFromXML(xmlProps); // safe with null final MainSOM builder = new MainSOM(dirReader, conf); IndexWriter writer = null; FileOutputStream somJSFile = null; try { builder.initTerms(); builder.writeVectors(file); file.close(); final SequenceFile.Reader seqRdr = new SequenceFile.Reader(fs, outPath, hadoopConf); writer = builder.createWriter(new File(evPath, "som-idx"), conf); somJSFile = new FileOutputStream(new File(evPath, "som.js")); final CharsetEncoder utf8 = Charset.forName("UTF-8").newEncoder(); utf8.onMalformedInput(CodingErrorAction.IGNORE); final Writer somJS = new BufferedWriter(new OutputStreamWriter(somJSFile, utf8)); builder.makeSOM(conf, seqRdr, writer, somJS); writer.forceMerge(1); } catch (Exception e) { e.printStackTrace(System.err); } finally { file.close(); if (writer != null) { writer.close(); } if (somJSFile != null) { somJSFile.close(); } dirReader.close(); System.out.println("Number of docs written: " + builder.getNumDocs()); System.out.println("Number of outlier docs: " + builder.getNumOutliers()); System.out.println("Total term dimensions: " + builder.getTermsMap().size()); System.out.println("Max terms per doc: " + builder.getMaxDocTerms()); System.out.println("Avg terms per doc: " + builder.getAvgDocTerms()); System.out.println("Duration: " + ((System.currentTimeMillis() - begin) / 1000) + " seconds"); conf.storeToXML(new FileOutputStream("sifter_props.xml")); } }
From source file:io.anserini.embeddings.IndexW2V.java
License:Apache License
public void indexEmbeddings() throws IOException, InterruptedException { LOG.info("Starting indexer..."); long startTime = System.currentTimeMillis(); final WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(); final IndexWriterConfig config = new IndexWriterConfig(analyzer); final IndexWriter writer = new IndexWriter(directory, config); BufferedReader bRdr = new BufferedReader(new FileReader(args.input)); String line = null;/*from w ww . ja v a 2 s . c o m*/ bRdr.readLine(); Document document = new Document(); ByteArrayOutputStream byteStream = new ByteArrayOutputStream(); int cnt = 0; while ((line = bRdr.readLine()) != null) { String[] termEmbedding = line.trim().split("\t"); document.add(new StringField(LuceneDocumentGenerator.FIELD_ID, termEmbedding[0], Field.Store.NO)); String[] parts = termEmbedding[1].split(" "); for (int i = 0; i < parts.length; ++i) { byteStream.write(ByteBuffer.allocate(4).putFloat(Float.parseFloat(parts[i])).array()); } document.add(new StoredField(FIELD_BODY, byteStream.toByteArray())); byteStream.flush(); byteStream.reset(); writer.addDocument(document); document.clear(); cnt++; if (cnt % 100000 == 0) { LOG.info(cnt + " terms indexed"); } } LOG.info(String.format("Total of %s terms added", cnt)); try { writer.commit(); writer.forceMerge(1); } finally { try { writer.close(); } catch (IOException e) { LOG.error(e); } } LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); }
From source file:io.anserini.index.IndexClueWeb09b.java
License:Apache License
public int indexWithThreads(int numThreads) throws IOException, InterruptedException { System.out.println(//from w w w . j ava2 s . c o m "Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'..."); final Directory dir = FSDirectory.open(indexPath); final IndexWriterConfig iwc = new IndexWriterConfig(analyzer()); iwc.setSimilarity(new BM25Similarity()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwc.setRAMBufferSizeMB(256.0); iwc.setUseCompoundFile(false); iwc.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, iwc); final ExecutorService executor = Executors.newFixedThreadPool(numThreads); List<Path> warcFiles = discoverWarcFiles(docDir); if (doclimit > 0 && warcFiles.size() < doclimit) warcFiles = warcFiles.subList(0, doclimit); for (Path f : warcFiles) executor.execute(new IndexerThread(writer, f)); //add some delay to let some threads spawn by scheduler Thread.sleep(30000); executor.shutdown(); // Disable new tasks from being submitted try { // Wait for existing tasks to terminate while (!executor.awaitTermination(5, TimeUnit.MINUTES)) { Thread.sleep(1000); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } int numIndexed = writer.maxDoc(); try { writer.commit(); if (optimize) writer.forceMerge(1); } finally { writer.close(); } return numIndexed; }