List of usage examples for org.apache.lucene.index IndexWriter addIndexes
public long addIndexes(CodecReader... readers) throws IOException
From source file:dk.netarkivet.harvester.indexserver.CrawlLogIndexCache.java
License:Open Source License
/** * Combine a number of crawl.log files into one Lucene index. This index is placed as gzip files under the directory * returned by getCacheFile()./*www .j a v a 2 s . co m*/ * * @param rawfiles The map from job ID into crawl.log contents. No null values are allowed in this map. */ protected void combine(Map<Long, File> rawfiles) { ++indexingJobCount; long datasetSize = rawfiles.values().size(); log.info("Starting combine task #{}. This combines a dataset with {} crawl logs (thread = {})", indexingJobCount, datasetSize, Thread.currentThread().getName()); File resultDir = getCacheFile(rawfiles.keySet()); Set<File> tmpfiles = new HashSet<File>(); String indexLocation = resultDir.getAbsolutePath() + ".luceneDir"; ThreadPoolExecutor executor = null; try { DigestIndexer indexer = createStandardIndexer(indexLocation); final boolean verboseIndexing = false; DigestOptions indexingOptions = new DigestOptions(this.useBlacklist, verboseIndexing, this.mimeFilter); long count = 0; Set<IndexingState> outstandingJobs = new HashSet<IndexingState>(); final int maxThreads = Settings.getInt(HarvesterSettings.INDEXSERVER_INDEXING_MAXTHREADS); executor = new ThreadPoolExecutor(maxThreads, maxThreads, 0L, TimeUnit.MILLISECONDS, new LinkedBlockingQueue<Runnable>()); executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy()); for (Map.Entry<Long, File> entry : rawfiles.entrySet()) { Long jobId = entry.getKey(); File crawlLog = entry.getValue(); // Generate UUID to ensure a unique filedir for the index. File tmpFile = new File(FileUtils.getTempDir(), UUID.randomUUID().toString()); tmpfiles.add(tmpFile); String localindexLocation = tmpFile.getAbsolutePath(); Long cached = cdxcache.cache(jobId); if (cached == null) { log.warn("Skipping the ingest of logs for job {}. Unable to retrieve cdx-file for job.", entry.getKey()); continue; } File cachedCDXFile = cdxcache.getCacheFile(cached); // Dispatch this indexing task to a separate thread that // handles the sorting of the logfiles and the generation // of a lucene index for this crawllog and cdxfile. ++count; String taskID = count + " out of " + datasetSize; log.debug("Making subthread for indexing job " + jobId + " - task " + taskID); Callable<Boolean> task = new DigestIndexerWorker(localindexLocation, jobId, crawlLog, cachedCDXFile, indexingOptions, taskID); Future<Boolean> result = executor.submit(task); outstandingJobs.add(new IndexingState(jobId, localindexLocation, result)); } // wait for all the outstanding subtasks to complete. Set<Directory> subindices = new HashSet<Directory>(); // Deadline for the combine-task long combineTimeout = Settings.getLong(HarvesterSettings.INDEXSERVER_INDEXING_TIMEOUT); long timeOutTime = System.currentTimeMillis() + combineTimeout; // The indexwriter for the totalindex. IndexWriter totalIndex = indexer.getIndex(); int subindicesInTotalIndex = 0; // Max number of segments in totalindex. int maxSegments = Settings.getInt(HarvesterSettings.INDEXSERVER_INDEXING_MAX_SEGMENTS); final int ACCUMULATED_SUBINDICES_BEFORE_MERGING = 200; while (outstandingJobs.size() > 0) { log.info("Outstanding jobs in combine task #{} is now {}", indexingJobCount, outstandingJobs.size()); Iterator<IndexingState> iterator = outstandingJobs.iterator(); if (timeOutTime < System.currentTimeMillis()) { log.warn( "Max indexing time exceeded for one index ({}). Indexing stops here, " + "although missing subindices for {} jobs", TimeUtils.readableTimeInterval(combineTimeout), outstandingJobs.size()); break; } while (iterator.hasNext() && subindices.size() < ACCUMULATED_SUBINDICES_BEFORE_MERGING) { Future<Boolean> nextResult; IndexingState next = iterator.next(); if (next.getResultObject().isDone()) { nextResult = next.getResultObject(); try { // check, if the indexing failed if (nextResult.get()) { subindices.add(new SimpleFSDirectory(new File(next.getIndex()))); } else { log.warn("Indexing of job {} failed.", next.getJobIdentifier()); } } catch (InterruptedException e) { log.warn("Unable to get Result back from indexing thread", e); } catch (ExecutionException e) { log.warn("Unable to get Result back from indexing thread", e); } // remove the done object from the set iterator.remove(); } } if (subindices.size() >= ACCUMULATED_SUBINDICES_BEFORE_MERGING) { log.info( "Adding {} subindices to main index. Forcing index to contain max {} files (related to combine task #{})", subindices.size(), maxSegments, indexingJobCount); totalIndex.addIndexes(subindices.toArray(new Directory[0])); totalIndex.forceMerge(maxSegments); totalIndex.commit(); for (Directory luceneDir : subindices) { luceneDir.close(); } subindicesInTotalIndex += subindices.size(); log.info( "Completed adding {} subindices to main index, now containing {} subindices(related to combine task #{})", subindices.size(), subindicesInTotalIndex, indexingJobCount); subindices.clear(); } else { sleepAwhile(); } } log.info( "Adding the final {} subindices to main index. " + "Forcing index to contain max {} files (related to combine task #{})", subindices.size(), maxSegments, indexingJobCount); totalIndex.addIndexes(subindices.toArray(new Directory[0])); totalIndex.forceMerge(maxSegments); totalIndex.commit(); for (Directory luceneDir : subindices) { luceneDir.close(); } subindices.clear(); log.info("Adding operation completed (combine task #{})!", indexingJobCount); long docsInIndex = totalIndex.numDocs(); indexer.close(); log.info("Closed index (related to combine task #{}", indexingJobCount); // Now the index is made, gzip it up. File totalIndexDir = new File(indexLocation); log.info("Gzip-compressing the individual {} index files of combine task # {}", totalIndexDir.list().length, indexingJobCount); ZipUtils.gzipFiles(totalIndexDir, resultDir); log.info( "Completed combine task #{} that combined a dataset with {} crawl logs (entries in combined index: {}) - compressed index has size {}", indexingJobCount, datasetSize, docsInIndex, FileUtils.getHumanReadableFileSize(resultDir)); } catch (IOException e) { throw new IOFailure("Error setting up craw.log index framework for " + resultDir.getAbsolutePath(), e); } finally { // close down Threadpool-executor closeDownThreadpoolQuietly(executor); FileUtils.removeRecursively(new File(indexLocation)); for (File temporaryFile : tmpfiles) { FileUtils.removeRecursively(temporaryFile); } } }
From source file:fi.passiba.services.bibledata.sword.index.lucene.LuceneIndex.java
License:Open Source License
/** * Generate an index to use, telling the job about progress as you go. * @throws BookException If we fail to read the index files */// ww w . j a v a 2 s . c o m public LuceneIndex(Book book, URI storage, boolean create) throws BookException { assert create; this.book = book; File finalPath = null; try { finalPath = NetUtil.getAsFile(storage); this.path = finalPath.getCanonicalPath(); } catch (IOException ex) { throw new BookException(UserMsg.LUCENE_INIT, ex); } System.out.println("index path " + finalPath.getAbsolutePath()); // Indexing the book is a good way to police data errors. DataPolice.setBook(book.getBookMetaData()); IndexStatus finalStatus = IndexStatus.UNDONE; Analyzer analyzer = new LuceneAnalyzer(book); List errors = new ArrayList(); File tempPath = new File(path + '.' + IndexStatus.CREATING.toString()); try { synchronized (CREATING) { book.setIndexStatus(IndexStatus.CREATING); // An index is created by opening an IndexWriter with the create argument set to true. //IndexWriter writer = new IndexWriter(tempPath.getCanonicalPath(), analyzer, true); // Create the index in core. RAMDirectory ramDir = new RAMDirectory(); IndexWriter writer = new IndexWriter(ramDir, analyzer, true); generateSearchIndexImpl(errors, writer, book.getGlobalKeyList(), 0); // Consolidate the index into the minimum number of files. // writer.optimize(); /* Optimize is done by addIndexes */ writer.close(); // Write the core index to disk. IndexWriter fsWriter = new IndexWriter(tempPath.getCanonicalPath(), analyzer, true); fsWriter.addIndexes(new Directory[] { ramDir }); fsWriter.close(); // Free up the space used by the ram directory ramDir.close(); tempPath.renameTo(finalPath); if (finalPath.exists()) { finalStatus = IndexStatus.DONE; } if (errors.size() > 0) { StringBuffer buf = new StringBuffer(); Iterator iter = errors.iterator(); while (iter.hasNext()) { buf.append(iter.next()); buf.append('\n'); } Reporter.informUser(this, UserMsg.BAD_VERSE, buf); } } } catch (IOException ex) { throw new BookException(UserMsg.LUCENE_INIT, ex); } finally { book.setIndexStatus(finalStatus); } }
From source file:io.anserini.integration.IndexerTest.java
License:Apache License
@Test public void testCloneIndex() throws Exception { System.out.println("Cloning index:"); Directory dir1 = FSDirectory.open(tempDir1); IndexReader reader = DirectoryReader.open(dir1); Directory dir2 = FSDirectory.open(tempDir2); IndexWriterConfig config = new IndexWriterConfig(new EnglishAnalyzer()); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir2, config); LeafReader leafReader = reader.leaves().get(0).reader(); CodecReader codecReader = SlowCodecReaderWrapper.wrap(leafReader); writer.addIndexes(new MyFilterCodecReader(codecReader)); writer.commit();/*from www.j a va2s . com*/ writer.forceMerge(1); writer.close(); reader.close(); // Open up the cloned index and verify it. reader = DirectoryReader.open(dir2); assertEquals(3, reader.numDocs()); assertEquals(1, reader.leaves().size()); System.out.println("Dumping out postings..."); dumpPostings(reader); assertEquals(2, reader.docFreq(new Term("text", "here"))); assertEquals(2, reader.docFreq(new Term("text", "more"))); assertEquals(1, reader.docFreq(new Term("text", "some"))); assertEquals(1, reader.docFreq(new Term("text", "test"))); assertEquals(2, reader.docFreq(new Term("text", "text"))); reader.close(); }
From source file:io.datalayer.lucene.helper.AosUtil.java
License:Apache License
public void addIndexes() throws Exception { Directory otherDir = null;//from w w w .j av a2 s. co m Directory ramDir = null; IndexWriter writer = new IndexWriter(otherDir, new IndexWriterConfig(Version.LUCENE_46, new SimpleAnalyzer(Version.LUCENE_46))); writer.addIndexes(new Directory[] { ramDir }); }
From source file:io.yucca.lucene.FieldRemover.java
License:Apache License
/** * Remove fields from an index. All readers and writer are closed on * completion or on an exception./* www.j a va 2 s . co m*/ * * @param reader * IndexReader * @param writer * IndexWriter * File destination index directory * @param fields * String[] fields to remove */ public void removeFields(IndexReader reader, IndexWriter writer, String[] fields) { Set<String> removals = toTrimmedSet(fields); List<AtomicReaderContext> leaves = reader.leaves(); AtomicReader wrappedLeaves[] = new AtomicReader[leaves.size()]; for (int i = 0; i < leaves.size(); i++) { wrappedLeaves[i] = new FieldFilterAtomicReader(leaves.get(i).reader(), removals, true); } try { MultiReader mr = new MultiReader(wrappedLeaves, true); writer.addIndexes(mr); writer.commit(); writer.close(); mr.close(); } catch (IOException e) { log.error("Writing new index failed.", e); } finally { IOUtils.closeWhileHandlingException(reader); IOUtils.closeWhileHandlingException(writer); IOUtils.closeWhileHandlingException(writer.getDirectory()); } }
From source file:lius.lucene.LuceneActions.java
License:Apache License
public void addIndexes(Directory[] directoriesToIndex, String indexDir, LiusConfig lc) { Analyzer analyzer = AnalyzerFactory.getAnalyzer(lc); IndexWriter writer = null; try {/* ww w. j a va 2 s . c o m*/ boolean createIndex = createIndexValue(lc.getCreateIndex(), indexDir); Directory fsDir = FSDirectory.getDirectory(indexDir, createIndex); writer = new IndexWriter(fsDir, analyzer, createIndex); setIndexWriterProps(writer, lc); writer.addIndexes(directoriesToIndex); } catch (Exception e) { logger.error(e.getMessage()); } finally { try { if (writer != null) { writer.close(); } } catch (IOException e) { logger.error(e.getMessage()); } } }
From source file:net.bible.service.sword.IndexMerger.java
License:Open Source License
/** Index all text files under a directory. */ public static void main(String[] args) { if (args.length != 2) { System.out.println("Usage: java -jar IndexMerger.jar " + "existing_indexes_dir merged_index_dir"); System.out/*w w w . ja v a 2s . c om*/ .println(" existing_indexes_dir: A directory where the " + "indexes that have to merged exist"); System.out.println(" e.g. indexes/"); System.out.println(" e.g. index1"); System.out.println(" e.g. index2"); System.out.println(" e.g. index3"); System.out.println(" merged_index_dir: A directory where the merged " + "index will be stored"); System.out.println(" e.g. merged_indexes"); System.exit(1); } File INDEXES_DIR = new File(args[0]); File INDEX_DIR = new File(args[1]); INDEX_DIR.mkdir(); Date start = new Date(); try { IndexWriter writer = new IndexWriter(INDEX_DIR, new StandardAnalyzer(), true); writer.setMergeFactor(1000); writer.setRAMBufferSizeMB(50); Directory indexes[] = new Directory[INDEXES_DIR.list().length]; for (int i = 0; i < INDEXES_DIR.list().length; i++) { System.out.println("Adding: " + INDEXES_DIR.list()[i]); indexes[i] = FSDirectory.getDirectory(INDEXES_DIR.getAbsolutePath() + "/" + INDEXES_DIR.list()[i]); } System.out.print("Merging added indexes..."); writer.addIndexes(indexes); System.out.println("done"); System.out.print("Optimizing index..."); writer.optimize(); writer.close(); System.out.println("done"); Date end = new Date(); System.out.println("It took: " + ((end.getTime() - start.getTime()) / 1000) + "\""); } catch (IOException e) { e.printStackTrace(); } }
From source file:org.apache.blur.manager.writer.IndexImporter.java
License:Apache License
private IndexAction getIndexAction(final HdfsDirectory directory, final FileSystem fileSystem) { return new IndexAction() { @Override//w w w . j a va2 s .c om public void performMutate(IndexSearcherCloseable searcher, IndexWriter writer) throws IOException { LOG.info("About to import [{0}] into [{1}/{2}]", directory, _shard, _table); boolean emitDeletes = searcher.getIndexReader().numDocs() != 0; applyDeletes(directory, writer, _shard, emitDeletes); LOG.info("Add index [{0}] [{1}/{2}]", directory, _shard, _table); writer.addIndexes(directory); LOG.info("Removing delete markers [{0}] on [{1}/{2}]", directory, _shard, _table); writer.deleteDocuments(new Term(BlurConstants.DELETE_MARKER, BlurConstants.DELETE_MARKER_VALUE)); LOG.info("Finishing import [{0}], commiting on [{1}/{2}]", directory, _shard, _table); } @Override public void doPreCommit(IndexSearcherCloseable indexSearcher, IndexWriter writer) throws IOException { } @Override public void doPostCommit(IndexWriter writer) throws IOException { Path path = directory.getPath(); fileSystem.delete(new Path(path, INPROGRESS), false); LOG.info("Import complete on [{0}/{1}]", _shard, _table); writer.maybeMerge(); } @Override public void doPreRollback(IndexWriter writer) throws IOException { LOG.info("Starting rollback on [{0}/{1}]", _shard, _table); } @Override public void doPostRollback(IndexWriter writer) throws IOException { LOG.info("Finished rollback on [{0}/{1}]", _shard, _table); Path path = directory.getPath(); String name = path.getName(); fileSystem.rename(path, new Path(path.getParent(), rename(name, BADROWIDS))); } }; }
From source file:org.apache.blur.mapreduce.lib.GenericBlurRecordWriter.java
License:Apache License
private void copyAndOptimizeInFlightDir() throws IOException { CopyRateDirectory copyRateDirectory = new CopyRateDirectory(_finalDir, _copyRateCounter); copyRateDirectory.setLockFactory(NoLockFactory.getNoLockFactory()); DirectoryReader reader = DirectoryReader.open(_localDir); IndexWriter writer = new IndexWriter(copyRateDirectory, _conf.clone()); writer.addIndexes(reader); writer.close();// ww w.j a v a 2s.co m rm(_localPath); }
From source file:org.apache.blur.store.hdfs.HdfsDirectorySymlinkTest.java
License:Apache License
@Test public void testSymlinkWithIndexes() throws IOException { HdfsDirectory dir1 = new HdfsDirectory(_configuration, new Path(_base, "dir1")); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_43, new KeywordAnalyzer()); IndexWriter writer1 = new IndexWriter(dir1, conf.clone()); writer1.addDocument(getDoc());// ww w . ja v a 2s.c om writer1.close(); HdfsDirectory dir2 = new HdfsDirectory(_configuration, new Path(_base, "dir2")); IndexWriter writer2 = new IndexWriter(dir2, conf.clone()); writer2.addIndexes(dir1); writer2.close(); DirectoryReader reader1 = DirectoryReader.open(dir1); DirectoryReader reader2 = DirectoryReader.open(dir2); assertEquals(1, reader1.maxDoc()); assertEquals(1, reader2.maxDoc()); assertEquals(1, reader1.numDocs()); assertEquals(1, reader2.numDocs()); Document document1 = reader1.document(0); Document document2 = reader2.document(0); assertEquals(document1.get("id"), document2.get("id")); }