List of usage examples for org.apache.lucene.index IndexWriterConfig setUseCompoundFile
@Override public IndexWriterConfig setUseCompoundFile(boolean useCompoundFile)
From source file:io.anserini.index.IndexWebCollection.java
License:Apache License
public int indexWithThreads(int numThreads) throws IOException, InterruptedException { LOG.info("Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'..."); final Directory dir = FSDirectory.open(indexPath); final IndexWriterConfig iwc = new IndexWriterConfig(new EnglishAnalyzer()); iwc.setSimilarity(new BM25Similarity()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwc.setRAMBufferSizeMB(512);//w w w . j a v a 2 s. com iwc.setUseCompoundFile(false); iwc.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, iwc); final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads); final String suffix = Collection.GOV2.equals(collection) ? ".gz" : ".warc.gz"; final Deque<Path> warcFiles = discoverWarcFiles(docDir, suffix); if (doclimit > 0 && warcFiles.size() < doclimit) for (int i = doclimit; i < warcFiles.size(); i++) warcFiles.removeFirst(); long totalWarcFiles = warcFiles.size(); LOG.info(totalWarcFiles + " many " + suffix + " files found under the docs path : " + docDir.toString()); for (int i = 0; i < 2000; i++) { if (!warcFiles.isEmpty()) executor.execute(new IndexerThread(writer, warcFiles.removeFirst())); else { if (!executor.isShutdown()) { Thread.sleep(30000); executor.shutdown(); } break; } } long first = 0; //add some delay to let some threads spawn by scheduler Thread.sleep(30000); try { // Wait for existing tasks to terminate while (!executor.awaitTermination(1, TimeUnit.MINUTES)) { final long completedTaskCount = executor.getCompletedTaskCount(); LOG.info(String.format("%.2f percentage completed", (double) completedTaskCount / totalWarcFiles * 100.0d)); if (!warcFiles.isEmpty()) for (long i = first; i < completedTaskCount; i++) { if (!warcFiles.isEmpty()) executor.execute(new IndexerThread(writer, warcFiles.removeFirst())); else { if (!executor.isShutdown()) executor.shutdown(); } } first = completedTaskCount; Thread.sleep(1000); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } if (totalWarcFiles != executor.getCompletedTaskCount()) throw new RuntimeException("totalWarcFiles = " + totalWarcFiles + " is not equal to completedTaskCount = " + executor.getCompletedTaskCount()); int numIndexed = writer.maxDoc(); try { writer.commit(); if (optimize) writer.forceMerge(1); } finally { writer.close(); } return numIndexed; }
From source file:io.anserini.IndexerCW09B.java
License:Apache License
public int indexWithThreads(int numThreads) throws IOException, InterruptedException { System.out.println(//w ww . j av a2 s. c o m "Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'..."); final Directory dir = FSDirectory.open(indexPath); final IndexWriterConfig iwc = new IndexWriterConfig(analyzer()); iwc.setSimilarity(new BM25Similarity()); iwc.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwc.setRAMBufferSizeMB(256.0); iwc.setUseCompoundFile(false); iwc.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, iwc); final ExecutorService executor = Executors.newFixedThreadPool(numThreads); for (Path f : discoverWarcFiles(docDir)) executor.execute(new IndexerThread(writer, f)); //add some delay to let some threads spawn by scheduler Thread.sleep(30000); executor.shutdown(); // Disable new tasks from being submitted try { // Wait for existing tasks to terminate while (!executor.awaitTermination(5, TimeUnit.MINUTES)) { Thread.sleep(1000); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } int numIndexed = writer.maxDoc(); try { writer.commit(); } finally { writer.close(); } return numIndexed; }
From source file:io.druid.extension.lucene.LuceneDruidSegment.java
License:Apache License
private static IndexWriter buildRamWriter(RAMDirectory dir, Analyzer analyzer, int maxDocsPerSegment) throws IOException { IndexWriterConfig writerConfig = new IndexWriterConfig(analyzer); writerConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); // some arbitrary large numbers writerConfig.setMaxBufferedDocs(maxDocsPerSegment * 2); writerConfig.setRAMBufferSizeMB(5000); writerConfig.setUseCompoundFile(false); writerConfig.setCommitOnClose(true); writerConfig.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE); writerConfig.setMergePolicy(NoMergePolicy.INSTANCE); writerConfig.setMergeScheduler(NoMergeScheduler.INSTANCE); return new IndexWriter(dir, writerConfig); }
From source file:io.druid.extension.lucene.LuceneDruidSegment.java
License:Apache License
private static IndexWriter buildPersistWriter(Directory dir) throws IOException { IndexWriterConfig writerConfig = new IndexWriterConfig(null); writerConfig.setUseCompoundFile(false); writerConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); writerConfig.setMergePolicy(NoMergePolicy.INSTANCE); writerConfig.setMergeScheduler(NoMergeScheduler.INSTANCE); return new IndexWriter(dir, writerConfig); }
From source file:io.puntanegra.fhir.index.lucene.LuceneService.java
License:Apache License
/** * Builds a new {@link FSIndex}.//from www. j a v a2 s .com * * @param name * the index name * @param mbeanName * the JMX MBean object name * @param path * the directory path * @param analyzer * the index writer analyzer * @param refresh * the index reader refresh frequency in seconds * @param ramBufferMB * the index writer RAM buffer size in MB * @param maxMergeMB * the directory max merge size in MB * @param maxCachedMB * the directory max cache size in MB * @param refreshTask * action to be done during refresh */ public void init(String name, String mbeanName, Path path, Analyzer analyzer, double refresh, int ramBufferMB, int maxMergeMB, int maxCachedMB, Runnable refreshTask) { try { this.path = path; this.name = name; // Open or create directory FSDirectory fsDirectory = FSDirectory.open(path); this.directory = new NRTCachingDirectory(fsDirectory, maxMergeMB, maxCachedMB); // Setup index writer IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer); indexWriterConfig.setRAMBufferSizeMB(ramBufferMB); indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); indexWriterConfig.setUseCompoundFile(true); indexWriterConfig.setMergePolicy(new TieredMergePolicy()); this.indexWriter = new IndexWriter(this.directory, indexWriterConfig); // Setup NRT search SearcherFactory searcherFactory = new SearcherFactory() { @Override public IndexSearcher newSearcher(IndexReader reader, IndexReader previousReader) { if (refreshTask != null) { refreshTask.run(); } IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(new NoIDFSimilarity()); return searcher; } }; TrackingIndexWriter trackingWriter = new TrackingIndexWriter(this.indexWriter); this.searcherManager = new SearcherManager(this.indexWriter, true, searcherFactory); this.searcherReopener = new ControlledRealTimeReopenThread<>(trackingWriter, this.searcherManager, refresh, refresh); this.searcherReopener.start(); // Register JMX MBean // mbean = new ObjectName(mbeanName); // ManagementFactory.getPlatformMBeanServer().registerMBean(service, // this.mbean); } catch (Exception e) { throw new FhirIndexException(e, "Error while creating index %s", name); } }
From source file:nicta.com.au.patent.pac.index.CodeIndexer.java
public CodeIndexer(String indexDir) throws IOException { File indexDirFile = new File(indexDir); analyzer = new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET); // analyzer = new StandardAnalyzer(Version.LUCENE_48); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_48, analyzer); conf.setUseCompoundFile(false); // conf.setCodec(new SimpleTextCodec()); writer = new IndexWriter(FSDirectory.open(indexDirFile), conf); }
From source file:nicta.com.au.patent.pac.index.PACIndexer.java
public PACIndexer(String indexDir) throws IOException { File indexDirFile = new File(indexDir); Map<String, Analyzer> analyzerPerField = new HashMap<>(); analyzerPerField.put(PatentDocument.Title, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.TITLE_ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Abstract, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ABSTRACT_ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Description, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.DESCRIPTION_ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Claims, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.CLAIMS_ENGLISH_STOP_WORDS_SET)); aWrapper = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Version.LUCENE_48), analyzerPerField); analyzer = new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET); // analyzer = new StandardAnalyzer(Version.LUCENE_48); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_48, aWrapper); conf.setUseCompoundFile(false); conf.setCodec(new SimpleTextCodec()); writer = new IndexWriter(FSDirectory.open(indexDirFile), conf); }
From source file:org.apache.nutch.indexwriter.lucene.LuceneWriter.java
License:Apache License
public void open(JobConf job, String name) throws IOException { this.fs = FileSystem.get(job); perm = new Path(FileOutputFormat.getOutputPath(job), name); temp = job.getLocalPath("index/_" + Integer.toString(new Random().nextInt())); fs.delete(perm, true); // delete old, if any analyzerFactory = new AnalyzerFactory(job); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_4_10_2, new SmartChineseAnalyzer()); LogByteSizeMergePolicy mergePolicy = new LogByteSizeMergePolicy(); mergePolicy.setMergeFactor(job.getInt("indexer.mergeFactor", 10)); mergePolicy.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs", Integer.MAX_VALUE)); indexWriterConfig.setMergePolicy(mergePolicy); indexWriterConfig.setUseCompoundFile(false); indexWriterConfig.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128)); indexWriterConfig.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100)); indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); writer = new org.apache.lucene.index.IndexWriter( FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())), indexWriterConfig); /*/*from ww w.j a va 2 s. c o m*/ * addFieldOptions("title", STORE.YES, INDEX.TOKENIZED, VECTOR.NO, job); * addFieldOptions("url", STORE.YES, INDEX.TOKENIZED, VECTOR.NO, job); * addFieldOptions("content", STORE.YES, INDEX.TOKENIZED, VECTOR.NO, job); * addFieldOptions("lang", STORE.YES, INDEX.UNTOKENIZED, VECTOR.NO, job); */ processOptions(job); }
From source file:org.apache.solr.codecs.test.testDeleteDocs.java
License:Apache License
public static void main(String[] args) { try {/*w ww . j a v a 2s.c o m*/ plaintextDir = assureDirectoryExists(new File(INDEX_ROOT_FOLDER)); //----------- index documents ------- StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_10_0); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_10_0, analyzer); // recreate the index on each execution config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); //config.setCodec(new SimpleTextCodec()); Properties props = new Properties(); FileInputStream fstream = new FileInputStream( "C:\\work\\search_engine\\codec\\solr410\\solr_codectest\\collection1\\conf\\kvstore.properties"); props.load(fstream); fstream.close(); ONSQLKVstoreHandler.getInstance().setKVStore("omega", props); ONSQLCodec codec = new ONSQLCodec(); config.setCodec(codec); config.setUseCompoundFile(false); Directory luceneDir = new ONSQLWrapperDirectory(new File(INDEX_ROOT_FOLDER)); IndexWriter writer = new IndexWriter(luceneDir, config); QueryParser queryParser = new QueryParser(Version.LUCENE_4_10_0, "title", analyzer); String search_word = "fourth"; Query query = queryParser.parse(search_word); writer.deleteDocuments(query); writer.commit(); writer.close(); searchIndex("title", search_word); } catch (Throwable te) { te.printStackTrace(); } }
From source file:org.apache.solr.codecs.test.testMergeSegments.java
License:Apache License
public static void main(String[] args) { try {/*from w w w . j a v a 2s .co m*/ testUtil.initPropsONSQL(); StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_10_1); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_10_1, analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); ONSQLCodec codec = new ONSQLCodec(); config.setCodec(codec); config.setUseCompoundFile(false); Directory luceneDir = new ONSQLWrapperDirectory(new File(INDEX_ROOT_FOLDER)); IndexWriter writer = new IndexWriter(luceneDir, config); writer.forceMerge(1); writer.close(); } catch (Throwable te) { te.printStackTrace(); } }