List of usage examples for org.apache.lucene.index IndexWriterConfig setOpenMode
public IndexWriterConfig setOpenMode(OpenMode openMode)
From source file:io.anserini.integration.IndexerTest.java
License:Apache License
@Test public void testCloneIndex() throws Exception { System.out.println("Cloning index:"); Directory dir1 = FSDirectory.open(tempDir1); IndexReader reader = DirectoryReader.open(dir1); Directory dir2 = FSDirectory.open(tempDir2); IndexWriterConfig config = new IndexWriterConfig(new EnglishAnalyzer()); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir2, config); LeafReader leafReader = reader.leaves().get(0).reader(); CodecReader codecReader = SlowCodecReaderWrapper.wrap(leafReader); writer.addIndexes(new MyFilterCodecReader(codecReader)); writer.commit();/*from w ww . j a v a 2 s.com*/ writer.forceMerge(1); writer.close(); reader.close(); // Open up the cloned index and verify it. reader = DirectoryReader.open(dir2); assertEquals(3, reader.numDocs()); assertEquals(1, reader.leaves().size()); System.out.println("Dumping out postings..."); dumpPostings(reader); assertEquals(2, reader.docFreq(new Term("text", "here"))); assertEquals(2, reader.docFreq(new Term("text", "more"))); assertEquals(1, reader.docFreq(new Term("text", "some"))); assertEquals(1, reader.docFreq(new Term("text", "test"))); assertEquals(2, reader.docFreq(new Term("text", "text"))); reader.close(); }
From source file:io.bfscan.clueweb12.BuildWarcTrecIdMapping.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("bz2 Wikipedia XML dump file") .create(INPUT_OPTION));/*from w ww .j a v a 2s.co m*/ options.addOption( OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg() .withDescription("maximum number of documents to index").create(MAX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of indexing threads") .create(THREADS_OPTION)); options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment")); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(BuildWarcTrecIdMapping.class.getCanonicalName(), options); System.exit(-1); } String indexPath = cmdline.getOptionValue(INDEX_OPTION); int maxdocs = cmdline.hasOption(MAX_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MAX_OPTION)) : Integer.MAX_VALUE; int threads = cmdline.hasOption(THREADS_OPTION) ? Integer.parseInt(cmdline.getOptionValue(THREADS_OPTION)) : DEFAULT_NUM_THREADS; long startTime = System.currentTimeMillis(); String path = cmdline.getOptionValue(INPUT_OPTION); PrintStream out = new PrintStream(System.out, true, "UTF-8"); Directory dir = FSDirectory.open(new File(indexPath)); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, ANALYZER); config.setOpenMode(OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); LOG.info("Creating index at " + indexPath); LOG.info("Indexing with " + threads + " threads"); FileInputStream fis = null; BufferedReader br = null; try { fis = new FileInputStream(new File(path)); byte[] ignoreBytes = new byte[2]; fis.read(ignoreBytes); // "B", "Z" bytes from commandline tools br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fis), "UTF8")); ExecutorService executor = Executors.newFixedThreadPool(threads); int cnt = 0; String s; while ((s = br.readLine()) != null) { Runnable worker = new AddDocumentRunnable(writer, s); executor.execute(worker); cnt++; if (cnt % 1000000 == 0) { LOG.info(cnt + " articles added"); } if (cnt >= maxdocs) { break; } } executor.shutdown(); // Wait until all threads are finish while (!executor.isTerminated()) { } LOG.info("Total of " + cnt + " articles indexed."); if (cmdline.hasOption(OPTIMIZE_OPTION)) { LOG.info("Merging segments..."); writer.forceMerge(1); LOG.info("Done!"); } LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); } catch (Exception e) { e.printStackTrace(); } finally { writer.close(); dir.close(); out.close(); br.close(); fis.close(); } }
From source file:io.datalayer.lucene.index.IndexerMain.java
License:Apache License
/** * Index all text files under a directory. *///from w w w . j a v a2s . co m public static void main(String... args) { String usage = "java org.apache.lucene.demo.IndexFiles" + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" + "This indexes the documents in DOCS_PATH, creating a Lucene index" + "in INDEX_PATH that can be searched with SearchFiles"; String indexPath = "index"; String docsPath = null; boolean create = true; for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { indexPath = args[i + 1]; i++; } else if ("-docs".equals(args[i])) { docsPath = args[i + 1]; i++; } else if ("-update".equals(args[i])) { create = false; } } if (docsPath == null) { System.err.println("Usage: " + usage); System.exit(1); } final File docDir = new File(docsPath); if (!docDir.exists() || !docDir.canRead()) { LOGGER.info("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } Date start = new Date(); try { LOGGER.info("Indexing to directory '" + indexPath + "'..."); Directory dir = FSDirectory.open(new File(indexPath)); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_46, analyzer); if (create) { // Create a new index in the directory, removing any previously // indexed documents: iwc.setOpenMode(OpenMode.CREATE); } else { // Add new documents to an existing index: iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, docDir); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // // writer.forceMerge(1); writer.close(); Date end = new Date(); LOGGER.info(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { LOGGER.info(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:io.druid.extension.lucene.LuceneDruidSegment.java
License:Apache License
private static IndexWriter buildRamWriter(RAMDirectory dir, Analyzer analyzer, int maxDocsPerSegment) throws IOException { IndexWriterConfig writerConfig = new IndexWriterConfig(analyzer); writerConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); // some arbitrary large numbers writerConfig.setMaxBufferedDocs(maxDocsPerSegment * 2); writerConfig.setRAMBufferSizeMB(5000); writerConfig.setUseCompoundFile(false); writerConfig.setCommitOnClose(true); writerConfig.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE); writerConfig.setMergePolicy(NoMergePolicy.INSTANCE); writerConfig.setMergeScheduler(NoMergeScheduler.INSTANCE); return new IndexWriter(dir, writerConfig); }
From source file:io.druid.extension.lucene.LuceneDruidSegment.java
License:Apache License
private static IndexWriter buildPersistWriter(Directory dir) throws IOException { IndexWriterConfig writerConfig = new IndexWriterConfig(null); writerConfig.setUseCompoundFile(false); writerConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); writerConfig.setMergePolicy(NoMergePolicy.INSTANCE); writerConfig.setMergeScheduler(NoMergeScheduler.INSTANCE); return new IndexWriter(dir, writerConfig); }
From source file:io.github.infolis.algorithm.Indexer.java
License:Apache License
@Override public void execute() throws IOException { File indexDir;/* w w w. ja v a 2 s .co m*/ if (null != getExecution().getIndexDirectory() && !getExecution().getIndexDirectory().isEmpty()) { indexDir = new File(getExecution().getIndexDirectory()); } else { indexDir = new File( Files.createTempDirectory(InfolisConfig.getTmpFilePath().toAbsolutePath(), INDEX_DIR_PREFIX) .toString()); FileUtils.forceDeleteOnExit(indexDir); } log.debug("Indexing to: " + indexDir.getAbsolutePath()); getExecution().setOutputDirectory(indexDir.getAbsolutePath().toString()); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_35, createAnalyzer()); indexWriterConfig.setOpenMode(OpenMode.CREATE); FSDirectory fsIndexDir = FSDirectory.open(indexDir); List<InfolisFile> files = new ArrayList<>(); for (String fileUri : getExecution().getInputFiles()) { try { files.add(this.getInputDataStoreClient().get(InfolisFile.class, fileUri)); } catch (Exception e) { error(log, "Could not retrieve file " + fileUri + ": " + e.getMessage()); getExecution().setStatus(ExecutionStatus.FAILED); persistExecution(); return; } } Date start = new Date(); log.debug("Starting to index"); IndexWriter writer = new IndexWriter(fsIndexDir, indexWriterConfig); try { int counter = 0; for (InfolisFile file : files) { counter++; log.trace("Indexing file " + file); writer.addDocument(toLuceneDocument(getInputFileResolver(), file)); updateProgress(counter, files.size()); } } catch (FileNotFoundException fnfe) { // NOTE: at least on windows, some temporary files raise this // exception with an "access denied" message checking if the // file can be read doesn't help throw new RuntimeException("Could not write index entry: " + fnfe); } finally { log.debug("Merging all Lucene segments ..."); writer.forceMerge(1); writer.close(); } getExecution().setStatus(ExecutionStatus.FINISHED); fsIndexDir.close(); log.debug(String.format("Indexing %s documents took %s ms", files.size(), new Date().getTime() - start.getTime())); }
From source file:io.jpress.searcher.LuceneSearcher.java
License:LGPL
public IndexWriter createIndexWriter() throws IOException { if (mIndexFilePath == null) { throw new NullPointerException("please invoke init() method first!"); }/*from ww w . j a va 2 s . c om*/ Analyzer analyzer = new JcsegAnalyzer5X(JcsegTaskConfig.COMPLEX_MODE); // ?(?): ??? JcsegAnalyzer5X jcseg = (JcsegAnalyzer5X) analyzer; // ???, ?jcseg.properties?jcseg.loadsyn=1 JcsegTaskConfig config = jcseg.getTaskConfig(); // ?, ?jcseg.properties?jcseg.loadpinyin=1 config.setAppendCJKSyn(true); // ?, com.webssky.jcseg.core.JcsegTaskConfig config.setAppendCJKPinyin(true); Directory fsDirectory = FSDirectory.open(Paths.get(mIndexFilePath)); IndexWriterConfig indexConfig = new IndexWriterConfig(analyzer); indexConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); indexConfig.setMaxBufferedDocs(1000); IndexWriter indexWriter = new IndexWriter(fsDirectory, indexConfig); return indexWriter; }
From source file:io.puntanegra.fhir.index.lucene.LuceneService.java
License:Apache License
/** * Builds a new {@link FSIndex}./* w ww .ja v a2 s .co m*/ * * @param name * the index name * @param mbeanName * the JMX MBean object name * @param path * the directory path * @param analyzer * the index writer analyzer * @param refresh * the index reader refresh frequency in seconds * @param ramBufferMB * the index writer RAM buffer size in MB * @param maxMergeMB * the directory max merge size in MB * @param maxCachedMB * the directory max cache size in MB * @param refreshTask * action to be done during refresh */ public void init(String name, String mbeanName, Path path, Analyzer analyzer, double refresh, int ramBufferMB, int maxMergeMB, int maxCachedMB, Runnable refreshTask) { try { this.path = path; this.name = name; // Open or create directory FSDirectory fsDirectory = FSDirectory.open(path); this.directory = new NRTCachingDirectory(fsDirectory, maxMergeMB, maxCachedMB); // Setup index writer IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer); indexWriterConfig.setRAMBufferSizeMB(ramBufferMB); indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); indexWriterConfig.setUseCompoundFile(true); indexWriterConfig.setMergePolicy(new TieredMergePolicy()); this.indexWriter = new IndexWriter(this.directory, indexWriterConfig); // Setup NRT search SearcherFactory searcherFactory = new SearcherFactory() { @Override public IndexSearcher newSearcher(IndexReader reader, IndexReader previousReader) { if (refreshTask != null) { refreshTask.run(); } IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(new NoIDFSimilarity()); return searcher; } }; TrackingIndexWriter trackingWriter = new TrackingIndexWriter(this.indexWriter); this.searcherManager = new SearcherManager(this.indexWriter, true, searcherFactory); this.searcherReopener = new ControlledRealTimeReopenThread<>(trackingWriter, this.searcherManager, refresh, refresh); this.searcherReopener.start(); // Register JMX MBean // mbean = new ObjectName(mbeanName); // ManagementFactory.getPlatformMBeanServer().registerMBean(service, // this.mbean); } catch (Exception e) { throw new FhirIndexException(e, "Error while creating index %s", name); } }
From source file:is.hi.bok.deduplicator.DigestIndexer.java
License:Open Source License
/** * Each instance of this class wraps one Lucene index for writing deduplication information to it. * * @param indexLocation The location of the index (path). * @param indexingMode Index {@link #MODE_URL}, {@link #MODE_HASH} or {@link #MODE_BOTH}. * @param includeNormalizedURL Should a normalized version of the URL be added to the index. See * {@link #stripURL(String)}./*from w w w .ja va2s . c o m*/ * @param includeTimestamp Should a timestamp be included in the index. * @param includeEtag Should an Etag be included in the index. * @param addToExistingIndex Are we opening up an existing index. Setting this to false will cause any index at * <code>indexLocation</code> to be overwritten. * @throws IOException If an error occurs opening the index. */ public DigestIndexer(String indexLocation, String indexingMode, boolean includeNormalizedURL, boolean includeTimestamp, boolean includeEtag, boolean addToExistingIndex) throws IOException { this.etag = includeEtag; this.equivalent = includeNormalizedURL; this.timestamp = includeTimestamp; if (indexingMode.equals(MODE_URL)) { indexDigest = false; } else if (indexingMode.equals(MODE_HASH)) { indexURL = false; } // Set up the index writer IndexWriterConfig config = new IndexWriterConfig(Constants.LUCENE_VERSION, new WhitespaceAnalyzer(Constants.LUCENE_VERSION)); // TODO Possibly change the default MergePolicy, see NAS-2119 if (!addToExistingIndex) { config.setOpenMode(OpenMode.CREATE); } else { config.setOpenMode(OpenMode.CREATE_OR_APPEND); } luceneDirectory = FSDirectory.open(new File(indexLocation)); index = new IndexWriter(luceneDirectory, config); }
From source file:is.landsbokasafn.deduplicator.indexer.IndexBuilder.java
License:Apache License
/** * Each instance of this class wraps one Lucene index for writing * deduplication information to it./*w w w .j av a 2 s .c o m*/ * * @param indexLocation The location of the index (path). * @param indexURL Index the URL field in the index. * @param includeCanonicalizedURL Should a normalized version of the URL be * added to the index. * See {@link #stripURL(String)}. * @param includeTimestamp Should a timestamp be included in the index. * @param includeEtag Should an Etag be included in the index. * @param addToExistingIndex Are we opening up an existing index. Setting * this to false will cause any index at * <code>indexLocation</code> to be overwritten. * @throws IOException If an error occurs opening the index. */ public IndexBuilder(String indexLocation, boolean indexURL, boolean includeCanonicalizedURL, boolean includeEtag, boolean addToExistingIndex) throws IOException { this.indexURL = indexURL; this.includeEtag = includeEtag; this.includeCanonicalizedURL = includeCanonicalizedURL; IndexWriterConfig indexWriterConfig = new IndexWriterConfig(LUCENE_VER, new WhitespaceAnalyzer(LUCENE_VER)); if (addToExistingIndex) { indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); } else { indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE); } // Set up the index writer index = new IndexWriter(FSDirectory.open(new File(indexLocation)), indexWriterConfig); }