List of usage examples for org.apache.lucene.index IndexWriterConfig setOpenMode
public IndexWriterConfig setOpenMode(OpenMode openMode)
From source file:api.startup.PDFIndexer.java
License:Open Source License
/** * Updates the index//from www .jav a 2s. c om * * @throws IOException */ public void updateIndex() throws IOException { try { long startTime = System.nanoTime(); // Get the index directory Directory dir = FSDirectory.open(Paths.get(indexDirectory)); // Get the directory for resources String resourcesDir = resourceDirectory + "/" + Constants.CSV_LOCATION; // Get PDF Analyzer Analyzer pdf_analyzer = new PDFAnalyzer(resourceDirectory + "/" + Constants.STOPWORDS_FILE); // Create an index writer config with the analyzer IndexWriterConfig iwc = new IndexWriterConfig(pdf_analyzer); // Set the open mode to create or append iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); // Set index to be created // Create an index writer IndexWriter writer = new IndexWriter(dir, iwc); // Index the documents indexDocs(writer, resourcesDir); long endTime = System.nanoTime(); LuceneIndexReader.getInstance().initializeIndexReader(writer); writer.close(); log.info("Took: " + (endTime - startTime) / Math.pow(10, 6) + " milliseconds to generate the index."); } catch (IOException e) { log.error("IO Exception Thrown while updating index " + e.getMessage() + "\n"); } }
From source file:Application.mediaIndexer.java
public static void IndexFiles(String index, String docsPath, TextArea results, boolean CreateOrUpdate, boolean removeFiles) throws IOException { String indexPath = index;//from ww w.j av a 2 s.c om boolean create = CreateOrUpdate; final Path docDir = Paths.get(docsPath); if (!Files.isReadable(docDir)) results.appendText("Document directory '" + docDir.toAbsolutePath() + "' does not exist or is not readable, please check the path" + "\n"); Date start = new Date(); try { if (removeFiles) results.appendText("Deleting '" + docsPath + "' from directory '" + indexPath + "'..." + "\n"); // else results.appendText("results '" + docsPath + "' to directory // '" + indexPath + "'..." + "\n"); Directory dir = FSDirectory.open(Paths.get(indexPath)); Analyzer analyzer = new StandardAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); if (create) iwc.setOpenMode(OpenMode.CREATE); else iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); // Optional: for better results performance, if you are results many // documents, increase the RAM buffer. // But if you do this, increase the max heap size to the JVM (eg add // -Xmx512m or -Xmx1g): // iwc.setRAMBufferSizeMB(2048.0); IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, docDir, results, removeFiles); writer.close(); Date end = new Date(); long diffInSeconds = (end.getTime() - start.getTime()) / 1000; results.appendText(diffInSeconds + " total seconds" + "\n"); } catch (SAXException e) { e.printStackTrace(); } catch (TikaException e) { e.printStackTrace(); } }
From source file:apps.LuceneIndexer.java
License:Apache License
public static void main(String[] args) { Options options = new Options(); options.addOption("i", null, true, "input file"); options.addOption("o", null, true, "output directory"); options.addOption("r", null, true, "optional output TREC-format QREL file"); options.addOption("bm25_b", null, true, "BM25 parameter: b"); options.addOption("bm25_k1", null, true, "BM25 parameter: k1"); options.addOption("bm25fixed", null, false, "use the fixed BM25 similarity"); Joiner commaJoin = Joiner.on(','); Joiner spaceJoin = Joiner.on(' '); options.addOption("source_type", null, true, "document source type: " + commaJoin.join(SourceFactory.getDocSourceList())); // If you increase this value, you may need to modify the following line in *.sh file // export MAVEN_OPTS="-Xms8192m -server" double ramBufferSizeMB = 1024 * 8; // 8 GB CommandLineParser parser = new org.apache.commons.cli.GnuParser(); IndexWriter indexWriter = null;/*from w w w . j a va2s. com*/ BufferedWriter qrelWriter = null; int docNum = 0; try { CommandLine cmd = parser.parse(options, args); String inputFileName = null, outputDirName = null, qrelFileName = null; if (cmd.hasOption("i")) { inputFileName = cmd.getOptionValue("i"); } else { Usage("Specify 'input file'", options); } if (cmd.hasOption("o")) { outputDirName = cmd.getOptionValue("o"); } else { Usage("Specify 'index directory'", options); } if (cmd.hasOption("r")) { qrelFileName = cmd.getOptionValue("r"); } String sourceName = cmd.getOptionValue("source_type"); if (sourceName == null) Usage("Specify document source type", options); if (qrelFileName != null) qrelWriter = new BufferedWriter(new FileWriter(qrelFileName)); File outputDir = new File(outputDirName); if (!outputDir.exists()) { if (!outputDir.mkdirs()) { System.out.println("couldn't create " + outputDir.getAbsolutePath()); System.exit(1); } } if (!outputDir.isDirectory()) { System.out.println(outputDir.getAbsolutePath() + " is not a directory!"); System.exit(1); } if (!outputDir.canWrite()) { System.out.println("Can't write to " + outputDir.getAbsolutePath()); System.exit(1); } boolean useFixedBM25 = cmd.hasOption("bm25fixed"); float bm25_k1 = UtilConst.BM25_K1_DEFAULT, bm25_b = UtilConst.BM25_B_DEFAULT; if (cmd.hasOption("bm25_k1")) { try { bm25_k1 = Float.parseFloat(cmd.getOptionValue("bm25_k1")); } catch (NumberFormatException e) { Usage("Wrong format for 'bm25_k1'", options); } } if (cmd.hasOption("bm25_b")) { try { bm25_b = Float.parseFloat(cmd.getOptionValue("bm25_b")); } catch (NumberFormatException e) { Usage("Wrong format for 'bm25_b'", options); } } EnglishAnalyzer analyzer = new EnglishAnalyzer(); FSDirectory indexDir = FSDirectory.open(Paths.get(outputDirName)); IndexWriterConfig indexConf = new IndexWriterConfig(analyzer); /* OpenMode.CREATE creates a new index or overwrites an existing one. https://lucene.apache.org/core/6_0_0/core/org/apache/lucene/index/IndexWriterConfig.OpenMode.html#CREATE */ indexConf.setOpenMode(OpenMode.CREATE); indexConf.setRAMBufferSizeMB(ramBufferSizeMB); System.out.println(String.format("BM25 parameters k1=%f b=%f ", bm25_k1, bm25_b)); if (useFixedBM25) { System.out.println(String.format("Using fixed BM25Simlarity, k1=%f b=%f", bm25_k1, bm25_b)); indexConf.setSimilarity(new BM25SimilarityFix(bm25_k1, bm25_b)); } else { System.out.println(String.format("Using Lucene BM25Similarity, k1=%f b=%f", bm25_k1, bm25_b)); indexConf.setSimilarity(new BM25Similarity(bm25_k1, bm25_b)); } indexWriter = new IndexWriter(indexDir, indexConf); DocumentSource inpDocSource = SourceFactory.createDocumentSource(sourceName, inputFileName); DocumentEntry inpDoc = null; TextCleaner textCleaner = new TextCleaner(null); while ((inpDoc = inpDocSource.next()) != null) { ++docNum; Document luceneDoc = new Document(); ArrayList<String> cleanedToks = textCleaner.cleanUp(inpDoc.mDocText); String cleanText = spaceJoin.join(cleanedToks); // System.out.println(inpDoc.mDocId); // System.out.println(cleanText); // System.out.println("=============================="); luceneDoc.add(new StringField(UtilConst.FIELD_ID, inpDoc.mDocId, Field.Store.YES)); luceneDoc.add(new TextField(UtilConst.FIELD_TEXT, cleanText, Field.Store.YES)); indexWriter.addDocument(luceneDoc); if (inpDoc.mIsRel != null && qrelWriter != null) { saveQrelOneEntry(qrelWriter, inpDoc.mQueryId, inpDoc.mDocId, inpDoc.mIsRel ? MAX_GRADE : 0); } if (docNum % 1000 == 0) System.out.println(String.format("Indexed %d documents", docNum)); } } catch (ParseException e) { e.printStackTrace(); Usage("Cannot parse arguments" + e, options); } catch (Exception e) { System.err.println("Terminating due to an exception: " + e); System.exit(1); } finally { System.out.println(String.format("Indexed %d documents", docNum)); try { if (null != indexWriter) indexWriter.close(); if (null != qrelWriter) qrelWriter.close(); } catch (IOException e) { System.err.println("IO exception: " + e); e.printStackTrace(); } } }
From source file:au.org.ala.names.search.ALANameIndexer.java
License:Open Source License
/** * Creates an index writer in the specified directory. It will create/recreate * the target directory/* w ww . ja v a 2s. c om*/ * * @param directory * @param analyzer * @return * @throws Exception */ protected IndexWriter createIndexWriter(File directory, Analyzer analyzer, boolean replace) throws Exception { IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_34, analyzer); if (replace) conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE); else conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); if (directory.exists() && replace) { FileUtils.forceDelete(directory); } FileUtils.forceMkdir(directory); IndexWriter iw = new IndexWriter(FSDirectory.open(directory), conf); return iw; }
From source file:back.Indexer.java
License:Apache License
/** Index all text files under a directory. */ public static void main(String[] args) { String usage = "java org.apache.lucene.demo.IndexFiles" + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" + "This indexes the documents in DOCS_PATH, creating a Lucene index" + "in INDEX_PATH that can be searched with SearchFiles"; String indexPath = ".\\indexed"; String docsPath = ".//artigos"; boolean create = true; for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { indexPath = args[i + 1];/*from w ww . jav a 2 s .co m*/ i++; } else if ("-docs".equals(args[i])) { docsPath = args[i + 1]; i++; } else if ("-update".equals(args[i])) { create = false; } } if (docsPath == null) { System.err.println("Usage: " + usage); System.exit(1); } final File docDir = new File(docsPath); if (!docDir.exists() || !docDir.canRead()) { System.out.println("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } Date start = new Date(); try { System.out.println("Indexing to directory '" + indexPath + "'..."); Directory dir = FSDirectory.open(new File(indexPath)); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT, new CharArraySet(Version.LUCENE_CURRENT, 0, false)); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40, analyzer); if (create) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); } else { // Add new documents to an existing index: iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, docDir); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // // writer.forceMerge(1); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:biospectra.index.Indexer.java
License:Apache License
private void initialize(File indexPath, int kmerSize, boolean minStrandKmer, Similarity similarity, int workerThreads, int ramBufferSize) throws Exception { if (!indexPath.exists()) { indexPath.mkdirs();/*from ww w . j a v a 2 s. c o m*/ } if (indexPath.exists()) { cleanUpDirectory(indexPath); } this.indexPath = indexPath; this.minStrandKmer = minStrandKmer; this.analyzer = new KmerIndexAnalyzer(kmerSize, minStrandKmer); Directory dir = new MMapDirectory(this.indexPath.toPath()); IndexWriterConfig config = new IndexWriterConfig(this.analyzer); if (similarity != null) { config.setSimilarity(similarity); } this.workerThreads = workerThreads; if (ramBufferSize > 0) { config.setRAMBufferSizeMB(ramBufferSize); } config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); this.indexWriter = new IndexWriter(dir, config); this.executor = new BlockingExecutor(this.workerThreads, this.workerThreads * 2); for (int i = 0; i < this.workerThreads; i++) { Document doc = new Document(); Field filenameField = new StringField(IndexConstants.FIELD_FILENAME, "", Field.Store.YES); Field headerField = new StringField(IndexConstants.FIELD_HEADER, "", Field.Store.YES); Field sequenceDirectionField = new StringField(IndexConstants.FIELD_SEQUENCE_DIRECTION, "", Field.Store.YES); Field taxonTreeField = new StringField(IndexConstants.FIELD_TAXONOMY_TREE, "", Field.Store.YES); Field sequenceField = new TextField(IndexConstants.FIELD_SEQUENCE, "", Field.Store.NO); doc.add(filenameField); doc.add(headerField); doc.add(sequenceDirectionField); doc.add(taxonTreeField); doc.add(sequenceField); this.freeQueue.offer(doc); } }
From source file:br.andrew.lucene.testing.IndexFiles.java
License:Apache License
/** Index all text files under a directory. */ public static void main(final String[] args) { final String usage = "java org.apache.lucene.demo.IndexFiles" + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" + "This indexes the documents in DOCS_PATH, creating a Lucene index" + "in INDEX_PATH that can be searched with SearchFiles"; final String indexPath = "index"; final File docDir = new File("data"); final Date start = new Date(); try {/*from w w w. j a va 2 s . co m*/ System.out.println("Indexing to directory '" + indexPath + "'..."); final Directory dir = FSDirectory.open(new File(indexPath)); final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40); final IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40, analyzer); iwc.setOpenMode(OpenMode.CREATE); // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); final IndexWriter writer = new IndexWriter(dir, iwc); IndexFiles.indexDocs(writer, docDir); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // // writer.forceMerge(1); writer.close(); final Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (final IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:br.bireme.ngrams.NGIndex.java
private IndexWriter getIndexWriter(final String indexPath, final Analyzer analyzer, final boolean append) throws IOException { assert indexPath != null; assert analyzer != null; final File dir = new File(indexPath); final Directory directory = FSDirectory.open(dir.toPath()); final IndexWriterConfig cfg = new IndexWriterConfig(analyzer); if (append) { cfg.setOpenMode(IndexWriterConfig.OpenMode.APPEND); } else {/*from www. j a v a 2 s . c om*/ new File(dir, "write.lock").delete(); cfg.setOpenMode(IndexWriterConfig.OpenMode.CREATE); } return new IndexWriter(directory, cfg); }
From source file:buscador.IndexFiles.java
License:Apache License
/** * Index all text files under a directory. *//* w w w. j a va 2 s . c o m*/ public static void main(String[] args) { String usage = "java org.apache.lucene.demo.IndexFiles" + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" + "This indexes the documents in DOCS_PATH, creating a Lucene index" + "in INDEX_PATH that can be searched with SearchFiles"; String indexPath = "Zaguan1"; String docsPath = null; boolean create = true; for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { indexPath = args[i + 1]; i++; } else if ("-docs".equals(args[i])) { docsPath = args[i + 1]; i++; } else if ("-update".equals(args[i])) { create = false; } } if (docsPath == null) { System.err.println("Usage: " + usage); System.exit(1); } final File docDir = new File(docsPath); if (!docDir.exists() || !docDir.canRead()) { System.out.println("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } Date start = new Date(); try { System.out.println("Indexing to directory '" + indexPath + "'..."); Directory dir = FSDirectory.open(new File(indexPath)); Analyzer analyzer = new SpanishAnalyzer(Version.LUCENE_44); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, analyzer); if (create) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); } else { // Add new documents to an existing index: iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, docDir); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // // writer.forceMerge(1); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:ca.dracode.ais.indexer.FileIndexer.java
License:Open Source License
public FileIndexer() { super();/*w w w . j a va 2 s . co m*/ this.searcher = new FileSearcher(); Directory dir; try { dir = FSDirectory.open(new File(FileIndexer.getRootStorageDir())); Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_47); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); this.writer = new IndexWriter(dir, iwc); this.writer.commit(); } catch (IOException e) { e.printStackTrace(); } }