List of usage examples for org.apache.lucene.index IndexWriterConfig setOpenMode
public IndexWriterConfig setOpenMode(OpenMode openMode)
From source file:de.maklerpoint.office.Lucene.Indexer.java
License:Open Source License
@Override public void run() { try {// w ww . j av a 2s . com Analyzer an = new StandardAnalyzer(Version.LUCENE_32); FSDirectory dir = FSDirectory.open(new File(indexDir)); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_32, an); LimitTokenCountAnalyzer lt = new LimitTokenCountAnalyzer(an, Integer.MAX_VALUE); conf.setOpenMode(OpenMode.CREATE); // Immer neuer INdex writer = new IndexWriter(dir, conf); int originalNumDocs = writer.numDocs(); Log.logger.info("Anzahl indexierte Dateien: " + originalNumDocs); // indexFileorDir(Filesystem.getRootPath()); indexDatabase(); int newNumDocs = writer.numDocs(); Log.logger.info((newNumDocs - originalNumDocs) + " neue Dokumente indexiert."); writer.close(); } catch (Exception ex) { // Exceptions.printStackTrace(ex); } finally { if (writer != null) { try { writer.close(); } catch (CorruptIndexException ex) { Exceptions.printStackTrace(ex); } catch (IOException ex) { Exceptions.printStackTrace(ex); } } } }
From source file:de.mpii.docsimilarity.tasks.qe.wiki.IndexWikipediaDump.java
License:Apache License
public static void constructIndex(String indexPath, String inputPath) throws UnsupportedEncodingException, IOException, ClassNotFoundException, InstantiationException, IllegalAccessException { int threads = 24; WikiClean cleaner = new WikiCleanBuilder().withTitle(true).build(); Directory dir = null;/*from w ww .jav a 2 s.c om*/ //FSDirectory.open(Paths.get(indexPath)); // the analyzer should be the same with the runtime analyzer Analyzer analyzer = new StandardAnalyzer(new CharArraySet(Arrays.asList(StopWordsFilter.STOPWORDS), true)); IndexWriterConfig iwc = null; //new IndexWriterConfig(analyzer); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwc.setRAMBufferSizeMB(1024.0); IndexWriter writer = new IndexWriter(dir, iwc); logger.info("Creating index at " + indexPath); logger.info("Indexing with " + threads + " threads"); long startTime = System.currentTimeMillis(); try { WikipediaXMLDumpInputStream stream = new WikipediaXMLDumpInputStream(inputPath); ExecutorService executor = Executors.newFixedThreadPool(threads); int cnt = 0; String page; while ((page = stream.readNext()) != null) { String title = cleaner.getTitle(page); // These are heuristic specifically for filtering out non-articles in enwiki-20120104. if (title.startsWith("Wikipedia:") || title.startsWith("Portal:") || title.startsWith("File:")) { continue; } if (page.contains("#REDIRECT") || page.contains("#redirect") || page.contains("#Redirect")) { continue; } Runnable worker = new AddDocumentRunnable(writer, cleaner, page); executor.execute(worker); cnt++; if (cnt % 10000 == 0) { logger.info(cnt + " articles added"); } } executor.shutdown(); // Wait until all threads are finish while (!executor.isTerminated()) { } logger.info("Total of " + cnt + " articles indexed."); logger.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); } catch (Exception ex) { logger.error("", ex); } finally { writer.close(); dir.close(); } }
From source file:de.mpii.microblogtrack.component.thirdparty.IndexWikipediaDump.java
License:Apache License
public static void constructIndex(String indexPath, String inputPath) throws UnsupportedEncodingException, IOException, ClassNotFoundException, InstantiationException, IllegalAccessException { int threads = 16; WikiClean cleaner = new WikiCleanBuilder().withTitle(true).build(); Directory dir = FSDirectory.open(Paths.get(indexPath)); // the analyzer should be the same with the runtime analyzer IndexWriterConfig iwc = new IndexWriterConfig( (Analyzer) Class.forName(Configuration.LUCENE_ANALYZER).newInstance()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwc.setRAMBufferSizeMB(Configuration.LUCENE_MEM_SIZE); IndexWriter writer = new IndexWriter(dir, iwc); logger.info("Creating index at " + indexPath); logger.info("Indexing with " + threads + " threads"); long startTime = System.currentTimeMillis(); try {//from ww w. j a v a 2 s.co m WikipediaXMLDumpInputStream stream = new WikipediaXMLDumpInputStream(inputPath); ExecutorService executor = Executors.newFixedThreadPool(threads); int cnt = 0; String page; while ((page = stream.readNext()) != null) { String title = cleaner.getTitle(page); // These are heuristic specifically for filtering out non-articles in enwiki-20120104. if (title.startsWith("Wikipedia:") || title.startsWith("Portal:") || title.startsWith("File:")) { continue; } if (page.contains("#REDIRECT") || page.contains("#redirect") || page.contains("#Redirect")) { continue; } Runnable worker = new AddDocumentRunnable(writer, cleaner, page); executor.execute(worker); cnt++; if (cnt % 10000 == 0) { logger.info(cnt + " articles added"); } } executor.shutdown(); // Wait until all threads are finish while (!executor.isTerminated()) { } logger.info("Total of " + cnt + " articles indexed."); logger.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); } catch (Exception ex) { logger.error("", ex); } finally { writer.close(); dir.close(); } }
From source file:de.tudarmstadt.lt.lm.app.GenerateNgramIndex.java
License:Apache License
public void create_ngram_index(File ngram_joined_counts_file) throws IOException { File index_dir = new File(_index_dir, "ngram"); if (index_dir.exists()) { LOG.info("Ngram index already exists in directory '{}'.", index_dir.getAbsolutePath()); if (_overwrite) { LOG.info("Overwriting index '{}',", index_dir); index_dir.delete();//w w w. j a v a2 s . co m } else return; } index_dir.mkdirs(); Analyzer analyzer = new KeywordAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer); iwc.setOpenMode(OpenMode.CREATE); // use 80 percent of the available total memory double total_mem_mb = (double) Runtime.getRuntime().maxMemory() / 1e6; double percentage_ram_buffer = Properties.ramBufferPercentage(); if (percentage_ram_buffer > 0) { double percentage_ram_buffer_mb = total_mem_mb * percentage_ram_buffer; LOG.info(String.format("Setting ram buffer size to %.2f MB (%.2f%% from %.2f MB)", percentage_ram_buffer_mb, percentage_ram_buffer * 100, total_mem_mb)); iwc.setRAMBufferSizeMB(percentage_ram_buffer_mb); } Directory directory = new MMapDirectory(index_dir); IndexWriter writer_ngram = new IndexWriter(directory, iwc); InputStream in = new FileInputStream(ngram_joined_counts_file); if (ngram_joined_counts_file.getName().endsWith(".gz")) in = new GZIPInputStream(in); LineIterator iter = new LineIterator(new BufferedReader(new InputStreamReader(in, "UTF-8"))); Document doc = new Document(); Field f_ngram = new StringField("ngram", "", Store.YES); doc.add(f_ngram); Field f_n = new IntField("cardinality", 0, Store.YES); doc.add(f_n); Field f_word = new StringField("word", "", Store.YES); doc.add(f_word); Field f_hist = new StringField("history", "", Store.YES); doc.add(f_hist); Field f_lower = new StringField("lower", "", Store.YES); doc.add(f_lower); Field f_count = new StoredField("num", 0L); doc.add(f_count); Field[] f_follow = new Field[4]; f_follow[0] = new StoredField("nf_s", 0L); doc.add(f_follow[0]); f_follow[1] = new StoredField("nf_N1", 0L); doc.add(f_follow[1]); f_follow[2] = new StoredField("nf_N2", 0L); doc.add(f_follow[2]); f_follow[3] = new StoredField("nf_N3", 0L); doc.add(f_follow[3]); Field[] f_precede = new Field[4]; f_precede[0] = new StoredField("np_s", 0L); doc.add(f_precede[0]); f_precede[1] = new StoredField("np_N1", 0L); doc.add(f_precede[1]); f_precede[2] = new StoredField("np_N2", 0L); doc.add(f_precede[2]); f_precede[3] = new StoredField("np_N3", 0L); doc.add(f_precede[3]); Field[] f_followerprecede = new Field[4]; f_followerprecede[0] = new StoredField("nfp_s", 0L); doc.add(f_followerprecede[0]); f_followerprecede[1] = new StoredField("nfp_N1", 0L); doc.add(f_followerprecede[1]); f_followerprecede[2] = new StoredField("nfp_N2", 0L); doc.add(f_followerprecede[2]); f_followerprecede[3] = new StoredField("nfp_N3", 0L); doc.add(f_followerprecede[3]); Long[][] N = new Long[][] { { 0L, 0L, 0L, 0L, 0L, 0L } }; Long[] S = new Long[] { 0L }; long c = 0; while (iter.hasNext()) { if (++c % 100000 == 0) LOG.info("Adding {}'th ngram.", c); String line = iter.next(); try { String[] splits = de.tudarmstadt.lt.utilities.StringUtils.rtrim(line).split("\t"); String ngram_str = splits[0]; if (de.tudarmstadt.lt.utilities.StringUtils.trim(ngram_str).isEmpty()) { LOG.warn("Ngram is empty, skipping line {}: '{}' (file '{}').", c, line, ngram_joined_counts_file); continue; } List<String> ngram = Arrays.asList(ngram_str.split(" ")); long num = Long.parseLong(splits[1]); int n = ngram.size(); f_ngram.setStringValue(ngram_str); f_n.setIntValue(n); f_word.setStringValue(ngram.get(ngram.size() - 1)); f_hist.setStringValue(StringUtils.join(ngram.subList(0, ngram.size() - 1), " ")); f_lower.setStringValue(StringUtils.join(ngram.subList(1, ngram.size()), " ")); f_count.setLongValue(num); for (int j = 0; j < f_follow.length; j++) { f_follow[j].setLongValue(0L); f_precede[j].setLongValue(0L); f_followerprecede[j].setLongValue(0L); } if (splits.length > 2 && !splits[2].isEmpty()) { // precede or follow or followerprecede String[] splits_ = splits[2].split(":"); String type = splits_[0]; String[] count_values = splits_[1].split(","); if (count_values.length > 0) { if ("n_f".equals(type)) f_follow[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_p".equals(type)) f_precede[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_fp".equals(type)) f_followerprecede[0].setLongValue(Long.parseLong(count_values[0])); } for (int i = 1; i < count_values.length; i++) { if ("n_f".equals(type)) f_follow[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_p".equals(type)) f_precede[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_fp".equals(type)) f_followerprecede[i].setLongValue(Long.parseLong(count_values[i])); } } if (splits.length > 3 && !splits[3].isEmpty()) { // should be follow or followerprecede String[] splits_ = splits[3].split(":"); String type = splits_[0]; String[] count_values = splits_[1].split(","); if (count_values.length > 0) { if ("n_f".equals(type)) f_follow[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_p".equals(type)) f_precede[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_fp".equals(type)) f_followerprecede[0].setLongValue(Long.parseLong(count_values[0])); } for (int i = 1; i < count_values.length; i++) { if ("n_f".equals(type)) f_follow[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_p".equals(type)) f_precede[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_fp".equals(type)) f_followerprecede[i].setLongValue(Long.parseLong(count_values[i])); } } if (splits.length > 4 && !splits[4].isEmpty()) { // should be followerprecede String[] splits_ = splits[4].split(":"); String type = splits_[0]; String[] count_values = splits_[1].split(","); if (count_values.length > 0) { if ("n_f".equals(type)) f_follow[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_p".equals(type)) f_precede[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_fp".equals(type)) f_followerprecede[0].setLongValue(Long.parseLong(count_values[0])); } for (int i = 1; i < count_values.length; i++) { if ("n_f".equals(type)) f_follow[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_p".equals(type)) f_precede[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_fp".equals(type)) f_followerprecede[i].setLongValue(Long.parseLong(count_values[i])); } } writer_ngram.addDocument(doc); while (N.length <= n) { N = ArrayUtils.getConcatinatedArray(N, new Long[][] { { 0L, 0L, 0L, 0L, 0L, 0L } }); S = ArrayUtils.getConcatinatedArray(S, new Long[] { 0L }); } if (num == 1L) N[n][1]++; else if (num == 2L) N[n][2]++; else if (num == 3L) N[n][3]++; else if (num == 4L) N[n][4]++; else N[n][5]++; N[n][0]++; S[n] += num; } catch (Exception e) { LOG.error("Could not process line '{}' in file '{}:{}', malformed line.", line, ngram_joined_counts_file, c, e); } } writer_ngram.forceMergeDeletes(); writer_ngram.commit(); writer_ngram.close(); StringBuilder b = new StringBuilder(String.format( "#%n# Number of times where an ngram occurred: %n# at_least_once, exactly_once, exactly_twice, exactly_three_times, exactly_four_times, five_times_or_more.%n#%nmax_n=%d%nmax_c=6%n", N.length - 1)); for (int n = 1; n < N.length; n++) b.append(String.format("n%d=%s%n", n, StringUtils.join(N[n], ','))); for (int n = 1; n < S.length; n++) b.append(String.format("s%d=%d%n", n, S[n])); FileUtils.writeStringToFile(new File(_index_dir, "__sum_ngrams__"), b.toString()); }
From source file:de.tudarmstadt.lt.lm.app.GenerateNgramIndex.java
License:Apache License
public void create_vocabulary_index(File vocabulary_file) throws IOException { File index_dir = new File(_index_dir, "vocab"); if (index_dir.exists()) { LOG.info("Vocabulary index already exists in directory '{}'.", index_dir.getAbsolutePath()); if (_overwrite) { LOG.info("Overwriting index '{}',", index_dir); index_dir.delete();/*w w w . ja va 2 s .com*/ } else return; } index_dir.mkdirs(); Analyzer analyzer = new KeywordAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer); iwc.setOpenMode(OpenMode.CREATE); iwc.setRAMBufferSizeMB(1024.0); Directory directory = new MMapDirectory(index_dir); IndexWriter writer_vocab = new IndexWriter(directory, iwc); InputStream in = new FileInputStream(vocabulary_file); if (vocabulary_file.getName().endsWith(".gz")) in = new GZIPInputStream(in); LineIterator iter = new LineIterator(new BufferedReader(new InputStreamReader(in, "UTF-8"))); Document doc = new Document(); Field f_word = new StringField("word", "", Field.Store.YES); doc.add(f_word); long c = 0; while (iter.hasNext()) { if (++c % 10000 == 0) LOG.info("Adding {}'th word.", c); String line = iter.next(); try { String word = line.trim(); f_word.setStringValue(word); writer_vocab.addDocument(doc); } catch (Exception e) { LOG.warn("Could not process line '{}' in file '{}', malformed line.", line, vocabulary_file, e); } } writer_vocab.forceMergeDeletes(); writer_vocab.commit(); writer_vocab.close(); }
From source file:de.tudarmstadt.ukp.experiments.argumentation.clustering.debatefiltering.LuceneIndexer.java
License:Apache License
@Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); try {/*w w w . jav a2 s . co m*/ Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); directory = FSDirectory.open(luceneIndexDir); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, analyzer); // Add new documents to an existing index: iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwriter = new IndexWriter(directory, iwc); } catch (IOException e) { e.printStackTrace(); } }
From source file:de.uni_koeln.spinfo.maalr.lucene.core.DictionaryCreator.java
License:Apache License
private IndexWriter initIndexWriter() throws IOException { IndexWriterConfig writerConfig = new IndexWriterConfig(LuceneHelper.CURRENT, analyzer); if (!indexAvailable()) { writerConfig.setOpenMode(OpenMode.CREATE); } else {// w ww . j a v a 2 s . c om writerConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); } writerConfig.setRAMBufferSizeMB(512.0); IndexWriter writer = new IndexWriter(indexDirectory, writerConfig); return writer; }
From source file:de.uni_koeln.spinfo.maalr.lucene.core.DictionaryLoader.java
License:Apache License
private IndexWriter initIndexWriter() throws IOException { IndexWriterConfig writerConfig = new IndexWriterConfig(LuceneHelper.CURRENT, LuceneHelper.newAnalyzer()); writerConfig.setOpenMode(OpenMode.APPEND); writerConfig.setRAMBufferSizeMB(512.0); IndexWriter writer = new IndexWriter(ram, writerConfig); return writer; }
From source file:de.uni_marburg.splittr.indexing.CustomFileIndexer.java
License:Apache License
/** Index all text files under a directory. */ public static void main(String[] args) { String usage = "java org.apache.lucene.demo.IndexFiles" + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" + "This indexes the documents in DOCS_PATH, creating a Lucene index" + "in INDEX_PATH that can be searched with SearchFiles"; String indexPath = "index"; String docsPath = null;//from w w w . j a va 2 s.co m boolean create = true; for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { indexPath = args[i + 1]; i++; } else if ("-docs".equals(args[i])) { docsPath = args[i + 1]; i++; } else if ("-update".equals(args[i])) { create = false; } } if (docsPath == null) { System.err.println("Usage: " + usage); System.exit(1); } final File docDir = new File(docsPath); if (!docDir.exists() || !docDir.canRead()) { System.out.println("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } Date start = new Date(); try { System.out.println("Indexing to directory '" + indexPath + "'..."); Directory dir = FSDirectory.open(new File(indexPath)); Analyzer analyzer = new MinimalStemmingAnalyzer(); // Analyzer analyzer = new EnglishAnalyzer(LUCENE_VERSION); IndexWriterConfig iwc = new IndexWriterConfig(LUCENE_VERSION, analyzer); if (create) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); } else { // Add new documents to an existing index: iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, docDir); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // // writer.forceMerge(1); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:de.walware.statet.r.internal.core.rhelp.index.REnvIndexWriter.java
License:Open Source License
public void beginBatch(final boolean reset) throws AbortIndexException { if (this.luceneWriter != null) { throw new IllegalStateException(); }/*from w w w . ja v a2s .c o m*/ this.status = new MultiStatus(RCore.PLUGIN_ID, 0, "Indexing: '" + this.rEnvConfig.getName() + "'.", null); //$NON-NLS-1$ //$NON-NLS-2$ this.status.add(new Status(IStatus.INFO, RCore.PLUGIN_ID, "Beginning batch (index directory= '" + this.indexDirectory.getAbsolutePath() + "').")); try { final RHelpManager rHelpManager = RCorePlugin.getDefault().getRHelpManager(); this.indexLock = rHelpManager.getIndexLock(this.rEnvConfig.getReference()); synchronized (this.indexLock) { this.reset = reset; this.luceneDirectory = new SimpleFSDirectory(this.indexDirectory); if (!reset) { final REnvHelp oldHelp = rHelpManager.getHelp(this.rEnvConfig.getReference()); try (final IndexReader dirReader = DirectoryReader.open(this.luceneDirectory)) { this.existingPackages = new HashMap<>(64); TermsEnum termsEnum = null; for (final AtomicReaderContext leave : dirReader.leaves()) { final AtomicReader aReader = leave.reader(); final Terms terms = aReader.terms(PACKAGE_FIELD_NAME); if (terms != null) { termsEnum = terms.iterator(termsEnum); BytesRef term; while ((term = termsEnum.next()) != null) { final String name = term.utf8ToString(); final IRPkgHelp pkgHelp = (oldHelp != null) ? oldHelp.getRPackage(name) : null; this.existingPackages.put(name, pkgHelp); } } } final IndexWriterConfig config = createWriterConfig(); config.setOpenMode(OpenMode.CREATE_OR_APPEND); this.luceneWriter = new IndexWriter(this.luceneDirectory, config); } catch (final IOException e) { assert (this.luceneWriter == null); // try again new } finally { if (oldHelp != null) { oldHelp.unlock(); } } } if (this.luceneWriter == null) { this.reset = true; this.existingPackages = new HashMap<>(0); final IndexWriterConfig config = createWriterConfig(); config.setOpenMode(OpenMode.CREATE); this.luceneWriter = new IndexWriter(this.luceneDirectory, config); } } this.packages = new LinkedHashMap<>(); this.keywordGroups = new LinkedHashMap<>(); } catch (final IOException e) { throw new AbortIndexException(e); } catch (final OutOfMemoryError e) { throw new AbortIndexException(e); } }