List of usage examples for org.apache.lucene.index IndexWriter addDocument
public long addDocument(Iterable<? extends IndexableField> doc) throws IOException
From source file:de.tudarmstadt.lt.lm.app.GenerateNgramIndex.java
License:Apache License
public void create_ngram_index(File ngram_joined_counts_file) throws IOException { File index_dir = new File(_index_dir, "ngram"); if (index_dir.exists()) { LOG.info("Ngram index already exists in directory '{}'.", index_dir.getAbsolutePath()); if (_overwrite) { LOG.info("Overwriting index '{}',", index_dir); index_dir.delete();//from w w w . ja v a2 s. c o m } else return; } index_dir.mkdirs(); Analyzer analyzer = new KeywordAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer); iwc.setOpenMode(OpenMode.CREATE); // use 80 percent of the available total memory double total_mem_mb = (double) Runtime.getRuntime().maxMemory() / 1e6; double percentage_ram_buffer = Properties.ramBufferPercentage(); if (percentage_ram_buffer > 0) { double percentage_ram_buffer_mb = total_mem_mb * percentage_ram_buffer; LOG.info(String.format("Setting ram buffer size to %.2f MB (%.2f%% from %.2f MB)", percentage_ram_buffer_mb, percentage_ram_buffer * 100, total_mem_mb)); iwc.setRAMBufferSizeMB(percentage_ram_buffer_mb); } Directory directory = new MMapDirectory(index_dir); IndexWriter writer_ngram = new IndexWriter(directory, iwc); InputStream in = new FileInputStream(ngram_joined_counts_file); if (ngram_joined_counts_file.getName().endsWith(".gz")) in = new GZIPInputStream(in); LineIterator iter = new LineIterator(new BufferedReader(new InputStreamReader(in, "UTF-8"))); Document doc = new Document(); Field f_ngram = new StringField("ngram", "", Store.YES); doc.add(f_ngram); Field f_n = new IntField("cardinality", 0, Store.YES); doc.add(f_n); Field f_word = new StringField("word", "", Store.YES); doc.add(f_word); Field f_hist = new StringField("history", "", Store.YES); doc.add(f_hist); Field f_lower = new StringField("lower", "", Store.YES); doc.add(f_lower); Field f_count = new StoredField("num", 0L); doc.add(f_count); Field[] f_follow = new Field[4]; f_follow[0] = new StoredField("nf_s", 0L); doc.add(f_follow[0]); f_follow[1] = new StoredField("nf_N1", 0L); doc.add(f_follow[1]); f_follow[2] = new StoredField("nf_N2", 0L); doc.add(f_follow[2]); f_follow[3] = new StoredField("nf_N3", 0L); doc.add(f_follow[3]); Field[] f_precede = new Field[4]; f_precede[0] = new StoredField("np_s", 0L); doc.add(f_precede[0]); f_precede[1] = new StoredField("np_N1", 0L); doc.add(f_precede[1]); f_precede[2] = new StoredField("np_N2", 0L); doc.add(f_precede[2]); f_precede[3] = new StoredField("np_N3", 0L); doc.add(f_precede[3]); Field[] f_followerprecede = new Field[4]; f_followerprecede[0] = new StoredField("nfp_s", 0L); doc.add(f_followerprecede[0]); f_followerprecede[1] = new StoredField("nfp_N1", 0L); doc.add(f_followerprecede[1]); f_followerprecede[2] = new StoredField("nfp_N2", 0L); doc.add(f_followerprecede[2]); f_followerprecede[3] = new StoredField("nfp_N3", 0L); doc.add(f_followerprecede[3]); Long[][] N = new Long[][] { { 0L, 0L, 0L, 0L, 0L, 0L } }; Long[] S = new Long[] { 0L }; long c = 0; while (iter.hasNext()) { if (++c % 100000 == 0) LOG.info("Adding {}'th ngram.", c); String line = iter.next(); try { String[] splits = de.tudarmstadt.lt.utilities.StringUtils.rtrim(line).split("\t"); String ngram_str = splits[0]; if (de.tudarmstadt.lt.utilities.StringUtils.trim(ngram_str).isEmpty()) { LOG.warn("Ngram is empty, skipping line {}: '{}' (file '{}').", c, line, ngram_joined_counts_file); continue; } List<String> ngram = Arrays.asList(ngram_str.split(" ")); long num = Long.parseLong(splits[1]); int n = ngram.size(); f_ngram.setStringValue(ngram_str); f_n.setIntValue(n); f_word.setStringValue(ngram.get(ngram.size() - 1)); f_hist.setStringValue(StringUtils.join(ngram.subList(0, ngram.size() - 1), " ")); f_lower.setStringValue(StringUtils.join(ngram.subList(1, ngram.size()), " ")); f_count.setLongValue(num); for (int j = 0; j < f_follow.length; j++) { f_follow[j].setLongValue(0L); f_precede[j].setLongValue(0L); f_followerprecede[j].setLongValue(0L); } if (splits.length > 2 && !splits[2].isEmpty()) { // precede or follow or followerprecede String[] splits_ = splits[2].split(":"); String type = splits_[0]; String[] count_values = splits_[1].split(","); if (count_values.length > 0) { if ("n_f".equals(type)) f_follow[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_p".equals(type)) f_precede[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_fp".equals(type)) f_followerprecede[0].setLongValue(Long.parseLong(count_values[0])); } for (int i = 1; i < count_values.length; i++) { if ("n_f".equals(type)) f_follow[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_p".equals(type)) f_precede[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_fp".equals(type)) f_followerprecede[i].setLongValue(Long.parseLong(count_values[i])); } } if (splits.length > 3 && !splits[3].isEmpty()) { // should be follow or followerprecede String[] splits_ = splits[3].split(":"); String type = splits_[0]; String[] count_values = splits_[1].split(","); if (count_values.length > 0) { if ("n_f".equals(type)) f_follow[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_p".equals(type)) f_precede[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_fp".equals(type)) f_followerprecede[0].setLongValue(Long.parseLong(count_values[0])); } for (int i = 1; i < count_values.length; i++) { if ("n_f".equals(type)) f_follow[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_p".equals(type)) f_precede[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_fp".equals(type)) f_followerprecede[i].setLongValue(Long.parseLong(count_values[i])); } } if (splits.length > 4 && !splits[4].isEmpty()) { // should be followerprecede String[] splits_ = splits[4].split(":"); String type = splits_[0]; String[] count_values = splits_[1].split(","); if (count_values.length > 0) { if ("n_f".equals(type)) f_follow[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_p".equals(type)) f_precede[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_fp".equals(type)) f_followerprecede[0].setLongValue(Long.parseLong(count_values[0])); } for (int i = 1; i < count_values.length; i++) { if ("n_f".equals(type)) f_follow[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_p".equals(type)) f_precede[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_fp".equals(type)) f_followerprecede[i].setLongValue(Long.parseLong(count_values[i])); } } writer_ngram.addDocument(doc); while (N.length <= n) { N = ArrayUtils.getConcatinatedArray(N, new Long[][] { { 0L, 0L, 0L, 0L, 0L, 0L } }); S = ArrayUtils.getConcatinatedArray(S, new Long[] { 0L }); } if (num == 1L) N[n][1]++; else if (num == 2L) N[n][2]++; else if (num == 3L) N[n][3]++; else if (num == 4L) N[n][4]++; else N[n][5]++; N[n][0]++; S[n] += num; } catch (Exception e) { LOG.error("Could not process line '{}' in file '{}:{}', malformed line.", line, ngram_joined_counts_file, c, e); } } writer_ngram.forceMergeDeletes(); writer_ngram.commit(); writer_ngram.close(); StringBuilder b = new StringBuilder(String.format( "#%n# Number of times where an ngram occurred: %n# at_least_once, exactly_once, exactly_twice, exactly_three_times, exactly_four_times, five_times_or_more.%n#%nmax_n=%d%nmax_c=6%n", N.length - 1)); for (int n = 1; n < N.length; n++) b.append(String.format("n%d=%s%n", n, StringUtils.join(N[n], ','))); for (int n = 1; n < S.length; n++) b.append(String.format("s%d=%d%n", n, S[n])); FileUtils.writeStringToFile(new File(_index_dir, "__sum_ngrams__"), b.toString()); }
From source file:de.tudarmstadt.lt.lm.app.GenerateNgramIndex.java
License:Apache License
public void create_vocabulary_index(File vocabulary_file) throws IOException { File index_dir = new File(_index_dir, "vocab"); if (index_dir.exists()) { LOG.info("Vocabulary index already exists in directory '{}'.", index_dir.getAbsolutePath()); if (_overwrite) { LOG.info("Overwriting index '{}',", index_dir); index_dir.delete();//from www . j a v a 2s . c o m } else return; } index_dir.mkdirs(); Analyzer analyzer = new KeywordAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer); iwc.setOpenMode(OpenMode.CREATE); iwc.setRAMBufferSizeMB(1024.0); Directory directory = new MMapDirectory(index_dir); IndexWriter writer_vocab = new IndexWriter(directory, iwc); InputStream in = new FileInputStream(vocabulary_file); if (vocabulary_file.getName().endsWith(".gz")) in = new GZIPInputStream(in); LineIterator iter = new LineIterator(new BufferedReader(new InputStreamReader(in, "UTF-8"))); Document doc = new Document(); Field f_word = new StringField("word", "", Field.Store.YES); doc.add(f_word); long c = 0; while (iter.hasNext()) { if (++c % 10000 == 0) LOG.info("Adding {}'th word.", c); String line = iter.next(); try { String word = line.trim(); f_word.setStringValue(word); writer_vocab.addDocument(doc); } catch (Exception e) { LOG.warn("Could not process line '{}' in file '{}', malformed line.", line, vocabulary_file, e); } } writer_vocab.forceMergeDeletes(); writer_vocab.commit(); writer_vocab.close(); }
From source file:de.u808.simpleinquest.indexer.impl.IndexUpdater.java
License:Apache License
private void indexDocuments(List<File> files) throws CorruptIndexException, LockObtainFailedException, IOException { IndexWriter indexWriter = new IndexWriter(indexDirectory, new StandardAnalyzer()); Iterator<File> iterator = files.iterator(); while (iterator.hasNext()) { File file = (File) iterator.next(); if (file.isDirectory()) { Document doc = DirectoryDocument.Document(file); indexWriter.addDocument(doc); } else {/*from ww w . ja va 2 s. co m*/ Indexer indexer = indexerFactory.getIndexer(file); if (indexer != null) { Document document = null; try { log.debug("Memory before indexing in MB (M: " + memoryFormater.format(Runtime.getRuntime().maxMemory() / (1024 * 1024)) + " T: " + memoryFormater.format(Runtime.getRuntime().totalMemory() / (1024 * 1024)) + "F: " + memoryFormater.format(Runtime.getRuntime().freeMemory() / (1024 * 1024)) + ")"); this.ensureEnoughHeapMemory(); String msg = "Indexing file: " + file.getPath(); document = indexer.indexFile(file); this.setStatusMessage(msg); log.info(msg); log.debug("Memory after indexing in MB (M: " + memoryFormater.format(Runtime.getRuntime().maxMemory() / (1024 * 1024)) + " T: " + memoryFormater.format(Runtime.getRuntime().totalMemory() / (1024 * 1024)) + " F: " + memoryFormater.format(Runtime.getRuntime().freeMemory() / (1024 * 1024)) + ")"); iterator.remove(); } catch (IndexerException e) { log.error("Error during indexing", e); } catch (OutOfMemoryError outOfMemoryError) { log.warn( "File seems to be to big for the actual free heap. Try to increase availible memory with vm option -Xmx if this is a recurring error message"); log.info("Try to free memory"); document = null; System.gc(); this.refreschIndex(); } if (document != null) { indexWriter.addDocument(document); } else { String msg = "Indexer " + indexer.getClass() + " returned no content to index"; this.setStatusMessage(msg); log.warn(msg); } } else { log.debug("No indexer for file: " + file.getPath()); } } } String msg = "Optimizing index"; this.setStatusMessage(msg); log.info(msg); indexWriter.flush(); indexWriter.optimize(); msg = "Index optimized"; this.setStatusMessage(msg); log.info(msg); indexWriter.close(true); indexWriter = null; }
From source file:de.unidue.inf.is.ezdl.dlservices.search.handlers.ranking.LuceneRanker.java
License:Open Source License
private void createIndex(ResultDocumentList toRank, IndexWriter indexWriter) throws CorruptIndexException, IOException { for (ResultDocument result : toRank) { Document document = result.getDocument(); org.apache.lucene.document.Document d = new org.apache.lucene.document.Document(); StringBuilder sb = new StringBuilder(); String oid = document.getOid(); Field.Store store = Field.Store.NO; Field field;/* w w w . j ava2 s .c om*/ if (!StringUtils.isEmpty(oid)) { field = new Field("oid", oid, Field.Store.YES, Field.Index.NO); d.add(field); String title = document.getTitle(); if (!StringUtils.isEmpty(title)) { field = new Field(de.unidue.inf.is.ezdl.dlcore.data.fields.Field.TITLE.toString(), title, store, Field.Index.ANALYZED); field.setOmitNorms(true); field.setBoost(2.0f); d.add(field); sb.append(title); sb.append(" "); } if (document instanceof TextDocument) { String docAbstract = ((TextDocument) document).getAbstract(); if (!StringUtils.isEmpty(docAbstract)) { field = new Field(de.unidue.inf.is.ezdl.dlcore.data.fields.Field.ABSTRACT.toString(), docAbstract, store, Field.Index.ANALYZED); d.add(field); sb.append(docAbstract); sb.append(" "); } } int year = document.getYear(); if (year != 0) { field = new Field(de.unidue.inf.is.ezdl.dlcore.data.fields.Field.YEAR.toString(), String.valueOf(year), store, Field.Index.NOT_ANALYZED); d.add(field); sb.append(" "); sb.append(year); } PersonList authorList = document.getAuthorList(); if (authorList != null) { field = new Field(de.unidue.inf.is.ezdl.dlcore.data.fields.Field.AUTHOR.toString(), authorList.toString(), store, Field.Index.ANALYZED); d.add(field); sb.append(authorList.toString()); } field = new Field(de.unidue.inf.is.ezdl.dlcore.data.fields.Field.TEXT.toString(), sb.toString().toString(), store, Field.Index.ANALYZED); d.add(field); indexWriter.addDocument(d); } } indexWriter.commit(); }
From source file:de.uni_koeln.spinfo.maalr.lucene.core.DictionaryCreator.java
License:Apache License
private int indexDocs(final IndexWriter writer, final Iterator<LexEntry> iterator) throws IOException { int counter = 0; NumberFormat nf = NumberFormat.getNumberInstance(); while (iterator.hasNext()) { LexEntry lexEntry = iterator.next(); List<Document> docs = createDocument(lexEntry); if (tracing) { logger.trace("Indexing Documents: " + docs); }/*from w w w. j av a2s.com*/ for (Document doc : docs) { writer.addDocument(doc); } counter++; if (counter % 10000 == 0) { logger.debug("Indexed " + nf.format(counter) + " documents."); } } logger.info("###########################################"); logger.info("Indexing completed - " + nf.format(counter) + " entries have been indexed."); logger.info("###########################################"); return counter; }
From source file:de.uni_koeln.spinfo.maalr.lucene.core.DictionaryCreator.java
License:Apache License
void update(LexEntry entry) throws IOException { IndexWriter writer = initIndexWriter(); Term queryTerm = new Term(LexEntry.ID, entry.getId()); writer.deleteDocuments(queryTerm);/*from w w w . ja va 2s. com*/ if (entry.getCurrent() != null) { List<Document> docs = createDocument(entry); for (Document document : docs) { writer.addDocument(document); } } writer.close(); }
From source file:de.uni_koeln.spinfo.maalr.lucene.core.DictionaryLoader.java
License:Apache License
void update(LexEntry entry) throws IOException { IndexWriter writer = initIndexWriter(); Term queryTerm = new Term(LexEntry.ID, entry.getId()); writer.deleteDocuments(queryTerm);// w w w . j ava 2 s. c om if (entry.getCurrent() != null) { List<Document> docs = createDocument(new HashSet<String>(), entry); for (Document document : docs) { writer.addDocument(document); } } writer.commit(); writer.close(); reader.close(); reader = DirectoryReader.open(ram); searcher = new IndexSearcher(reader); }
From source file:demo.jaxrs.search.server.Catalog.java
License:Apache License
private void storeAndIndex(final LuceneDocumentMetadata metadata, final byte[] content) throws IOException { try (BufferedInputStream in = new BufferedInputStream(new ByteArrayInputStream(content))) { final Document document = extractor.extract(in, metadata); if (document != null) { final IndexWriter writer = getIndexWriter(); try { storage.addDocument(metadata.getSource(), content); writer.addDocument(document); writer.commit();/* www .j av a 2 s. co m*/ } finally { writer.close(); } } } }
From source file:demo.jaxrs.search.server.Indexer.java
License:Apache License
public void storeAndIndex(final LuceneDocumentMetadata metadata, final byte[] content) throws IOException { BufferedInputStream in = null; try {/*from ww w. j a v a 2s.c om*/ in = new BufferedInputStream(new ByteArrayInputStream(content)); final Document document = extractor.extract(in, metadata); if (document != null) { final IndexWriter writer = getIndexWriter(); try { storage.addDocument(metadata.getSource(), content); writer.addDocument(document); writer.commit(); } finally { writer.close(); } } } finally { if (in != null) { try { in.close(); } catch (IOException ex) { /* do nothing */ } } } }
From source file:Demo1.MyServlet.java
private static void addDoc(IndexWriter w, String Name, String Price, String Area, String Purpose) throws IOException { Document doc = new Document(); // A text field will be tokenized doc.add(new StringField("name", Name, Field.Store.YES)); // We use a string field for isbn because we don\'t want it tokenized doc.add(new StringField("price", Price, Field.Store.YES)); doc.add(new StringField("area", Area, Field.Store.YES)); doc.add(new StringField("purpose", Purpose, Field.Store.YES)); String Searching = Name + " " + Price + " " + Area + " " + Purpose; doc.add(new TextField("Searching", Searching, Field.Store.NO)); w.addDocument(doc); }