List of usage examples for org.apache.lucene.index IndexWriter updateDocument
private long updateDocument(final DocumentsWriterDeleteQueue.Node<?> delNode, Iterable<? extends IndexableField> doc) throws IOException
From source file:org.xcmis.search.lucene.index.PersistedIndex.java
License:Open Source License
/** * {@inheritDoc}//w w w .j av a 2s.co m */ public IndexTransactionModificationReport save(final IndexTransaction<Document> changes) throws IndexException { final Set<String> removedDocuments = new HashSet<String>(); final Set<String> updatedDocuments = new HashSet<String>(); try { // index already started synchronized (this.indexDirectiry) { final Set<String> removed = changes.getRemovedDocuments(); IndexWriter writer = null; IndexReader reader = null; Map<String, Document> updated = null; for (final String removedUuid : removed) { if (reader == null) { reader = this.getIndexReader(); } if (this.getDocument(removedUuid, reader) != null) { removedDocuments.add(removedUuid); } } if (removedDocuments.size() > 0 || changes.getAddedDocuments().size() > 0) { writer = new IndexWriter(this.indexDirectiry, new StandardAnalyzer(), MaxFieldLength.UNLIMITED); // removed for (final String uuid : removedDocuments) { writer.deleteDocuments(new Term(FieldNames.UUID, uuid)); } // updated for (final String uuid : updatedDocuments) { // TODO possible use only delete writer.updateDocument(new Term(FieldNames.UUID, uuid), updated.get(uuid)); } // added for (final Document document : changes.getAddedDocuments().values()) { writer.addDocument(document); } writer.commit(); writer.close(); this.lastModifedTime = System.currentTimeMillis(); } } } catch (final CorruptIndexException e) { throw new IndexException(e.getLocalizedMessage(), e); } catch (final IOException e) { throw new IndexException(e.getLocalizedMessage(), e); } return new IndexTransactionModificationReportImpl(changes.getAddedDocuments().keySet(), removedDocuments, updatedDocuments); }
From source file:org.xcmis.search.lucene.InMemoryLuceneQueryableIndexStorage.java
License:Open Source License
/** * @throws IndexException// www. j av a 2s . c o m * @see org.xcmis.search.lucene.AbstractLuceneQueryableIndexStorage#save(org.xcmis.search.lucene.index.LuceneIndexTransaction) */ @Override protected synchronized Object save(LuceneIndexTransaction indexTransaction) throws IndexException, IndexTransactionException { try { IndexWriter writer = new IndexWriter(ramDirectory, new StandardAnalyzer(), MaxFieldLength.UNLIMITED); // removed for (final String uuid : indexTransaction.getRemovedDocuments()) { writer.deleteDocuments(new Term(FieldNames.UUID, uuid)); } // added for (final Entry<String, Document> entry : indexTransaction.getAddedDocuments().entrySet()) { writer.updateDocument(new Term(FieldNames.UUID, entry.getKey()), entry.getValue()); } writer.commit(); writer.close(); } catch (CorruptIndexException e) { throw new IndexModificationException(e.getLocalizedMessage(), e); } catch (LockObtainFailedException e) { throw new IndexModificationException(e.getLocalizedMessage(), e); } catch (IOException e) { throw new IndexModificationException(e.getLocalizedMessage(), e); } return new Object(); }
From source file:part2.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is given, * recurses over files and directories found under the given directory. * // w w w . j av a2 s.c om * NOTE: This method indexes one document per input file. This is slow. For good * throughput, put multiple documents into your input file(s). An example of this is * in the benchmark module, which can create "line doc" files, one document per line, * using the * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer Writer to the index where the given file/dir info will be stored * @param file The file to index, or the directory to recurse into to find files to index * @throws IOException If there is a low-level I/O error */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { if (!file.toString().toLowerCase().contains(".txt")) { return; } FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Make Book object which parses the file and finds Author and Title of the text file. Book book = new Book(); book.parse(fis); Field authorField = new TextField("author", book.getAuthor(), Field.Store.YES); Field titleField = new TextField("title", book.getTitle(), Field.Store.YES); Field releaseField = new TextField("release date", book.getReleaseDate(), Field.Store.YES); Field languageField = new TextField("language", book.getLanguage(), Field.Store.YES); authorField.setBoost(3.0f); titleField.setBoost(3.0f); doc.add(authorField); doc.add(titleField); doc.add(releaseField); doc.add(languageField); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:perf.TestBenchNRTPKLookup.java
License:Apache License
public static void main(String[] args) throws IOException { Directory dir = new MMapDirectory(new File(args[0])); //Directory dir = new NIOFSDirectory(new File(args[0])); IndexWriterConfig iwc = new IndexWriterConfig(new StandardAnalyzer()); iwc.setRAMBufferSizeMB(250);/*from w w w . ja v a2 s . c o m*/ IndexWriter writer = new IndexWriter(dir, iwc); final SearcherManager manager = new SearcherManager(writer, true, new SearcherFactory() { @Override public IndexSearcher newSearcher(IndexReader r) { return new IndexSearcher(r); } }); FieldType type = new FieldType(); type.setIndexed(true); type.setTokenized(false); type.setStored(false); type.freeze(); HashMap<Object, TermsEnum> cachedTermsEnum = new HashMap<Object, TermsEnum>(); long time = System.currentTimeMillis(); long lastTime = time; int num = 2500000; Random r = new Random(16); for (int i = 0; i < num; i++) { //Term t = new Term("_id", Integer.toString(i)); String id = String.format("%010d", r.nextInt(Integer.MAX_VALUE)); Term t = new Term("_id", id); IndexSearcher acquire = manager.acquire(); try { IndexReader indexReader = acquire.getIndexReader(); List<AtomicReaderContext> leaves = indexReader.leaves(); for (AtomicReaderContext atomicReaderContext : leaves) { AtomicReader reader = atomicReaderContext.reader(); TermsEnum termsEnum = cachedTermsEnum.get(reader.getCombinedCoreAndDeletesKey()); if (termsEnum == null) { termsEnum = reader.fields().terms("_id").iterator(null); //cachedTermsEnum.put(reader.getCombinedCoreAndDeletesKey(), termsEnum); // uncomment this line to see improvements } // MKM //System.out.println("\nlookup seg=: " + reader + " term=" + t); if (termsEnum.seekExact(t.bytes())) { DocsEnum termDocsEnum = termsEnum.docs(reader.getLiveDocs(), null); if (termDocsEnum != null) { break; } } } } finally { manager.release(acquire); } Document d = new Document(); d.add(new Field("_id", id, type)); writer.updateDocument(t, d); //writer.addDocument(d); if (i % 50000 == 0) { long t1 = System.currentTimeMillis(); System.out.println(i + " " + (t1 - lastTime) + " ms"); lastTime = t1; } if ((i + 1) % 250000 == 0) { System.out.println("Reopen..."); manager.maybeRefresh(); IndexSearcher s = manager.acquire(); try { System.out.println(" got: " + s); } finally { manager.release(s); } } } System.out.println("\nTotal: " + (System.currentTimeMillis() - time) + " msec"); //System.out.println("loadBlockCount: " + BlockTreeTermsReader.loadBlockCount); manager.close(); writer.close(); dir.close(); }
From source file:practica1_2.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is * given, recurses over files and directories found under the given * directory./*from ww w.j av a2 s. c om*/ * * NOTE: This method indexes one document per input file. This is slow. For * good throughput, put multiple documents into your input file(s). An * example of this is in the benchmark module, which can create "line doc" * files, one document per line, using the <a href= * "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer * Writer to the index where the given file/dir info will be * stored * @param file * The file to index, or the directory to recurse into to find * files to index * @throws IOException * If there is a low-level I/O error */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this // exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't // tokenize // the field into separate words and don't index term // frequency // or positional information: Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named // "modified". // Use a LongField that is indexed (i.e. efficiently // filterable with // NumericRangeFilter). This indexes to milli-second // resolution, which // is often too fine. You could instead create a number // based on // year/month/day/hour/minutes/seconds, down the resolution // you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); // Add the contents of the file to a field named "contents". // Specify a Reader, // so that the text of the file is tokenized and indexed, // but not stored. // Note that FileReader expects the file to be in UTF-8 // encoding. // If that's not the case searching for special characters // will fail. // doc.add(new TextField("contents", new BufferedReader( // new InputStreamReader(fis, "UTF-8")))); insertIndexTag("title", file, doc, true); insertIndexTag("identifier", file, doc, false); insertIndexTag("subject", file, doc, true); insertIndexTag("type", file, doc, false); insertIndexTag("description", file, doc, true); insertIndexTag("creator", file, doc, true); insertIndexTag("publisher", file, doc, true); insertIndexTag("format", file, doc, false); insertIndexTag("language", file, doc, false); readPosition(doc, file, true); readPosition(doc, file, false); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old // document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have // been indexed) so // we use updateDocument instead to replace the old one // matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:resource.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is * given, recurses over files and directories found under the given * directory.//w ww. j a v a 2 s. c o m * * NOTE: This method indexes one document per input file. This is slow. For * good throughput, put multiple documents into your input file(s). An * example of this is in the benchmark module, which can create "line doc" * files, one document per line, using the <a href= * "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer * Writer to the index where the given file/dir info will be * stored * @param file * The file to index, or the directory to recurse into to find * files to index * @throws IOException * If there is a low-level I/O error * @throws TikaException * @throws SAXException */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this // exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't // tokenize // the field into separate words and don't index term // frequency // or positional information: Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); Field relativePathField = new StringField("shortPath", getRelativePath(file.getPath()), Field.Store.YES); doc.add(relativePathField); // Add the last modified date of the file a field named // "modified". // Use a LongField that is indexed (i.e. efficiently // filterable with // NumericRangeFilter). This indexes to milli-second // resolution, which // is often too fine. You could instead create a number // based on // year/month/day/hour/minutes/seconds, down the resolution // you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); // Add the contents of the file to a field named "contents". // Specify a Reader, // so that the text of the file is tokenized and indexed, // but not stored. // Note that FileReader expects the file to be in UTF-8 // encoding. // If that's not the case searching for special characters // will fail. if (file.getName().endsWith(".txt")) doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); if (file.getName().endsWith(".html")) { doc.add(new TextField("contents", html2String(fis), Field.Store.YES)); doc.add(new StringField("title", getHTMLTitle(file), Field.Store.YES)); } if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old // document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have // been indexed) so // we use updateDocument instead to replace the old one // matching the exact // path, if present: System.out.println("updating " + file); try { writer.updateDocument(new Term("path", file.getPath()), doc); } catch (Exception e) { // TODO e.printStackTrace(); } } } finally { fis.close(); } } } }
From source file:ru.npopm.dep715.searchdocs.lucene.IndexFiles.java
License:Apache License
/** * Indexes a single document//from ww w .j a v a 2 s .c o m */ static void indexDoc(IndexWriter writer, File file, long lastModified) throws IOException { try (InputStream stream = new FileInputStream(file)) { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongPoint that is indexed (i.e. efficiently filterable with // PointRangeQuery). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongPoint("modified", lastModified)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); // writer.deleteDocuments(new Term("path", file.toString())); writer.updateDocument(new Term("path", file.toString()), doc); } } }
From source file:ru.npopm.dep715.searchdocs.lucene._IndexFiles_.java
License:Apache License
/** * Indexes a single document//from w w w . j a va 2 s . co m */ static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { try (InputStream stream = Files.newInputStream(file)) { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongPoint that is indexed (i.e. efficiently filterable with // PointRangeQuery). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongPoint("modified", lastModified)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); // writer.deleteDocuments(new Term("path", file.toString())); writer.updateDocument(new Term("path", file.toString()), doc); } } }
From source file:se.riddle.jekyll.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is given, * recurses over files and directories found under the given directory. * <p/>/* w w w . j a v a 2s . com*/ * NOTE: This method indexes one document per input file. This is slow. For good * throughput, put multiple documents into your input file(s). An example of this is * in the benchmark module, which can create "line doc" files, one document per line, * using the * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer Writer to the index where the given file/dir info will be stored * @param file The file to index, or the directory to recurse into to find files to index * @throws IOException */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new Field("path", file.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); pathField.setIndexOptions(IndexOptions.DOCS_ONLY); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a NumericField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. NumericField modifiedField = new NumericField("modified"); modifiedField.setLongValue(file.lastModified()); doc.add(modifiedField); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new Field("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")), Field.TermVector.YES)); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:searchEngine.IndexFiles.java
License:Apache License
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { try (InputStream stream = Files.newInputStream(file)) { File docDir = new File(file.toString()); org.jsoup.nodes.Document jsoupDoc = Jsoup.parse(docDir, "UTF8"); Document doc = new Document(); Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField);//from w w w .j a va2s. c om doc.add(new LongPoint("modified", lastModified)); doc.add(new TextField("title", jsoupDoc.title(), Field.Store.YES)); doc.add(new TextField("contents", jsoupDoc.text(), Field.Store.YES)); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { System.out.println("adding " + file); writer.addDocument(doc); } else { System.out.println("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } } }