List of usage examples for org.apache.lucene.index IndexWriter updateDocument
private long updateDocument(final DocumentsWriterDeleteQueue.Node<?> delNode, Iterable<? extends IndexableField> doc) throws IOException
From source file:lia.recent.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is given, * recurses over files and directories found under the given directory. * //w ww . ja v a2 s . c om * NOTE: This method indexes one document per input file. This is slow. For good * throughput, put multiple documents into your input file(s). An example of this is * in the benchmark module, which can create "line doc" files, one document per line, * using the * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer Writer to the index where the given file/dir info will be stored * @param file The file to index, or the directory to recurse into to find files to index * @throws IOException If there is a low-level I/O error */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); System.out.println("fis " + file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:lucene.demo.search.FileIndexer.java
License:Apache License
private void addDoc(IndexWriter writer, File file, Document doc) throws IOException { if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) { try {/*from w w w . j a v a 2s .c om*/ writer.addDocument(doc); } catch (Exception e) { // } } else { writer.updateDocument(new Term("path", file.getPath()), doc); } }
From source file:luceneexamples.UpdateDocument.java
License:Apache License
@Test public void index() throws Exception { RAMDirectory directory = new RAMDirectory(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer); IndexWriter writer = new IndexWriter(directory, iwc); Document doc = new Document(); doc.add(new Field("id", "001", Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("str_field", "quick brown fox jumped over the lazy dog.", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc);// w w w.j av a 2 s . c o m writer.commit(); IndexReader reader = IndexReader.open(writer, true); IndexSearcher searcher = new IndexSearcher(reader); QueryParser parser = new QueryParser(Version.LUCENE_31, "str_field", analyzer); TopDocs td = searcher.search(parser.parse("fox"), 1000); assertThat(td.totalHits, is(1)); Document doc2 = new Document(); doc.add(new Field("id", "001", Field.Store.YES, Field.Index.NOT_ANALYZED)); doc2.add(new Field("str_field", "quick brown fox jumped over the lazy whale.", Field.Store.YES, Field.Index.ANALYZED)); writer.updateDocument(new Term("id", "001"), doc2); writer.commit(); searcher.close(); reader = reader.reopen(); searcher = new IndexSearcher(reader); td = searcher.search(parser.parse("dog"), 1000); assertThat(td.totalHits, is(0)); td = searcher.search(parser.parse("whale"), 1000); assertThat(td.totalHits, is(1)); writer.close(); searcher.close(); directory.close(); }
From source file:mm.IndexFiles.java
License:Apache License
/** Indexes a single document */ static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { try (InputStream stream = Files.newInputStream(file)) { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField);// w ww . j a v a2 s. c o m // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongPoint("modified", lastModified)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } } }
From source file:model.IndexFiles.java
License:Apache License
/** Indexes a single document */ static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { HashMap<String, String> mappingPathToTitle = new HashMap<String, String>(); try (InputStream stream = Files.newInputStream(file)) { // make a new, empty document Document doc = new Document(); // strip tags in here as they're unrequired //doc.add(new TextField("fullContents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.toString(), Field.Store.YES); //System.out.println(pathField); String path = file.toString(); String pathContents = readFileToString(path, path, mappingPathToTitle); Field pathContents1 = new TextField("contents", pathContents, Field.Store.YES); if (!mappingPathToTitle.isEmpty()) { Field title = new TextField("title", mappingPathToTitle.get(path), Field.Store.YES); doc.add(title);/*from ww w . j av a 2 s . c o m*/ } doc.add(pathField); doc.add(pathContents1); // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", lastModified, Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } } }
From source file:net.oneandone.pommes.model.Database.java
License:Apache License
public void index(Iterator<Document> iterator) throws IOException { IndexWriter writer; IndexWriterConfig config;//from w w w .j a va 2 s . c om Document doc; close(); // no analyzer, I have String fields only config = new IndexWriterConfig(Version.LUCENE_4_9, null); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); writer = new IndexWriter(getIndexLuceneDirectory(), config); while (iterator.hasNext()) { doc = iterator.next(); writer.updateDocument(new Term(ORIGIN, doc.get(ORIGIN)), doc); } writer.close(); }
From source file:net.riezebos.thoth.content.search.Indexer.java
License:Apache License
protected void addToIndex(IndexWriter writer, String resourcePath, String resourceType, String title, String contents, Map<String, String> metaTags) throws IOException { String extension = ThothUtil.getExtension(resourcePath); if (extension == null) extension = ""; extension = extension.toLowerCase(); Document document = new Document(); document.add(new StringField(INDEX_PATH, resourcePath, Field.Store.YES)); document.add(new TextField(INDEX_TYPE, resourceType, Store.YES)); document.add(new TextField(INDEX_TITLE, title, Store.YES)); document.add(new TextField(INDEX_CONTENTS, contents, Store.NO)); document.add(new TextField(INDEX_USED, "true", Store.NO)); document.add(new TextField(INDEX_EXTENSION, extension.toLowerCase(), Store.NO)); metaTags.entrySet().stream().forEach(entry -> document .add(new TextField(entry.getKey().toLowerCase(), String.valueOf(entry.getValue()), Store.NO))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): LOG.debug("Indexer for context " + contentManager.getContextName() + " added " + resourcePath); writer.addDocument(document);//from w w w. j a va 2 s.c om } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: LOG.debug("Indexer for context " + contentManager.getContextName() + " updated " + resourcePath); writer.updateDocument(new Term(INDEX_PATH, resourcePath), document); } }
From source file:net.semanticmetadata.lire.imageanalysis.bovw.BOVWBuilder.java
License:Open Source License
public void indexMissing() throws IOException { init();//from w w w .j a va 2 s.com // Reading clusters from disk: clusters = Cluster.readClusters(clusterFile); // create & store histograms: System.out.println("Creating histograms ..."); LireFeature f = getFeatureInstance(); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); // based on bug report from Einav Itamar <einavitamar@gmail.com> IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), false, LuceneUtils.AnalyzerType.WhitespaceAnalyzer); int counter = 0; for (int i = 0; i < reader.maxDoc(); i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. Document d = reader.document(i); // Only if there are no values yet: if (d.getValues(visualWordsFieldName) == null || d.getValues(visualWordsFieldName).length == 0) { createVisualWords(d, f); // now write the new one. we use the identifier to update ;) iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER, d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), d); counter++; } } System.out.println(counter + " Documents were updated"); iw.commit(); // added to permanently remove the deleted docs. iw.forceMerge(1); iw.close(); System.out.println("Finished."); }
From source file:net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilder.java
License:Open Source License
public void indexMissing() throws IOException { // Reading clusters from disk: clusters = Cluster.readClusters(clusterFile); // create & store histograms: System.out.println("Creating histograms ..."); int[] tmpHist = new int[numClusters]; LireFeature f = getFeatureInstance(); IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true, LuceneUtils.AnalyzerType.WhitespaceAnalyzer); for (int i = 0; i < reader.maxDoc(); i++) { // if (!reader.isDeleted(i)) { for (int j = 0; j < tmpHist.length; j++) { tmpHist[j] = 0;/*w w w. j ava 2 s . co m*/ } Document d = reader.document(i); // Only if there are no values yet: if (d.getValues(visualWordsFieldName) == null || d.getValues(visualWordsFieldName).length == 0) { IndexableField[] fields = d.getFields(localFeatureFieldName); // find the appropriate cluster for each feature: for (int j = 0; j < fields.length; j++) { f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length); tmpHist[clusterForFeature((Histogram) f)]++; } normalize(tmpHist); d.add(new TextField(visualWordsFieldName, arrayToVisualWordString(tmpHist), Field.Store.YES)); d.add(new StringField(localFeatureHistFieldName, SerializationUtils.arrayToString(tmpHist), Field.Store.YES)); // now write the new one. we use the identifier to update ;) iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER, d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), d); } // } } iw.commit(); iw.close(); System.out.println("Finished."); }
From source file:net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilderKmeansPlusPlus.java
License:Open Source License
/** * Uses an existing index, where each and every document should have a set of local features. A number of * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words * (the cluster means). For all images a histogram on the visual words is created and added to the documents. * Pre-existing histograms are deleted, so this method can be used for re-indexing. * * @throws java.io.IOException/* w ww . j a v a2s . co m*/ */ public void index() throws IOException { df.setMaximumFractionDigits(3); // find the documents for building the vocabulary: HashSet<Integer> docIDs = selectVocabularyDocs(); System.out.println("Using " + docIDs.size() + " documents to build the vocabulary."); KMeansPlusPlusClusterer kpp = new KMeansPlusPlusClusterer(numClusters, 15); // fill the KMeans object: LinkedList<DoublePoint> features = new LinkedList<DoublePoint>(); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); for (Iterator<Integer> iterator = docIDs.iterator(); iterator.hasNext();) { int nextDoc = iterator.next(); if (reader.hasDeletions() && !liveDocs.get(nextDoc)) continue; // if it is deleted, just ignore it. Document d = reader.document(nextDoc); // features.clear(); IndexableField[] fields = d.getFields(localFeatureFieldName); String file = d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]; for (int j = 0; j < fields.length; j++) { LireFeature f = getFeatureInstance(); f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length); // copy the data over to new array ... double[] feat = new double[f.getDoubleHistogram().length]; System.arraycopy(f.getDoubleHistogram(), 0, feat, 0, feat.length); features.add(new DoublePoint(f.getDoubleHistogram())); } } if (features.size() < numClusters) { // this cannot work. You need more data points than clusters. throw new UnsupportedOperationException("Only " + features.size() + " features found to cluster in " + numClusters + ". Try to use less clusters or more images."); } // do the clustering: System.out.println("Number of local features: " + df.format(features.size())); System.out.println("Starting clustering ..."); List<CentroidCluster<DoublePoint>> clusterList = kpp.cluster(features); // TODO: Serializing clusters to a file on the disk ... System.out.println("Clustering finished, " + clusterList.size() + " clusters found"); clusters = new LinkedList<double[]>(); for (Iterator<CentroidCluster<DoublePoint>> iterator = clusterList.iterator(); iterator.hasNext();) { CentroidCluster<DoublePoint> centroidCluster = iterator.next(); clusters.add(centroidCluster.getCenter().getPoint()); } System.out.println("Creating histograms ..."); int[] tmpHist = new int[numClusters]; IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true, LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d); // careful: copy reader to RAM for faster access when reading ... // reader = IndexReader.open(new RAMDirectory(reader.directory()), true); LireFeature f = getFeatureInstance(); for (int i = 0; i < reader.maxDoc(); i++) { try { if (reader.hasDeletions() && !liveDocs.get(i)) continue; for (int j = 0; j < tmpHist.length; j++) { tmpHist[j] = 0; } Document d = reader.document(i); IndexableField[] fields = d.getFields(localFeatureFieldName); // remove the fields if they are already there ... d.removeField(visualWordsFieldName); d.removeField(localFeatureHistFieldName); // find the appropriate cluster for each feature: for (int j = 0; j < fields.length; j++) { f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length); tmpHist[clusterForFeature(f, clusters)]++; } // System.out.println(Arrays.toString(tmpHist)); d.add(new StoredField(localFeatureHistFieldName, SerializationUtils.toByteArray(normalize(tmpHist)))); quantize(tmpHist); d.add(new TextField(visualWordsFieldName, arrayToVisualWordString(tmpHist), Field.Store.YES)); // remove local features to save some space if requested: if (DELETE_LOCAL_FEATURES) { d.removeFields(localFeatureFieldName); } // now write the new one. we use the identifier to update ;) iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER, d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), d); } catch (IOException e) { e.printStackTrace(); } } iw.commit(); // this one does the "old" commit(), it removes the deleted local features. iw.forceMerge(1); iw.close(); System.out.println("Finished."); }