Example usage for org.apache.lucene.index IndexWriter updateDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter updateDocument.

Prototype

private long updateDocument(final DocumentsWriterDeleteQueue.Node<?> delNode,
            Iterable<? extends IndexableField> doc) throws IOException

Source Link

Usage

From source file:lia.recent.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * //w  ww . ja  v  a2  s  .  c  om
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *  
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException If there is a low-level I/O error
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
                System.out.println("fis " + file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize 
                // the field into separate words and don't index term frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a LongField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.
                doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:lucene.demo.search.FileIndexer.java

License:Apache License

private void addDoc(IndexWriter writer, File file, Document doc) throws IOException {
    if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {

        try {/*from  w  w w  . j a v  a 2s .c  om*/
            writer.addDocument(doc);
        } catch (Exception e) {
            //
        }

    } else {
        writer.updateDocument(new Term("path", file.getPath()), doc);
    }
}

From source file:luceneexamples.UpdateDocument.java

License:Apache License

@Test
public void index() throws Exception {
    RAMDirectory directory = new RAMDirectory();
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer);
    IndexWriter writer = new IndexWriter(directory, iwc);

    Document doc = new Document();
    doc.add(new Field("id", "001", Field.Store.YES, Field.Index.NOT_ANALYZED));
    doc.add(new Field("str_field", "quick brown fox jumped over the lazy dog.", Field.Store.YES,
            Field.Index.ANALYZED));
    writer.addDocument(doc);//  w w w.j av  a  2  s  . c  o m
    writer.commit();
    IndexReader reader = IndexReader.open(writer, true);
    IndexSearcher searcher = new IndexSearcher(reader);
    QueryParser parser = new QueryParser(Version.LUCENE_31, "str_field", analyzer);
    TopDocs td = searcher.search(parser.parse("fox"), 1000);
    assertThat(td.totalHits, is(1));

    Document doc2 = new Document();
    doc.add(new Field("id", "001", Field.Store.YES, Field.Index.NOT_ANALYZED));
    doc2.add(new Field("str_field", "quick brown fox jumped over the lazy whale.", Field.Store.YES,
            Field.Index.ANALYZED));
    writer.updateDocument(new Term("id", "001"), doc2);
    writer.commit();

    searcher.close();
    reader = reader.reopen();
    searcher = new IndexSearcher(reader);

    td = searcher.search(parser.parse("dog"), 1000);
    assertThat(td.totalHits, is(0));
    td = searcher.search(parser.parse("whale"), 1000);
    assertThat(td.totalHits, is(1));

    writer.close();
    searcher.close();
    directory.close();
}

From source file:mm.IndexFiles.java

License:Apache License

/** Indexes a single document */
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    try (InputStream stream = Files.newInputStream(file)) {
        // make a new, empty document
        Document doc = new Document();

        // Add the path of the file as a field named "path".  Use a
        // field that is indexed (i.e. searchable), but don't tokenize 
        // the field into separate words and don't index term frequency
        // or positional information:
        Field pathField = new StringField("path", file.toString(), Field.Store.YES);
        doc.add(pathField);// w  ww .  j a  v a2 s.  c  o  m

        // Add the last modified date of the file a field named "modified".
        // Use a LongField that is indexed (i.e. efficiently filterable with
        // NumericRangeFilter).  This indexes to milli-second resolution, which
        // is often too fine.  You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you require.
        // For example the long value 2011021714 would mean
        // February 17, 2011, 2-3 PM.
        doc.add(new LongPoint("modified", lastModified));

        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will fail.
        doc.add(new TextField("contents",
                new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));

        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            System.out.println("adding " + file);
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been indexed) so 
            // we use updateDocument instead to replace the old one matching the exact 
            // path, if present:
            System.out.println("updating " + file);
            writer.updateDocument(new Term("path", file.toString()), doc);
        }
    }
}

From source file:model.IndexFiles.java

License:Apache License

/** Indexes a single document */
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    HashMap<String, String> mappingPathToTitle = new HashMap<String, String>();

    try (InputStream stream = Files.newInputStream(file)) {
        // make a new, empty document
        Document doc = new Document();

        // strip tags in here as they're unrequired
        //doc.add(new TextField("fullContents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));

        // Add the path of the file as a field named "path".  Use a
        // field that is indexed (i.e. searchable), but don't tokenize 
        // the field into separate words and don't index term frequency
        // or positional information:
        Field pathField = new StringField("path", file.toString(), Field.Store.YES);
        //System.out.println(pathField);

        String path = file.toString();

        String pathContents = readFileToString(path, path, mappingPathToTitle);
        Field pathContents1 = new TextField("contents", pathContents, Field.Store.YES);

        if (!mappingPathToTitle.isEmpty()) {

            Field title = new TextField("title", mappingPathToTitle.get(path), Field.Store.YES);
            doc.add(title);/*from  ww w .  j av  a 2 s  .  c o m*/
        }

        doc.add(pathField);
        doc.add(pathContents1);

        // Add the last modified date of the file a field named "modified".
        // Use a LongField that is indexed (i.e. efficiently filterable with
        // NumericRangeFilter).  This indexes to milli-second resolution, which
        // is often too fine.  You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you require.
        // For example the long value 2011021714 would mean
        // February 17, 2011, 2-3 PM.
        doc.add(new LongField("modified", lastModified, Field.Store.NO));

        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will fail.

        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been indexed) so 
            // we use updateDocument instead to replace the old one matching the exact 
            // path, if present:
            System.out.println("updating " + file);
            writer.updateDocument(new Term("path", file.toString()), doc);
        }
    }

}

From source file:net.oneandone.pommes.model.Database.java

License:Apache License

public void index(Iterator<Document> iterator) throws IOException {
    IndexWriter writer;
    IndexWriterConfig config;//from w  w w  .j  a va  2 s  .  c om
    Document doc;

    close();
    // no analyzer, I have String fields only
    config = new IndexWriterConfig(Version.LUCENE_4_9, null);
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    writer = new IndexWriter(getIndexLuceneDirectory(), config);
    while (iterator.hasNext()) {
        doc = iterator.next();
        writer.updateDocument(new Term(ORIGIN, doc.get(ORIGIN)), doc);
    }
    writer.close();
}

From source file:net.riezebos.thoth.content.search.Indexer.java

License:Apache License

protected void addToIndex(IndexWriter writer, String resourcePath, String resourceType, String title,
        String contents, Map<String, String> metaTags) throws IOException {
    String extension = ThothUtil.getExtension(resourcePath);
    if (extension == null)
        extension = "";
    extension = extension.toLowerCase();

    Document document = new Document();
    document.add(new StringField(INDEX_PATH, resourcePath, Field.Store.YES));
    document.add(new TextField(INDEX_TYPE, resourceType, Store.YES));
    document.add(new TextField(INDEX_TITLE, title, Store.YES));
    document.add(new TextField(INDEX_CONTENTS, contents, Store.NO));
    document.add(new TextField(INDEX_USED, "true", Store.NO));
    document.add(new TextField(INDEX_EXTENSION, extension.toLowerCase(), Store.NO));

    metaTags.entrySet().stream().forEach(entry -> document
            .add(new TextField(entry.getKey().toLowerCase(), String.valueOf(entry.getValue()), Store.NO)));

    if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
        // New index, so we just add the document (no old document can be there):
        LOG.debug("Indexer for context " + contentManager.getContextName() + " added " + resourcePath);
        writer.addDocument(document);//from w  w w. j a va  2 s.c  om
    } else {
        // Existing index (an old copy of this document may have been indexed) so
        // we use updateDocument instead to replace the old one matching the exact
        // path, if present:
        LOG.debug("Indexer for context " + contentManager.getContextName() + " updated " + resourcePath);
        writer.updateDocument(new Term(INDEX_PATH, resourcePath), document);
    }
}

From source file:net.semanticmetadata.lire.imageanalysis.bovw.BOVWBuilder.java

License:Open Source License

public void indexMissing() throws IOException {
    init();//from  w  w  w .j  a  va  2  s.com
    // Reading clusters from disk:
    clusters = Cluster.readClusters(clusterFile);
    //  create & store histograms:
    System.out.println("Creating histograms ...");
    LireFeature f = getFeatureInstance();

    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    // based on bug report from Einav Itamar <einavitamar@gmail.com>
    IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), false,
            LuceneUtils.AnalyzerType.WhitespaceAnalyzer);
    int counter = 0;
    for (int i = 0; i < reader.maxDoc(); i++) {
        if (reader.hasDeletions() && !liveDocs.get(i))
            continue; // if it is deleted, just ignore it.
        Document d = reader.document(i);
        // Only if there are no values yet:
        if (d.getValues(visualWordsFieldName) == null || d.getValues(visualWordsFieldName).length == 0) {
            createVisualWords(d, f);
            // now write the new one. we use the identifier to update ;)
            iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER,
                    d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), d);
            counter++;
        }
    }
    System.out.println(counter + " Documents were updated");
    iw.commit();
    // added to permanently remove the deleted docs.
    iw.forceMerge(1);
    iw.close();
    System.out.println("Finished.");
}

From source file:net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilder.java

License:Open Source License

public void indexMissing() throws IOException {
    // Reading clusters from disk:
    clusters = Cluster.readClusters(clusterFile);
    //  create & store histograms:
    System.out.println("Creating histograms ...");
    int[] tmpHist = new int[numClusters];
    LireFeature f = getFeatureInstance();
    IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true,
            LuceneUtils.AnalyzerType.WhitespaceAnalyzer);
    for (int i = 0; i < reader.maxDoc(); i++) {
        //            if (!reader.isDeleted(i)) {
        for (int j = 0; j < tmpHist.length; j++) {
            tmpHist[j] = 0;/*w  w w. j  ava 2 s .  co  m*/
        }
        Document d = reader.document(i);
        // Only if there are no values yet:
        if (d.getValues(visualWordsFieldName) == null || d.getValues(visualWordsFieldName).length == 0) {
            IndexableField[] fields = d.getFields(localFeatureFieldName);
            // find the appropriate cluster for each feature:
            for (int j = 0; j < fields.length; j++) {
                f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset,
                        fields[j].binaryValue().length);
                tmpHist[clusterForFeature((Histogram) f)]++;
            }
            normalize(tmpHist);
            d.add(new TextField(visualWordsFieldName, arrayToVisualWordString(tmpHist), Field.Store.YES));
            d.add(new StringField(localFeatureHistFieldName, SerializationUtils.arrayToString(tmpHist),
                    Field.Store.YES));
            // now write the new one. we use the identifier to update ;)
            iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER,
                    d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), d);
        }
        //            }
    }
    iw.commit();
    iw.close();
    System.out.println("Finished.");
}

From source file:net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilderKmeansPlusPlus.java

License:Open Source License

/**
 * Uses an existing index, where each and every document should have a set of local features. A number of
 * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words
 * (the cluster means). For all images a histogram on the visual words is created and added to the documents.
 * Pre-existing histograms are deleted, so this method can be used for re-indexing.
 *
 * @throws java.io.IOException/*  w  ww .  j a v a2s  . co m*/
 */
public void index() throws IOException {
    df.setMaximumFractionDigits(3);
    // find the documents for building the vocabulary:
    HashSet<Integer> docIDs = selectVocabularyDocs();
    System.out.println("Using " + docIDs.size() + " documents to build the vocabulary.");
    KMeansPlusPlusClusterer kpp = new KMeansPlusPlusClusterer(numClusters, 15);
    // fill the KMeans object:
    LinkedList<DoublePoint> features = new LinkedList<DoublePoint>();
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);
    for (Iterator<Integer> iterator = docIDs.iterator(); iterator.hasNext();) {
        int nextDoc = iterator.next();
        if (reader.hasDeletions() && !liveDocs.get(nextDoc))
            continue; // if it is deleted, just ignore it.
        Document d = reader.document(nextDoc);
        //            features.clear();
        IndexableField[] fields = d.getFields(localFeatureFieldName);
        String file = d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0];
        for (int j = 0; j < fields.length; j++) {
            LireFeature f = getFeatureInstance();
            f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset,
                    fields[j].binaryValue().length);
            // copy the data over to new array ...
            double[] feat = new double[f.getDoubleHistogram().length];
            System.arraycopy(f.getDoubleHistogram(), 0, feat, 0, feat.length);
            features.add(new DoublePoint(f.getDoubleHistogram()));
        }
    }
    if (features.size() < numClusters) {
        // this cannot work. You need more data points than clusters.
        throw new UnsupportedOperationException("Only " + features.size() + " features found to cluster in "
                + numClusters + ". Try to use less clusters or more images.");
    }
    // do the clustering:
    System.out.println("Number of local features: " + df.format(features.size()));
    System.out.println("Starting clustering ...");
    List<CentroidCluster<DoublePoint>> clusterList = kpp.cluster(features);
    // TODO: Serializing clusters to a file on the disk ...
    System.out.println("Clustering finished, " + clusterList.size() + " clusters found");
    clusters = new LinkedList<double[]>();
    for (Iterator<CentroidCluster<DoublePoint>> iterator = clusterList.iterator(); iterator.hasNext();) {
        CentroidCluster<DoublePoint> centroidCluster = iterator.next();
        clusters.add(centroidCluster.getCenter().getPoint());
    }
    System.out.println("Creating histograms ...");
    int[] tmpHist = new int[numClusters];
    IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true,
            LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d);

    // careful: copy reader to RAM for faster access when reading ...
    //        reader = IndexReader.open(new RAMDirectory(reader.directory()), true);
    LireFeature f = getFeatureInstance();
    for (int i = 0; i < reader.maxDoc(); i++) {
        try {
            if (reader.hasDeletions() && !liveDocs.get(i))
                continue;
            for (int j = 0; j < tmpHist.length; j++) {
                tmpHist[j] = 0;
            }
            Document d = reader.document(i);
            IndexableField[] fields = d.getFields(localFeatureFieldName);
            // remove the fields if they are already there ...
            d.removeField(visualWordsFieldName);
            d.removeField(localFeatureHistFieldName);

            // find the appropriate cluster for each feature:
            for (int j = 0; j < fields.length; j++) {
                f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset,
                        fields[j].binaryValue().length);
                tmpHist[clusterForFeature(f, clusters)]++;
            }
            //                System.out.println(Arrays.toString(tmpHist));
            d.add(new StoredField(localFeatureHistFieldName,
                    SerializationUtils.toByteArray(normalize(tmpHist))));
            quantize(tmpHist);
            d.add(new TextField(visualWordsFieldName, arrayToVisualWordString(tmpHist), Field.Store.YES));

            // remove local features to save some space if requested:
            if (DELETE_LOCAL_FEATURES) {
                d.removeFields(localFeatureFieldName);
            }
            // now write the new one. we use the identifier to update ;)
            iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER,
                    d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), d);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    iw.commit();
    // this one does the "old" commit(), it removes the deleted local features.
    iw.forceMerge(1);
    iw.close();
    System.out.println("Finished.");
}