Example usage for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException

Source Link

Document

Adds a document to this index.

Usage

From source file:com.senseidb.search.node.inmemory.InMemorySenseiService.java

License:Apache License

private void addDocuments(Directory directory, IndexWriter writer, List<JSONObject> documents) {
    try {/*w  ww  .j a va2s  . com*/
        writer.deleteAll();
        for (JSONObject doc : documents) {
            if (doc == null)
                continue;
            writer.addDocument(buildDoc(doc));
            pluggableSearchEngineManager.update(doc, "");
        }
        writer.commit();

    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:com.serendio.lingo3g.CreateLuceneIndex.java

License:Open Source License

public static void main(String[] args) throws Exception {
    if (args.length != 1) {
        System.out.println("Args: index-dir");
        System.exit(-1);// ww w  .j  av  a  2 s.  c o  m
    }

    File indexDir = new File(args[0]);
    if (indexDir.exists()) {
        System.out.println("Index directory already exists: " + indexDir.getAbsolutePath());
        System.exit(-2);
    }

    Analyzer analyzer = new StandardAnalyzer();
    IndexWriterConfig config = new IndexWriterConfig(analyzer);
    IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir.toPath()), config);

    for (Document d : SampleDocumentData.DOCUMENTS_DATA_MINING) {
        final org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();
        /*
         * We will create Lucene documents with searchable "fullContent" field and "title", 
         * "url" and "snippet" fields for clustering.
         */
        doc.add(new TextField("fullContent", d.getSummary(), Store.NO));

        doc.add(new TextField("title", d.getTitle(), Store.YES));
        doc.add(new TextField("snippet", d.getSummary(), Store.YES));
        doc.add(new StringField("url", d.getContentUrl(), Store.YES));
        writer.addDocument(doc);
    }

    writer.close();
}

From source file:com.sg.business.vault.index.demo.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * // www.  j  av  a  2  s.c  o m
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *  
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException If there is a low-level I/O error
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize 
                // the field into separate words and don't index term frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES); //$NON-NLS-1$
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a LongField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                //          doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.
                //          doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                try {
                    doc.add(new TextField("contents", FileUtil.getContent(file.getName(), fis), //$NON-NLS-1$
                            Field.Store.NO));
                } catch (Exception e) {
                    e.printStackTrace();
                }

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file); //$NON-NLS-1$
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file); //$NON-NLS-1$
                    writer.updateDocument(new Term("path", file.getPath()), doc); //$NON-NLS-1$
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:com.shaie.annots.AnnotationSearchExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    Directory dir = new RAMDirectory();
    IndexWriterConfig conf = new IndexWriterConfig(new WhitespaceAnalyzer());
    IndexWriter writer = new IndexWriter(dir, conf);

    // we need to add the annotation as a TokenStream field, therefore cannot use an Analyzer passed in the
    // IndexWriterConfig.
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("quick brown fox ate the blue red chicken"));
    TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    TokenStream colorAnnotationStream = new AnnotatingTokenFilter(
            textStream.newSinkTokenStream(new ColorsSinkFilter()), COLOR_ANNOT_TERM);

    Document doc = new Document();
    doc.add(new TextField("text", textStream));
    doc.add(new TextField("annot", colorAnnotationStream));
    writer.addDocument(doc);

    writer.close();//from   ww w  . j av a 2 s .c om

    DirectoryReader reader = DirectoryReader.open(dir);
    LeafReader ar = reader.leaves().get(0).reader(); // we only have one segment
    printFieldTerms(ar, "text");
    System.out.println();

    final ByteArrayDataInput in = new ByteArrayDataInput();
    PostingsEnum dape = ar.postings(new Term("annot", COLOR_ANNOT_TERM));
    int docID = dape.nextDoc();
    int freq = dape.freq();
    System.out.println("Color annotation spans: doc=" + docID + ", freq=" + freq);
    for (int i = 0; i < freq; i++) {
        dape.nextPosition();
        BytesRef payload = dape.getPayload();
        in.reset(payload.bytes, payload.offset, payload.length);
        System.out.println("  start=" + in.readVInt() + ", length=" + in.readVInt());
    }

    IndexSearcher searcher = new IndexSearcher(reader);

    System.out.println("\nsearching for 'red WITHIN color':");
    Query q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)),
            new SpanInclusivePositionTermQuery(new Term("text", "red")));
    TopDocs td = searcher.search(q, 10);
    System.out.println("  num results: " + td.scoreDocs.length);

    System.out.println("\nsearching for 'ate WITHIN color':");
    q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)),
            new SpanInclusivePositionTermQuery(new Term("text", "ate")));
    td = searcher.search(q, 10);
    System.out.println("  num results: " + td.scoreDocs.length);

    reader.close();
    dir.close();
}

From source file:com.shaie.annots.example.AnnotatorAnyExample.java

License:Apache License

@SuppressWarnings("resource")
private static void addDocument(IndexWriter writer, String text) throws IOException {
    final Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    final TokenStream colorsStream = new AnyAnnotationTokenFilter(
            new AnnotatorTokenFilter(textStream.newSinkTokenStream(), ColorAnnotator.withDefaultColors()));
    final TokenStream animalsStream = new AnyAnnotationTokenFilter(
            new AnnotatorTokenFilter(textStream.newSinkTokenStream(), AnimalAnnotator.withDefaultAnimals()));

    final Document doc = new Document();
    doc.add(new StoredField(TEXT_FIELD, text));
    doc.add(new TextField(TEXT_FIELD, textStream));
    doc.add(new TextField(COLOR_FIELD, colorsStream));
    doc.add(new TextField(ANIMAL_FIELD, animalsStream));
    writer.addDocument(doc);
}

From source file:com.shaie.annots.example.AnnotatorTeeSinkFilterExample.java

License:Apache License

@SuppressWarnings("resource")
private static void addDocument(IndexWriter writer, String text) throws IOException {
    final Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    final TokenStream colorsStream = new AnnotatorTokenFilter(textStream.newSinkTokenStream(),
            ColorAnnotator.withDefaultColors());
    final TokenStream animalsStream = new AnnotatorTokenFilter(textStream.newSinkTokenStream(),
            AnimalAnnotator.withDefaultAnimals());

    final Document doc = new Document();
    doc.add(new StoredField(TEXT_FIELD, text));
    doc.add(new TextField(TEXT_FIELD, textStream));
    doc.add(new TextField(COLOR_FIELD, colorsStream));
    doc.add(new TextField(ANIMAL_FIELD, animalsStream));
    writer.addDocument(doc);
}

From source file:com.shaie.annots.example.AnnotatorTokenFilterExample.java

License:Apache License

private static void addDocument(IndexWriter writer, String text) throws IOException {
    final Document doc = new Document();
    doc.add(new TextField(TEXT_FIELD, text, Store.YES));
    doc.add(new TextField(COLOR_FIELD, text, Store.NO));
    doc.add(new TextField(ANIMAL_FIELD, text, Store.NO));
    writer.addDocument(doc);
}

From source file:com.shaie.annots.example.PreAnnotatedTokenFilterExample.java

License:Apache License

@SuppressWarnings("resource")
private static void addDocument(IndexWriter writer, String text, int... colorAnnotations) throws IOException {
    final Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    final TokenStream colorsStream = new PreAnnotatedTokenFilter(textStream.newSinkTokenStream(),
            colorAnnotations);//from  w w w. j  a v a 2 s . co  m

    final Document doc = new Document();
    doc.add(new StoredField(TEXT_FIELD, text));
    doc.add(new TextField(TEXT_FIELD, textStream));
    doc.add(new TextField(COLOR_FIELD, colorsStream));
    writer.addDocument(doc);
}

From source file:com.shaie.annots.example.SimplePreAnnotatedTokenFilterExample.java

License:Apache License

@SuppressWarnings("resource")
private static void addDocument(IndexWriter writer, String text, int... colorAnnotations) throws IOException {
    final Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    final TokenStream colorsStream = new AnyAnnotationTokenFilter(
            new SimplePreAnnotatedTokenFilter(textStream.newSinkTokenStream(), colorAnnotations));

    final Document doc = new Document();
    doc.add(new StoredField(TEXT_FIELD, text));
    doc.add(new TextField(TEXT_FIELD, textStream));
    doc.add(new TextField(COLOR_FIELD, colorsStream));
    writer.addDocument(doc);
}

From source file:com.shaie.PhraseVsSpanQuery.java

License:Apache License

@SuppressWarnings("resource")
public static void main(String[] args) throws Exception {
    final Directory dir = new RAMDirectory();
    final IndexWriterConfig conf = new IndexWriterConfig(new WhitespaceAnalyzer());
    final IndexWriter writer = new IndexWriter(dir, conf);

    final Document doc = new Document();
    doc.add(new TextField("f", new TokenStream() {
        final PositionIncrementAttribute pos = addAttribute(PositionIncrementAttribute.class);
        final CharTermAttribute term = addAttribute(CharTermAttribute.class);
        boolean first = true, done = false;

        @Override//w  w  w  . jav  a 2 s . co m
        public boolean incrementToken() throws IOException {
            if (done) {
                return false;
            }
            if (first) {
                term.setEmpty().append("a");
                pos.setPositionIncrement(1);
                first = false;
            } else {
                term.setEmpty().append("b");
                pos.setPositionIncrement(0);
                done = true;
            }
            return true;
        }
    }));
    writer.addDocument(doc);
    writer.close();

    final DirectoryReader reader = DirectoryReader.open(dir);
    final IndexSearcher searcher = new IndexSearcher(reader);
    final LeafReader ar = reader.leaves().get(0).reader();
    final TermsEnum te = ar.terms("f").iterator();
    BytesRef scratch = new BytesRef();
    while ((scratch = te.next()) != null) {
        System.out.println(scratch.utf8ToString());
        final PostingsEnum dape = ar.postings(new Term("f", scratch.utf8ToString()));
        System.out.println("  doc=" + dape.nextDoc() + ", pos=" + dape.nextPosition());
    }

    System.out.println();

    // try a phrase query with a slop
    final PhraseQuery pqNoSlop = buildPhraseQuery(0);
    System.out.println("searching for \"a b\"; num results = " + searcher.search(pqNoSlop, 10).totalHits);

    final PhraseQuery pqSlop1 = buildPhraseQuery(1);
    System.out.println("searching for \"a b\"~1; num results = " + searcher.search(pqSlop1, 10).totalHits);

    final PhraseQuery pqSlop3 = buildPhraseQuery(3);
    System.out.println("searching for \"a b\"~3; num results = " + searcher.search(pqSlop3, 10).totalHits);

    final SpanNearQuery snqUnOrdered = new SpanNearQuery(
            new SpanQuery[] { new SpanTermQuery(new Term("f", "a")), new SpanTermQuery(new Term("f", "b")) }, 1,
            false);
    System.out.println("searching for SpanNearUnordered('a', 'b'), slop=1; num results = "
            + searcher.search(snqUnOrdered, 10).totalHits);

    final SpanNearQuery snqOrdered = new SpanNearQuery(
            new SpanQuery[] { new SpanTermQuery(new Term("f", "a")), new SpanTermQuery(new Term("f", "b")) }, 1,
            true);
    System.out.println("searching for SpanNearOrdered('a', 'b'), slop=1; num results = "
            + searcher.search(snqOrdered, 10).totalHits);

    reader.close();
}