Example usage for org.apache.lucene.index IndexWriter addDocument

List of usage examples for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException 

Source Link

Document

Adds a document to this index.

Usage

From source file:com.senseidb.search.node.inmemory.InMemorySenseiService.java

License:Apache License

private void addDocuments(Directory directory, IndexWriter writer, List<JSONObject> documents) {
    try {/*w  ww  .j a va2s  . com*/
        writer.deleteAll();
        for (JSONObject doc : documents) {
            if (doc == null)
                continue;
            writer.addDocument(buildDoc(doc));
            pluggableSearchEngineManager.update(doc, "");
        }
        writer.commit();

    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:com.serendio.lingo3g.CreateLuceneIndex.java

License:Open Source License

public static void main(String[] args) throws Exception {
    if (args.length != 1) {
        System.out.println("Args: index-dir");
        System.exit(-1);// ww w  .j  av  a  2 s.  c o  m
    }

    File indexDir = new File(args[0]);
    if (indexDir.exists()) {
        System.out.println("Index directory already exists: " + indexDir.getAbsolutePath());
        System.exit(-2);
    }

    Analyzer analyzer = new StandardAnalyzer();
    IndexWriterConfig config = new IndexWriterConfig(analyzer);
    IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir.toPath()), config);

    for (Document d : SampleDocumentData.DOCUMENTS_DATA_MINING) {
        final org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();
        /*
         * We will create Lucene documents with searchable "fullContent" field and "title", 
         * "url" and "snippet" fields for clustering.
         */
        doc.add(new TextField("fullContent", d.getSummary(), Store.NO));

        doc.add(new TextField("title", d.getTitle(), Store.YES));
        doc.add(new TextField("snippet", d.getSummary(), Store.YES));
        doc.add(new StringField("url", d.getContentUrl(), Store.YES));
        writer.addDocument(doc);
    }

    writer.close();
}

From source file:com.sg.business.vault.index.demo.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * // www.  j  av  a  2  s.c  o m
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *  
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException If there is a low-level I/O error
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize 
                // the field into separate words and don't index term frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES); //$NON-NLS-1$
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a LongField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                //          doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.
                //          doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                try {
                    doc.add(new TextField("contents", FileUtil.getContent(file.getName(), fis), //$NON-NLS-1$
                            Field.Store.NO));
                } catch (Exception e) {
                    e.printStackTrace();
                }

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file); //$NON-NLS-1$
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file); //$NON-NLS-1$
                    writer.updateDocument(new Term("path", file.getPath()), doc); //$NON-NLS-1$
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:com.shaie.annots.AnnotationSearchExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    Directory dir = new RAMDirectory();
    IndexWriterConfig conf = new IndexWriterConfig(new WhitespaceAnalyzer());
    IndexWriter writer = new IndexWriter(dir, conf);

    // we need to add the annotation as a TokenStream field, therefore cannot use an Analyzer passed in the
    // IndexWriterConfig.
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("quick brown fox ate the blue red chicken"));
    TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    TokenStream colorAnnotationStream = new AnnotatingTokenFilter(
            textStream.newSinkTokenStream(new ColorsSinkFilter()), COLOR_ANNOT_TERM);

    Document doc = new Document();
    doc.add(new TextField("text", textStream));
    doc.add(new TextField("annot", colorAnnotationStream));
    writer.addDocument(doc);

    writer.close();//from   ww w  . j av a 2 s .c om

    DirectoryReader reader = DirectoryReader.open(dir);
    LeafReader ar = reader.leaves().get(0).reader(); // we only have one segment
    printFieldTerms(ar, "text");
    System.out.println();

    final ByteArrayDataInput in = new ByteArrayDataInput();
    PostingsEnum dape = ar.postings(new Term("annot", COLOR_ANNOT_TERM));
    int docID = dape.nextDoc();
    int freq = dape.freq();
    System.out.println("Color annotation spans: doc=" + docID + ", freq=" + freq);
    for (int i = 0; i < freq; i++) {
        dape.nextPosition();
        BytesRef payload = dape.getPayload();
        in.reset(payload.bytes, payload.offset, payload.length);
        System.out.println("  start=" + in.readVInt() + ", length=" + in.readVInt());
    }

    IndexSearcher searcher = new IndexSearcher(reader);

    System.out.println("\nsearching for 'red WITHIN color':");
    Query q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)),
            new SpanInclusivePositionTermQuery(new Term("text", "red")));
    TopDocs td = searcher.search(q, 10);
    System.out.println("  num results: " + td.scoreDocs.length);

    System.out.println("\nsearching for 'ate WITHIN color':");
    q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)),
            new SpanInclusivePositionTermQuery(new Term("text", "ate")));
    td = searcher.search(q, 10);
    System.out.println("  num results: " + td.scoreDocs.length);

    reader.close();
    dir.close();
}

From source file:com.shaie.annots.example.AnnotatorAnyExample.java

License:Apache License

@SuppressWarnings("resource")
private static void addDocument(IndexWriter writer, String text) throws IOException {
    final Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    final TokenStream colorsStream = new AnyAnnotationTokenFilter(
            new AnnotatorTokenFilter(textStream.newSinkTokenStream(), ColorAnnotator.withDefaultColors()));
    final TokenStream animalsStream = new AnyAnnotationTokenFilter(
            new AnnotatorTokenFilter(textStream.newSinkTokenStream(), AnimalAnnotator.withDefaultAnimals()));

    final Document doc = new Document();
    doc.add(new StoredField(TEXT_FIELD, text));
    doc.add(new TextField(TEXT_FIELD, textStream));
    doc.add(new TextField(COLOR_FIELD, colorsStream));
    doc.add(new TextField(ANIMAL_FIELD, animalsStream));
    writer.addDocument(doc);
}

From source file:com.shaie.annots.example.AnnotatorTeeSinkFilterExample.java

License:Apache License

@SuppressWarnings("resource")
private static void addDocument(IndexWriter writer, String text) throws IOException {
    final Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    final TokenStream colorsStream = new AnnotatorTokenFilter(textStream.newSinkTokenStream(),
            ColorAnnotator.withDefaultColors());
    final TokenStream animalsStream = new AnnotatorTokenFilter(textStream.newSinkTokenStream(),
            AnimalAnnotator.withDefaultAnimals());

    final Document doc = new Document();
    doc.add(new StoredField(TEXT_FIELD, text));
    doc.add(new TextField(TEXT_FIELD, textStream));
    doc.add(new TextField(COLOR_FIELD, colorsStream));
    doc.add(new TextField(ANIMAL_FIELD, animalsStream));
    writer.addDocument(doc);
}

From source file:com.shaie.annots.example.AnnotatorTokenFilterExample.java

License:Apache License

private static void addDocument(IndexWriter writer, String text) throws IOException {
    final Document doc = new Document();
    doc.add(new TextField(TEXT_FIELD, text, Store.YES));
    doc.add(new TextField(COLOR_FIELD, text, Store.NO));
    doc.add(new TextField(ANIMAL_FIELD, text, Store.NO));
    writer.addDocument(doc);
}

From source file:com.shaie.annots.example.PreAnnotatedTokenFilterExample.java

License:Apache License

@SuppressWarnings("resource")
private static void addDocument(IndexWriter writer, String text, int... colorAnnotations) throws IOException {
    final Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    final TokenStream colorsStream = new PreAnnotatedTokenFilter(textStream.newSinkTokenStream(),
            colorAnnotations);//from  w w w. j  a v a 2 s . co  m

    final Document doc = new Document();
    doc.add(new StoredField(TEXT_FIELD, text));
    doc.add(new TextField(TEXT_FIELD, textStream));
    doc.add(new TextField(COLOR_FIELD, colorsStream));
    writer.addDocument(doc);
}

From source file:com.shaie.annots.example.SimplePreAnnotatedTokenFilterExample.java

License:Apache License

@SuppressWarnings("resource")
private static void addDocument(IndexWriter writer, String text, int... colorAnnotations) throws IOException {
    final Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    final TokenStream colorsStream = new AnyAnnotationTokenFilter(
            new SimplePreAnnotatedTokenFilter(textStream.newSinkTokenStream(), colorAnnotations));

    final Document doc = new Document();
    doc.add(new StoredField(TEXT_FIELD, text));
    doc.add(new TextField(TEXT_FIELD, textStream));
    doc.add(new TextField(COLOR_FIELD, colorsStream));
    writer.addDocument(doc);
}

From source file:com.shaie.PhraseVsSpanQuery.java

License:Apache License

@SuppressWarnings("resource")
public static void main(String[] args) throws Exception {
    final Directory dir = new RAMDirectory();
    final IndexWriterConfig conf = new IndexWriterConfig(new WhitespaceAnalyzer());
    final IndexWriter writer = new IndexWriter(dir, conf);

    final Document doc = new Document();
    doc.add(new TextField("f", new TokenStream() {
        final PositionIncrementAttribute pos = addAttribute(PositionIncrementAttribute.class);
        final CharTermAttribute term = addAttribute(CharTermAttribute.class);
        boolean first = true, done = false;

        @Override//w  w  w  . jav  a 2 s . co m
        public boolean incrementToken() throws IOException {
            if (done) {
                return false;
            }
            if (first) {
                term.setEmpty().append("a");
                pos.setPositionIncrement(1);
                first = false;
            } else {
                term.setEmpty().append("b");
                pos.setPositionIncrement(0);
                done = true;
            }
            return true;
        }
    }));
    writer.addDocument(doc);
    writer.close();

    final DirectoryReader reader = DirectoryReader.open(dir);
    final IndexSearcher searcher = new IndexSearcher(reader);
    final LeafReader ar = reader.leaves().get(0).reader();
    final TermsEnum te = ar.terms("f").iterator();
    BytesRef scratch = new BytesRef();
    while ((scratch = te.next()) != null) {
        System.out.println(scratch.utf8ToString());
        final PostingsEnum dape = ar.postings(new Term("f", scratch.utf8ToString()));
        System.out.println("  doc=" + dape.nextDoc() + ", pos=" + dape.nextPosition());
    }

    System.out.println();

    // try a phrase query with a slop
    final PhraseQuery pqNoSlop = buildPhraseQuery(0);
    System.out.println("searching for \"a b\"; num results = " + searcher.search(pqNoSlop, 10).totalHits);

    final PhraseQuery pqSlop1 = buildPhraseQuery(1);
    System.out.println("searching for \"a b\"~1; num results = " + searcher.search(pqSlop1, 10).totalHits);

    final PhraseQuery pqSlop3 = buildPhraseQuery(3);
    System.out.println("searching for \"a b\"~3; num results = " + searcher.search(pqSlop3, 10).totalHits);

    final SpanNearQuery snqUnOrdered = new SpanNearQuery(
            new SpanQuery[] { new SpanTermQuery(new Term("f", "a")), new SpanTermQuery(new Term("f", "b")) }, 1,
            false);
    System.out.println("searching for SpanNearUnordered('a', 'b'), slop=1; num results = "
            + searcher.search(snqUnOrdered, 10).totalHits);

    final SpanNearQuery snqOrdered = new SpanNearQuery(
            new SpanQuery[] { new SpanTermQuery(new Term("f", "a")), new SpanTermQuery(new Term("f", "b")) }, 1,
            true);
    System.out.println("searching for SpanNearOrdered('a', 'b'), slop=1; num results = "
            + searcher.search(snqOrdered, 10).totalHits);

    reader.close();
}