Example usage for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException

Source Link

Document

Adds a document to this index.

Usage

From source file:com.google.gerrit.server.change.ReviewerSuggestionCache.java

License:Apache License

private void addAccount(IndexWriter writer, Account a) throws IOException, OrmException {
    Document doc = new Document();
    doc.add(new IntField(ID, a.getId().get(), Store.YES));
    if (a.getFullName() != null) {
        doc.add(new TextField(NAME, a.getFullName(), Store.YES));
    }//w w  w .j  ava2 s  .com
    if (a.getPreferredEmail() != null) {
        doc.add(new StringField(EMAIL, a.getPreferredEmail().toLowerCase(), Store.YES));
        doc.add(new TextField(EMAIL, a.getPreferredEmail(), Store.YES));
    }
    AccountExternalIdAccess extIdAccess = db.get().accountExternalIds();
    String username = AccountState.getUserName(extIdAccess.byAccount(a.getId()).toList());
    if (username != null) {
        doc.add(new StringField(USERNAME, username, Store.YES));
    }
    writer.addDocument(doc);
}

From source file:com.greplin.lucene.filter.PhraseFilterBenchmark.java

License:Apache License

public static void main(String[] argv) {
    Directory directory = new RAMDirectory();
    try {/*from  www  .jav a 2s .c o m*/
        IndexWriter writer = new IndexWriter(directory,
                new IndexWriterConfig(Version.LUCENE_32, new WhitespaceAnalyzer(Version.LUCENE_32)));
        int done = 0;
        for (int i = 0; i < NUMBER_OF_SEGMENTS; i++) {
            int remaining = NUMBER_OF_SEGMENTS - i;
            int numberOfDocs;
            if (remaining == 1) {
                numberOfDocs = TOTAL_DOCS - done;
            } else {
                numberOfDocs = RANDOM.nextInt(TOTAL_DOCS - done - remaining) + 1;
            }
            done += numberOfDocs;
            System.out.println("Segment #" + i + " has " + numberOfDocs + " docs");

            for (int d = 0; d < numberOfDocs; d++) {
                int wordCount = RANDOM.nextInt(WORDS_PER_DOC_DEVIATION * 2) + AVERAGE_WORDS_PER_DOC
                        - WORDS_PER_DOC_DEVIATION;
                Document doc = new Document();
                doc.add(new Field("f", Joiner.on(' ').join(words(wordCount)), Field.Store.YES,
                        Field.Index.ANALYZED));
                doc.add(new Field("second", RANDOM.nextInt(100) < SECOND_FIELD_MATCH_PERCENTAGE ? "yes" : "no",
                        Field.Store.NO, Field.Index.ANALYZED));
                writer.addDocument(doc);
            }
            writer.commit();
        }
        writer.close();

        IndexReader reader = IndexReader.open(directory);
        IndexSearcher searcher = new IndexSearcher(reader);

        String[][] queries = new String[TOTAL_QUERIES][];
        Term[][] terms = new Term[TOTAL_QUERIES][];

        for (int q = 0; q < TOTAL_QUERIES; q++) {
            queries[q] = words(WORDS_PER_QUERY[RANDOM.nextInt(WORDS_PER_QUERY.length)]);
            terms[q] = new Term[queries[q].length];
            for (int qw = 0; qw < queries[q].length; qw++) {
                terms[q][qw] = new Term(FIELD, queries[q][qw]);
            }
        }

        // Warm up.
        new PhraseFilter(FIELD, queries[0]).getDocIdSet(reader);

        for (int round = 0; round < ROUNDS; round++) {
            System.out.println();
            String name1 = "filter";
            String name2 = "query";

            long ms1 = 0, ms2 = 0;
            for (int step = 0; step < 2; step++) {
                System.gc();
                System.gc();
                System.gc();

                if (step == (round & 1)) {
                    long millis = System.currentTimeMillis();
                    long hits = 0;
                    for (String[] queryWords : queries) {
                        PhraseFilter pf = new PhraseFilter(
                                new FilterIntersectionProvider(TermsFilter.from(new Term("second", "yes"))),
                                FIELD, queryWords);
                        hits += searcher.search(new FilteredQuery(new MatchAllDocsQuery(), pf), 1).totalHits;
                    }
                    ms1 = System.currentTimeMillis() - millis;
                    System.out.println("Finished " + name1 + " in " + ms1 + "ms with " + hits + " hits");
                } else {
                    long millis = System.currentTimeMillis();
                    long hits = 0;
                    for (Term[] queryTerms : terms) {
                        PhraseQuery pq = new PhraseQuery();
                        for (Term term : queryTerms) {
                            pq.add(term);
                        }
                        Query query = BooleanQueryBuilder.builder()
                                .must(new TermQuery(new Term("second", "yes"))).must(pq).build();
                        hits += searcher.search(query, 1).totalHits;
                    }
                    ms2 = System.currentTimeMillis() - millis;
                    System.out.println("Finished " + name2 + " in " + ms2 + "ms with " + hits + " hits");
                }
            }
            System.out.println(name1 + " took " + (int) ((100.0 * ms1) / ms2) + "% as much time as " + name2);
        }

    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:com.greplin.lucene.query.PredicateBonusQueryTest.java

License:Apache License

@Test
public void testBasics() throws Exception {
    IndexWriter writer = new IndexWriter(this.directory,
            new IndexWriterConfig(Version.LUCENE_35, new WhitespaceAnalyzer(Version.LUCENE_35)));
    writer.addDocument(new DocumentBuilder().add("value", "5").build());
    writer.close();/*from w w  w  .j a  v a  2s  . c  o  m*/

    IndexReader reader = IndexReader.open(this.directory);
    IndexSearcher searcher = new IndexSearcher(reader);

    Query query = new ConstantScoreQuery(new TermQuery(new Term("value", "5")));
    Assert.assertEquals(1.0, searcher.search(query, 1).getMaxScore(), 0.00001);

    Query noBonus = new PredicateBonusQuery(query, Predicates.NONE, 10.0f);
    Assert.assertEquals(1.0, searcher.search(noBonus, 1).getMaxScore(), 0.00001);

    Query bonus = new PredicateBonusQuery(query, Predicates.ALL, 100.0f);
    Assert.assertEquals(101.0, searcher.search(bonus, 1).getMaxScore(), 0.00001);

    Query noMatch = new TermQuery(new Term("value", "not5"));
    Assert.assertEquals(Double.NaN, searcher.search(noMatch, 1).getMaxScore(), 0.00001);

    Query noMatchNoBonus = new PredicateBonusQuery(noMatch, Predicates.NONE, 10.0f);
    Assert.assertEquals(Double.NaN, searcher.search(noMatchNoBonus, 1).getMaxScore(), 0.00001);

    Query noMatchIgnoresBonus = new PredicateBonusQuery(noMatch, Predicates.ALL, 100.0f);
    Assert.assertEquals(Double.NaN, searcher.search(noMatchIgnoresBonus, 1).getMaxScore(), 0.00001);
}

From source file:com.heejong.lucene.IndexFiles.java

License:Apache License

/** Indexes a single document */
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    try (InputStream stream = Files.newInputStream(file)) {
        // make a new, empty document
        Document doc = new Document();

        // Add the path of the file as a field named "path".  Use a
        // field that is indexed (i.e. searchable), but don't tokenize 
        // the field into separate words and don't index term frequency
        // or positional information:
        Field pathField = new StringField("path", file.toString(), Field.Store.YES);
        doc.add(pathField);//from  w w w. j a v  a 2s .  c  o m

        // Add the last modified date of the file a field named "modified".
        // Use a LongPoint that is indexed (i.e. efficiently filterable with
        // PointRangeQuery).  This indexes to milli-second resolution, which
        // is often too fine.  You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you require.
        // For example the long value 4 would mean
        // February 17, 1, 2-3 PM.
        doc.add(new LongPoint("modified", lastModified));

        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will fail.
        doc.add(new TextField("contents",
                new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));

        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            System.out.println("adding " + file);
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been indexed) so 
            // we use updateDocument instead to replace the old one matching the exact 
            // path, if present:
            System.out.println("updating " + file);
            writer.updateDocument(new Term("path", file.toString()), doc);
        }
    }
}

From source file:com.ibm.jaql.lang.expr.index.BuildLuceneFn.java

License:Apache License

@Override
public JsonValue eval(Context context) throws Exception {
    JsonRecord fd = (JsonRecord) exprs[1].eval(context);
    if (fd == null) {
        return null;
    }/*  w ww.j a  v a2  s  .c  o  m*/
    JsonString loc = (JsonString) fd.get(new JsonString("location"));
    if (loc == null) {
        return null;
    }
    Function keyFn = (Function) exprs[2].eval(context);
    if (keyFn == null) {
        return null;
    }
    Function valFn = (Function) exprs[3].eval(context);
    JsonIterator iter = exprs[0].iter(context);
    JsonValue[] fnArgs = new JsonValue[1];
    Analyzer analyzer = new StandardAnalyzer();
    IndexWriter writer = new IndexWriter(loc.toString(), analyzer, true);
    ByteArrayOutputStream buf = null;
    DataOutputStream out = null;
    if (valFn != null) {
        buf = new ByteArrayOutputStream();
        out = new DataOutputStream(buf);
    }

    for (JsonValue value : iter) {
        fnArgs[0] = value;
        keyFn.setArguments(fnArgs);
        JsonIterator keyIter = keyFn.iter(context);
        Document doc = null;
        for (JsonValue key : keyIter) {
            JsonString jkey = (JsonString) key;
            if (doc == null) {
                doc = new Document();
            }
            doc.add(new Field("key", jkey.toString(), Store.NO, Index.UN_TOKENIZED)); // TODO: typed keys, store binary value
        }

        if (doc != null) {
            if (valFn != null) {
                valFn.setArguments(fnArgs);
                JsonIterator valIter = valFn.iter(context);
                for (JsonValue val : valIter) {
                    JsonRecord jrec = (JsonRecord) val;
                    for (Entry<JsonString, JsonValue> e : jrec) {
                        JsonString name = e.getKey();
                        JsonValue fval = e.getValue();
                        buf.reset();
                        serializer.write(out, fval);
                        out.flush();
                        byte[] bytes = buf.toByteArray();
                        doc.add(new Field(name.toString(), bytes, Store.COMPRESS));
                    }
                }
            }
            writer.addDocument(doc);
        }
    }

    writer.optimize();
    writer.close();
    return fd;
}

From source file:com.ibm.watson.developer_cloud.professor_languo.ingestion.indexing.LuceneIndexer.java

License:Open Source License

/**
 * Given a built corpus(a set of StackExchangeThreads without duplicates), an index writer and a
 * document mapper, write the indexing file with documents and record the statistics during the
 * indexing period./*www .  j a v a 2 s  .  c  o m*/
 * 
 * @param uniqueThreadDirPath - the path of the folder which stores the unique threads
 * @param writer - an index writer which can write document unit to the index file
 * @param docMapper - document mapper which maps the StackExchange instance to the document unit
 * @return the statistics during the indexing period.
 * @throws IngestionException
 */
private IndexingStats indexCorpus(String uniqueThreadDirPath, IndexWriter writer, DocumentMapper docMapper)
        throws IngestionException {
    List<Integer> indexThreadIds = new ArrayList<Integer>();
    long startTime, endTime;
    int indexDocNum;
    StackExchangeThread thread = null;
    File[] serFiles = new File(uniqueThreadDirPath).listFiles();

    try {
        startTime = System.currentTimeMillis();
        // restore the uniqe StackExchangeThreads from the .ser Files and
        // index them
        for (File serFile : serFiles) {
            thread = StackExchangeThreadSerializer.deserializeThreadFromBinFile(serFile.getPath());
            Document doc = docMapper.createDocument(thread);
            writer.addDocument(doc);
            indexThreadIds.add(thread.getId());
        }

        endTime = System.currentTimeMillis();
        indexDocNum = writer.numDocs();
        closeIndexWriter();
    } catch (IOException e) {
        throw new IngestionException(e);
    }

    return createIndexingStats(indexDocNum, indexThreadIds, endTime - startTime);
}

From source file:com.icdd.lucence.IndexFiles.java

License:Apache License

static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    try (InputStream stream = Files.newInputStream(file)) {
        // make a new,empty document
        Document doc = new Document();

        Field pathField = new StringField("path", file.toString(), Field.Store.YES);

        doc.add(pathField);//ww  w  .j a v  a  2 s  .  c o m
        doc.add(new SortedNumericDocValuesField("modified", lastModified));
        doc.add(new TextField("contents",
                new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));

        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can
            // be there):
            System.out.println("adding " + file);
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been
            // indexed) so
            // path, if present:
            System.out.println("updating " + file);
            writer.updateDocument(new Term("path", file.toString()), doc);
        }
    }
}

From source file:com.icdd.lucene.CreateIndex.java

License:Apache License

static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    // filter non-xml files
    if (filter.accept(file.toFile())) {

        System.out.println("num: " + num);
        num++;//w w w  .  j a v  a2  s  .  c  om
        if (num < endset && num >= offset) {

            try (InputStream stream = Files.newInputStream(file)) {
                // make a new,empty document
                Document doc = new Document();

                Field pathField = new StringField("path", file.toString(), Field.Store.YES);
                String filename = file.getFileName().toString();
                int post = filename.indexOf('_');
                if (post > 0) {
                    filename = filename.substring(post + 1, filename.length() - 4);
                }

                doc.add(pathField);
                doc.add(new StringField("title", filename, Field.Store.YES));
                doc.add(new SortedNumericDocValuesField("modified", lastModified));
                doc.add(new TextField("contents",
                        new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old
                    // document
                    // can
                    // be there):
                    logger.info("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have
                    // been
                    // indexed) so
                    // path, if present:
                    logger.info("updating " + file);
                    writer.updateDocument(new Term("path", file.toString()), doc);
                }
            }
        }
    }
}

From source file:com.ikon.analysis.SearchDemo.java

License:Open Source License

/**
 * Add documents/*from   ww  w .  j a v a2  s  .co m*/
 */
private static void add(Directory index, Analyzer analyzer, String str) throws IOException, ParseException {
    IndexWriterConfig config = new IndexWriterConfig(Config.LUCENE_VERSION, analyzer);
    IndexWriter w = new IndexWriter(index, config);
    Document doc = new Document();
    doc.add(new Field(DOC_FIELD, str, Field.Store.YES, Field.Index.ANALYZED));
    w.addDocument(doc);
    w.close();
}

From source file:com.impetus.kundera.index.LuceneIndexer.java

License:Apache License

/**
 * Indexes document in file system using lucene.
 * //from   w  w w  . j  av a 2s . c om
 * @param metadata
 *            the metadata
 * @param document
 *            the document
 */
public void indexDocument(EntityMetadata metadata, Document document) {
    if (log.isDebugEnabled()) {
        log.debug("Indexing document: {} for in file system using Lucene", document);
    }

    IndexWriter w = getIndexWriter();
    try {
        w.addDocument(document);
    } catch (Exception e) {
        log.error("Error while indexing document {} into Lucene, Caused by:{} ", document, e);
        throw new LuceneIndexingException("Error while indexing document " + document + " into Lucene.", e);
    }
}