Example usage for org.apache.lucene.index IndexWriter addDocument

List of usage examples for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException 

Source Link

Document

Adds a document to this index.

Usage

From source file:com.google.gerrit.server.change.ReviewerSuggestionCache.java

License:Apache License

private void addAccount(IndexWriter writer, Account a) throws IOException, OrmException {
    Document doc = new Document();
    doc.add(new IntField(ID, a.getId().get(), Store.YES));
    if (a.getFullName() != null) {
        doc.add(new TextField(NAME, a.getFullName(), Store.YES));
    }//w w  w .j  ava2 s  .com
    if (a.getPreferredEmail() != null) {
        doc.add(new StringField(EMAIL, a.getPreferredEmail().toLowerCase(), Store.YES));
        doc.add(new TextField(EMAIL, a.getPreferredEmail(), Store.YES));
    }
    AccountExternalIdAccess extIdAccess = db.get().accountExternalIds();
    String username = AccountState.getUserName(extIdAccess.byAccount(a.getId()).toList());
    if (username != null) {
        doc.add(new StringField(USERNAME, username, Store.YES));
    }
    writer.addDocument(doc);
}

From source file:com.greplin.lucene.filter.PhraseFilterBenchmark.java

License:Apache License

public static void main(String[] argv) {
    Directory directory = new RAMDirectory();
    try {/*from  www  .jav a 2s .c o m*/
        IndexWriter writer = new IndexWriter(directory,
                new IndexWriterConfig(Version.LUCENE_32, new WhitespaceAnalyzer(Version.LUCENE_32)));
        int done = 0;
        for (int i = 0; i < NUMBER_OF_SEGMENTS; i++) {
            int remaining = NUMBER_OF_SEGMENTS - i;
            int numberOfDocs;
            if (remaining == 1) {
                numberOfDocs = TOTAL_DOCS - done;
            } else {
                numberOfDocs = RANDOM.nextInt(TOTAL_DOCS - done - remaining) + 1;
            }
            done += numberOfDocs;
            System.out.println("Segment #" + i + " has " + numberOfDocs + " docs");

            for (int d = 0; d < numberOfDocs; d++) {
                int wordCount = RANDOM.nextInt(WORDS_PER_DOC_DEVIATION * 2) + AVERAGE_WORDS_PER_DOC
                        - WORDS_PER_DOC_DEVIATION;
                Document doc = new Document();
                doc.add(new Field("f", Joiner.on(' ').join(words(wordCount)), Field.Store.YES,
                        Field.Index.ANALYZED));
                doc.add(new Field("second", RANDOM.nextInt(100) < SECOND_FIELD_MATCH_PERCENTAGE ? "yes" : "no",
                        Field.Store.NO, Field.Index.ANALYZED));
                writer.addDocument(doc);
            }
            writer.commit();
        }
        writer.close();

        IndexReader reader = IndexReader.open(directory);
        IndexSearcher searcher = new IndexSearcher(reader);

        String[][] queries = new String[TOTAL_QUERIES][];
        Term[][] terms = new Term[TOTAL_QUERIES][];

        for (int q = 0; q < TOTAL_QUERIES; q++) {
            queries[q] = words(WORDS_PER_QUERY[RANDOM.nextInt(WORDS_PER_QUERY.length)]);
            terms[q] = new Term[queries[q].length];
            for (int qw = 0; qw < queries[q].length; qw++) {
                terms[q][qw] = new Term(FIELD, queries[q][qw]);
            }
        }

        // Warm up.
        new PhraseFilter(FIELD, queries[0]).getDocIdSet(reader);

        for (int round = 0; round < ROUNDS; round++) {
            System.out.println();
            String name1 = "filter";
            String name2 = "query";

            long ms1 = 0, ms2 = 0;
            for (int step = 0; step < 2; step++) {
                System.gc();
                System.gc();
                System.gc();

                if (step == (round & 1)) {
                    long millis = System.currentTimeMillis();
                    long hits = 0;
                    for (String[] queryWords : queries) {
                        PhraseFilter pf = new PhraseFilter(
                                new FilterIntersectionProvider(TermsFilter.from(new Term("second", "yes"))),
                                FIELD, queryWords);
                        hits += searcher.search(new FilteredQuery(new MatchAllDocsQuery(), pf), 1).totalHits;
                    }
                    ms1 = System.currentTimeMillis() - millis;
                    System.out.println("Finished " + name1 + " in " + ms1 + "ms with " + hits + " hits");
                } else {
                    long millis = System.currentTimeMillis();
                    long hits = 0;
                    for (Term[] queryTerms : terms) {
                        PhraseQuery pq = new PhraseQuery();
                        for (Term term : queryTerms) {
                            pq.add(term);
                        }
                        Query query = BooleanQueryBuilder.builder()
                                .must(new TermQuery(new Term("second", "yes"))).must(pq).build();
                        hits += searcher.search(query, 1).totalHits;
                    }
                    ms2 = System.currentTimeMillis() - millis;
                    System.out.println("Finished " + name2 + " in " + ms2 + "ms with " + hits + " hits");
                }
            }
            System.out.println(name1 + " took " + (int) ((100.0 * ms1) / ms2) + "% as much time as " + name2);
        }

    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:com.greplin.lucene.query.PredicateBonusQueryTest.java

License:Apache License

@Test
public void testBasics() throws Exception {
    IndexWriter writer = new IndexWriter(this.directory,
            new IndexWriterConfig(Version.LUCENE_35, new WhitespaceAnalyzer(Version.LUCENE_35)));
    writer.addDocument(new DocumentBuilder().add("value", "5").build());
    writer.close();/*from w w  w  .j a  v a  2s  . c  o  m*/

    IndexReader reader = IndexReader.open(this.directory);
    IndexSearcher searcher = new IndexSearcher(reader);

    Query query = new ConstantScoreQuery(new TermQuery(new Term("value", "5")));
    Assert.assertEquals(1.0, searcher.search(query, 1).getMaxScore(), 0.00001);

    Query noBonus = new PredicateBonusQuery(query, Predicates.NONE, 10.0f);
    Assert.assertEquals(1.0, searcher.search(noBonus, 1).getMaxScore(), 0.00001);

    Query bonus = new PredicateBonusQuery(query, Predicates.ALL, 100.0f);
    Assert.assertEquals(101.0, searcher.search(bonus, 1).getMaxScore(), 0.00001);

    Query noMatch = new TermQuery(new Term("value", "not5"));
    Assert.assertEquals(Double.NaN, searcher.search(noMatch, 1).getMaxScore(), 0.00001);

    Query noMatchNoBonus = new PredicateBonusQuery(noMatch, Predicates.NONE, 10.0f);
    Assert.assertEquals(Double.NaN, searcher.search(noMatchNoBonus, 1).getMaxScore(), 0.00001);

    Query noMatchIgnoresBonus = new PredicateBonusQuery(noMatch, Predicates.ALL, 100.0f);
    Assert.assertEquals(Double.NaN, searcher.search(noMatchIgnoresBonus, 1).getMaxScore(), 0.00001);
}

From source file:com.heejong.lucene.IndexFiles.java

License:Apache License

/** Indexes a single document */
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    try (InputStream stream = Files.newInputStream(file)) {
        // make a new, empty document
        Document doc = new Document();

        // Add the path of the file as a field named "path".  Use a
        // field that is indexed (i.e. searchable), but don't tokenize 
        // the field into separate words and don't index term frequency
        // or positional information:
        Field pathField = new StringField("path", file.toString(), Field.Store.YES);
        doc.add(pathField);//from  w w w. j a v  a 2s .  c  o m

        // Add the last modified date of the file a field named "modified".
        // Use a LongPoint that is indexed (i.e. efficiently filterable with
        // PointRangeQuery).  This indexes to milli-second resolution, which
        // is often too fine.  You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you require.
        // For example the long value 4 would mean
        // February 17, 1, 2-3 PM.
        doc.add(new LongPoint("modified", lastModified));

        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will fail.
        doc.add(new TextField("contents",
                new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));

        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            System.out.println("adding " + file);
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been indexed) so 
            // we use updateDocument instead to replace the old one matching the exact 
            // path, if present:
            System.out.println("updating " + file);
            writer.updateDocument(new Term("path", file.toString()), doc);
        }
    }
}

From source file:com.ibm.jaql.lang.expr.index.BuildLuceneFn.java

License:Apache License

@Override
public JsonValue eval(Context context) throws Exception {
    JsonRecord fd = (JsonRecord) exprs[1].eval(context);
    if (fd == null) {
        return null;
    }/*  w ww.j a  v a2  s  .c  o  m*/
    JsonString loc = (JsonString) fd.get(new JsonString("location"));
    if (loc == null) {
        return null;
    }
    Function keyFn = (Function) exprs[2].eval(context);
    if (keyFn == null) {
        return null;
    }
    Function valFn = (Function) exprs[3].eval(context);
    JsonIterator iter = exprs[0].iter(context);
    JsonValue[] fnArgs = new JsonValue[1];
    Analyzer analyzer = new StandardAnalyzer();
    IndexWriter writer = new IndexWriter(loc.toString(), analyzer, true);
    ByteArrayOutputStream buf = null;
    DataOutputStream out = null;
    if (valFn != null) {
        buf = new ByteArrayOutputStream();
        out = new DataOutputStream(buf);
    }

    for (JsonValue value : iter) {
        fnArgs[0] = value;
        keyFn.setArguments(fnArgs);
        JsonIterator keyIter = keyFn.iter(context);
        Document doc = null;
        for (JsonValue key : keyIter) {
            JsonString jkey = (JsonString) key;
            if (doc == null) {
                doc = new Document();
            }
            doc.add(new Field("key", jkey.toString(), Store.NO, Index.UN_TOKENIZED)); // TODO: typed keys, store binary value
        }

        if (doc != null) {
            if (valFn != null) {
                valFn.setArguments(fnArgs);
                JsonIterator valIter = valFn.iter(context);
                for (JsonValue val : valIter) {
                    JsonRecord jrec = (JsonRecord) val;
                    for (Entry<JsonString, JsonValue> e : jrec) {
                        JsonString name = e.getKey();
                        JsonValue fval = e.getValue();
                        buf.reset();
                        serializer.write(out, fval);
                        out.flush();
                        byte[] bytes = buf.toByteArray();
                        doc.add(new Field(name.toString(), bytes, Store.COMPRESS));
                    }
                }
            }
            writer.addDocument(doc);
        }
    }

    writer.optimize();
    writer.close();
    return fd;
}

From source file:com.ibm.watson.developer_cloud.professor_languo.ingestion.indexing.LuceneIndexer.java

License:Open Source License

/**
 * Given a built corpus(a set of StackExchangeThreads without duplicates), an index writer and a
 * document mapper, write the indexing file with documents and record the statistics during the
 * indexing period./*www .  j a v a 2 s  .  c  o m*/
 * 
 * @param uniqueThreadDirPath - the path of the folder which stores the unique threads
 * @param writer - an index writer which can write document unit to the index file
 * @param docMapper - document mapper which maps the StackExchange instance to the document unit
 * @return the statistics during the indexing period.
 * @throws IngestionException
 */
private IndexingStats indexCorpus(String uniqueThreadDirPath, IndexWriter writer, DocumentMapper docMapper)
        throws IngestionException {
    List<Integer> indexThreadIds = new ArrayList<Integer>();
    long startTime, endTime;
    int indexDocNum;
    StackExchangeThread thread = null;
    File[] serFiles = new File(uniqueThreadDirPath).listFiles();

    try {
        startTime = System.currentTimeMillis();
        // restore the uniqe StackExchangeThreads from the .ser Files and
        // index them
        for (File serFile : serFiles) {
            thread = StackExchangeThreadSerializer.deserializeThreadFromBinFile(serFile.getPath());
            Document doc = docMapper.createDocument(thread);
            writer.addDocument(doc);
            indexThreadIds.add(thread.getId());
        }

        endTime = System.currentTimeMillis();
        indexDocNum = writer.numDocs();
        closeIndexWriter();
    } catch (IOException e) {
        throw new IngestionException(e);
    }

    return createIndexingStats(indexDocNum, indexThreadIds, endTime - startTime);
}

From source file:com.icdd.lucence.IndexFiles.java

License:Apache License

static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    try (InputStream stream = Files.newInputStream(file)) {
        // make a new,empty document
        Document doc = new Document();

        Field pathField = new StringField("path", file.toString(), Field.Store.YES);

        doc.add(pathField);//ww  w  .j a v  a  2 s  .  c o m
        doc.add(new SortedNumericDocValuesField("modified", lastModified));
        doc.add(new TextField("contents",
                new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));

        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can
            // be there):
            System.out.println("adding " + file);
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been
            // indexed) so
            // path, if present:
            System.out.println("updating " + file);
            writer.updateDocument(new Term("path", file.toString()), doc);
        }
    }
}

From source file:com.icdd.lucene.CreateIndex.java

License:Apache License

static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    // filter non-xml files
    if (filter.accept(file.toFile())) {

        System.out.println("num: " + num);
        num++;//w w w  .  j a v  a2  s  .  c  om
        if (num < endset && num >= offset) {

            try (InputStream stream = Files.newInputStream(file)) {
                // make a new,empty document
                Document doc = new Document();

                Field pathField = new StringField("path", file.toString(), Field.Store.YES);
                String filename = file.getFileName().toString();
                int post = filename.indexOf('_');
                if (post > 0) {
                    filename = filename.substring(post + 1, filename.length() - 4);
                }

                doc.add(pathField);
                doc.add(new StringField("title", filename, Field.Store.YES));
                doc.add(new SortedNumericDocValuesField("modified", lastModified));
                doc.add(new TextField("contents",
                        new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old
                    // document
                    // can
                    // be there):
                    logger.info("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have
                    // been
                    // indexed) so
                    // path, if present:
                    logger.info("updating " + file);
                    writer.updateDocument(new Term("path", file.toString()), doc);
                }
            }
        }
    }
}

From source file:com.ikon.analysis.SearchDemo.java

License:Open Source License

/**
 * Add documents/*from   ww  w .  j a v a2  s  .co m*/
 */
private static void add(Directory index, Analyzer analyzer, String str) throws IOException, ParseException {
    IndexWriterConfig config = new IndexWriterConfig(Config.LUCENE_VERSION, analyzer);
    IndexWriter w = new IndexWriter(index, config);
    Document doc = new Document();
    doc.add(new Field(DOC_FIELD, str, Field.Store.YES, Field.Index.ANALYZED));
    w.addDocument(doc);
    w.close();
}

From source file:com.impetus.kundera.index.LuceneIndexer.java

License:Apache License

/**
 * Indexes document in file system using lucene.
 * //from   w  w w  . j  av a 2s . c om
 * @param metadata
 *            the metadata
 * @param document
 *            the document
 */
public void indexDocument(EntityMetadata metadata, Document document) {
    if (log.isDebugEnabled()) {
        log.debug("Indexing document: {} for in file system using Lucene", document);
    }

    IndexWriter w = getIndexWriter();
    try {
        w.addDocument(document);
    } catch (Exception e) {
        log.error("Error while indexing document {} into Lucene, Caused by:{} ", document, e);
        throw new LuceneIndexingException("Error while indexing document " + document + " into Lucene.", e);
    }
}