Example usage for org.apache.lucene.document Document clear

List of usage examples for org.apache.lucene.document Document clear

Introduction

In this page you can find the example usage for org.apache.lucene.document Document clear.

Prototype

public void clear() 

Source Link

Document

Removes all the fields from document.

Usage

From source file:io.anserini.embeddings.IndexW2V.java

License:Apache License

public void indexEmbeddings() throws IOException, InterruptedException {
    LOG.info("Starting indexer...");
    long startTime = System.currentTimeMillis();
    final WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
    final IndexWriterConfig config = new IndexWriterConfig(analyzer);
    final IndexWriter writer = new IndexWriter(directory, config);

    BufferedReader bRdr = new BufferedReader(new FileReader(args.input));
    String line = null;/*  w w w  . ja va  2 s .c o  m*/
    bRdr.readLine();

    Document document = new Document();
    ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
    int cnt = 0;

    while ((line = bRdr.readLine()) != null) {
        String[] termEmbedding = line.trim().split("\t");
        document.add(new StringField(LuceneDocumentGenerator.FIELD_ID, termEmbedding[0], Field.Store.NO));
        String[] parts = termEmbedding[1].split(" ");

        for (int i = 0; i < parts.length; ++i) {
            byteStream.write(ByteBuffer.allocate(4).putFloat(Float.parseFloat(parts[i])).array());
        }
        document.add(new StoredField(FIELD_BODY, byteStream.toByteArray()));

        byteStream.flush();
        byteStream.reset();
        writer.addDocument(document);
        document.clear();
        cnt++;

        if (cnt % 100000 == 0) {
            LOG.info(cnt + " terms indexed");
        }
    }

    LOG.info(String.format("Total of %s terms added", cnt));

    try {
        writer.commit();
        writer.forceMerge(1);
    } finally {
        try {
            writer.close();
        } catch (IOException e) {
            LOG.error(e);
        }
    }

    LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
}

From source file:org.elasticsearch.search.aggregations.bucket.composite.CompositeAggregatorTests.java

License:Apache License

private void executeTestCase(boolean reduced, Query query, List<Map<String, List<Object>>> dataset,
        Supplier<CompositeAggregationBuilder> create, Consumer<InternalComposite> verify) throws IOException {
    try (Directory directory = newDirectory()) {
        try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) {
            Document document = new Document();
            for (Map<String, List<Object>> fields : dataset) {
                addToDocument(document, fields);
                indexWriter.addDocument(document);
                document.clear();
            }/*from ww w. j ava 2s.c  o  m*/
        }
        try (IndexReader indexReader = DirectoryReader.open(directory)) {
            IndexSearcher indexSearcher = new IndexSearcher(indexReader);
            CompositeAggregationBuilder aggregationBuilder = create.get();
            final InternalComposite composite;
            if (reduced) {
                composite = searchAndReduce(indexSearcher, query, aggregationBuilder, FIELD_TYPES);
            } else {
                composite = search(indexSearcher, query, aggregationBuilder, FIELD_TYPES);
            }
            verify.accept(composite);
        }
    }
}

From source file:org.elasticsearch.search.aggregations.bucket.filter.FilterAggregatorTests.java

License:Apache License

public void testRandom() throws Exception {
    Directory directory = newDirectory();
    RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory);
    int numDocs = randomIntBetween(100, 200);
    int maxTerm = randomIntBetween(10, 50);
    int[] expectedBucketCount = new int[maxTerm];
    Document document = new Document();
    for (int i = 0; i < numDocs; i++) {
        if (frequently()) {
            // make sure we have more than one segment to test the merge
            indexWriter.getReader().close();
        }/*from   www .jav a2 s.  c o  m*/
        int value = randomInt(maxTerm - 1);
        expectedBucketCount[value] += 1;
        document.add(new Field("field", Integer.toString(value), fieldType));
        indexWriter.addDocument(document);
        document.clear();
    }
    indexWriter.close();

    IndexReader indexReader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = newSearcher(indexReader, true, true);
    int value = randomInt(maxTerm - 1);
    QueryBuilder filter = QueryBuilders.termQuery("field", Integer.toString(value));
    FilterAggregationBuilder builder = new FilterAggregationBuilder("test", filter);

    for (boolean doReduce : new boolean[] { true, false }) {
        final InternalFilter response;
        if (doReduce) {
            response = searchAndReduce(indexSearcher, new MatchAllDocsQuery(), builder, fieldType);
        } else {
            response = search(indexSearcher, new MatchAllDocsQuery(), builder, fieldType);
        }
        assertEquals(response.getDocCount(), (long) expectedBucketCount[value]);
    }
    indexReader.close();
    directory.close();
}

From source file:org.elasticsearch.search.aggregations.bucket.filter.FiltersAggregatorTests.java

License:Apache License

public void testKeyedFilter() throws Exception {
    Directory directory = newDirectory();
    RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory);
    Document document = new Document();
    document.add(new Field("field", "foo", fieldType));
    indexWriter.addDocument(document);//from   w  w  w.jav  a  2  s .  c  o  m
    document.clear();
    document.add(new Field("field", "else", fieldType));
    indexWriter.addDocument(document);
    // make sure we have more than one segment to test the merge
    indexWriter.commit();
    document.add(new Field("field", "foo", fieldType));
    indexWriter.addDocument(document);
    document.clear();
    document.add(new Field("field", "bar", fieldType));
    indexWriter.addDocument(document);
    document.clear();
    document.add(new Field("field", "foobar", fieldType));
    indexWriter.addDocument(document);
    indexWriter.commit();
    document.clear();
    document.add(new Field("field", "something", fieldType));
    indexWriter.addDocument(document);
    indexWriter.commit();
    document.clear();
    document.add(new Field("field", "foobar", fieldType));
    indexWriter.addDocument(document);
    indexWriter.close();

    IndexReader indexReader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = newSearcher(indexReader, true, true);

    FiltersAggregator.KeyedFilter[] keys = new FiltersAggregator.KeyedFilter[6];
    keys[0] = new FiltersAggregator.KeyedFilter("foobar", QueryBuilders.termQuery("field", "foobar"));
    keys[1] = new FiltersAggregator.KeyedFilter("bar", QueryBuilders.termQuery("field", "bar"));
    keys[2] = new FiltersAggregator.KeyedFilter("foo", QueryBuilders.termQuery("field", "foo"));
    keys[3] = new FiltersAggregator.KeyedFilter("foo2", QueryBuilders.termQuery("field", "foo"));
    keys[4] = new FiltersAggregator.KeyedFilter("same", QueryBuilders.termQuery("field", "foo"));
    // filter name already present so it should be merge with the previous one ?
    keys[5] = new FiltersAggregator.KeyedFilter("same", QueryBuilders.termQuery("field", "bar"));
    FiltersAggregationBuilder builder = new FiltersAggregationBuilder("test", keys);
    builder.otherBucket(true);
    builder.otherBucketKey("other");
    for (boolean doReduce : new boolean[] { true, false }) {
        final InternalFilters filters;
        if (doReduce) {
            filters = searchAndReduce(indexSearcher, new MatchAllDocsQuery(), builder, fieldType);
        } else {
            filters = search(indexSearcher, new MatchAllDocsQuery(), builder, fieldType);
        }
        assertEquals(filters.getBuckets().size(), 7);
        assertEquals(filters.getBucketByKey("foobar").getDocCount(), 2);
        assertEquals(filters.getBucketByKey("foo").getDocCount(), 2);
        assertEquals(filters.getBucketByKey("foo2").getDocCount(), 2);
        assertEquals(filters.getBucketByKey("bar").getDocCount(), 1);
        assertEquals(filters.getBucketByKey("same").getDocCount(), 1);
        assertEquals(filters.getBucketByKey("other").getDocCount(), 2);
    }

    indexReader.close();
    directory.close();
}

From source file:org.elasticsearch.search.aggregations.bucket.filter.FiltersAggregatorTests.java

License:Apache License

public void testRandom() throws Exception {
    Directory directory = newDirectory();
    RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory);
    int numDocs = randomIntBetween(100, 200);
    int maxTerm = randomIntBetween(10, 50);
    int[] expectedBucketCount = new int[maxTerm];
    Document document = new Document();
    for (int i = 0; i < numDocs; i++) {
        if (frequently()) {
            // make sure we have more than one segment to test the merge
            indexWriter.commit();//from  ww  w .j  a  v  a 2 s .  co  m
        }
        int value = randomInt(maxTerm - 1);
        expectedBucketCount[value] += 1;
        document.add(new Field("field", Integer.toString(value), fieldType));
        indexWriter.addDocument(document);
        document.clear();
    }
    indexWriter.close();

    IndexReader indexReader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = newSearcher(indexReader, true, true);
    int numFilters = randomIntBetween(1, 10);
    QueryBuilder[] filters = new QueryBuilder[numFilters];
    int[] filterTerms = new int[numFilters];
    int expectedOtherCount = numDocs;
    Set<Integer> filterSet = new HashSet<>();
    for (int i = 0; i < filters.length; i++) {
        int value = randomInt(maxTerm - 1);
        filters[i] = QueryBuilders.termQuery("field", Integer.toString(value));
        filterTerms[i] = value;
        if (filterSet.contains(value) == false) {
            expectedOtherCount -= expectedBucketCount[value];
            filterSet.add(value);
        }
    }
    FiltersAggregationBuilder builder = new FiltersAggregationBuilder("test", filters);
    builder.otherBucket(true);
    builder.otherBucketKey("other");

    for (boolean doReduce : new boolean[] { true, false }) {
        final InternalFilters response;
        if (doReduce) {
            response = searchAndReduce(indexSearcher, new MatchAllDocsQuery(), builder, fieldType);
        } else {
            response = search(indexSearcher, new MatchAllDocsQuery(), builder, fieldType);
        }
        List<InternalFilters.InternalBucket> buckets = response.getBuckets();
        assertEquals(buckets.size(), filters.length + 1);

        for (InternalFilters.InternalBucket bucket : buckets) {
            if ("other".equals(bucket.getKey())) {
                assertEquals(bucket.getDocCount(), expectedOtherCount);
            } else {
                int index = Integer.parseInt(bucket.getKey());
                assertEquals(bucket.getDocCount(), (long) expectedBucketCount[filterTerms[index]]);
            }
        }
    }
    indexReader.close();
    directory.close();
}

From source file:org.elasticsearch.search.aggregations.bucket.histogram.AutoDateHistogramAggregatorTests.java

License:Apache License

private void executeTestCase(boolean reduced, Query query, List<String> dataset,
        Consumer<AutoDateHistogramAggregationBuilder> configure, Consumer<Histogram> verify)
        throws IOException {

    try (Directory directory = newDirectory()) {
        try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) {
            Document document = new Document();
            for (String date : dataset) {
                if (frequently()) {
                    indexWriter.commit();
                }//  w w w .j  ava 2 s.c  o m

                long instant = asLong(date);
                document.add(new SortedNumericDocValuesField(DATE_FIELD, instant));
                document.add(new LongPoint(INSTANT_FIELD, instant));
                indexWriter.addDocument(document);
                document.clear();
            }
        }

        try (IndexReader indexReader = DirectoryReader.open(directory)) {
            IndexSearcher indexSearcher = newSearcher(indexReader, true, true);

            AutoDateHistogramAggregationBuilder aggregationBuilder = new AutoDateHistogramAggregationBuilder(
                    "_name");
            if (configure != null) {
                configure.accept(aggregationBuilder);
            }

            DateFieldMapper.Builder builder = new DateFieldMapper.Builder("_name");
            DateFieldMapper.DateFieldType fieldType = builder.fieldType();
            fieldType.setHasDocValues(true);
            fieldType.setName(aggregationBuilder.field());

            InternalAutoDateHistogram histogram;
            if (reduced) {
                histogram = searchAndReduce(indexSearcher, query, aggregationBuilder, fieldType);
            } else {
                histogram = search(indexSearcher, query, aggregationBuilder, fieldType);
            }
            verify.accept(histogram);
        }
    }
}

From source file:org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramAggregatorTests.java

License:Apache License

private void executeTestCase(boolean reduced, Query query, List<String> dataset,
        Consumer<DateHistogramAggregationBuilder> configure, Consumer<Histogram> verify) throws IOException {

    try (Directory directory = newDirectory()) {
        try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) {
            Document document = new Document();
            for (String date : dataset) {
                if (frequently()) {
                    indexWriter.commit();
                }// w ww  .  j  a  v a 2 s.  c  o  m

                long instant = asLong(date);
                document.add(new SortedNumericDocValuesField(DATE_FIELD, instant));
                document.add(new LongPoint(INSTANT_FIELD, instant));
                indexWriter.addDocument(document);
                document.clear();
            }
        }

        try (IndexReader indexReader = DirectoryReader.open(directory)) {
            IndexSearcher indexSearcher = newSearcher(indexReader, true, true);

            DateHistogramAggregationBuilder aggregationBuilder = new DateHistogramAggregationBuilder("_name");
            if (configure != null) {
                configure.accept(aggregationBuilder);
            }

            DateFieldMapper.Builder builder = new DateFieldMapper.Builder("_name");
            DateFieldMapper.DateFieldType fieldType = builder.fieldType();
            fieldType.setHasDocValues(true);
            fieldType.setName(aggregationBuilder.field());

            InternalDateHistogram histogram;
            if (reduced) {
                histogram = searchAndReduce(indexSearcher, query, aggregationBuilder, fieldType);
            } else {
                histogram = search(indexSearcher, query, aggregationBuilder, fieldType);
            }
            verify.accept(histogram);
        }
    }
}

From source file:org.elasticsearch.search.aggregations.bucket.missing.MissingAggregatorTests.java

License:Apache License

private void executeTestCase(int numDocs, String fieldName, Query query, Consumer<Document> consumer,
        Consumer<InternalMissing> verify, boolean reduced) throws IOException {
    try (Directory directory = newDirectory()) {
        try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) {
            Document document = new Document();
            for (int i = 0; i < numDocs; i++) {
                if (frequently()) {
                    indexWriter.commit();
                }/*from ww w.  j av  a2s .  co  m*/
                consumer.accept(document);
                indexWriter.addDocument(document);
                document.clear();
            }
        }

        try (IndexReader indexReader = DirectoryReader.open(directory)) {
            IndexSearcher indexSearcher = newSearcher(indexReader, true, true);
            MissingAggregationBuilder builder = new MissingAggregationBuilder("_name", ValueType.LONG);
            builder.field(fieldName);

            NumberFieldMapper.Builder mapperBuilder = new NumberFieldMapper.Builder("_name",
                    NumberFieldMapper.NumberType.LONG);
            MappedFieldType fieldType = mapperBuilder.fieldType();
            fieldType.setHasDocValues(true);
            fieldType.setName(builder.field());

            InternalMissing missing;
            if (reduced) {
                missing = searchAndReduce(indexSearcher, query, builder, fieldType);
            } else {
                missing = search(indexSearcher, query, builder, fieldType);
            }
            verify.accept(missing);
        }
    }
}

From source file:org.elasticsearch.search.aggregations.pipeline.bucketmetrics.avg.AvgBucketAggregatorTests.java

License:Apache License

/**
 * Test for issue #30608.  Under the following circumstances:
 *
 * A. Multi-bucket agg in the first entry of our internal list
 * B. Regular agg as the immediate child of the multi-bucket in A
 * C. Regular agg with the same name as B at the top level, listed as the second entry in our internal list
 * D. Finally, a pipeline agg with the path down to B
 *
 * BucketMetrics reduction would throw a class cast exception due to bad subpathing.  This test ensures
 * it is fixed.//from  w  ww.j  a v a 2s  .  c  o m
 *
 * Note: we have this test inside of the `avg_bucket` package so that we can get access to the package-private
 * `doReduce()` needed for testing this
 */
public void testSameAggNames() throws IOException {
    Query query = new MatchAllDocsQuery();

    AvgAggregationBuilder avgBuilder = new AvgAggregationBuilder("foo").field(VALUE_FIELD);
    DateHistogramAggregationBuilder histo = new DateHistogramAggregationBuilder("histo")
            .dateHistogramInterval(DateHistogramInterval.YEAR).field(DATE_FIELD)
            .subAggregation(new AvgAggregationBuilder("foo").field(VALUE_FIELD));

    AvgBucketPipelineAggregationBuilder avgBucketBuilder = new AvgBucketPipelineAggregationBuilder(
            "the_avg_bucket", "histo>foo");

    try (Directory directory = newDirectory()) {
        try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) {
            Document document = new Document();
            for (String date : dataset) {
                if (frequently()) {
                    indexWriter.commit();
                }

                document.add(new SortedNumericDocValuesField(DATE_FIELD, asLong(date)));
                document.add(new SortedNumericDocValuesField(VALUE_FIELD, randomInt()));
                indexWriter.addDocument(document);
                document.clear();
            }
        }

        InternalAvg avgResult;
        InternalDateHistogram histogramResult;
        try (IndexReader indexReader = DirectoryReader.open(directory)) {
            IndexSearcher indexSearcher = newSearcher(indexReader, true, true);

            DateFieldMapper.Builder builder = new DateFieldMapper.Builder("histo");
            DateFieldMapper.DateFieldType fieldType = builder.fieldType();
            fieldType.setHasDocValues(true);
            fieldType.setName(DATE_FIELD);

            MappedFieldType valueFieldType = new NumberFieldMapper.NumberFieldType(
                    NumberFieldMapper.NumberType.LONG);
            valueFieldType.setName(VALUE_FIELD);
            valueFieldType.setHasDocValues(true);

            avgResult = searchAndReduce(indexSearcher, query, avgBuilder, 10000, null,
                    new MappedFieldType[] { fieldType, valueFieldType });
            histogramResult = searchAndReduce(indexSearcher, query, histo, 10000, null,
                    new MappedFieldType[] { fieldType, valueFieldType });
        }

        // Finally, reduce the pipeline agg
        PipelineAggregator avgBucketAgg = avgBucketBuilder.createInternal(Collections.emptyMap());
        List<Aggregation> reducedAggs = new ArrayList<>(2);

        // Histo has to go first to exercise the bug
        reducedAggs.add(histogramResult);
        reducedAggs.add(avgResult);
        Aggregations aggregations = new Aggregations(reducedAggs);
        InternalAggregation pipelineResult = ((AvgBucketPipelineAggregator) avgBucketAgg).doReduce(aggregations,
                null);
        assertNotNull(pipelineResult);
    }
}

From source file:org.elasticsearch.search.aggregations.pipeline.CumulativeSumAggregatorTests.java

License:Apache License

public void testDocCount() throws IOException {
    Query query = new MatchAllDocsQuery();

    int numDocs = randomIntBetween(6, 20);
    int interval = randomIntBetween(2, 5);

    int minRandomValue = 0;
    int maxRandomValue = 20;

    int numValueBuckets = ((maxRandomValue - minRandomValue) / interval) + 1;
    long[] valueCounts = new long[numValueBuckets];

    HistogramAggregationBuilder aggBuilder = new HistogramAggregationBuilder("histo").field(VALUE_FIELD)
            .interval(interval).extendedBounds(minRandomValue, maxRandomValue);
    aggBuilder.subAggregation(new CumulativeSumPipelineAggregationBuilder("cusum", "_count"));

    executeTestCase(query, aggBuilder, histogram -> {
        List<? extends Histogram.Bucket> buckets = ((Histogram) histogram).getBuckets();

        assertThat(buckets.size(), equalTo(numValueBuckets));

        double sum = 0;
        for (int i = 0; i < numValueBuckets; ++i) {
            Histogram.Bucket bucket = buckets.get(i);
            assertThat(bucket, notNullValue());
            assertThat(((Number) bucket.getKey()).longValue(), equalTo((long) i * interval));
            assertThat(bucket.getDocCount(), equalTo(valueCounts[i]));
            sum += bucket.getDocCount();
            InternalSimpleValue cumulativeSumValue = bucket.getAggregations().get("cusum");
            assertThat(cumulativeSumValue, notNullValue());
            assertThat(cumulativeSumValue.getName(), equalTo("cusum"));
            assertThat(cumulativeSumValue.value(), equalTo(sum));
        }//from   w ww. j  a v a  2s . c om
    }, indexWriter -> {
        Document document = new Document();

        for (int i = 0; i < numDocs; i++) {
            int fieldValue = randomIntBetween(minRandomValue, maxRandomValue);
            document.add(new NumericDocValuesField(VALUE_FIELD, fieldValue));
            final int bucket = (fieldValue / interval);
            valueCounts[bucket]++;

            indexWriter.addDocument(document);
            document.clear();
        }
    });
}