List of usage examples for org.apache.lucene.document Document clear
public void clear()
From source file:io.anserini.embeddings.IndexW2V.java
License:Apache License
public void indexEmbeddings() throws IOException, InterruptedException { LOG.info("Starting indexer..."); long startTime = System.currentTimeMillis(); final WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(); final IndexWriterConfig config = new IndexWriterConfig(analyzer); final IndexWriter writer = new IndexWriter(directory, config); BufferedReader bRdr = new BufferedReader(new FileReader(args.input)); String line = null;/* w w w . ja va 2 s .c o m*/ bRdr.readLine(); Document document = new Document(); ByteArrayOutputStream byteStream = new ByteArrayOutputStream(); int cnt = 0; while ((line = bRdr.readLine()) != null) { String[] termEmbedding = line.trim().split("\t"); document.add(new StringField(LuceneDocumentGenerator.FIELD_ID, termEmbedding[0], Field.Store.NO)); String[] parts = termEmbedding[1].split(" "); for (int i = 0; i < parts.length; ++i) { byteStream.write(ByteBuffer.allocate(4).putFloat(Float.parseFloat(parts[i])).array()); } document.add(new StoredField(FIELD_BODY, byteStream.toByteArray())); byteStream.flush(); byteStream.reset(); writer.addDocument(document); document.clear(); cnt++; if (cnt % 100000 == 0) { LOG.info(cnt + " terms indexed"); } } LOG.info(String.format("Total of %s terms added", cnt)); try { writer.commit(); writer.forceMerge(1); } finally { try { writer.close(); } catch (IOException e) { LOG.error(e); } } LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); }
From source file:org.elasticsearch.search.aggregations.bucket.composite.CompositeAggregatorTests.java
License:Apache License
private void executeTestCase(boolean reduced, Query query, List<Map<String, List<Object>>> dataset, Supplier<CompositeAggregationBuilder> create, Consumer<InternalComposite> verify) throws IOException { try (Directory directory = newDirectory()) { try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { Document document = new Document(); for (Map<String, List<Object>> fields : dataset) { addToDocument(document, fields); indexWriter.addDocument(document); document.clear(); }/*from ww w. j ava 2s.c o m*/ } try (IndexReader indexReader = DirectoryReader.open(directory)) { IndexSearcher indexSearcher = new IndexSearcher(indexReader); CompositeAggregationBuilder aggregationBuilder = create.get(); final InternalComposite composite; if (reduced) { composite = searchAndReduce(indexSearcher, query, aggregationBuilder, FIELD_TYPES); } else { composite = search(indexSearcher, query, aggregationBuilder, FIELD_TYPES); } verify.accept(composite); } } }
From source file:org.elasticsearch.search.aggregations.bucket.filter.FilterAggregatorTests.java
License:Apache License
public void testRandom() throws Exception { Directory directory = newDirectory(); RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory); int numDocs = randomIntBetween(100, 200); int maxTerm = randomIntBetween(10, 50); int[] expectedBucketCount = new int[maxTerm]; Document document = new Document(); for (int i = 0; i < numDocs; i++) { if (frequently()) { // make sure we have more than one segment to test the merge indexWriter.getReader().close(); }/*from www .jav a2 s. c o m*/ int value = randomInt(maxTerm - 1); expectedBucketCount[value] += 1; document.add(new Field("field", Integer.toString(value), fieldType)); indexWriter.addDocument(document); document.clear(); } indexWriter.close(); IndexReader indexReader = DirectoryReader.open(directory); IndexSearcher indexSearcher = newSearcher(indexReader, true, true); int value = randomInt(maxTerm - 1); QueryBuilder filter = QueryBuilders.termQuery("field", Integer.toString(value)); FilterAggregationBuilder builder = new FilterAggregationBuilder("test", filter); for (boolean doReduce : new boolean[] { true, false }) { final InternalFilter response; if (doReduce) { response = searchAndReduce(indexSearcher, new MatchAllDocsQuery(), builder, fieldType); } else { response = search(indexSearcher, new MatchAllDocsQuery(), builder, fieldType); } assertEquals(response.getDocCount(), (long) expectedBucketCount[value]); } indexReader.close(); directory.close(); }
From source file:org.elasticsearch.search.aggregations.bucket.filter.FiltersAggregatorTests.java
License:Apache License
public void testKeyedFilter() throws Exception { Directory directory = newDirectory(); RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory); Document document = new Document(); document.add(new Field("field", "foo", fieldType)); indexWriter.addDocument(document);//from w w w.jav a 2 s . c o m document.clear(); document.add(new Field("field", "else", fieldType)); indexWriter.addDocument(document); // make sure we have more than one segment to test the merge indexWriter.commit(); document.add(new Field("field", "foo", fieldType)); indexWriter.addDocument(document); document.clear(); document.add(new Field("field", "bar", fieldType)); indexWriter.addDocument(document); document.clear(); document.add(new Field("field", "foobar", fieldType)); indexWriter.addDocument(document); indexWriter.commit(); document.clear(); document.add(new Field("field", "something", fieldType)); indexWriter.addDocument(document); indexWriter.commit(); document.clear(); document.add(new Field("field", "foobar", fieldType)); indexWriter.addDocument(document); indexWriter.close(); IndexReader indexReader = DirectoryReader.open(directory); IndexSearcher indexSearcher = newSearcher(indexReader, true, true); FiltersAggregator.KeyedFilter[] keys = new FiltersAggregator.KeyedFilter[6]; keys[0] = new FiltersAggregator.KeyedFilter("foobar", QueryBuilders.termQuery("field", "foobar")); keys[1] = new FiltersAggregator.KeyedFilter("bar", QueryBuilders.termQuery("field", "bar")); keys[2] = new FiltersAggregator.KeyedFilter("foo", QueryBuilders.termQuery("field", "foo")); keys[3] = new FiltersAggregator.KeyedFilter("foo2", QueryBuilders.termQuery("field", "foo")); keys[4] = new FiltersAggregator.KeyedFilter("same", QueryBuilders.termQuery("field", "foo")); // filter name already present so it should be merge with the previous one ? keys[5] = new FiltersAggregator.KeyedFilter("same", QueryBuilders.termQuery("field", "bar")); FiltersAggregationBuilder builder = new FiltersAggregationBuilder("test", keys); builder.otherBucket(true); builder.otherBucketKey("other"); for (boolean doReduce : new boolean[] { true, false }) { final InternalFilters filters; if (doReduce) { filters = searchAndReduce(indexSearcher, new MatchAllDocsQuery(), builder, fieldType); } else { filters = search(indexSearcher, new MatchAllDocsQuery(), builder, fieldType); } assertEquals(filters.getBuckets().size(), 7); assertEquals(filters.getBucketByKey("foobar").getDocCount(), 2); assertEquals(filters.getBucketByKey("foo").getDocCount(), 2); assertEquals(filters.getBucketByKey("foo2").getDocCount(), 2); assertEquals(filters.getBucketByKey("bar").getDocCount(), 1); assertEquals(filters.getBucketByKey("same").getDocCount(), 1); assertEquals(filters.getBucketByKey("other").getDocCount(), 2); } indexReader.close(); directory.close(); }
From source file:org.elasticsearch.search.aggregations.bucket.filter.FiltersAggregatorTests.java
License:Apache License
public void testRandom() throws Exception { Directory directory = newDirectory(); RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory); int numDocs = randomIntBetween(100, 200); int maxTerm = randomIntBetween(10, 50); int[] expectedBucketCount = new int[maxTerm]; Document document = new Document(); for (int i = 0; i < numDocs; i++) { if (frequently()) { // make sure we have more than one segment to test the merge indexWriter.commit();//from ww w .j a v a 2 s . co m } int value = randomInt(maxTerm - 1); expectedBucketCount[value] += 1; document.add(new Field("field", Integer.toString(value), fieldType)); indexWriter.addDocument(document); document.clear(); } indexWriter.close(); IndexReader indexReader = DirectoryReader.open(directory); IndexSearcher indexSearcher = newSearcher(indexReader, true, true); int numFilters = randomIntBetween(1, 10); QueryBuilder[] filters = new QueryBuilder[numFilters]; int[] filterTerms = new int[numFilters]; int expectedOtherCount = numDocs; Set<Integer> filterSet = new HashSet<>(); for (int i = 0; i < filters.length; i++) { int value = randomInt(maxTerm - 1); filters[i] = QueryBuilders.termQuery("field", Integer.toString(value)); filterTerms[i] = value; if (filterSet.contains(value) == false) { expectedOtherCount -= expectedBucketCount[value]; filterSet.add(value); } } FiltersAggregationBuilder builder = new FiltersAggregationBuilder("test", filters); builder.otherBucket(true); builder.otherBucketKey("other"); for (boolean doReduce : new boolean[] { true, false }) { final InternalFilters response; if (doReduce) { response = searchAndReduce(indexSearcher, new MatchAllDocsQuery(), builder, fieldType); } else { response = search(indexSearcher, new MatchAllDocsQuery(), builder, fieldType); } List<InternalFilters.InternalBucket> buckets = response.getBuckets(); assertEquals(buckets.size(), filters.length + 1); for (InternalFilters.InternalBucket bucket : buckets) { if ("other".equals(bucket.getKey())) { assertEquals(bucket.getDocCount(), expectedOtherCount); } else { int index = Integer.parseInt(bucket.getKey()); assertEquals(bucket.getDocCount(), (long) expectedBucketCount[filterTerms[index]]); } } } indexReader.close(); directory.close(); }
From source file:org.elasticsearch.search.aggregations.bucket.histogram.AutoDateHistogramAggregatorTests.java
License:Apache License
private void executeTestCase(boolean reduced, Query query, List<String> dataset, Consumer<AutoDateHistogramAggregationBuilder> configure, Consumer<Histogram> verify) throws IOException { try (Directory directory = newDirectory()) { try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { Document document = new Document(); for (String date : dataset) { if (frequently()) { indexWriter.commit(); }// w w w .j ava 2 s.c o m long instant = asLong(date); document.add(new SortedNumericDocValuesField(DATE_FIELD, instant)); document.add(new LongPoint(INSTANT_FIELD, instant)); indexWriter.addDocument(document); document.clear(); } } try (IndexReader indexReader = DirectoryReader.open(directory)) { IndexSearcher indexSearcher = newSearcher(indexReader, true, true); AutoDateHistogramAggregationBuilder aggregationBuilder = new AutoDateHistogramAggregationBuilder( "_name"); if (configure != null) { configure.accept(aggregationBuilder); } DateFieldMapper.Builder builder = new DateFieldMapper.Builder("_name"); DateFieldMapper.DateFieldType fieldType = builder.fieldType(); fieldType.setHasDocValues(true); fieldType.setName(aggregationBuilder.field()); InternalAutoDateHistogram histogram; if (reduced) { histogram = searchAndReduce(indexSearcher, query, aggregationBuilder, fieldType); } else { histogram = search(indexSearcher, query, aggregationBuilder, fieldType); } verify.accept(histogram); } } }
From source file:org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramAggregatorTests.java
License:Apache License
private void executeTestCase(boolean reduced, Query query, List<String> dataset, Consumer<DateHistogramAggregationBuilder> configure, Consumer<Histogram> verify) throws IOException { try (Directory directory = newDirectory()) { try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { Document document = new Document(); for (String date : dataset) { if (frequently()) { indexWriter.commit(); }// w ww . j a v a 2 s. c o m long instant = asLong(date); document.add(new SortedNumericDocValuesField(DATE_FIELD, instant)); document.add(new LongPoint(INSTANT_FIELD, instant)); indexWriter.addDocument(document); document.clear(); } } try (IndexReader indexReader = DirectoryReader.open(directory)) { IndexSearcher indexSearcher = newSearcher(indexReader, true, true); DateHistogramAggregationBuilder aggregationBuilder = new DateHistogramAggregationBuilder("_name"); if (configure != null) { configure.accept(aggregationBuilder); } DateFieldMapper.Builder builder = new DateFieldMapper.Builder("_name"); DateFieldMapper.DateFieldType fieldType = builder.fieldType(); fieldType.setHasDocValues(true); fieldType.setName(aggregationBuilder.field()); InternalDateHistogram histogram; if (reduced) { histogram = searchAndReduce(indexSearcher, query, aggregationBuilder, fieldType); } else { histogram = search(indexSearcher, query, aggregationBuilder, fieldType); } verify.accept(histogram); } } }
From source file:org.elasticsearch.search.aggregations.bucket.missing.MissingAggregatorTests.java
License:Apache License
private void executeTestCase(int numDocs, String fieldName, Query query, Consumer<Document> consumer, Consumer<InternalMissing> verify, boolean reduced) throws IOException { try (Directory directory = newDirectory()) { try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { Document document = new Document(); for (int i = 0; i < numDocs; i++) { if (frequently()) { indexWriter.commit(); }/*from ww w. j av a2s . co m*/ consumer.accept(document); indexWriter.addDocument(document); document.clear(); } } try (IndexReader indexReader = DirectoryReader.open(directory)) { IndexSearcher indexSearcher = newSearcher(indexReader, true, true); MissingAggregationBuilder builder = new MissingAggregationBuilder("_name", ValueType.LONG); builder.field(fieldName); NumberFieldMapper.Builder mapperBuilder = new NumberFieldMapper.Builder("_name", NumberFieldMapper.NumberType.LONG); MappedFieldType fieldType = mapperBuilder.fieldType(); fieldType.setHasDocValues(true); fieldType.setName(builder.field()); InternalMissing missing; if (reduced) { missing = searchAndReduce(indexSearcher, query, builder, fieldType); } else { missing = search(indexSearcher, query, builder, fieldType); } verify.accept(missing); } } }
From source file:org.elasticsearch.search.aggregations.pipeline.bucketmetrics.avg.AvgBucketAggregatorTests.java
License:Apache License
/** * Test for issue #30608. Under the following circumstances: * * A. Multi-bucket agg in the first entry of our internal list * B. Regular agg as the immediate child of the multi-bucket in A * C. Regular agg with the same name as B at the top level, listed as the second entry in our internal list * D. Finally, a pipeline agg with the path down to B * * BucketMetrics reduction would throw a class cast exception due to bad subpathing. This test ensures * it is fixed.//from w ww.j a v a 2s . c o m * * Note: we have this test inside of the `avg_bucket` package so that we can get access to the package-private * `doReduce()` needed for testing this */ public void testSameAggNames() throws IOException { Query query = new MatchAllDocsQuery(); AvgAggregationBuilder avgBuilder = new AvgAggregationBuilder("foo").field(VALUE_FIELD); DateHistogramAggregationBuilder histo = new DateHistogramAggregationBuilder("histo") .dateHistogramInterval(DateHistogramInterval.YEAR).field(DATE_FIELD) .subAggregation(new AvgAggregationBuilder("foo").field(VALUE_FIELD)); AvgBucketPipelineAggregationBuilder avgBucketBuilder = new AvgBucketPipelineAggregationBuilder( "the_avg_bucket", "histo>foo"); try (Directory directory = newDirectory()) { try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { Document document = new Document(); for (String date : dataset) { if (frequently()) { indexWriter.commit(); } document.add(new SortedNumericDocValuesField(DATE_FIELD, asLong(date))); document.add(new SortedNumericDocValuesField(VALUE_FIELD, randomInt())); indexWriter.addDocument(document); document.clear(); } } InternalAvg avgResult; InternalDateHistogram histogramResult; try (IndexReader indexReader = DirectoryReader.open(directory)) { IndexSearcher indexSearcher = newSearcher(indexReader, true, true); DateFieldMapper.Builder builder = new DateFieldMapper.Builder("histo"); DateFieldMapper.DateFieldType fieldType = builder.fieldType(); fieldType.setHasDocValues(true); fieldType.setName(DATE_FIELD); MappedFieldType valueFieldType = new NumberFieldMapper.NumberFieldType( NumberFieldMapper.NumberType.LONG); valueFieldType.setName(VALUE_FIELD); valueFieldType.setHasDocValues(true); avgResult = searchAndReduce(indexSearcher, query, avgBuilder, 10000, null, new MappedFieldType[] { fieldType, valueFieldType }); histogramResult = searchAndReduce(indexSearcher, query, histo, 10000, null, new MappedFieldType[] { fieldType, valueFieldType }); } // Finally, reduce the pipeline agg PipelineAggregator avgBucketAgg = avgBucketBuilder.createInternal(Collections.emptyMap()); List<Aggregation> reducedAggs = new ArrayList<>(2); // Histo has to go first to exercise the bug reducedAggs.add(histogramResult); reducedAggs.add(avgResult); Aggregations aggregations = new Aggregations(reducedAggs); InternalAggregation pipelineResult = ((AvgBucketPipelineAggregator) avgBucketAgg).doReduce(aggregations, null); assertNotNull(pipelineResult); } }
From source file:org.elasticsearch.search.aggregations.pipeline.CumulativeSumAggregatorTests.java
License:Apache License
public void testDocCount() throws IOException { Query query = new MatchAllDocsQuery(); int numDocs = randomIntBetween(6, 20); int interval = randomIntBetween(2, 5); int minRandomValue = 0; int maxRandomValue = 20; int numValueBuckets = ((maxRandomValue - minRandomValue) / interval) + 1; long[] valueCounts = new long[numValueBuckets]; HistogramAggregationBuilder aggBuilder = new HistogramAggregationBuilder("histo").field(VALUE_FIELD) .interval(interval).extendedBounds(minRandomValue, maxRandomValue); aggBuilder.subAggregation(new CumulativeSumPipelineAggregationBuilder("cusum", "_count")); executeTestCase(query, aggBuilder, histogram -> { List<? extends Histogram.Bucket> buckets = ((Histogram) histogram).getBuckets(); assertThat(buckets.size(), equalTo(numValueBuckets)); double sum = 0; for (int i = 0; i < numValueBuckets; ++i) { Histogram.Bucket bucket = buckets.get(i); assertThat(bucket, notNullValue()); assertThat(((Number) bucket.getKey()).longValue(), equalTo((long) i * interval)); assertThat(bucket.getDocCount(), equalTo(valueCounts[i])); sum += bucket.getDocCount(); InternalSimpleValue cumulativeSumValue = bucket.getAggregations().get("cusum"); assertThat(cumulativeSumValue, notNullValue()); assertThat(cumulativeSumValue.getName(), equalTo("cusum")); assertThat(cumulativeSumValue.value(), equalTo(sum)); }//from w ww. j a v a 2s . c om }, indexWriter -> { Document document = new Document(); for (int i = 0; i < numDocs; i++) { int fieldValue = randomIntBetween(minRandomValue, maxRandomValue); document.add(new NumericDocValuesField(VALUE_FIELD, fieldValue)); final int bucket = (fieldValue / interval); valueCounts[bucket]++; indexWriter.addDocument(document); document.clear(); } }); }