List of usage examples for org.apache.lucene.index Fields terms
public abstract Terms terms(String field) throws IOException;
From source file:org.elasticsearch.search.suggest.completion.CompletionFieldStats.java
License:Apache License
/** * Returns total in-heap bytes used by all suggesters. This method has CPU cost <code>O(numIndexedFields)</code>. * * @param fieldNamePatterns if non-null, any completion field name matching any of these patterns will break out its in-heap bytes * separately in the returned {@link CompletionStats} *//*from w ww . ja va 2 s. com*/ public static CompletionStats completionStats(IndexReader indexReader, String... fieldNamePatterns) { long sizeInBytes = 0; ObjectLongHashMap<String> completionFields = null; if (fieldNamePatterns != null && fieldNamePatterns.length > 0) { completionFields = new ObjectLongHashMap<>(fieldNamePatterns.length); } for (LeafReaderContext atomicReaderContext : indexReader.leaves()) { LeafReader atomicReader = atomicReaderContext.reader(); try { Fields fields = atomicReader.fields(); for (String fieldName : fields) { Terms terms = fields.terms(fieldName); if (terms instanceof CompletionTerms) { // TODO: currently we load up the suggester for reporting its size long fstSize = ((CompletionTerms) terms).suggester().ramBytesUsed(); if (fieldNamePatterns != null && fieldNamePatterns.length > 0 && Regex.simpleMatch(fieldNamePatterns, fieldName)) { completionFields.addTo(fieldName, fstSize); } sizeInBytes += fstSize; } } } catch (IOException ioe) { throw new ElasticsearchException(ioe); } } return new CompletionStats(sizeInBytes, completionFields); }
From source file:org.elasticsearch.search.suggest.completion.old.AnalyzingCompletionLookupProvider.java
License:Apache License
@Override public FieldsConsumer consumer(final IndexOutput output) throws IOException { CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION_LATEST); return new FieldsConsumer() { private Map<String, Long> fieldOffsets = new HashMap<>(); @Override// ww w . j a va2 s . c o m public void close() throws IOException { try { /* * write the offsets per field such that we know where * we need to load the FSTs from */ long pointer = output.getFilePointer(); output.writeVInt(fieldOffsets.size()); for (Map.Entry<String, Long> entry : fieldOffsets.entrySet()) { output.writeString(entry.getKey()); output.writeVLong(entry.getValue()); } output.writeLong(pointer); CodecUtil.writeFooter(output); } finally { IOUtils.close(output); } } @Override public void write(Fields fields) throws IOException { for (String field : fields) { Terms terms = fields.terms(field); if (terms == null) { continue; } TermsEnum termsEnum = terms.iterator(); PostingsEnum docsEnum = null; final SuggestPayload spare = new SuggestPayload(); int maxAnalyzedPathsForOneInput = 0; final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder( maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP); int docCount = 0; while (true) { BytesRef term = termsEnum.next(); if (term == null) { break; } docsEnum = termsEnum.postings(null, docsEnum, PostingsEnum.PAYLOADS); builder.startTerm(term); int docFreq = 0; while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { for (int i = 0; i < docsEnum.freq(); i++) { final int position = docsEnum.nextPosition(); AnalyzingCompletionLookupProvider.this.parsePayload(docsEnum.getPayload(), spare); builder.addSurface(spare.surfaceForm.get(), spare.payload.get(), spare.weight); // multi fields have the same surface form so we sum up here maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1); } docFreq++; docCount = Math.max(docCount, docsEnum.docID() + 1); } builder.finishTerm(docFreq); } /* * Here we are done processing the field and we can * buid the FST and write it to disk. */ FST<Pair<Long, BytesRef>> build = builder.build(); assert build != null || docCount == 0 : "the FST is null but docCount is != 0 actual value: [" + docCount + "]"; /* * it's possible that the FST is null if we have 2 segments that get merged * and all docs that have a value in this field are deleted. This will cause * a consumer to be created but it doesn't consume any values causing the FSTBuilder * to return null. */ if (build != null) { fieldOffsets.put(field, output.getFilePointer()); build.save(output); /* write some more meta-info */ output.writeVInt(maxAnalyzedPathsForOneInput); output.writeVInt(maxSurfaceFormsPerAnalyzedForm); output.writeInt(maxGraphExpansions); // can be negative int options = 0; options |= preserveSep ? SERIALIZE_PRESERVE_SEPARATORS : 0; options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0; options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0; output.writeVInt(options); output.writeVInt(XAnalyzingSuggester.SEP_LABEL); output.writeVInt(XAnalyzingSuggester.END_BYTE); output.writeVInt(XAnalyzingSuggester.PAYLOAD_SEP); output.writeVInt(XAnalyzingSuggester.HOLE_CHARACTER); } } } }; }
From source file:org.elasticsearch.search.suggest.completion.old.AnalyzingCompletionLookupProviderV1.java
License:Apache License
@Override public FieldsConsumer consumer(final IndexOutput output) throws IOException { // TODO write index header? CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION); return new FieldsConsumer() { private Map<String, Long> fieldOffsets = new HashMap<>(); @Override//from w ww .j a v a 2s . co m public void close() throws IOException { try { /* * write the offsets per field such that we know where * we need to load the FSTs from */ long pointer = output.getFilePointer(); output.writeVInt(fieldOffsets.size()); for (Map.Entry<String, Long> entry : fieldOffsets.entrySet()) { output.writeString(entry.getKey()); output.writeVLong(entry.getValue()); } output.writeLong(pointer); } finally { IOUtils.close(output); } } @Override public void write(Fields fields) throws IOException { for (String field : fields) { Terms terms = fields.terms(field); if (terms == null) { continue; } TermsEnum termsEnum = terms.iterator(); PostingsEnum docsEnum = null; final SuggestPayload spare = new SuggestPayload(); int maxAnalyzedPathsForOneInput = 0; final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder( maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP); int docCount = 0; while (true) { BytesRef term = termsEnum.next(); if (term == null) { break; } docsEnum = termsEnum.postings(null, docsEnum, PostingsEnum.PAYLOADS); builder.startTerm(term); int docFreq = 0; while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { for (int i = 0; i < docsEnum.freq(); i++) { final int position = docsEnum.nextPosition(); AnalyzingCompletionLookupProviderV1.this.parsePayload(docsEnum.getPayload(), spare); builder.addSurface(spare.surfaceForm.get(), spare.payload.get(), spare.weight); // multi fields have the same surface form so we sum up here maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1); } docFreq++; docCount = Math.max(docCount, docsEnum.docID() + 1); } builder.finishTerm(docFreq); } /* * Here we are done processing the field and we can * buid the FST and write it to disk. */ FST<Pair<Long, BytesRef>> build = builder.build(); assert build != null || docCount == 0 : "the FST is null but docCount is != 0 actual value: [" + docCount + "]"; /* * it's possible that the FST is null if we have 2 segments that get merged * and all docs that have a value in this field are deleted. This will cause * a consumer to be created but it doesn't consume any values causing the FSTBuilder * to return null. */ if (build != null) { fieldOffsets.put(field, output.getFilePointer()); build.save(output); /* write some more meta-info */ output.writeVInt(maxAnalyzedPathsForOneInput); output.writeVInt(maxSurfaceFormsPerAnalyzedForm); output.writeInt(maxGraphExpansions); // can be negative int options = 0; options |= preserveSep ? SERIALIZE_PRESERVE_SEPARATORS : 0; options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0; options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0; output.writeVInt(options); } } } }; }
From source file:org.elasticsearch.search.suggest.completion2x.AnalyzingCompletionLookupProvider.java
License:Apache License
@Override public FieldsConsumer consumer(final IndexOutput output) throws IOException { CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION_LATEST); return new FieldsConsumer() { private Map<String, Long> fieldOffsets = new HashMap<>(); @Override/*ww w .j a v a 2 s. c om*/ public void close() throws IOException { try { /* * write the offsets per field such that we know where * we need to load the FSTs from */ long pointer = output.getFilePointer(); output.writeVInt(fieldOffsets.size()); for (Map.Entry<String, Long> entry : fieldOffsets.entrySet()) { output.writeString(entry.getKey()); output.writeVLong(entry.getValue()); } output.writeLong(pointer); CodecUtil.writeFooter(output); } finally { IOUtils.close(output); } } @Override public void write(Fields fields) throws IOException { for (String field : fields) { Terms terms = fields.terms(field); if (terms == null) { continue; } TermsEnum termsEnum = terms.iterator(); PostingsEnum docsEnum = null; final SuggestPayload spare = new SuggestPayload(); int maxAnalyzedPathsForOneInput = 0; final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder( maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP); int docCount = 0; while (true) { BytesRef term = termsEnum.next(); if (term == null) { break; } docsEnum = termsEnum.postings(docsEnum, PostingsEnum.PAYLOADS); builder.startTerm(term); int docFreq = 0; while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { for (int i = 0; i < docsEnum.freq(); i++) { final int position = docsEnum.nextPosition(); AnalyzingCompletionLookupProvider.this.parsePayload(docsEnum.getPayload(), spare); builder.addSurface(spare.surfaceForm.get(), spare.payload.get(), spare.weight); // multi fields have the same surface form so we sum up here maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1); } docFreq++; docCount = Math.max(docCount, docsEnum.docID() + 1); } builder.finishTerm(docFreq); } /* * Here we are done processing the field and we can * buid the FST and write it to disk. */ FST<Pair<Long, BytesRef>> build = builder.build(); assert build != null || docCount == 0 : "the FST is null but docCount is != 0 actual value: [" + docCount + "]"; /* * it's possible that the FST is null if we have 2 segments that get merged * and all docs that have a value in this field are deleted. This will cause * a consumer to be created but it doesn't consume any values causing the FSTBuilder * to return null. */ if (build != null) { fieldOffsets.put(field, output.getFilePointer()); build.save(output); /* write some more meta-info */ output.writeVInt(maxAnalyzedPathsForOneInput); output.writeVInt(maxSurfaceFormsPerAnalyzedForm); output.writeInt(maxGraphExpansions); // can be negative int options = 0; options |= preserveSep ? SERIALIZE_PRESERVE_SEPARATORS : 0; options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0; options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0; output.writeVInt(options); output.writeVInt(XAnalyzingSuggester.SEP_LABEL); output.writeVInt(XAnalyzingSuggester.END_BYTE); output.writeVInt(XAnalyzingSuggester.PAYLOAD_SEP); output.writeVInt(XAnalyzingSuggester.HOLE_CHARACTER); } } } }; }
From source file:org.elasticsearch.termvectors.GetTermVectorCheckDocFreqTests.java
License:Apache License
private void checkWithoutFieldStatistics(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset, int i) throws IOException { TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i)) .setPayloads(true).setOffsets(true).setPositions(true).setTermStatistics(true) .setFieldStatistics(false).setSelectedFields(); TermVectorResponse response = resp.execute().actionGet(); assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(1)); Terms terms = fields.terms("field"); assertThat(terms.size(), equalTo(8l)); assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) -1)); assertThat(terms.getDocCount(), Matchers.equalTo(-1)); assertThat(terms.getSumDocFreq(), equalTo((long) -1)); TermsEnum iterator = terms.iterator(null); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(next, Matchers.notNullValue()); assertThat("expected " + string, string, equalTo(next.utf8ToString())); assertThat(next, Matchers.notNullValue()); if (string.equals("the")) { assertThat("expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq())); } else {/*from w ww . j a v a2 s .c o m*/ assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq())); } DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null); assertThat(docsAndPositions.nextDoc(), equalTo(0)); assertThat(freq[j], equalTo(docsAndPositions.freq())); assertThat(iterator.docFreq(), equalTo(numDocs)); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; assertThat(termPos.length, equalTo(freq[j])); assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); assertThat("term: " + string, nextPosition, equalTo(termPos[k])); assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); } } assertThat(iterator.next(), Matchers.nullValue()); XContentBuilder xBuilder = new XContentFactory().jsonBuilder(); response.toXContent(xBuilder, null); BytesStream bytesStream = xBuilder.bytesStream(); String utf8 = bytesStream.bytes().toUtf8(); String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i + "\",\"_version\":1,\"exists\":true,\"term_vectors\":{\"field\":{\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}"; assertThat(utf8, equalTo(expectedString)); }
From source file:org.elasticsearch.termvectors.GetTermVectorCheckDocFreqTests.java
License:Apache License
private void checkWithoutTermStatistics(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset, int i) throws IOException { TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i)) .setPayloads(true).setOffsets(true).setPositions(true).setTermStatistics(false) .setFieldStatistics(true).setSelectedFields(); assertThat(resp.request().termStatistics(), equalTo(false)); TermVectorResponse response = resp.execute().actionGet(); assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(1)); Terms terms = fields.terms("field"); assertThat(terms.size(), equalTo(8l)); assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs))); assertThat(terms.getDocCount(), Matchers.equalTo(numDocs)); assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length)); TermsEnum iterator = terms.iterator(null); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(next, Matchers.notNullValue()); assertThat("expected " + string, string, equalTo(next.utf8ToString())); assertThat(next, Matchers.notNullValue()); assertThat("expected ttf of " + string, -1, equalTo((int) iterator.totalTermFreq())); DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null); assertThat(docsAndPositions.nextDoc(), equalTo(0)); assertThat(freq[j], equalTo(docsAndPositions.freq())); assertThat(iterator.docFreq(), equalTo(-1)); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; assertThat(termPos.length, equalTo(freq[j])); assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); assertThat("term: " + string, nextPosition, equalTo(termPos[k])); assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); }/*from ww w. j a va 2s .com*/ } assertThat(iterator.next(), Matchers.nullValue()); XContentBuilder xBuilder = new XContentFactory().jsonBuilder(); response.toXContent(xBuilder, null); BytesStream bytesStream = xBuilder.bytesStream(); String utf8 = bytesStream.bytes().toUtf8(); String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i + "\",\"_version\":1,\"exists\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}"; assertThat(utf8, equalTo(expectedString)); }
From source file:org.elasticsearch.termvectors.GetTermVectorCheckDocFreqTests.java
License:Apache License
private void checkAllInfo(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset, int i) throws IOException { TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i)) .setPayloads(true).setOffsets(true).setPositions(true).setFieldStatistics(true) .setTermStatistics(true).setSelectedFields(); assertThat(resp.request().fieldStatistics(), equalTo(true)); TermVectorResponse response = resp.execute().actionGet(); assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(1)); Terms terms = fields.terms("field"); assertThat(terms.size(), equalTo(8l)); assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs))); assertThat(terms.getDocCount(), Matchers.equalTo(numDocs)); assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length)); TermsEnum iterator = terms.iterator(null); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(next, Matchers.notNullValue()); assertThat("expected " + string, string, equalTo(next.utf8ToString())); assertThat(next, Matchers.notNullValue()); if (string.equals("the")) { assertThat("expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq())); } else {/*from w w w .java 2 s.co m*/ assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq())); } DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null); assertThat(docsAndPositions.nextDoc(), equalTo(0)); assertThat(freq[j], equalTo(docsAndPositions.freq())); assertThat(iterator.docFreq(), equalTo(numDocs)); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; assertThat(termPos.length, equalTo(freq[j])); assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); assertThat("term: " + string, nextPosition, equalTo(termPos[k])); assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); } } assertThat(iterator.next(), Matchers.nullValue()); XContentBuilder xBuilder = new XContentFactory().jsonBuilder(); response.toXContent(xBuilder, null); BytesStream bytesStream = xBuilder.bytesStream(); String utf8 = bytesStream.bytes().toUtf8(); String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i + "\",\"_version\":1,\"exists\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}"; assertThat(utf8, equalTo(expectedString)); }
From source file:org.elasticsearch.termvectors.GetTermVectorTests.java
License:Apache License
@Test public void testSimpleTermVectors() throws ElasticSearchException, IOException { XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1") .startObject("properties").startObject("field").field("type", "string") .field("term_vector", "with_positions_offsets_payloads").field("analyzer", "tv_test").endObject() .endObject().endObject().endObject(); ElasticSearchAssertions.assertAcked(prepareCreate("test").addMapping("type1", mapping) .setSettings(ImmutableSettings.settingsBuilder() .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace") .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase"))); ensureYellow();/* w w w . jav a 2 s.co m*/ for (int i = 0; i < 10; i++) { client().prepareIndex("test", "type1", Integer.toString(i)) .setSource(XContentFactory.jsonBuilder().startObject() .field("field", "the quick brown fox jumps over the lazy dog") // 0the3 4quick9 10brown15 16fox19 20jumps25 26over30 // 31the34 35lazy39 40dog43 .endObject()) .execute().actionGet(); refresh(); } String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" }; int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 }; int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } }; int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } }; int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } }; for (int i = 0; i < 10; i++) { TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i)) .setPayloads(true).setOffsets(true).setPositions(true).setSelectedFields(); TermVectorResponse response = resp.execute().actionGet(); assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(1)); Terms terms = fields.terms("field"); assertThat(terms.size(), equalTo(8l)); TermsEnum iterator = terms.iterator(null); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(next, Matchers.notNullValue()); assertThat("expected " + string, string, equalTo(next.utf8ToString())); assertThat(next, Matchers.notNullValue()); // do not test ttf or doc frequency, because here we have many // shards and do not know how documents are distributed DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null); assertThat(docsAndPositions.nextDoc(), equalTo(0)); assertThat(freq[j], equalTo(docsAndPositions.freq())); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; assertThat(termPos.length, equalTo(freq[j])); assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); assertThat("term: " + string, nextPosition, equalTo(termPos[k])); assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); } } assertThat(iterator.next(), Matchers.nullValue()); } }
From source file:org.elasticsearch.termvectors.GetTermVectorTests.java
License:Apache License
@Test public void testRandomSingleTermVectors() throws ElasticSearchException, IOException { FieldType ft = new FieldType(); int config = randomInt(6); boolean storePositions = false; boolean storeOffsets = false; boolean storePayloads = false; boolean storeTermVectors = false; switch (config) { case 0: {//from ww w. j a v a2 s .c o m // do nothing } case 1: { storeTermVectors = true; } case 2: { storeTermVectors = true; storePositions = true; } case 3: { storeTermVectors = true; storeOffsets = true; } case 4: { storeTermVectors = true; storePositions = true; storeOffsets = true; } case 5: { storeTermVectors = true; storePositions = true; storePayloads = true; } case 6: { storeTermVectors = true; storePositions = true; storeOffsets = true; storePayloads = true; } } ft.setStoreTermVectors(storeTermVectors); ft.setStoreTermVectorOffsets(storeOffsets); ft.setStoreTermVectorPayloads(storePayloads); ft.setStoreTermVectorPositions(storePositions); String optionString = AbstractFieldMapper.termVectorOptionsToString(ft); XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1") .startObject("properties").startObject("field").field("type", "string") .field("term_vector", optionString).field("analyzer", "tv_test").endObject().endObject().endObject() .endObject(); ElasticSearchAssertions.assertAcked(prepareCreate("test").addMapping("type1", mapping) .setSettings(ImmutableSettings.settingsBuilder() .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace") .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase"))); ensureYellow(); for (int i = 0; i < 10; i++) { client().prepareIndex("test", "type1", Integer.toString(i)) .setSource(XContentFactory.jsonBuilder().startObject() .field("field", "the quick brown fox jumps over the lazy dog") // 0the3 4quick9 10brown15 16fox19 20jumps25 26over30 // 31the34 35lazy39 40dog43 .endObject()) .execute().actionGet(); refresh(); } String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" }; int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 }; int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } }; int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } }; int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } }; boolean isPayloadRequested = randomBoolean(); boolean isOffsetRequested = randomBoolean(); boolean isPositionsRequested = randomBoolean(); String infoString = createInfoString(isPositionsRequested, isOffsetRequested, isPayloadRequested, optionString); for (int i = 0; i < 10; i++) { TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i)) .setPayloads(isPayloadRequested).setOffsets(isOffsetRequested) .setPositions(isPositionsRequested).setSelectedFields(); TermVectorResponse response = resp.execute().actionGet(); assertThat(infoString + "doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(ft.storeTermVectors() ? 1 : 0)); if (ft.storeTermVectors()) { Terms terms = fields.terms("field"); assertThat(terms.size(), equalTo(8l)); TermsEnum iterator = terms.iterator(null); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(infoString, next, Matchers.notNullValue()); assertThat(infoString + "expected " + string, string, equalTo(next.utf8ToString())); assertThat(infoString, next, Matchers.notNullValue()); // do not test ttf or doc frequency, because here we have // many shards and do not know how documents are distributed DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null); // docs and pos only returns something if positions or // payloads or offsets are stored / requestd Otherwise use // DocsEnum? assertThat(infoString, docsAndPositions.nextDoc(), equalTo(0)); assertThat(infoString, freq[j], equalTo(docsAndPositions.freq())); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; if (isPositionsRequested && storePositions) { assertThat(infoString, termPos.length, equalTo(freq[j])); } if (isOffsetRequested && storeOffsets) { assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); } for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); // only return something useful if requested and stored if (isPositionsRequested && storePositions) { assertThat(infoString + "positions for term: " + string, nextPosition, equalTo(termPos[k])); } else { assertThat(infoString + "positions for term: ", nextPosition, equalTo(-1)); } // only return something useful if requested and stored if (isPayloadRequested && storePayloads) { assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); } else { assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(), equalTo(null)); } // only return something useful if requested and stored if (isOffsetRequested && storeOffsets) { assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); } else { assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(), equalTo(-1)); assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(), equalTo(-1)); } } } assertThat(iterator.next(), Matchers.nullValue()); } } }
From source file:org.elasticsearch.test.integration.termvectors.GetTermVectorTests.java
License:Apache License
@Test public void testSimpleTermVectors() throws ElasticSearchException, IOException { run(addMapping(prepareCreate("test"), "type1", new Object[] { "field", "type", "string", "term_vector", "with_positions_offsets_payloads", "analyzer", "tv_test" }) .setSettings(ImmutableSettings.settingsBuilder() .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace") .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase"))); ensureYellow();/*from ww w . j a v a 2 s . com*/ for (int i = 0; i < 10; i++) { client().prepareIndex("test", "type1", Integer.toString(i)) .setSource(XContentFactory.jsonBuilder().startObject() .field("field", "the quick brown fox jumps over the lazy dog") // 0the3 4quick9 10brown15 16fox19 20jumps25 26over30 // 31the34 35lazy39 40dog43 .endObject()) .execute().actionGet(); refresh(); } String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" }; int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 }; int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } }; int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } }; int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } }; for (int i = 0; i < 10; i++) { TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i)) .setPayloads(true).setOffsets(true).setPositions(true).setSelectedFields(); TermVectorResponse response = resp.execute().actionGet(); assertThat("doc id: " + i + " doesn't exists but should", response.documentExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(1)); Terms terms = fields.terms("field"); assertThat(terms.size(), equalTo(8l)); TermsEnum iterator = terms.iterator(null); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(next, Matchers.notNullValue()); assertThat("expected " + string, string, equalTo(next.utf8ToString())); assertThat(next, Matchers.notNullValue()); // do not test ttf or doc frequency, because here we have many // shards and do not know how documents are distributed DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null); assertThat(docsAndPositions.nextDoc(), equalTo(0)); assertThat(freq[j], equalTo(docsAndPositions.freq())); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; assertThat(termPos.length, equalTo(freq[j])); assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); assertThat("term: " + string, nextPosition, equalTo(termPos[k])); assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); } } assertThat(iterator.next(), Matchers.nullValue()); } }