List of usage examples for org.apache.lucene.index Fields terms
public abstract Terms terms(String field) throws IOException;
From source file:org.elasticsearch.action.termvectors.GetTermVectorsCheckDocFreqTests.java
License:Apache License
private void checkWithoutFieldStatistics(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset, int i) throws IOException { TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)) .setPayloads(true).setOffsets(true).setPositions(true).setTermStatistics(true) .setFieldStatistics(false).setSelectedFields(); TermVectorsResponse response = resp.execute().actionGet(); assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(1)); Terms terms = fields.terms("field"); assertThat(terms.size(), equalTo(8l)); assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) -1)); assertThat(terms.getDocCount(), Matchers.equalTo(-1)); assertThat(terms.getSumDocFreq(), equalTo((long) -1)); TermsEnum iterator = terms.iterator(); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(next, Matchers.notNullValue()); assertThat("expected " + string, string, equalTo(next.utf8ToString())); assertThat(next, Matchers.notNullValue()); if (string.equals("the")) { assertThat("expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq())); } else {//from w ww .ja v a2s. c om assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq())); } PostingsEnum docsAndPositions = iterator.postings(null, null, PostingsEnum.ALL); assertThat(docsAndPositions.nextDoc(), equalTo(0)); assertThat(freq[j], equalTo(docsAndPositions.freq())); assertThat(iterator.docFreq(), equalTo(numDocs)); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; assertThat(termPos.length, equalTo(freq[j])); assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); assertThat("term: " + string, nextPosition, equalTo(termPos[k])); assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); } } assertThat(iterator.next(), Matchers.nullValue()); XContentBuilder xBuilder = XContentFactory.jsonBuilder(); xBuilder.startObject(); response.toXContent(xBuilder, null); xBuilder.endObject(); BytesStream bytesStream = xBuilder.bytesStream(); String utf8 = bytesStream.bytes().toUtf8().replaceFirst("\"took\":\\d+,", ""); ; String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i + "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}"; assertThat(utf8, equalTo(expectedString)); }
From source file:org.elasticsearch.action.termvectors.GetTermVectorsCheckDocFreqTests.java
License:Apache License
private void checkWithoutTermStatistics(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset, int i) throws IOException { TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)) .setPayloads(true).setOffsets(true).setPositions(true).setTermStatistics(false) .setFieldStatistics(true).setSelectedFields(); assertThat(resp.request().termStatistics(), equalTo(false)); TermVectorsResponse response = resp.execute().actionGet(); assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(1)); Terms terms = fields.terms("field"); assertThat(terms.size(), equalTo(8l)); assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs))); assertThat(terms.getDocCount(), Matchers.equalTo(numDocs)); assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length)); TermsEnum iterator = terms.iterator(); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(next, Matchers.notNullValue()); assertThat("expected " + string, string, equalTo(next.utf8ToString())); assertThat(next, Matchers.notNullValue()); assertThat("expected ttf of " + string, -1, equalTo((int) iterator.totalTermFreq())); PostingsEnum docsAndPositions = iterator.postings(null, null, PostingsEnum.ALL); assertThat(docsAndPositions.nextDoc(), equalTo(0)); assertThat(freq[j], equalTo(docsAndPositions.freq())); assertThat(iterator.docFreq(), equalTo(-1)); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; assertThat(termPos.length, equalTo(freq[j])); assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); assertThat("term: " + string, nextPosition, equalTo(termPos[k])); assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); }// w ww . j ava 2 s . co m } assertThat(iterator.next(), Matchers.nullValue()); XContentBuilder xBuilder = XContentFactory.jsonBuilder(); xBuilder.startObject(); response.toXContent(xBuilder, null); xBuilder.endObject(); BytesStream bytesStream = xBuilder.bytesStream(); String utf8 = bytesStream.bytes().toUtf8().replaceFirst("\"took\":\\d+,", ""); ; String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i + "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}"; assertThat(utf8, equalTo(expectedString)); }
From source file:org.elasticsearch.action.termvectors.GetTermVectorsCheckDocFreqTests.java
License:Apache License
private void checkAllInfo(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset, int i) throws IOException { TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)) .setPayloads(true).setOffsets(true).setPositions(true).setFieldStatistics(true) .setTermStatistics(true).setSelectedFields(); assertThat(resp.request().fieldStatistics(), equalTo(true)); TermVectorsResponse response = resp.execute().actionGet(); assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(1)); Terms terms = fields.terms("field"); assertThat(terms.size(), equalTo(8l)); assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs))); assertThat(terms.getDocCount(), Matchers.equalTo(numDocs)); assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length)); TermsEnum iterator = terms.iterator(); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(next, Matchers.notNullValue()); assertThat("expected " + string, string, equalTo(next.utf8ToString())); assertThat(next, Matchers.notNullValue()); if (string.equals("the")) { assertThat("expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq())); } else {//from w w w.j a va 2 s. co m assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq())); } PostingsEnum docsAndPositions = iterator.postings(null, null, PostingsEnum.ALL); assertThat(docsAndPositions.nextDoc(), equalTo(0)); assertThat(freq[j], equalTo(docsAndPositions.freq())); assertThat(iterator.docFreq(), equalTo(numDocs)); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; assertThat(termPos.length, equalTo(freq[j])); assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); assertThat("term: " + string, nextPosition, equalTo(termPos[k])); assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); } } assertThat(iterator.next(), Matchers.nullValue()); XContentBuilder xBuilder = XContentFactory.jsonBuilder(); xBuilder.startObject(); response.toXContent(xBuilder, ToXContent.EMPTY_PARAMS); xBuilder.endObject(); BytesStream bytesStream = xBuilder.bytesStream(); String utf8 = bytesStream.bytes().toUtf8().replaceFirst("\"took\":\\d+,", ""); ; String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i + "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}"; assertThat(utf8, equalTo(expectedString)); }
From source file:org.elasticsearch.action.termvectors.GetTermVectorsIT.java
License:Apache License
@Test public void testRandomSingleTermVectors() throws IOException { FieldType ft = new FieldType(); int config = randomInt(6); boolean storePositions = false; boolean storeOffsets = false; boolean storePayloads = false; boolean storeTermVectors = false; switch (config) { case 0: {//from w ww.j a v a2 s . c o m // do nothing break; } case 1: { storeTermVectors = true; break; } case 2: { storeTermVectors = true; storePositions = true; break; } case 3: { storeTermVectors = true; storeOffsets = true; break; } case 4: { storeTermVectors = true; storePositions = true; storeOffsets = true; break; } case 5: { storeTermVectors = true; storePositions = true; storePayloads = true; break; } case 6: { storeTermVectors = true; storePositions = true; storeOffsets = true; storePayloads = true; break; } } ft.setStoreTermVectors(storeTermVectors); ft.setStoreTermVectorOffsets(storeOffsets); ft.setStoreTermVectorPayloads(storePayloads); ft.setStoreTermVectorPositions(storePositions); String optionString = FieldMapper.termVectorOptionsToString(ft); XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties") .startObject("field").field("type", "string").field("term_vector", optionString) .field("analyzer", "tv_test").endObject().endObject().endObject().endObject(); assertAcked(prepareCreate("test").addMapping("type1", mapping) .setSettings(settingsBuilder().put("index.analysis.analyzer.tv_test.tokenizer", "whitespace") .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase"))); ensureYellow(); for (int i = 0; i < 10; i++) { client().prepareIndex("test", "type1", Integer.toString(i)) .setSource(jsonBuilder().startObject() .field("field", "the quick brown fox jumps over the lazy dog") // 0the3 4quick9 10brown15 16fox19 20jumps25 26over30 // 31the34 35lazy39 40dog43 .endObject()) .execute().actionGet(); refresh(); } String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" }; int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 }; int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } }; int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } }; int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } }; boolean isPayloadRequested = randomBoolean(); boolean isOffsetRequested = randomBoolean(); boolean isPositionsRequested = randomBoolean(); String infoString = createInfoString(isPositionsRequested, isOffsetRequested, isPayloadRequested, optionString); for (int i = 0; i < 10; i++) { TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)) .setPayloads(isPayloadRequested).setOffsets(isOffsetRequested) .setPositions(isPositionsRequested).setSelectedFields(); TermVectorsResponse response = resp.execute().actionGet(); assertThat(infoString + "doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(ft.storeTermVectors() ? 1 : 0)); if (ft.storeTermVectors()) { Terms terms = fields.terms("field"); assertThat(terms.size(), equalTo(8l)); TermsEnum iterator = terms.iterator(); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(infoString, next, notNullValue()); assertThat(infoString + "expected " + string, string, equalTo(next.utf8ToString())); assertThat(infoString, next, notNullValue()); // do not test ttf or doc frequency, because here we have // many shards and do not know how documents are distributed PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL); // docs and pos only returns something if positions or // payloads or offsets are stored / requestd Otherwise use // DocsEnum? assertThat(infoString, docsAndPositions.nextDoc(), equalTo(0)); assertThat(infoString, freq[j], equalTo(docsAndPositions.freq())); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; if (isPositionsRequested && storePositions) { assertThat(infoString, termPos.length, equalTo(freq[j])); } if (isOffsetRequested && storeOffsets) { assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); } for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); // only return something useful if requested and stored if (isPositionsRequested && storePositions) { assertThat(infoString + "positions for term: " + string, nextPosition, equalTo(termPos[k])); } else { assertThat(infoString + "positions for term: ", nextPosition, equalTo(-1)); } // only return something useful if requested and stored if (isPayloadRequested && storePayloads) { assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); } else { assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(), equalTo(null)); } // only return something useful if requested and stored if (isOffsetRequested && storeOffsets) { assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); } else { assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(), equalTo(-1)); assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(), equalTo(-1)); } } } assertThat(iterator.next(), nullValue()); } } }
From source file:org.elasticsearch.action.termvectors.GetTermVectorsIT.java
License:Apache License
@Test public void testRandomPayloadWithDelimitedPayloadTokenFilter() throws IOException { //create the test document int encoding = randomIntBetween(0, 2); String encodingString = ""; if (encoding == 0) { encodingString = "float"; }/*from w w w .j a v a2 s .c om*/ if (encoding == 1) { encodingString = "int"; } if (encoding == 2) { encodingString = "identity"; } String[] tokens = crateRandomTokens(); Map<String, List<BytesRef>> payloads = createPayloads(tokens, encoding); String delimiter = createRandomDelimiter(tokens); String queryString = createString(tokens, payloads, encoding, delimiter.charAt(0)); //create the mapping XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties") .startObject("field").field("type", "string") .field("term_vector", "with_positions_offsets_payloads").field("analyzer", "payload_test") .endObject().endObject().endObject().endObject(); assertAcked(prepareCreate("test").addMapping("type1", mapping).setSettings(settingsBuilder() .put(indexSettings()).put("index.analysis.analyzer.payload_test.tokenizer", "whitespace") .putArray("index.analysis.analyzer.payload_test.filter", "my_delimited_payload_filter") .put("index.analysis.filter.my_delimited_payload_filter.delimiter", delimiter) .put("index.analysis.filter.my_delimited_payload_filter.encoding", encodingString) .put("index.analysis.filter.my_delimited_payload_filter.type", "delimited_payload_filter"))); ensureYellow(); client().prepareIndex("test", "type1", Integer.toString(1)) .setSource(jsonBuilder().startObject().field("field", queryString).endObject()).execute() .actionGet(); refresh(); TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(1)) .setPayloads(true).setOffsets(true).setPositions(true).setSelectedFields(); TermVectorsResponse response = resp.execute().actionGet(); assertThat("doc id 1 doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(1)); Terms terms = fields.terms("field"); TermsEnum iterator = terms.iterator(); while (iterator.next() != null) { String term = iterator.term().utf8ToString(); PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL); assertThat(docsAndPositions.nextDoc(), equalTo(0)); List<BytesRef> curPayloads = payloads.get(term); assertThat(term, curPayloads, notNullValue()); assertNotNull(docsAndPositions); for (int k = 0; k < docsAndPositions.freq(); k++) { docsAndPositions.nextPosition(); if (docsAndPositions.getPayload() != null) { String infoString = "\nterm: " + term + " has payload \n" + docsAndPositions.getPayload().toString() + "\n but should have payload \n" + curPayloads.get(k).toString(); assertThat(infoString, docsAndPositions.getPayload(), equalTo(curPayloads.get(k))); } else { String infoString = "\nterm: " + term + " has no payload but should have payload \n" + curPayloads.get(k).toString(); assertThat(infoString, curPayloads.get(k).length, equalTo(0)); } } } assertThat(iterator.next(), nullValue()); }
From source file:org.elasticsearch.action.termvectors.GetTermVectorsIT.java
License:Apache License
private void checkBrownFoxTermVector(Fields fields, String fieldName, boolean withPayloads) throws IOException { String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" }; int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 }; int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } }; int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } }; int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } }; Terms terms = fields.terms(fieldName); assertThat(terms.size(), equalTo(8l)); TermsEnum iterator = terms.iterator(); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(next, notNullValue()); assertThat("expected " + string, string, equalTo(next.utf8ToString())); assertThat(next, notNullValue()); // do not test ttf or doc frequency, because here we have many // shards and do not know how documents are distributed PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL); assertThat(docsAndPositions.nextDoc(), equalTo(0)); assertThat(freq[j], equalTo(docsAndPositions.freq())); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; assertThat(termPos.length, equalTo(freq[j])); assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); assertThat("term: " + string, nextPosition, equalTo(termPos[k])); assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); if (withPayloads) { assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); }//ww w . j av a 2 s . co m } } assertThat(iterator.next(), nullValue()); }
From source file:org.elasticsearch.action.termvectors.GetTermVectorsIT.java
License:Apache License
private void compareTermVectors(String fieldName, Fields fields0, Fields fields1) throws IOException { Terms terms0 = fields0.terms(fieldName); Terms terms1 = fields1.terms(fieldName); assertThat(terms0, notNullValue());/*from w w w .j a v a2 s . c om*/ assertThat(terms1, notNullValue()); assertThat(terms0.size(), equalTo(terms1.size())); TermsEnum iter0 = terms0.iterator(); TermsEnum iter1 = terms1.iterator(); for (int i = 0; i < terms0.size(); i++) { BytesRef next0 = iter0.next(); assertThat(next0, notNullValue()); BytesRef next1 = iter1.next(); assertThat(next1, notNullValue()); // compare field value String string0 = next0.utf8ToString(); String string1 = next1.utf8ToString(); assertThat("expected: " + string0, string0, equalTo(string1)); // compare df and ttf assertThat("term: " + string0, iter0.docFreq(), equalTo(iter1.docFreq())); assertThat("term: " + string0, iter0.totalTermFreq(), equalTo(iter1.totalTermFreq())); // compare freq and docs PostingsEnum docsAndPositions0 = iter0.postings(null, PostingsEnum.ALL); PostingsEnum docsAndPositions1 = iter1.postings(null, PostingsEnum.ALL); assertThat("term: " + string0, docsAndPositions0.nextDoc(), equalTo(docsAndPositions1.nextDoc())); assertThat("term: " + string0, docsAndPositions0.freq(), equalTo(docsAndPositions1.freq())); // compare position, start offsets and end offsets for (int j = 0; j < docsAndPositions0.freq(); j++) { assertThat("term: " + string0, docsAndPositions0.nextPosition(), equalTo(docsAndPositions1.nextPosition())); assertThat("term: " + string0, docsAndPositions0.startOffset(), equalTo(docsAndPositions1.startOffset())); assertThat("term: " + string0, docsAndPositions0.endOffset(), equalTo(docsAndPositions1.endOffset())); } } assertThat(iter0.next(), nullValue()); assertThat(iter1.next(), nullValue()); }
From source file:org.elasticsearch.action.termvectors.GetTermVectorsIT.java
License:Apache License
private void checkAnalyzedFields(Fields fieldsObject, Set<String> fieldNames, Map<String, String> perFieldAnalyzer) throws IOException { Set<String> validFields = new HashSet<>(); for (String fieldName : fieldNames) { if (fieldName.startsWith("non_existing")) { assertThat("Non existing field\"" + fieldName + "\" should not be returned!", fieldsObject.terms(fieldName), nullValue()); continue; }//from w w w . j a va 2s .co m Terms terms = fieldsObject.terms(fieldName); assertThat("Existing field " + fieldName + "should have been returned", terms, notNullValue()); // check overridden by keyword analyzer ... if (perFieldAnalyzer.containsKey(fieldName)) { TermsEnum iterator = terms.iterator(); assertThat("Analyzer for " + fieldName + " should have been overridden!", iterator.next().utf8ToString(), equalTo("some text here")); assertThat(iterator.next(), nullValue()); } validFields.add(fieldName); } // ensure no other fields are returned assertThat("More fields than expected are returned!", fieldsObject.size(), equalTo(validFields.size())); }
From source file:org.elasticsearch.action.termvectors.GetTermVectorsIT.java
License:Apache License
private void checkStats(Fields fields, XContentBuilder xContentBuilder, boolean isEqual) throws IOException { Map<String, Object> stats = JsonXContent.jsonXContent.createParser(xContentBuilder.bytes()).map(); assertThat("number of fields expected:", fields.size(), equalTo(stats.size())); for (String fieldName : fields) { logger.info("Checking field statistics for field: {}", fieldName); Terms terms = fields.terms(fieldName); Map<String, Integer> fieldStatistics = getFieldStatistics(stats, fieldName); String msg = "field: " + fieldName + " "; assertThat(msg + "sum_doc_freq:", (int) terms.getSumDocFreq(), equalOrLessThanTo(fieldStatistics.get("sum_doc_freq"), isEqual)); assertThat(msg + "doc_count:", terms.getDocCount(), equalOrLessThanTo(fieldStatistics.get("doc_count"), isEqual)); assertThat(msg + "sum_ttf:", (int) terms.getSumTotalTermFreq(), equalOrLessThanTo(fieldStatistics.get("sum_ttf"), isEqual)); final TermsEnum termsEnum = terms.iterator(); BytesRef text;/*w w w . j av a 2s. c om*/ while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); logger.info("Checking term statistics for term: ({}, {})", fieldName, term); Map<String, Integer> termStatistics = getTermStatistics(stats, fieldName, term); msg = "term: (" + fieldName + "," + term + ") "; assertThat(msg + "doc_freq:", termsEnum.docFreq(), equalOrLessThanTo(termStatistics.get("doc_freq"), isEqual)); assertThat(msg + "ttf:", (int) termsEnum.totalTermFreq(), equalOrLessThanTo(termStatistics.get("ttf"), isEqual)); } } }
From source file:org.elasticsearch.action.termvectors.GetTermVectorsTests.java
License:Apache License
@Test public void testRandomSingleTermVectors() throws ElasticsearchException, IOException { FieldType ft = new FieldType(); int config = randomInt(6); boolean storePositions = false; boolean storeOffsets = false; boolean storePayloads = false; boolean storeTermVectors = false; switch (config) { case 0: {//from w w w .j a v a 2 s .c o m // do nothing break; } case 1: { storeTermVectors = true; break; } case 2: { storeTermVectors = true; storePositions = true; break; } case 3: { storeTermVectors = true; storeOffsets = true; break; } case 4: { storeTermVectors = true; storePositions = true; storeOffsets = true; break; } case 5: { storeTermVectors = true; storePositions = true; storePayloads = true; break; } case 6: { storeTermVectors = true; storePositions = true; storeOffsets = true; storePayloads = true; break; } } ft.setStoreTermVectors(storeTermVectors); ft.setStoreTermVectorOffsets(storeOffsets); ft.setStoreTermVectorPayloads(storePayloads); ft.setStoreTermVectorPositions(storePositions); String optionString = AbstractFieldMapper.termVectorOptionsToString(ft); XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties") .startObject("field").field("type", "string").field("term_vector", optionString) .field("analyzer", "tv_test").endObject().endObject().endObject().endObject(); assertAcked(prepareCreate("test").addMapping("type1", mapping) .setSettings(settingsBuilder().put("index.analysis.analyzer.tv_test.tokenizer", "whitespace") .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase"))); ensureYellow(); for (int i = 0; i < 10; i++) { client().prepareIndex("test", "type1", Integer.toString(i)) .setSource(jsonBuilder().startObject() .field("field", "the quick brown fox jumps over the lazy dog") // 0the3 4quick9 10brown15 16fox19 20jumps25 26over30 // 31the34 35lazy39 40dog43 .endObject()) .execute().actionGet(); refresh(); } String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" }; int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 }; int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } }; int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } }; int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } }; boolean isPayloadRequested = randomBoolean(); boolean isOffsetRequested = randomBoolean(); boolean isPositionsRequested = randomBoolean(); String infoString = createInfoString(isPositionsRequested, isOffsetRequested, isPayloadRequested, optionString); for (int i = 0; i < 10; i++) { TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)) .setPayloads(isPayloadRequested).setOffsets(isOffsetRequested) .setPositions(isPositionsRequested).setSelectedFields(); TermVectorsResponse response = resp.execute().actionGet(); assertThat(infoString + "doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(ft.storeTermVectors() ? 1 : 0)); if (ft.storeTermVectors()) { Terms terms = fields.terms("field"); assertThat(terms.size(), equalTo(8l)); TermsEnum iterator = terms.iterator(); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(infoString, next, notNullValue()); assertThat(infoString + "expected " + string, string, equalTo(next.utf8ToString())); assertThat(infoString, next, notNullValue()); // do not test ttf or doc frequency, because here we have // many shards and do not know how documents are distributed PostingsEnum docsAndPositions = iterator.postings(null, null, PostingsEnum.ALL); // docs and pos only returns something if positions or // payloads or offsets are stored / requestd Otherwise use // DocsEnum? assertThat(infoString, docsAndPositions.nextDoc(), equalTo(0)); assertThat(infoString, freq[j], equalTo(docsAndPositions.freq())); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; if (isPositionsRequested && storePositions) { assertThat(infoString, termPos.length, equalTo(freq[j])); } if (isOffsetRequested && storeOffsets) { assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); } for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); // only return something useful if requested and stored if (isPositionsRequested && storePositions) { assertThat(infoString + "positions for term: " + string, nextPosition, equalTo(termPos[k])); } else { assertThat(infoString + "positions for term: ", nextPosition, equalTo(-1)); } // only return something useful if requested and stored if (isPayloadRequested && storePayloads) { assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); } else { assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(), equalTo(null)); } // only return something useful if requested and stored if (isOffsetRequested && storeOffsets) { assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); } else { assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(), equalTo(-1)); assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(), equalTo(-1)); } } } assertThat(iterator.next(), nullValue()); } } }