Example usage for org.apache.lucene.index Fields terms

Introduction

In this page you can find the example usage for org.apache.lucene.index Fields terms.

Prototype

public abstract Terms terms(String field) throws IOException;

Source Link

Document

Get the Terms for this field.

Usage

From source file:org.elasticsearch.search.suggest.completion.CompletionFieldStats.java

License:Apache License

/**
 * Returns total in-heap bytes used by all suggesters.  This method has CPU cost <code>O(numIndexedFields)</code>.
 *
 * @param fieldNamePatterns if non-null, any completion field name matching any of these patterns will break out its in-heap bytes
 * separately in the returned {@link CompletionStats}
 *//*from w  ww . ja va  2  s. com*/
public static CompletionStats completionStats(IndexReader indexReader, String... fieldNamePatterns) {
    long sizeInBytes = 0;
    ObjectLongHashMap<String> completionFields = null;
    if (fieldNamePatterns != null && fieldNamePatterns.length > 0) {
        completionFields = new ObjectLongHashMap<>(fieldNamePatterns.length);
    }
    for (LeafReaderContext atomicReaderContext : indexReader.leaves()) {
        LeafReader atomicReader = atomicReaderContext.reader();
        try {
            Fields fields = atomicReader.fields();
            for (String fieldName : fields) {
                Terms terms = fields.terms(fieldName);
                if (terms instanceof CompletionTerms) {
                    // TODO: currently we load up the suggester for reporting its size
                    long fstSize = ((CompletionTerms) terms).suggester().ramBytesUsed();
                    if (fieldNamePatterns != null && fieldNamePatterns.length > 0
                            && Regex.simpleMatch(fieldNamePatterns, fieldName)) {
                        completionFields.addTo(fieldName, fstSize);
                    }
                    sizeInBytes += fstSize;
                }
            }
        } catch (IOException ioe) {
            throw new ElasticsearchException(ioe);
        }
    }
    return new CompletionStats(sizeInBytes, completionFields);
}

From source file:org.elasticsearch.search.suggest.completion.old.AnalyzingCompletionLookupProvider.java

License:Apache License

@Override
public FieldsConsumer consumer(final IndexOutput output) throws IOException {
    CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION_LATEST);
    return new FieldsConsumer() {
        private Map<String, Long> fieldOffsets = new HashMap<>();

        @Override// ww  w  .  j  a va2  s .  c o m
        public void close() throws IOException {
            try {
                /*
                 * write the offsets per field such that we know where
                 * we need to load the FSTs from
                 */
                long pointer = output.getFilePointer();
                output.writeVInt(fieldOffsets.size());
                for (Map.Entry<String, Long> entry : fieldOffsets.entrySet()) {
                    output.writeString(entry.getKey());
                    output.writeVLong(entry.getValue());
                }
                output.writeLong(pointer);
                CodecUtil.writeFooter(output);
            } finally {
                IOUtils.close(output);
            }
        }

        @Override
        public void write(Fields fields) throws IOException {
            for (String field : fields) {
                Terms terms = fields.terms(field);
                if (terms == null) {
                    continue;
                }
                TermsEnum termsEnum = terms.iterator();
                PostingsEnum docsEnum = null;
                final SuggestPayload spare = new SuggestPayload();
                int maxAnalyzedPathsForOneInput = 0;
                final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder(
                        maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP);
                int docCount = 0;
                while (true) {
                    BytesRef term = termsEnum.next();
                    if (term == null) {
                        break;
                    }
                    docsEnum = termsEnum.postings(null, docsEnum, PostingsEnum.PAYLOADS);
                    builder.startTerm(term);
                    int docFreq = 0;
                    while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                        for (int i = 0; i < docsEnum.freq(); i++) {
                            final int position = docsEnum.nextPosition();
                            AnalyzingCompletionLookupProvider.this.parsePayload(docsEnum.getPayload(), spare);
                            builder.addSurface(spare.surfaceForm.get(), spare.payload.get(), spare.weight);
                            // multi fields have the same surface form so we sum up here
                            maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1);
                        }
                        docFreq++;
                        docCount = Math.max(docCount, docsEnum.docID() + 1);
                    }
                    builder.finishTerm(docFreq);
                }
                /*
                 * Here we are done processing the field and we can
                 * buid the FST and write it to disk.
                 */
                FST<Pair<Long, BytesRef>> build = builder.build();
                assert build != null || docCount == 0 : "the FST is null but docCount is != 0 actual value: ["
                        + docCount + "]";
                /*
                 * it's possible that the FST is null if we have 2 segments that get merged
                 * and all docs that have a value in this field are deleted. This will cause
                 * a consumer to be created but it doesn't consume any values causing the FSTBuilder
                 * to return null.
                 */
                if (build != null) {
                    fieldOffsets.put(field, output.getFilePointer());
                    build.save(output);
                    /* write some more meta-info */
                    output.writeVInt(maxAnalyzedPathsForOneInput);
                    output.writeVInt(maxSurfaceFormsPerAnalyzedForm);
                    output.writeInt(maxGraphExpansions); // can be negative
                    int options = 0;
                    options |= preserveSep ? SERIALIZE_PRESERVE_SEPARATORS : 0;
                    options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0;
                    options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0;
                    output.writeVInt(options);
                    output.writeVInt(XAnalyzingSuggester.SEP_LABEL);
                    output.writeVInt(XAnalyzingSuggester.END_BYTE);
                    output.writeVInt(XAnalyzingSuggester.PAYLOAD_SEP);
                    output.writeVInt(XAnalyzingSuggester.HOLE_CHARACTER);
                }
            }
        }
    };
}

From source file:org.elasticsearch.search.suggest.completion.old.AnalyzingCompletionLookupProviderV1.java

License:Apache License

@Override
public FieldsConsumer consumer(final IndexOutput output) throws IOException {
    // TODO write index header?
    CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION);
    return new FieldsConsumer() {
        private Map<String, Long> fieldOffsets = new HashMap<>();

        @Override//from  w ww .j  a v a  2s  .  co m
        public void close() throws IOException {
            try { /*
                   * write the offsets per field such that we know where
                   * we need to load the FSTs from
                   */
                long pointer = output.getFilePointer();
                output.writeVInt(fieldOffsets.size());
                for (Map.Entry<String, Long> entry : fieldOffsets.entrySet()) {
                    output.writeString(entry.getKey());
                    output.writeVLong(entry.getValue());
                }
                output.writeLong(pointer);
            } finally {
                IOUtils.close(output);
            }
        }

        @Override
        public void write(Fields fields) throws IOException {
            for (String field : fields) {
                Terms terms = fields.terms(field);
                if (terms == null) {
                    continue;
                }
                TermsEnum termsEnum = terms.iterator();
                PostingsEnum docsEnum = null;
                final SuggestPayload spare = new SuggestPayload();
                int maxAnalyzedPathsForOneInput = 0;
                final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder(
                        maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP);
                int docCount = 0;
                while (true) {
                    BytesRef term = termsEnum.next();
                    if (term == null) {
                        break;
                    }
                    docsEnum = termsEnum.postings(null, docsEnum, PostingsEnum.PAYLOADS);
                    builder.startTerm(term);
                    int docFreq = 0;
                    while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                        for (int i = 0; i < docsEnum.freq(); i++) {
                            final int position = docsEnum.nextPosition();
                            AnalyzingCompletionLookupProviderV1.this.parsePayload(docsEnum.getPayload(), spare);
                            builder.addSurface(spare.surfaceForm.get(), spare.payload.get(), spare.weight);
                            // multi fields have the same surface form so we sum up here
                            maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1);
                        }
                        docFreq++;
                        docCount = Math.max(docCount, docsEnum.docID() + 1);
                    }
                    builder.finishTerm(docFreq);
                }
                /*
                 * Here we are done processing the field and we can
                 * buid the FST and write it to disk.
                 */
                FST<Pair<Long, BytesRef>> build = builder.build();
                assert build != null || docCount == 0 : "the FST is null but docCount is != 0 actual value: ["
                        + docCount + "]";
                /*
                 * it's possible that the FST is null if we have 2 segments that get merged
                 * and all docs that have a value in this field are deleted. This will cause
                 * a consumer to be created but it doesn't consume any values causing the FSTBuilder
                 * to return null.
                 */
                if (build != null) {
                    fieldOffsets.put(field, output.getFilePointer());
                    build.save(output);
                    /* write some more meta-info */
                    output.writeVInt(maxAnalyzedPathsForOneInput);
                    output.writeVInt(maxSurfaceFormsPerAnalyzedForm);
                    output.writeInt(maxGraphExpansions); // can be negative
                    int options = 0;
                    options |= preserveSep ? SERIALIZE_PRESERVE_SEPARATORS : 0;
                    options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0;
                    options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0;
                    output.writeVInt(options);
                }
            }
        }
    };
}

From source file:org.elasticsearch.search.suggest.completion2x.AnalyzingCompletionLookupProvider.java

License:Apache License

@Override
public FieldsConsumer consumer(final IndexOutput output) throws IOException {
    CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION_LATEST);
    return new FieldsConsumer() {
        private Map<String, Long> fieldOffsets = new HashMap<>();

        @Override/*ww w  .j  a  v a  2 s.  c  om*/
        public void close() throws IOException {
            try {
                /*
                 * write the offsets per field such that we know where
                 * we need to load the FSTs from
                 */
                long pointer = output.getFilePointer();
                output.writeVInt(fieldOffsets.size());
                for (Map.Entry<String, Long> entry : fieldOffsets.entrySet()) {
                    output.writeString(entry.getKey());
                    output.writeVLong(entry.getValue());
                }
                output.writeLong(pointer);
                CodecUtil.writeFooter(output);
            } finally {
                IOUtils.close(output);
            }
        }

        @Override
        public void write(Fields fields) throws IOException {
            for (String field : fields) {
                Terms terms = fields.terms(field);
                if (terms == null) {
                    continue;
                }
                TermsEnum termsEnum = terms.iterator();
                PostingsEnum docsEnum = null;
                final SuggestPayload spare = new SuggestPayload();
                int maxAnalyzedPathsForOneInput = 0;
                final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder(
                        maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP);
                int docCount = 0;
                while (true) {
                    BytesRef term = termsEnum.next();
                    if (term == null) {
                        break;
                    }
                    docsEnum = termsEnum.postings(docsEnum, PostingsEnum.PAYLOADS);
                    builder.startTerm(term);
                    int docFreq = 0;
                    while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                        for (int i = 0; i < docsEnum.freq(); i++) {
                            final int position = docsEnum.nextPosition();
                            AnalyzingCompletionLookupProvider.this.parsePayload(docsEnum.getPayload(), spare);
                            builder.addSurface(spare.surfaceForm.get(), spare.payload.get(), spare.weight);
                            // multi fields have the same surface form so we sum up here
                            maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1);
                        }
                        docFreq++;
                        docCount = Math.max(docCount, docsEnum.docID() + 1);
                    }
                    builder.finishTerm(docFreq);
                }
                /*
                 * Here we are done processing the field and we can
                 * buid the FST and write it to disk.
                 */
                FST<Pair<Long, BytesRef>> build = builder.build();
                assert build != null || docCount == 0 : "the FST is null but docCount is != 0 actual value: ["
                        + docCount + "]";
                /*
                 * it's possible that the FST is null if we have 2 segments that get merged
                 * and all docs that have a value in this field are deleted. This will cause
                 * a consumer to be created but it doesn't consume any values causing the FSTBuilder
                 * to return null.
                 */
                if (build != null) {
                    fieldOffsets.put(field, output.getFilePointer());
                    build.save(output);
                    /* write some more meta-info */
                    output.writeVInt(maxAnalyzedPathsForOneInput);
                    output.writeVInt(maxSurfaceFormsPerAnalyzedForm);
                    output.writeInt(maxGraphExpansions); // can be negative
                    int options = 0;
                    options |= preserveSep ? SERIALIZE_PRESERVE_SEPARATORS : 0;
                    options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0;
                    options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0;
                    output.writeVInt(options);
                    output.writeVInt(XAnalyzingSuggester.SEP_LABEL);
                    output.writeVInt(XAnalyzingSuggester.END_BYTE);
                    output.writeVInt(XAnalyzingSuggester.PAYLOAD_SEP);
                    output.writeVInt(XAnalyzingSuggester.HOLE_CHARACTER);
                }
            }
        }
    };
}

From source file:org.elasticsearch.termvectors.GetTermVectorCheckDocFreqTests.java

License:Apache License

private void checkWithoutFieldStatistics(int numDocs, String[] values, int[] freq, int[][] pos,
        int[][] startOffset, int[][] endOffset, int i) throws IOException {
    TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i))
            .setPayloads(true).setOffsets(true).setPositions(true).setTermStatistics(true)
            .setFieldStatistics(false).setSelectedFields();
    TermVectorResponse response = resp.execute().actionGet();
    assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
    Fields fields = response.getFields();
    assertThat(fields.size(), equalTo(1));
    Terms terms = fields.terms("field");
    assertThat(terms.size(), equalTo(8l));
    assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) -1));
    assertThat(terms.getDocCount(), Matchers.equalTo(-1));
    assertThat(terms.getSumDocFreq(), equalTo((long) -1));
    TermsEnum iterator = terms.iterator(null);
    for (int j = 0; j < values.length; j++) {
        String string = values[j];
        BytesRef next = iterator.next();
        assertThat(next, Matchers.notNullValue());
        assertThat("expected " + string, string, equalTo(next.utf8ToString()));
        assertThat(next, Matchers.notNullValue());
        if (string.equals("the")) {
            assertThat("expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq()));
        } else {/*from  w  ww  . j  a  v  a2  s  .c  o  m*/
            assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq()));
        }

        DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
        assertThat(docsAndPositions.nextDoc(), equalTo(0));
        assertThat(freq[j], equalTo(docsAndPositions.freq()));
        assertThat(iterator.docFreq(), equalTo(numDocs));
        int[] termPos = pos[j];
        int[] termStartOffset = startOffset[j];
        int[] termEndOffset = endOffset[j];
        assertThat(termPos.length, equalTo(freq[j]));
        assertThat(termStartOffset.length, equalTo(freq[j]));
        assertThat(termEndOffset.length, equalTo(freq[j]));
        for (int k = 0; k < freq[j]; k++) {
            int nextPosition = docsAndPositions.nextPosition();
            assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
            assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
            assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
            assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
        }
    }
    assertThat(iterator.next(), Matchers.nullValue());

    XContentBuilder xBuilder = new XContentFactory().jsonBuilder();

    response.toXContent(xBuilder, null);
    BytesStream bytesStream = xBuilder.bytesStream();
    String utf8 = bytesStream.bytes().toUtf8();
    String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i
            + "\",\"_version\":1,\"exists\":true,\"term_vectors\":{\"field\":{\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}";
    assertThat(utf8, equalTo(expectedString));

}

From source file:org.elasticsearch.termvectors.GetTermVectorCheckDocFreqTests.java

License:Apache License

private void checkWithoutTermStatistics(int numDocs, String[] values, int[] freq, int[][] pos,
        int[][] startOffset, int[][] endOffset, int i) throws IOException {
    TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i))
            .setPayloads(true).setOffsets(true).setPositions(true).setTermStatistics(false)
            .setFieldStatistics(true).setSelectedFields();
    assertThat(resp.request().termStatistics(), equalTo(false));
    TermVectorResponse response = resp.execute().actionGet();
    assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
    Fields fields = response.getFields();
    assertThat(fields.size(), equalTo(1));
    Terms terms = fields.terms("field");
    assertThat(terms.size(), equalTo(8l));
    assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs)));
    assertThat(terms.getDocCount(), Matchers.equalTo(numDocs));
    assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length));
    TermsEnum iterator = terms.iterator(null);
    for (int j = 0; j < values.length; j++) {
        String string = values[j];
        BytesRef next = iterator.next();
        assertThat(next, Matchers.notNullValue());
        assertThat("expected " + string, string, equalTo(next.utf8ToString()));
        assertThat(next, Matchers.notNullValue());

        assertThat("expected ttf of " + string, -1, equalTo((int) iterator.totalTermFreq()));

        DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
        assertThat(docsAndPositions.nextDoc(), equalTo(0));
        assertThat(freq[j], equalTo(docsAndPositions.freq()));
        assertThat(iterator.docFreq(), equalTo(-1));
        int[] termPos = pos[j];
        int[] termStartOffset = startOffset[j];
        int[] termEndOffset = endOffset[j];
        assertThat(termPos.length, equalTo(freq[j]));
        assertThat(termStartOffset.length, equalTo(freq[j]));
        assertThat(termEndOffset.length, equalTo(freq[j]));
        for (int k = 0; k < freq[j]; k++) {
            int nextPosition = docsAndPositions.nextPosition();
            assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
            assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
            assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
            assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
        }/*from  ww  w. j a  va 2s  .com*/
    }
    assertThat(iterator.next(), Matchers.nullValue());

    XContentBuilder xBuilder = new XContentFactory().jsonBuilder();

    response.toXContent(xBuilder, null);
    BytesStream bytesStream = xBuilder.bytesStream();
    String utf8 = bytesStream.bytes().toUtf8();
    String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i
            + "\",\"_version\":1,\"exists\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}";
    assertThat(utf8, equalTo(expectedString));

}

From source file:org.elasticsearch.termvectors.GetTermVectorCheckDocFreqTests.java

License:Apache License

private void checkAllInfo(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset,
        int[][] endOffset, int i) throws IOException {
    TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i))
            .setPayloads(true).setOffsets(true).setPositions(true).setFieldStatistics(true)
            .setTermStatistics(true).setSelectedFields();
    assertThat(resp.request().fieldStatistics(), equalTo(true));
    TermVectorResponse response = resp.execute().actionGet();
    assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
    Fields fields = response.getFields();
    assertThat(fields.size(), equalTo(1));
    Terms terms = fields.terms("field");
    assertThat(terms.size(), equalTo(8l));
    assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs)));
    assertThat(terms.getDocCount(), Matchers.equalTo(numDocs));
    assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length));
    TermsEnum iterator = terms.iterator(null);
    for (int j = 0; j < values.length; j++) {
        String string = values[j];
        BytesRef next = iterator.next();
        assertThat(next, Matchers.notNullValue());
        assertThat("expected " + string, string, equalTo(next.utf8ToString()));
        assertThat(next, Matchers.notNullValue());
        if (string.equals("the")) {
            assertThat("expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq()));
        } else {/*from w w  w  .java  2 s.co m*/
            assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq()));
        }

        DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
        assertThat(docsAndPositions.nextDoc(), equalTo(0));
        assertThat(freq[j], equalTo(docsAndPositions.freq()));
        assertThat(iterator.docFreq(), equalTo(numDocs));
        int[] termPos = pos[j];
        int[] termStartOffset = startOffset[j];
        int[] termEndOffset = endOffset[j];
        assertThat(termPos.length, equalTo(freq[j]));
        assertThat(termStartOffset.length, equalTo(freq[j]));
        assertThat(termEndOffset.length, equalTo(freq[j]));
        for (int k = 0; k < freq[j]; k++) {
            int nextPosition = docsAndPositions.nextPosition();
            assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
            assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
            assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
            assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
        }
    }
    assertThat(iterator.next(), Matchers.nullValue());

    XContentBuilder xBuilder = new XContentFactory().jsonBuilder();

    response.toXContent(xBuilder, null);
    BytesStream bytesStream = xBuilder.bytesStream();
    String utf8 = bytesStream.bytes().toUtf8();
    String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i
            + "\",\"_version\":1,\"exists\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}";
    assertThat(utf8, equalTo(expectedString));
}

From source file:org.elasticsearch.termvectors.GetTermVectorTests.java

License:Apache License

@Test
public void testSimpleTermVectors() throws ElasticSearchException, IOException {
    XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1")
            .startObject("properties").startObject("field").field("type", "string")
            .field("term_vector", "with_positions_offsets_payloads").field("analyzer", "tv_test").endObject()
            .endObject().endObject().endObject();
    ElasticSearchAssertions.assertAcked(prepareCreate("test").addMapping("type1", mapping)
            .setSettings(ImmutableSettings.settingsBuilder()
                    .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
                    .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
    ensureYellow();/* w  w  w  .  jav a  2 s.co m*/
    for (int i = 0; i < 10; i++) {
        client().prepareIndex("test", "type1", Integer.toString(i))
                .setSource(XContentFactory.jsonBuilder().startObject()
                        .field("field", "the quick brown fox jumps over the lazy dog")
                        // 0the3 4quick9 10brown15 16fox19 20jumps25 26over30
                        // 31the34 35lazy39 40dog43
                        .endObject())
                .execute().actionGet();
        refresh();
    }
    String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" };
    int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 };
    int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } };
    int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } };
    int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } };
    for (int i = 0; i < 10; i++) {
        TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i))
                .setPayloads(true).setOffsets(true).setPositions(true).setSelectedFields();
        TermVectorResponse response = resp.execute().actionGet();
        assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
        Fields fields = response.getFields();
        assertThat(fields.size(), equalTo(1));
        Terms terms = fields.terms("field");
        assertThat(terms.size(), equalTo(8l));
        TermsEnum iterator = terms.iterator(null);
        for (int j = 0; j < values.length; j++) {
            String string = values[j];
            BytesRef next = iterator.next();
            assertThat(next, Matchers.notNullValue());
            assertThat("expected " + string, string, equalTo(next.utf8ToString()));
            assertThat(next, Matchers.notNullValue());
            // do not test ttf or doc frequency, because here we have many
            // shards and do not know how documents are distributed
            DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
            assertThat(docsAndPositions.nextDoc(), equalTo(0));
            assertThat(freq[j], equalTo(docsAndPositions.freq()));
            int[] termPos = pos[j];
            int[] termStartOffset = startOffset[j];
            int[] termEndOffset = endOffset[j];
            assertThat(termPos.length, equalTo(freq[j]));
            assertThat(termStartOffset.length, equalTo(freq[j]));
            assertThat(termEndOffset.length, equalTo(freq[j]));
            for (int k = 0; k < freq[j]; k++) {
                int nextPosition = docsAndPositions.nextPosition();
                assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
                assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
                assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
                assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
            }
        }
        assertThat(iterator.next(), Matchers.nullValue());
    }
}

From source file:org.elasticsearch.termvectors.GetTermVectorTests.java

License:Apache License

@Test
public void testRandomSingleTermVectors() throws ElasticSearchException, IOException {
    FieldType ft = new FieldType();
    int config = randomInt(6);
    boolean storePositions = false;
    boolean storeOffsets = false;
    boolean storePayloads = false;
    boolean storeTermVectors = false;
    switch (config) {
    case 0: {//from ww w. j a  v  a2 s  .c  o  m
        // do nothing
    }
    case 1: {
        storeTermVectors = true;
    }
    case 2: {
        storeTermVectors = true;
        storePositions = true;
    }
    case 3: {
        storeTermVectors = true;
        storeOffsets = true;
    }
    case 4: {
        storeTermVectors = true;
        storePositions = true;
        storeOffsets = true;
    }
    case 5: {
        storeTermVectors = true;
        storePositions = true;
        storePayloads = true;
    }
    case 6: {
        storeTermVectors = true;
        storePositions = true;
        storeOffsets = true;
        storePayloads = true;
    }
    }
    ft.setStoreTermVectors(storeTermVectors);
    ft.setStoreTermVectorOffsets(storeOffsets);
    ft.setStoreTermVectorPayloads(storePayloads);
    ft.setStoreTermVectorPositions(storePositions);

    String optionString = AbstractFieldMapper.termVectorOptionsToString(ft);
    XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1")
            .startObject("properties").startObject("field").field("type", "string")
            .field("term_vector", optionString).field("analyzer", "tv_test").endObject().endObject().endObject()
            .endObject();
    ElasticSearchAssertions.assertAcked(prepareCreate("test").addMapping("type1", mapping)
            .setSettings(ImmutableSettings.settingsBuilder()
                    .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
                    .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
    ensureYellow();
    for (int i = 0; i < 10; i++) {
        client().prepareIndex("test", "type1", Integer.toString(i))
                .setSource(XContentFactory.jsonBuilder().startObject()
                        .field("field", "the quick brown fox jumps over the lazy dog")
                        // 0the3 4quick9 10brown15 16fox19 20jumps25 26over30
                        // 31the34 35lazy39 40dog43
                        .endObject())
                .execute().actionGet();
        refresh();
    }
    String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" };
    int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 };
    int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } };
    int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } };
    int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } };

    boolean isPayloadRequested = randomBoolean();
    boolean isOffsetRequested = randomBoolean();
    boolean isPositionsRequested = randomBoolean();
    String infoString = createInfoString(isPositionsRequested, isOffsetRequested, isPayloadRequested,
            optionString);
    for (int i = 0; i < 10; i++) {
        TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i))
                .setPayloads(isPayloadRequested).setOffsets(isOffsetRequested)
                .setPositions(isPositionsRequested).setSelectedFields();
        TermVectorResponse response = resp.execute().actionGet();
        assertThat(infoString + "doc id: " + i + " doesn't exists but should", response.isExists(),
                equalTo(true));
        Fields fields = response.getFields();
        assertThat(fields.size(), equalTo(ft.storeTermVectors() ? 1 : 0));
        if (ft.storeTermVectors()) {
            Terms terms = fields.terms("field");
            assertThat(terms.size(), equalTo(8l));
            TermsEnum iterator = terms.iterator(null);
            for (int j = 0; j < values.length; j++) {
                String string = values[j];
                BytesRef next = iterator.next();
                assertThat(infoString, next, Matchers.notNullValue());
                assertThat(infoString + "expected " + string, string, equalTo(next.utf8ToString()));
                assertThat(infoString, next, Matchers.notNullValue());
                // do not test ttf or doc frequency, because here we have
                // many shards and do not know how documents are distributed
                DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
                // docs and pos only returns something if positions or
                // payloads or offsets are stored / requestd Otherwise use
                // DocsEnum?
                assertThat(infoString, docsAndPositions.nextDoc(), equalTo(0));
                assertThat(infoString, freq[j], equalTo(docsAndPositions.freq()));
                int[] termPos = pos[j];
                int[] termStartOffset = startOffset[j];
                int[] termEndOffset = endOffset[j];
                if (isPositionsRequested && storePositions) {
                    assertThat(infoString, termPos.length, equalTo(freq[j]));
                }
                if (isOffsetRequested && storeOffsets) {
                    assertThat(termStartOffset.length, equalTo(freq[j]));
                    assertThat(termEndOffset.length, equalTo(freq[j]));
                }
                for (int k = 0; k < freq[j]; k++) {
                    int nextPosition = docsAndPositions.nextPosition();
                    // only return something useful if requested and stored
                    if (isPositionsRequested && storePositions) {
                        assertThat(infoString + "positions for term: " + string, nextPosition,
                                equalTo(termPos[k]));
                    } else {
                        assertThat(infoString + "positions for term: ", nextPosition, equalTo(-1));
                    }

                    // only return something useful if requested and stored
                    if (isPayloadRequested && storePayloads) {
                        assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(),
                                equalTo(new BytesRef("word")));
                    } else {
                        assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(),
                                equalTo(null));
                    }
                    // only return something useful if requested and stored
                    if (isOffsetRequested && storeOffsets) {

                        assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(),
                                equalTo(termStartOffset[k]));
                        assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(),
                                equalTo(termEndOffset[k]));
                    } else {
                        assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(),
                                equalTo(-1));
                        assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(),
                                equalTo(-1));
                    }

                }
            }
            assertThat(iterator.next(), Matchers.nullValue());
        }

    }
}

From source file:org.elasticsearch.test.integration.termvectors.GetTermVectorTests.java

License:Apache License

@Test
public void testSimpleTermVectors() throws ElasticSearchException, IOException {

    run(addMapping(prepareCreate("test"), "type1",
            new Object[] { "field", "type", "string", "term_vector", "with_positions_offsets_payloads",
                    "analyzer", "tv_test" })
                            .setSettings(ImmutableSettings.settingsBuilder()
                                    .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
                                    .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload",
                                            "lowercase")));
    ensureYellow();/*from  ww  w .  j  a v  a  2  s  . com*/
    for (int i = 0; i < 10; i++) {
        client().prepareIndex("test", "type1", Integer.toString(i))
                .setSource(XContentFactory.jsonBuilder().startObject()
                        .field("field", "the quick brown fox jumps over the lazy dog")
                        // 0the3 4quick9 10brown15 16fox19 20jumps25 26over30
                        // 31the34 35lazy39 40dog43
                        .endObject())
                .execute().actionGet();
        refresh();
    }
    String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" };
    int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 };
    int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } };
    int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } };
    int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } };
    for (int i = 0; i < 10; i++) {
        TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i))
                .setPayloads(true).setOffsets(true).setPositions(true).setSelectedFields();
        TermVectorResponse response = resp.execute().actionGet();
        assertThat("doc id: " + i + " doesn't exists but should", response.documentExists(), equalTo(true));
        Fields fields = response.getFields();
        assertThat(fields.size(), equalTo(1));
        Terms terms = fields.terms("field");
        assertThat(terms.size(), equalTo(8l));
        TermsEnum iterator = terms.iterator(null);
        for (int j = 0; j < values.length; j++) {
            String string = values[j];
            BytesRef next = iterator.next();
            assertThat(next, Matchers.notNullValue());
            assertThat("expected " + string, string, equalTo(next.utf8ToString()));
            assertThat(next, Matchers.notNullValue());
            // do not test ttf or doc frequency, because here we have many
            // shards and do not know how documents are distributed
            DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
            assertThat(docsAndPositions.nextDoc(), equalTo(0));
            assertThat(freq[j], equalTo(docsAndPositions.freq()));
            int[] termPos = pos[j];
            int[] termStartOffset = startOffset[j];
            int[] termEndOffset = endOffset[j];
            assertThat(termPos.length, equalTo(freq[j]));
            assertThat(termStartOffset.length, equalTo(freq[j]));
            assertThat(termEndOffset.length, equalTo(freq[j]));
            for (int k = 0; k < freq[j]; k++) {
                int nextPosition = docsAndPositions.nextPosition();
                assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
                assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
                assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
                assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
            }
        }
        assertThat(iterator.next(), Matchers.nullValue());
    }
}