Example usage for org.apache.lucene.index Fields terms

List of usage examples for org.apache.lucene.index Fields terms

Introduction

In this page you can find the example usage for org.apache.lucene.index Fields terms.

Prototype

public abstract Terms terms(String field) throws IOException;

Source Link

Document

Get the Terms for this field.

Usage

From source file:org.elasticsearch.search.suggest.completion.CompletionFieldStats.java

License:Apache License

/**
 * Returns total in-heap bytes used by all suggesters.  This method has CPU cost <code>O(numIndexedFields)</code>.
 *
 * @param fieldNamePatterns if non-null, any completion field name matching any of these patterns will break out its in-heap bytes
 * separately in the returned {@link CompletionStats}
 *//*from w  ww . ja va  2  s. com*/
public static CompletionStats completionStats(IndexReader indexReader, String... fieldNamePatterns) {
    long sizeInBytes = 0;
    ObjectLongHashMap<String> completionFields = null;
    if (fieldNamePatterns != null && fieldNamePatterns.length > 0) {
        completionFields = new ObjectLongHashMap<>(fieldNamePatterns.length);
    }
    for (LeafReaderContext atomicReaderContext : indexReader.leaves()) {
        LeafReader atomicReader = atomicReaderContext.reader();
        try {
            Fields fields = atomicReader.fields();
            for (String fieldName : fields) {
                Terms terms = fields.terms(fieldName);
                if (terms instanceof CompletionTerms) {
                    // TODO: currently we load up the suggester for reporting its size
                    long fstSize = ((CompletionTerms) terms).suggester().ramBytesUsed();
                    if (fieldNamePatterns != null && fieldNamePatterns.length > 0
                            && Regex.simpleMatch(fieldNamePatterns, fieldName)) {
                        completionFields.addTo(fieldName, fstSize);
                    }
                    sizeInBytes += fstSize;
                }
            }
        } catch (IOException ioe) {
            throw new ElasticsearchException(ioe);
        }
    }
    return new CompletionStats(sizeInBytes, completionFields);
}

From source file:org.elasticsearch.search.suggest.completion.old.AnalyzingCompletionLookupProvider.java

License:Apache License

@Override
public FieldsConsumer consumer(final IndexOutput output) throws IOException {
    CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION_LATEST);
    return new FieldsConsumer() {
        private Map<String, Long> fieldOffsets = new HashMap<>();

        @Override// ww  w  .  j  a va2  s .  c o m
        public void close() throws IOException {
            try {
                /*
                 * write the offsets per field such that we know where
                 * we need to load the FSTs from
                 */
                long pointer = output.getFilePointer();
                output.writeVInt(fieldOffsets.size());
                for (Map.Entry<String, Long> entry : fieldOffsets.entrySet()) {
                    output.writeString(entry.getKey());
                    output.writeVLong(entry.getValue());
                }
                output.writeLong(pointer);
                CodecUtil.writeFooter(output);
            } finally {
                IOUtils.close(output);
            }
        }

        @Override
        public void write(Fields fields) throws IOException {
            for (String field : fields) {
                Terms terms = fields.terms(field);
                if (terms == null) {
                    continue;
                }
                TermsEnum termsEnum = terms.iterator();
                PostingsEnum docsEnum = null;
                final SuggestPayload spare = new SuggestPayload();
                int maxAnalyzedPathsForOneInput = 0;
                final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder(
                        maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP);
                int docCount = 0;
                while (true) {
                    BytesRef term = termsEnum.next();
                    if (term == null) {
                        break;
                    }
                    docsEnum = termsEnum.postings(null, docsEnum, PostingsEnum.PAYLOADS);
                    builder.startTerm(term);
                    int docFreq = 0;
                    while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                        for (int i = 0; i < docsEnum.freq(); i++) {
                            final int position = docsEnum.nextPosition();
                            AnalyzingCompletionLookupProvider.this.parsePayload(docsEnum.getPayload(), spare);
                            builder.addSurface(spare.surfaceForm.get(), spare.payload.get(), spare.weight);
                            // multi fields have the same surface form so we sum up here
                            maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1);
                        }
                        docFreq++;
                        docCount = Math.max(docCount, docsEnum.docID() + 1);
                    }
                    builder.finishTerm(docFreq);
                }
                /*
                 * Here we are done processing the field and we can
                 * buid the FST and write it to disk.
                 */
                FST<Pair<Long, BytesRef>> build = builder.build();
                assert build != null || docCount == 0 : "the FST is null but docCount is != 0 actual value: ["
                        + docCount + "]";
                /*
                 * it's possible that the FST is null if we have 2 segments that get merged
                 * and all docs that have a value in this field are deleted. This will cause
                 * a consumer to be created but it doesn't consume any values causing the FSTBuilder
                 * to return null.
                 */
                if (build != null) {
                    fieldOffsets.put(field, output.getFilePointer());
                    build.save(output);
                    /* write some more meta-info */
                    output.writeVInt(maxAnalyzedPathsForOneInput);
                    output.writeVInt(maxSurfaceFormsPerAnalyzedForm);
                    output.writeInt(maxGraphExpansions); // can be negative
                    int options = 0;
                    options |= preserveSep ? SERIALIZE_PRESERVE_SEPARATORS : 0;
                    options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0;
                    options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0;
                    output.writeVInt(options);
                    output.writeVInt(XAnalyzingSuggester.SEP_LABEL);
                    output.writeVInt(XAnalyzingSuggester.END_BYTE);
                    output.writeVInt(XAnalyzingSuggester.PAYLOAD_SEP);
                    output.writeVInt(XAnalyzingSuggester.HOLE_CHARACTER);
                }
            }
        }
    };
}

From source file:org.elasticsearch.search.suggest.completion.old.AnalyzingCompletionLookupProviderV1.java

License:Apache License

@Override
public FieldsConsumer consumer(final IndexOutput output) throws IOException {
    // TODO write index header?
    CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION);
    return new FieldsConsumer() {
        private Map<String, Long> fieldOffsets = new HashMap<>();

        @Override//from  w ww .j  a v a  2s  .  co m
        public void close() throws IOException {
            try { /*
                   * write the offsets per field such that we know where
                   * we need to load the FSTs from
                   */
                long pointer = output.getFilePointer();
                output.writeVInt(fieldOffsets.size());
                for (Map.Entry<String, Long> entry : fieldOffsets.entrySet()) {
                    output.writeString(entry.getKey());
                    output.writeVLong(entry.getValue());
                }
                output.writeLong(pointer);
            } finally {
                IOUtils.close(output);
            }
        }

        @Override
        public void write(Fields fields) throws IOException {
            for (String field : fields) {
                Terms terms = fields.terms(field);
                if (terms == null) {
                    continue;
                }
                TermsEnum termsEnum = terms.iterator();
                PostingsEnum docsEnum = null;
                final SuggestPayload spare = new SuggestPayload();
                int maxAnalyzedPathsForOneInput = 0;
                final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder(
                        maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP);
                int docCount = 0;
                while (true) {
                    BytesRef term = termsEnum.next();
                    if (term == null) {
                        break;
                    }
                    docsEnum = termsEnum.postings(null, docsEnum, PostingsEnum.PAYLOADS);
                    builder.startTerm(term);
                    int docFreq = 0;
                    while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                        for (int i = 0; i < docsEnum.freq(); i++) {
                            final int position = docsEnum.nextPosition();
                            AnalyzingCompletionLookupProviderV1.this.parsePayload(docsEnum.getPayload(), spare);
                            builder.addSurface(spare.surfaceForm.get(), spare.payload.get(), spare.weight);
                            // multi fields have the same surface form so we sum up here
                            maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1);
                        }
                        docFreq++;
                        docCount = Math.max(docCount, docsEnum.docID() + 1);
                    }
                    builder.finishTerm(docFreq);
                }
                /*
                 * Here we are done processing the field and we can
                 * buid the FST and write it to disk.
                 */
                FST<Pair<Long, BytesRef>> build = builder.build();
                assert build != null || docCount == 0 : "the FST is null but docCount is != 0 actual value: ["
                        + docCount + "]";
                /*
                 * it's possible that the FST is null if we have 2 segments that get merged
                 * and all docs that have a value in this field are deleted. This will cause
                 * a consumer to be created but it doesn't consume any values causing the FSTBuilder
                 * to return null.
                 */
                if (build != null) {
                    fieldOffsets.put(field, output.getFilePointer());
                    build.save(output);
                    /* write some more meta-info */
                    output.writeVInt(maxAnalyzedPathsForOneInput);
                    output.writeVInt(maxSurfaceFormsPerAnalyzedForm);
                    output.writeInt(maxGraphExpansions); // can be negative
                    int options = 0;
                    options |= preserveSep ? SERIALIZE_PRESERVE_SEPARATORS : 0;
                    options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0;
                    options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0;
                    output.writeVInt(options);
                }
            }
        }
    };
}

From source file:org.elasticsearch.search.suggest.completion2x.AnalyzingCompletionLookupProvider.java

License:Apache License

@Override
public FieldsConsumer consumer(final IndexOutput output) throws IOException {
    CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION_LATEST);
    return new FieldsConsumer() {
        private Map<String, Long> fieldOffsets = new HashMap<>();

        @Override/*ww w  .j  a  v a  2 s.  c  om*/
        public void close() throws IOException {
            try {
                /*
                 * write the offsets per field such that we know where
                 * we need to load the FSTs from
                 */
                long pointer = output.getFilePointer();
                output.writeVInt(fieldOffsets.size());
                for (Map.Entry<String, Long> entry : fieldOffsets.entrySet()) {
                    output.writeString(entry.getKey());
                    output.writeVLong(entry.getValue());
                }
                output.writeLong(pointer);
                CodecUtil.writeFooter(output);
            } finally {
                IOUtils.close(output);
            }
        }

        @Override
        public void write(Fields fields) throws IOException {
            for (String field : fields) {
                Terms terms = fields.terms(field);
                if (terms == null) {
                    continue;
                }
                TermsEnum termsEnum = terms.iterator();
                PostingsEnum docsEnum = null;
                final SuggestPayload spare = new SuggestPayload();
                int maxAnalyzedPathsForOneInput = 0;
                final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder(
                        maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP);
                int docCount = 0;
                while (true) {
                    BytesRef term = termsEnum.next();
                    if (term == null) {
                        break;
                    }
                    docsEnum = termsEnum.postings(docsEnum, PostingsEnum.PAYLOADS);
                    builder.startTerm(term);
                    int docFreq = 0;
                    while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                        for (int i = 0; i < docsEnum.freq(); i++) {
                            final int position = docsEnum.nextPosition();
                            AnalyzingCompletionLookupProvider.this.parsePayload(docsEnum.getPayload(), spare);
                            builder.addSurface(spare.surfaceForm.get(), spare.payload.get(), spare.weight);
                            // multi fields have the same surface form so we sum up here
                            maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1);
                        }
                        docFreq++;
                        docCount = Math.max(docCount, docsEnum.docID() + 1);
                    }
                    builder.finishTerm(docFreq);
                }
                /*
                 * Here we are done processing the field and we can
                 * buid the FST and write it to disk.
                 */
                FST<Pair<Long, BytesRef>> build = builder.build();
                assert build != null || docCount == 0 : "the FST is null but docCount is != 0 actual value: ["
                        + docCount + "]";
                /*
                 * it's possible that the FST is null if we have 2 segments that get merged
                 * and all docs that have a value in this field are deleted. This will cause
                 * a consumer to be created but it doesn't consume any values causing the FSTBuilder
                 * to return null.
                 */
                if (build != null) {
                    fieldOffsets.put(field, output.getFilePointer());
                    build.save(output);
                    /* write some more meta-info */
                    output.writeVInt(maxAnalyzedPathsForOneInput);
                    output.writeVInt(maxSurfaceFormsPerAnalyzedForm);
                    output.writeInt(maxGraphExpansions); // can be negative
                    int options = 0;
                    options |= preserveSep ? SERIALIZE_PRESERVE_SEPARATORS : 0;
                    options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0;
                    options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0;
                    output.writeVInt(options);
                    output.writeVInt(XAnalyzingSuggester.SEP_LABEL);
                    output.writeVInt(XAnalyzingSuggester.END_BYTE);
                    output.writeVInt(XAnalyzingSuggester.PAYLOAD_SEP);
                    output.writeVInt(XAnalyzingSuggester.HOLE_CHARACTER);
                }
            }
        }
    };
}

From source file:org.elasticsearch.termvectors.GetTermVectorCheckDocFreqTests.java

License:Apache License

private void checkWithoutFieldStatistics(int numDocs, String[] values, int[] freq, int[][] pos,
        int[][] startOffset, int[][] endOffset, int i) throws IOException {
    TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i))
            .setPayloads(true).setOffsets(true).setPositions(true).setTermStatistics(true)
            .setFieldStatistics(false).setSelectedFields();
    TermVectorResponse response = resp.execute().actionGet();
    assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
    Fields fields = response.getFields();
    assertThat(fields.size(), equalTo(1));
    Terms terms = fields.terms("field");
    assertThat(terms.size(), equalTo(8l));
    assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) -1));
    assertThat(terms.getDocCount(), Matchers.equalTo(-1));
    assertThat(terms.getSumDocFreq(), equalTo((long) -1));
    TermsEnum iterator = terms.iterator(null);
    for (int j = 0; j < values.length; j++) {
        String string = values[j];
        BytesRef next = iterator.next();
        assertThat(next, Matchers.notNullValue());
        assertThat("expected " + string, string, equalTo(next.utf8ToString()));
        assertThat(next, Matchers.notNullValue());
        if (string.equals("the")) {
            assertThat("expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq()));
        } else {/*from  w  ww  . j  a  v  a2  s  .c  o  m*/
            assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq()));
        }

        DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
        assertThat(docsAndPositions.nextDoc(), equalTo(0));
        assertThat(freq[j], equalTo(docsAndPositions.freq()));
        assertThat(iterator.docFreq(), equalTo(numDocs));
        int[] termPos = pos[j];
        int[] termStartOffset = startOffset[j];
        int[] termEndOffset = endOffset[j];
        assertThat(termPos.length, equalTo(freq[j]));
        assertThat(termStartOffset.length, equalTo(freq[j]));
        assertThat(termEndOffset.length, equalTo(freq[j]));
        for (int k = 0; k < freq[j]; k++) {
            int nextPosition = docsAndPositions.nextPosition();
            assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
            assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
            assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
            assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
        }
    }
    assertThat(iterator.next(), Matchers.nullValue());

    XContentBuilder xBuilder = new XContentFactory().jsonBuilder();

    response.toXContent(xBuilder, null);
    BytesStream bytesStream = xBuilder.bytesStream();
    String utf8 = bytesStream.bytes().toUtf8();
    String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i
            + "\",\"_version\":1,\"exists\":true,\"term_vectors\":{\"field\":{\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}";
    assertThat(utf8, equalTo(expectedString));

}

From source file:org.elasticsearch.termvectors.GetTermVectorCheckDocFreqTests.java

License:Apache License

private void checkWithoutTermStatistics(int numDocs, String[] values, int[] freq, int[][] pos,
        int[][] startOffset, int[][] endOffset, int i) throws IOException {
    TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i))
            .setPayloads(true).setOffsets(true).setPositions(true).setTermStatistics(false)
            .setFieldStatistics(true).setSelectedFields();
    assertThat(resp.request().termStatistics(), equalTo(false));
    TermVectorResponse response = resp.execute().actionGet();
    assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
    Fields fields = response.getFields();
    assertThat(fields.size(), equalTo(1));
    Terms terms = fields.terms("field");
    assertThat(terms.size(), equalTo(8l));
    assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs)));
    assertThat(terms.getDocCount(), Matchers.equalTo(numDocs));
    assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length));
    TermsEnum iterator = terms.iterator(null);
    for (int j = 0; j < values.length; j++) {
        String string = values[j];
        BytesRef next = iterator.next();
        assertThat(next, Matchers.notNullValue());
        assertThat("expected " + string, string, equalTo(next.utf8ToString()));
        assertThat(next, Matchers.notNullValue());

        assertThat("expected ttf of " + string, -1, equalTo((int) iterator.totalTermFreq()));

        DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
        assertThat(docsAndPositions.nextDoc(), equalTo(0));
        assertThat(freq[j], equalTo(docsAndPositions.freq()));
        assertThat(iterator.docFreq(), equalTo(-1));
        int[] termPos = pos[j];
        int[] termStartOffset = startOffset[j];
        int[] termEndOffset = endOffset[j];
        assertThat(termPos.length, equalTo(freq[j]));
        assertThat(termStartOffset.length, equalTo(freq[j]));
        assertThat(termEndOffset.length, equalTo(freq[j]));
        for (int k = 0; k < freq[j]; k++) {
            int nextPosition = docsAndPositions.nextPosition();
            assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
            assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
            assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
            assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
        }/*from  ww  w. j a  va 2s  .com*/
    }
    assertThat(iterator.next(), Matchers.nullValue());

    XContentBuilder xBuilder = new XContentFactory().jsonBuilder();

    response.toXContent(xBuilder, null);
    BytesStream bytesStream = xBuilder.bytesStream();
    String utf8 = bytesStream.bytes().toUtf8();
    String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i
            + "\",\"_version\":1,\"exists\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}";
    assertThat(utf8, equalTo(expectedString));

}

From source file:org.elasticsearch.termvectors.GetTermVectorCheckDocFreqTests.java

License:Apache License

private void checkAllInfo(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset,
        int[][] endOffset, int i) throws IOException {
    TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i))
            .setPayloads(true).setOffsets(true).setPositions(true).setFieldStatistics(true)
            .setTermStatistics(true).setSelectedFields();
    assertThat(resp.request().fieldStatistics(), equalTo(true));
    TermVectorResponse response = resp.execute().actionGet();
    assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
    Fields fields = response.getFields();
    assertThat(fields.size(), equalTo(1));
    Terms terms = fields.terms("field");
    assertThat(terms.size(), equalTo(8l));
    assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs)));
    assertThat(terms.getDocCount(), Matchers.equalTo(numDocs));
    assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length));
    TermsEnum iterator = terms.iterator(null);
    for (int j = 0; j < values.length; j++) {
        String string = values[j];
        BytesRef next = iterator.next();
        assertThat(next, Matchers.notNullValue());
        assertThat("expected " + string, string, equalTo(next.utf8ToString()));
        assertThat(next, Matchers.notNullValue());
        if (string.equals("the")) {
            assertThat("expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq()));
        } else {/*from w w  w  .java  2 s.co m*/
            assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq()));
        }

        DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
        assertThat(docsAndPositions.nextDoc(), equalTo(0));
        assertThat(freq[j], equalTo(docsAndPositions.freq()));
        assertThat(iterator.docFreq(), equalTo(numDocs));
        int[] termPos = pos[j];
        int[] termStartOffset = startOffset[j];
        int[] termEndOffset = endOffset[j];
        assertThat(termPos.length, equalTo(freq[j]));
        assertThat(termStartOffset.length, equalTo(freq[j]));
        assertThat(termEndOffset.length, equalTo(freq[j]));
        for (int k = 0; k < freq[j]; k++) {
            int nextPosition = docsAndPositions.nextPosition();
            assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
            assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
            assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
            assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
        }
    }
    assertThat(iterator.next(), Matchers.nullValue());

    XContentBuilder xBuilder = new XContentFactory().jsonBuilder();

    response.toXContent(xBuilder, null);
    BytesStream bytesStream = xBuilder.bytesStream();
    String utf8 = bytesStream.bytes().toUtf8();
    String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i
            + "\",\"_version\":1,\"exists\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}";
    assertThat(utf8, equalTo(expectedString));
}

From source file:org.elasticsearch.termvectors.GetTermVectorTests.java

License:Apache License

@Test
public void testSimpleTermVectors() throws ElasticSearchException, IOException {
    XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1")
            .startObject("properties").startObject("field").field("type", "string")
            .field("term_vector", "with_positions_offsets_payloads").field("analyzer", "tv_test").endObject()
            .endObject().endObject().endObject();
    ElasticSearchAssertions.assertAcked(prepareCreate("test").addMapping("type1", mapping)
            .setSettings(ImmutableSettings.settingsBuilder()
                    .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
                    .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
    ensureYellow();/* w  w  w  .  jav a  2 s.co m*/
    for (int i = 0; i < 10; i++) {
        client().prepareIndex("test", "type1", Integer.toString(i))
                .setSource(XContentFactory.jsonBuilder().startObject()
                        .field("field", "the quick brown fox jumps over the lazy dog")
                        // 0the3 4quick9 10brown15 16fox19 20jumps25 26over30
                        // 31the34 35lazy39 40dog43
                        .endObject())
                .execute().actionGet();
        refresh();
    }
    String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" };
    int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 };
    int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } };
    int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } };
    int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } };
    for (int i = 0; i < 10; i++) {
        TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i))
                .setPayloads(true).setOffsets(true).setPositions(true).setSelectedFields();
        TermVectorResponse response = resp.execute().actionGet();
        assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
        Fields fields = response.getFields();
        assertThat(fields.size(), equalTo(1));
        Terms terms = fields.terms("field");
        assertThat(terms.size(), equalTo(8l));
        TermsEnum iterator = terms.iterator(null);
        for (int j = 0; j < values.length; j++) {
            String string = values[j];
            BytesRef next = iterator.next();
            assertThat(next, Matchers.notNullValue());
            assertThat("expected " + string, string, equalTo(next.utf8ToString()));
            assertThat(next, Matchers.notNullValue());
            // do not test ttf or doc frequency, because here we have many
            // shards and do not know how documents are distributed
            DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
            assertThat(docsAndPositions.nextDoc(), equalTo(0));
            assertThat(freq[j], equalTo(docsAndPositions.freq()));
            int[] termPos = pos[j];
            int[] termStartOffset = startOffset[j];
            int[] termEndOffset = endOffset[j];
            assertThat(termPos.length, equalTo(freq[j]));
            assertThat(termStartOffset.length, equalTo(freq[j]));
            assertThat(termEndOffset.length, equalTo(freq[j]));
            for (int k = 0; k < freq[j]; k++) {
                int nextPosition = docsAndPositions.nextPosition();
                assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
                assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
                assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
                assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
            }
        }
        assertThat(iterator.next(), Matchers.nullValue());
    }
}

From source file:org.elasticsearch.termvectors.GetTermVectorTests.java

License:Apache License

@Test
public void testRandomSingleTermVectors() throws ElasticSearchException, IOException {
    FieldType ft = new FieldType();
    int config = randomInt(6);
    boolean storePositions = false;
    boolean storeOffsets = false;
    boolean storePayloads = false;
    boolean storeTermVectors = false;
    switch (config) {
    case 0: {//from ww w. j a  v  a2 s  .c  o  m
        // do nothing
    }
    case 1: {
        storeTermVectors = true;
    }
    case 2: {
        storeTermVectors = true;
        storePositions = true;
    }
    case 3: {
        storeTermVectors = true;
        storeOffsets = true;
    }
    case 4: {
        storeTermVectors = true;
        storePositions = true;
        storeOffsets = true;
    }
    case 5: {
        storeTermVectors = true;
        storePositions = true;
        storePayloads = true;
    }
    case 6: {
        storeTermVectors = true;
        storePositions = true;
        storeOffsets = true;
        storePayloads = true;
    }
    }
    ft.setStoreTermVectors(storeTermVectors);
    ft.setStoreTermVectorOffsets(storeOffsets);
    ft.setStoreTermVectorPayloads(storePayloads);
    ft.setStoreTermVectorPositions(storePositions);

    String optionString = AbstractFieldMapper.termVectorOptionsToString(ft);
    XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1")
            .startObject("properties").startObject("field").field("type", "string")
            .field("term_vector", optionString).field("analyzer", "tv_test").endObject().endObject().endObject()
            .endObject();
    ElasticSearchAssertions.assertAcked(prepareCreate("test").addMapping("type1", mapping)
            .setSettings(ImmutableSettings.settingsBuilder()
                    .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
                    .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
    ensureYellow();
    for (int i = 0; i < 10; i++) {
        client().prepareIndex("test", "type1", Integer.toString(i))
                .setSource(XContentFactory.jsonBuilder().startObject()
                        .field("field", "the quick brown fox jumps over the lazy dog")
                        // 0the3 4quick9 10brown15 16fox19 20jumps25 26over30
                        // 31the34 35lazy39 40dog43
                        .endObject())
                .execute().actionGet();
        refresh();
    }
    String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" };
    int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 };
    int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } };
    int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } };
    int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } };

    boolean isPayloadRequested = randomBoolean();
    boolean isOffsetRequested = randomBoolean();
    boolean isPositionsRequested = randomBoolean();
    String infoString = createInfoString(isPositionsRequested, isOffsetRequested, isPayloadRequested,
            optionString);
    for (int i = 0; i < 10; i++) {
        TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i))
                .setPayloads(isPayloadRequested).setOffsets(isOffsetRequested)
                .setPositions(isPositionsRequested).setSelectedFields();
        TermVectorResponse response = resp.execute().actionGet();
        assertThat(infoString + "doc id: " + i + " doesn't exists but should", response.isExists(),
                equalTo(true));
        Fields fields = response.getFields();
        assertThat(fields.size(), equalTo(ft.storeTermVectors() ? 1 : 0));
        if (ft.storeTermVectors()) {
            Terms terms = fields.terms("field");
            assertThat(terms.size(), equalTo(8l));
            TermsEnum iterator = terms.iterator(null);
            for (int j = 0; j < values.length; j++) {
                String string = values[j];
                BytesRef next = iterator.next();
                assertThat(infoString, next, Matchers.notNullValue());
                assertThat(infoString + "expected " + string, string, equalTo(next.utf8ToString()));
                assertThat(infoString, next, Matchers.notNullValue());
                // do not test ttf or doc frequency, because here we have
                // many shards and do not know how documents are distributed
                DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
                // docs and pos only returns something if positions or
                // payloads or offsets are stored / requestd Otherwise use
                // DocsEnum?
                assertThat(infoString, docsAndPositions.nextDoc(), equalTo(0));
                assertThat(infoString, freq[j], equalTo(docsAndPositions.freq()));
                int[] termPos = pos[j];
                int[] termStartOffset = startOffset[j];
                int[] termEndOffset = endOffset[j];
                if (isPositionsRequested && storePositions) {
                    assertThat(infoString, termPos.length, equalTo(freq[j]));
                }
                if (isOffsetRequested && storeOffsets) {
                    assertThat(termStartOffset.length, equalTo(freq[j]));
                    assertThat(termEndOffset.length, equalTo(freq[j]));
                }
                for (int k = 0; k < freq[j]; k++) {
                    int nextPosition = docsAndPositions.nextPosition();
                    // only return something useful if requested and stored
                    if (isPositionsRequested && storePositions) {
                        assertThat(infoString + "positions for term: " + string, nextPosition,
                                equalTo(termPos[k]));
                    } else {
                        assertThat(infoString + "positions for term: ", nextPosition, equalTo(-1));
                    }

                    // only return something useful if requested and stored
                    if (isPayloadRequested && storePayloads) {
                        assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(),
                                equalTo(new BytesRef("word")));
                    } else {
                        assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(),
                                equalTo(null));
                    }
                    // only return something useful if requested and stored
                    if (isOffsetRequested && storeOffsets) {

                        assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(),
                                equalTo(termStartOffset[k]));
                        assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(),
                                equalTo(termEndOffset[k]));
                    } else {
                        assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(),
                                equalTo(-1));
                        assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(),
                                equalTo(-1));
                    }

                }
            }
            assertThat(iterator.next(), Matchers.nullValue());
        }

    }
}

From source file:org.elasticsearch.test.integration.termvectors.GetTermVectorTests.java

License:Apache License

@Test
public void testSimpleTermVectors() throws ElasticSearchException, IOException {

    run(addMapping(prepareCreate("test"), "type1",
            new Object[] { "field", "type", "string", "term_vector", "with_positions_offsets_payloads",
                    "analyzer", "tv_test" })
                            .setSettings(ImmutableSettings.settingsBuilder()
                                    .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
                                    .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload",
                                            "lowercase")));
    ensureYellow();/*from  ww  w .  j  a v  a  2  s  . com*/
    for (int i = 0; i < 10; i++) {
        client().prepareIndex("test", "type1", Integer.toString(i))
                .setSource(XContentFactory.jsonBuilder().startObject()
                        .field("field", "the quick brown fox jumps over the lazy dog")
                        // 0the3 4quick9 10brown15 16fox19 20jumps25 26over30
                        // 31the34 35lazy39 40dog43
                        .endObject())
                .execute().actionGet();
        refresh();
    }
    String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" };
    int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 };
    int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } };
    int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } };
    int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } };
    for (int i = 0; i < 10; i++) {
        TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i))
                .setPayloads(true).setOffsets(true).setPositions(true).setSelectedFields();
        TermVectorResponse response = resp.execute().actionGet();
        assertThat("doc id: " + i + " doesn't exists but should", response.documentExists(), equalTo(true));
        Fields fields = response.getFields();
        assertThat(fields.size(), equalTo(1));
        Terms terms = fields.terms("field");
        assertThat(terms.size(), equalTo(8l));
        TermsEnum iterator = terms.iterator(null);
        for (int j = 0; j < values.length; j++) {
            String string = values[j];
            BytesRef next = iterator.next();
            assertThat(next, Matchers.notNullValue());
            assertThat("expected " + string, string, equalTo(next.utf8ToString()));
            assertThat(next, Matchers.notNullValue());
            // do not test ttf or doc frequency, because here we have many
            // shards and do not know how documents are distributed
            DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
            assertThat(docsAndPositions.nextDoc(), equalTo(0));
            assertThat(freq[j], equalTo(docsAndPositions.freq()));
            int[] termPos = pos[j];
            int[] termStartOffset = startOffset[j];
            int[] termEndOffset = endOffset[j];
            assertThat(termPos.length, equalTo(freq[j]));
            assertThat(termStartOffset.length, equalTo(freq[j]));
            assertThat(termEndOffset.length, equalTo(freq[j]));
            for (int k = 0; k < freq[j]; k++) {
                int nextPosition = docsAndPositions.nextPosition();
                assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
                assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
                assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
                assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
            }
        }
        assertThat(iterator.next(), Matchers.nullValue());
    }
}