Example usage for org.apache.lucene.index Fields terms

List of usage examples for org.apache.lucene.index Fields terms

Introduction

In this page you can find the example usage for org.apache.lucene.index Fields terms.

Prototype

public abstract Terms terms(String field) throws IOException;

Source Link

Document

Get the Terms for this field.

Usage

From source file:org.elasticsearch.test.integration.termvectors.GetTermVectorTests.java

License:Apache License

@Test
public void testRandomSingleTermVectors() throws ElasticSearchException, IOException {
    Random random = getRandom();//from  ww w.ja v  a 2  s.  c om
    FieldType ft = new FieldType();
    int config = random.nextInt(6);
    boolean storePositions = false;
    boolean storeOffsets = false;
    boolean storePayloads = false;
    boolean storeTermVectors = false;
    switch (config) {
    case 0: {
        // do nothing
    }
    case 1: {
        storeTermVectors = true;
    }
    case 2: {
        storeTermVectors = true;
        storePositions = true;
    }
    case 3: {
        storeTermVectors = true;
        storeOffsets = true;
    }
    case 4: {
        storeTermVectors = true;
        storePositions = true;
        storeOffsets = true;
    }
    case 5: {
        storeTermVectors = true;
        storePositions = true;
        storePayloads = true;
    }
    case 6: {
        storeTermVectors = true;
        storePositions = true;
        storeOffsets = true;
        storePayloads = true;
    }
    }
    ft.setStoreTermVectors(storeTermVectors);
    ft.setStoreTermVectorOffsets(storeOffsets);
    ft.setStoreTermVectorPayloads(storePayloads);
    ft.setStoreTermVectorPositions(storePositions);

    String optionString = AbstractFieldMapper.termVectorOptionsToString(ft);
    run(addMapping(prepareCreate("test"), "type1",
            new Object[] { "field", "type", "string", "term_vector", optionString, "analyzer", "tv_test" })
                    .setSettings(ImmutableSettings.settingsBuilder()
                            .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace").putArray(
                                    "index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
    ensureYellow();
    for (int i = 0; i < 10; i++) {
        client().prepareIndex("test", "type1", Integer.toString(i))
                .setSource(XContentFactory.jsonBuilder().startObject()
                        .field("field", "the quick brown fox jumps over the lazy dog")
                        // 0the3 4quick9 10brown15 16fox19 20jumps25 26over30
                        // 31the34 35lazy39 40dog43
                        .endObject())
                .execute().actionGet();
        refresh();
    }
    String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" };
    int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 };
    int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } };
    int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } };
    int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } };

    boolean isPayloadRequested = random.nextBoolean();
    boolean isOffsetRequested = random.nextBoolean();
    boolean isPositionsRequested = random.nextBoolean();
    String infoString = createInfoString(isPositionsRequested, isOffsetRequested, isPayloadRequested,
            optionString);
    for (int i = 0; i < 10; i++) {
        TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i))
                .setPayloads(isPayloadRequested).setOffsets(isOffsetRequested)
                .setPositions(isPositionsRequested).setSelectedFields();
        TermVectorResponse response = resp.execute().actionGet();
        assertThat(infoString + "doc id: " + i + " doesn't exists but should", response.documentExists(),
                equalTo(true));
        Fields fields = response.getFields();
        assertThat(fields.size(), equalTo(ft.storeTermVectors() ? 1 : 0));
        if (ft.storeTermVectors()) {
            Terms terms = fields.terms("field");
            assertThat(terms.size(), equalTo(8l));
            TermsEnum iterator = terms.iterator(null);
            for (int j = 0; j < values.length; j++) {
                String string = values[j];
                BytesRef next = iterator.next();
                assertThat(infoString, next, Matchers.notNullValue());
                assertThat(infoString + "expected " + string, string, equalTo(next.utf8ToString()));
                assertThat(infoString, next, Matchers.notNullValue());
                // do not test ttf or doc frequency, because here we have
                // many shards and do not know how documents are distributed
                DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
                // docs and pos only returns something if positions or
                // payloads or offsets are stored / requestd Otherwise use
                // DocsEnum?
                assertThat(infoString, docsAndPositions.nextDoc(), equalTo(0));
                assertThat(infoString, freq[j], equalTo(docsAndPositions.freq()));
                int[] termPos = pos[j];
                int[] termStartOffset = startOffset[j];
                int[] termEndOffset = endOffset[j];
                if (isPositionsRequested && storePositions) {
                    assertThat(infoString, termPos.length, equalTo(freq[j]));
                }
                if (isOffsetRequested && storeOffsets) {
                    assertThat(termStartOffset.length, equalTo(freq[j]));
                    assertThat(termEndOffset.length, equalTo(freq[j]));
                }
                for (int k = 0; k < freq[j]; k++) {
                    int nextPosition = docsAndPositions.nextPosition();
                    // only return something useful if requested and stored
                    if (isPositionsRequested && storePositions) {
                        assertThat(infoString + "positions for term: " + string, nextPosition,
                                equalTo(termPos[k]));
                    } else {
                        assertThat(infoString + "positions for term: ", nextPosition, equalTo(-1));
                    }

                    // only return something useful if requested and stored
                    if (isPayloadRequested && storePayloads) {
                        assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(),
                                equalTo(new BytesRef("word")));
                    } else {
                        assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(),
                                equalTo(null));
                    }
                    // only return something useful if requested and stored
                    if (isOffsetRequested && storeOffsets) {

                        assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(),
                                equalTo(termStartOffset[k]));
                        assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(),
                                equalTo(termEndOffset[k]));
                    } else {
                        assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(),
                                equalTo(-1));
                        assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(),
                                equalTo(-1));
                    }

                }
            }
            assertThat(iterator.next(), Matchers.nullValue());
        }

    }
}

From source file:org.elasticsearch.test.integration.termvectors.GetTermVectorTests.java

License:Apache License

private void compareLuceneESTermVectorResults(Fields fields, Fields luceneFields,
        HashMap<String, Boolean> storePositionsMap, HashMap<String, Boolean> storeOfsetsMap,
        HashMap<String, Boolean> storePayloadsMap, boolean getPositions, boolean getOffsets,
        boolean getPayloads, String[] selectedFields) throws IOException {
    HashSet<String> selectedFieldsMap = new HashSet<String>(Arrays.asList(selectedFields));

    Iterator<String> luceneFieldNames = luceneFields.iterator();
    assertThat(luceneFields.size(), equalTo(storeOfsetsMap.size()));
    assertThat(fields.size(), equalTo(selectedFields.length));

    while (luceneFieldNames.hasNext()) {
        String luceneFieldName = luceneFieldNames.next();
        if (!selectedFieldsMap.contains(luceneFieldName))
            continue;
        Terms esTerms = fields.terms(luceneFieldName);
        Terms luceneTerms = luceneFields.terms(luceneFieldName);
        TermsEnum esTermEnum = esTerms.iterator(null);
        TermsEnum luceneTermEnum = luceneTerms.iterator(null);

        int numTerms = 0;

        while (esTermEnum.next() != null) {
            luceneTermEnum.next();/*from  www .  jav  a  2 s . com*/
            assertThat(esTermEnum.totalTermFreq(), equalTo(luceneTermEnum.totalTermFreq()));
            DocsAndPositionsEnum esDocsPosEnum = esTermEnum.docsAndPositions(null, null, 0);
            DocsAndPositionsEnum luceneDocsPosEnum = luceneTermEnum.docsAndPositions(null, null, 0);
            if (luceneDocsPosEnum == null) {
                assertThat(storeOfsetsMap.get(luceneFieldName), equalTo(false));
                assertThat(storePayloadsMap.get(luceneFieldName), equalTo(false));
                assertThat(storePositionsMap.get(luceneFieldName), equalTo(false));
                continue;

            }
            numTerms++;

            assertThat("failed for field: " + luceneFieldName, esTermEnum.term().utf8ToString(),
                    equalTo(luceneTermEnum.term().utf8ToString()));
            esDocsPosEnum.nextDoc();
            luceneDocsPosEnum.nextDoc();

            int freq = (int) esDocsPosEnum.freq();
            assertThat(freq, equalTo(luceneDocsPosEnum.freq()));
            for (int i = 0; i < freq; i++) {

                int lucenePos = luceneDocsPosEnum.nextPosition();
                int esPos = esDocsPosEnum.nextPosition();
                if (storePositionsMap.get(luceneFieldName) && getPositions) {
                    assertThat(luceneFieldName, lucenePos, equalTo(esPos));
                } else {
                    assertThat(esPos, equalTo(-1));
                }
                if (storeOfsetsMap.get(luceneFieldName) && getOffsets) {
                    assertThat(luceneDocsPosEnum.startOffset(), equalTo(esDocsPosEnum.startOffset()));
                    assertThat(luceneDocsPosEnum.endOffset(), equalTo(esDocsPosEnum.endOffset()));
                } else {
                    assertThat(esDocsPosEnum.startOffset(), equalTo(-1));
                    assertThat(esDocsPosEnum.endOffset(), equalTo(-1));
                }
                if (storePayloadsMap.get(luceneFieldName) && getPayloads) {
                    assertThat(luceneFieldName, luceneDocsPosEnum.getPayload(),
                            equalTo(esDocsPosEnum.getPayload()));
                } else {
                    assertThat(esDocsPosEnum.getPayload(), equalTo(null));
                }

            }
        }

    }

}

From source file:org.elasticsearch.test.integration.termvectors.GetTermVectorTestsCheckDocFreq.java

License:Apache License

private void checkWithoutFieldStatistics(int numDocs, String[] values, int[] freq, int[][] pos,
        int[][] startOffset, int[][] endOffset, int i) throws IOException {
    TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i))
            .setPayloads(true).setOffsets(true).setPositions(true).setTermStatistics(true)
            .setFieldStatistics(false).setSelectedFields();
    TermVectorResponse response = resp.execute().actionGet();
    assertThat("doc id: " + i + " doesn't exists but should", response.documentExists(), equalTo(true));
    Fields fields = response.getFields();
    assertThat(fields.size(), equalTo(1));
    Terms terms = fields.terms("field");
    assertThat(terms.size(), equalTo(8l));
    assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) -1));
    assertThat(terms.getDocCount(), Matchers.equalTo(-1));
    assertThat(terms.getSumDocFreq(), equalTo((long) -1));
    TermsEnum iterator = terms.iterator(null);
    for (int j = 0; j < values.length; j++) {
        String string = values[j];
        BytesRef next = iterator.next();
        assertThat(next, Matchers.notNullValue());
        assertThat("expected " + string, string, equalTo(next.utf8ToString()));
        assertThat(next, Matchers.notNullValue());
        if (string.equals("the")) {
            assertThat("expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq()));
        } else {//from w w  w.  j a  v a  2  s. com
            assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq()));
        }

        DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
        assertThat(docsAndPositions.nextDoc(), equalTo(0));
        assertThat(freq[j], equalTo(docsAndPositions.freq()));
        assertThat(iterator.docFreq(), equalTo(numDocs));
        int[] termPos = pos[j];
        int[] termStartOffset = startOffset[j];
        int[] termEndOffset = endOffset[j];
        assertThat(termPos.length, equalTo(freq[j]));
        assertThat(termStartOffset.length, equalTo(freq[j]));
        assertThat(termEndOffset.length, equalTo(freq[j]));
        for (int k = 0; k < freq[j]; k++) {
            int nextPosition = docsAndPositions.nextPosition();
            assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
            assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
            assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
            assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
        }
    }
    assertThat(iterator.next(), Matchers.nullValue());

    XContentBuilder xBuilder = new XContentFactory().jsonBuilder();

    response.toXContent(xBuilder, null);
    BytesStream bytesStream = xBuilder.bytesStream();
    String utf8 = bytesStream.bytes().toUtf8();
    String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i
            + "\",\"_version\":1,\"exists\":true,\"term_vectors\":{\"field\":{\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[2],\"start\":[10],\"end\":[15],\"payload\":[\"d29yZA==\"]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[8],\"start\":[40],\"end\":[43],\"payload\":[\"d29yZA==\"]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[3],\"start\":[16],\"end\":[19],\"payload\":[\"d29yZA==\"]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[4],\"start\":[20],\"end\":[25],\"payload\":[\"d29yZA==\"]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[7],\"start\":[35],\"end\":[39],\"payload\":[\"d29yZA==\"]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[5],\"start\":[26],\"end\":[30],\"payload\":[\"d29yZA==\"]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[1],\"start\":[4],\"end\":[9],\"payload\":[\"d29yZA==\"]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"pos\":[0,6],\"start\":[0,31],\"end\":[3,34],\"payload\":[\"d29yZA==\",\"d29yZA==\"]}}}}}";
    assertThat(utf8, equalTo(expectedString));

}

From source file:org.elasticsearch.test.integration.termvectors.GetTermVectorTestsCheckDocFreq.java

License:Apache License

private void checkWithoutTermStatistics(int numDocs, String[] values, int[] freq, int[][] pos,
        int[][] startOffset, int[][] endOffset, int i) throws IOException {
    TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i))
            .setPayloads(true).setOffsets(true).setPositions(true).setTermStatistics(false)
            .setFieldStatistics(true).setSelectedFields();
    assertThat(resp.request().termStatistics(), equalTo(false));
    TermVectorResponse response = resp.execute().actionGet();
    assertThat("doc id: " + i + " doesn't exists but should", response.documentExists(), equalTo(true));
    Fields fields = response.getFields();
    assertThat(fields.size(), equalTo(1));
    Terms terms = fields.terms("field");
    assertThat(terms.size(), equalTo(8l));
    assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs)));
    assertThat(terms.getDocCount(), Matchers.equalTo(numDocs));
    assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length));
    TermsEnum iterator = terms.iterator(null);
    for (int j = 0; j < values.length; j++) {
        String string = values[j];
        BytesRef next = iterator.next();
        assertThat(next, Matchers.notNullValue());
        assertThat("expected " + string, string, equalTo(next.utf8ToString()));
        assertThat(next, Matchers.notNullValue());

        assertThat("expected ttf of " + string, -1, equalTo((int) iterator.totalTermFreq()));

        DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
        assertThat(docsAndPositions.nextDoc(), equalTo(0));
        assertThat(freq[j], equalTo(docsAndPositions.freq()));
        assertThat(iterator.docFreq(), equalTo(-1));
        int[] termPos = pos[j];
        int[] termStartOffset = startOffset[j];
        int[] termEndOffset = endOffset[j];
        assertThat(termPos.length, equalTo(freq[j]));
        assertThat(termStartOffset.length, equalTo(freq[j]));
        assertThat(termEndOffset.length, equalTo(freq[j]));
        for (int k = 0; k < freq[j]; k++) {
            int nextPosition = docsAndPositions.nextPosition();
            assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
            assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
            assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
            assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
        }//from   w  ww. j  a  v  a 2 s . co m
    }
    assertThat(iterator.next(), Matchers.nullValue());

    XContentBuilder xBuilder = new XContentFactory().jsonBuilder();

    response.toXContent(xBuilder, null);
    BytesStream bytesStream = xBuilder.bytesStream();
    String utf8 = bytesStream.bytes().toUtf8();
    String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i
            + "\",\"_version\":1,\"exists\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"term_freq\":1,\"pos\":[2],\"start\":[10],\"end\":[15],\"payload\":[\"d29yZA==\"]},\"dog\":{\"term_freq\":1,\"pos\":[8],\"start\":[40],\"end\":[43],\"payload\":[\"d29yZA==\"]},\"fox\":{\"term_freq\":1,\"pos\":[3],\"start\":[16],\"end\":[19],\"payload\":[\"d29yZA==\"]},\"jumps\":{\"term_freq\":1,\"pos\":[4],\"start\":[20],\"end\":[25],\"payload\":[\"d29yZA==\"]},\"lazy\":{\"term_freq\":1,\"pos\":[7],\"start\":[35],\"end\":[39],\"payload\":[\"d29yZA==\"]},\"over\":{\"term_freq\":1,\"pos\":[5],\"start\":[26],\"end\":[30],\"payload\":[\"d29yZA==\"]},\"quick\":{\"term_freq\":1,\"pos\":[1],\"start\":[4],\"end\":[9],\"payload\":[\"d29yZA==\"]},\"the\":{\"term_freq\":2,\"pos\":[0,6],\"start\":[0,31],\"end\":[3,34],\"payload\":[\"d29yZA==\",\"d29yZA==\"]}}}}}";

    assertThat(utf8, equalTo(expectedString));

}

From source file:org.elasticsearch.test.integration.termvectors.GetTermVectorTestsCheckDocFreq.java

License:Apache License

private void checkAllInfo(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset,
        int[][] endOffset, int i) throws IOException {
    TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i))
            .setPayloads(true).setOffsets(true).setPositions(true).setFieldStatistics(true)
            .setTermStatistics(true).setSelectedFields();
    assertThat(resp.request().fieldStatistics(), equalTo(true));
    TermVectorResponse response = resp.execute().actionGet();
    assertThat("doc id: " + i + " doesn't exists but should", response.documentExists(), equalTo(true));
    Fields fields = response.getFields();
    assertThat(fields.size(), equalTo(1));
    Terms terms = fields.terms("field");
    assertThat(terms.size(), equalTo(8l));
    assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs)));
    assertThat(terms.getDocCount(), Matchers.equalTo(numDocs));
    assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length));
    TermsEnum iterator = terms.iterator(null);
    for (int j = 0; j < values.length; j++) {
        String string = values[j];
        BytesRef next = iterator.next();
        assertThat(next, Matchers.notNullValue());
        assertThat("expected " + string, string, equalTo(next.utf8ToString()));
        assertThat(next, Matchers.notNullValue());
        if (string.equals("the")) {
            assertThat("expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq()));
        } else {//from   w w w  .  jav  a 2s.  c  o m
            assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq()));
        }

        DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
        assertThat(docsAndPositions.nextDoc(), equalTo(0));
        assertThat(freq[j], equalTo(docsAndPositions.freq()));
        assertThat(iterator.docFreq(), equalTo(numDocs));
        int[] termPos = pos[j];
        int[] termStartOffset = startOffset[j];
        int[] termEndOffset = endOffset[j];
        assertThat(termPos.length, equalTo(freq[j]));
        assertThat(termStartOffset.length, equalTo(freq[j]));
        assertThat(termEndOffset.length, equalTo(freq[j]));
        for (int k = 0; k < freq[j]; k++) {
            int nextPosition = docsAndPositions.nextPosition();
            assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
            assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
            assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
            assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
        }
    }
    assertThat(iterator.next(), Matchers.nullValue());

    XContentBuilder xBuilder = new XContentFactory().jsonBuilder();

    response.toXContent(xBuilder, null);
    BytesStream bytesStream = xBuilder.bytesStream();
    String utf8 = bytesStream.bytes().toUtf8();
    String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i
            + "\",\"_version\":1,\"exists\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[2],\"start\":[10],\"end\":[15],\"payload\":[\"d29yZA==\"]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[8],\"start\":[40],\"end\":[43],\"payload\":[\"d29yZA==\"]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[3],\"start\":[16],\"end\":[19],\"payload\":[\"d29yZA==\"]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[4],\"start\":[20],\"end\":[25],\"payload\":[\"d29yZA==\"]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[7],\"start\":[35],\"end\":[39],\"payload\":[\"d29yZA==\"]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[5],\"start\":[26],\"end\":[30],\"payload\":[\"d29yZA==\"]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[1],\"start\":[4],\"end\":[9],\"payload\":[\"d29yZA==\"]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"pos\":[0,6],\"start\":[0,31],\"end\":[3,34],\"payload\":[\"d29yZA==\",\"d29yZA==\"]}}}}}";
    assertThat(utf8, equalTo(expectedString));

}

From source file:org.elasticsearch.vectorize.VectorizeService.java

License:Apache License

private void processTermVectorsFields(Vectorizer vectorizer, Fields termVectorsFields) throws IOException {
    for (String fieldName : termVectorsFields) {
        TermsEnum termsEnum = termVectorsFields.terms(fieldName).iterator();
        while (termsEnum.next() != null) {
            Term term = new Term(fieldName, termsEnum.term());
            TermStatistics termStatistics = new TermStatistics(termsEnum.term(), termsEnum.docFreq(),
                    termsEnum.totalTermFreq());
            int freq = termsEnum.postings(null, null, PostingsEnum.ALL).freq();
            vectorizer.add(term, termStatistics, freq);
        }//from   w w w.  j a v  a 2 s . c o  m
    }
}

From source file:org.getopt.luke.HighFreqTerms.java

License:Apache License

/**
 * //from  ww w .java  2s .  c o  m
 * @param reader
 * @param numTerms
 * @param fieldNames
 * @return TermStats[] ordered by terms with highest docFreq first.
 * @throws Exception
 */
public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String[] fieldNames)
        throws Exception {
    TermStatsQueue tiq = null;
    TermsEnum te = null;

    if (fieldNames != null) {
        Fields fields = MultiFields.getFields(reader);
        if (fields == null) {
            LOG.info("Index with no fields - probably empty or corrupted");
            return EMPTY_STATS;
        }
        tiq = new TermStatsQueue(numTerms);
        for (String field : fieldNames) {
            Terms terms = fields.terms(field);
            if (terms != null) {
                te = terms.iterator(te);
                fillQueue(te, tiq, field);
            }
        }
    } else {
        Fields fields = MultiFields.getFields(reader);
        if (fields == null) {
            LOG.info("Index with no fields - probably empty or corrupted");
            return EMPTY_STATS;
        }
        tiq = new TermStatsQueue(numTerms);
        Iterator<String> fieldIterator = fields.iterator();
        while (fieldIterator.hasNext()) {
            String field = fieldIterator.next();
            Terms terms = fields.terms(field);
            if (terms != null) {
                te = terms.iterator(te);
                fillQueue(te, tiq, field);
            }
        }
    }

    TermStats[] result = new TermStats[tiq.size()];
    // we want highest first so we read the queue and populate the array
    // starting at the end and work backwards
    int count = tiq.size() - 1;
    while (tiq.size() != 0) {
        result[count] = tiq.pop();
        count--;
    }
    return result;
}

From source file:org.hibernate.search.query.dsl.impl.MoreLikeThisBuilder.java

License:LGPL

/**
 * Find words for a more-like-this query former.
 * Store them per field name according to the order of fieldnames defined in {@link #fieldsContext}.
 * If the field name is not compatible with term retrieval, the queue will be empty for that index.
 *///from ww w.j  a  v  a  2s .c om
private List<PriorityQueue<Object[]>> retrieveTerms() throws IOException {
    int size = fieldsContext.size();
    Map<String, Map<String, Int>> termFreqMapPerFieldname = new HashMap<String, Map<String, Int>>(size);
    final Fields vectors;
    Document maybeDocument = null;
    if (documentNumber == null && size > 0) {
        //build the document from the entity instance

        //first build the list of fields we are interested in
        String[] fieldNames = new String[size];
        Iterator<FieldContext> fieldsContextIterator = fieldsContext.iterator();
        for (int index = 0; index < size; index++) {
            fieldNames[index] = fieldsContextIterator.next().getField();
        }
        //TODO should we keep the fieldToAnalyzerMap around to pass to the analyzer?
        Map<String, String> fieldToAnalyzerMap = new HashMap<String, String>();
        //FIXME by calling documentBuilder we don't honor .comparingField("foo").ignoreFieldBridge(): probably not a problem in practice though
        maybeDocument = documentBuilder.getDocument(null, input, null, fieldToAnalyzerMap, null,
                new ContextualExceptionBridgeHelper(), fieldNames);
        vectors = null;
    } else {
        vectors = indexReader.getTermVectors(documentNumber);
    }
    for (FieldContext fieldContext : fieldsContext) {
        String fieldName = fieldContext.getField();
        if (isCompatibleField(fieldName)) {
            Map<String, Int> termFreqMap = new HashMap<String, Int>();
            termFreqMapPerFieldname.put(fieldName, termFreqMap);
            final Terms vector;
            if (vectors != null) {
                vector = vectors.terms(fieldName);
            } else {
                vector = null;
            }

            // field does not store term vector info
            if (vector == null) {
                if (maybeDocument == null) {
                    maybeDocument = indexReader.document(documentNumber);
                }
                IndexableField[] fields = maybeDocument.getFields(fieldName);
                for (IndexableField field : fields) {
                    //TODO numbers
                    final String stringValue = DocumentBuilderHelper.extractStringFromFieldable(field);
                    if (stringValue != null) {
                        addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldContext);
                    }
                }
            } else {
                addTermFrequencies(termFreqMap, vector);
            }
        } else {
            //place null as the field is not compatible
            termFreqMapPerFieldname.put(fieldName, null);
        }
    }
    List<PriorityQueue<Object[]>> results = new ArrayList<PriorityQueue<Object[]>>(size);
    for (Map.Entry<String, Map<String, Int>> entry : termFreqMapPerFieldname.entrySet()) {
        results.add(createQueue(entry.getKey(), entry.getValue()));
    }
    return results;
}

From source file:org.languagetool.dev.archive.StartTokenCounter.java

License:Open Source License

public static void main(String[] args) throws IOException {
    long totalCount = 0;
    File dir = new File("/data/google-ngram-index/en/2grams");
    try (FSDirectory directory = FSDirectory.open(dir.toPath());
            IndexReader reader = DirectoryReader.open(directory)) {
        IndexSearcher searcher = new IndexSearcher(reader);
        Fields fields = MultiFields.getFields(reader);
        Terms ngrams = fields.terms("ngram");
        TermsEnum iterator = ngrams.iterator();
        BytesRef next;//  w ww.  java  2s.c o m
        int i = 0;
        while ((next = iterator.next()) != null) {
            String term = next.utf8ToString();
            if (term.startsWith(LanguageModel.GOOGLE_SENTENCE_START)) {
                if (term.matches(".*_(ADJ|ADV|NUM|VERB|ADP|NOUN|PRON|CONJ|DET|PRT)$")) {
                    //System.out.println("ignore: " + term);
                    continue;
                }
                TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 3);
                if (topDocs.totalHits == 0) {
                    throw new RuntimeException("No hits for " + term + ": " + topDocs.totalHits);
                } else if (topDocs.totalHits == 1) {
                    int docId = topDocs.scoreDocs[0].doc;
                    Document document = reader.document(docId);
                    Long count = Long.parseLong(document.get("count"));
                    //System.out.println(term + " -> " + count);
                    totalCount += count;
                    if (++i % 10_000 == 0) {
                        System.out.println(i + " ... " + totalCount);
                    }
                } else {
                    throw new RuntimeException(
                            "More hits than expected for " + term + ": " + topDocs.totalHits);
                }
            }
        }
    }
    System.out.println("==> " + totalCount);
}

From source file:org.languagetool.dev.bigdata.GermanUppercasePhraseFinder.java

License:Open Source License

public static void main(String[] args) throws IOException {
    if (args.length != 1) {
        System.out.println("Usage: " + GermanUppercasePhraseFinder.class.getSimpleName() + " <ngramIndexDir>");
        System.exit(1);//from  w  ww.  j  av a2 s .co m
    }
    JLanguageTool lt = new JLanguageTool(Languages.getLanguageForShortCode("de"));
    FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath());
    IndexReader reader = DirectoryReader.open(fsDir);
    IndexSearcher searcher = new IndexSearcher(reader);
    Fields fields = MultiFields.getFields(reader);
    Terms terms = fields.terms("ngram");
    TermsEnum termsEnum = terms.iterator();
    int count = 0;
    BytesRef next;
    while ((next = termsEnum.next()) != null) {
        String term = next.utf8ToString();
        count++;
        //term = "persischer Golf";  // for testing
        String[] parts = term.split(" ");
        boolean useful = true;
        int lcCount = 0;
        List<String> ucParts = new ArrayList<>();
        for (String part : parts) {
            if (part.length() < MIN_TERM_LEN) {
                useful = false;
                break;
            }
            String uc = StringTools.uppercaseFirstChar(part);
            if (!part.equals(uc)) {
                lcCount++;
            }
            ucParts.add(uc);
        }
        if (!useful || lcCount == 0 || lcCount == 2) {
            continue;
        }
        String uppercase = Strings.join(ucParts, " ");
        if (term.equals(uppercase)) {
            continue;
        }
        long thisCount = getOccurrenceCount(reader, searcher, term);
        long thisUpperCount = getOccurrenceCount(reader, searcher, uppercase);
        if (count % 10_000 == 0) {
            System.err.println(count + " @ " + term);
        }
        if (thisCount > LIMIT || thisUpperCount > LIMIT) {
            if (thisUpperCount > thisCount) {
                if (isRelevant(lt, term)) {
                    float factor = (float) thisUpperCount / thisCount;
                    System.out.printf(
                            "%.2f " + thisUpperCount + " " + uppercase + " " + thisCount + " " + term + "\n",
                            factor);
                }
            }
        }
    }
}