Example usage for org.apache.lucene.index Fields terms

Introduction

In this page you can find the example usage for org.apache.lucene.index Fields terms.

Prototype

public abstract Terms terms(String field) throws IOException;

Source Link

Document

Get the Terms for this field.

Usage

From source file:org.elasticsearch.test.integration.termvectors.GetTermVectorTests.java

License:Apache License

@Test
public void testRandomSingleTermVectors() throws ElasticSearchException, IOException {
    Random random = getRandom();//from  ww w.ja v  a 2  s.  c om
    FieldType ft = new FieldType();
    int config = random.nextInt(6);
    boolean storePositions = false;
    boolean storeOffsets = false;
    boolean storePayloads = false;
    boolean storeTermVectors = false;
    switch (config) {
    case 0: {
        // do nothing
    }
    case 1: {
        storeTermVectors = true;
    }
    case 2: {
        storeTermVectors = true;
        storePositions = true;
    }
    case 3: {
        storeTermVectors = true;
        storeOffsets = true;
    }
    case 4: {
        storeTermVectors = true;
        storePositions = true;
        storeOffsets = true;
    }
    case 5: {
        storeTermVectors = true;
        storePositions = true;
        storePayloads = true;
    }
    case 6: {
        storeTermVectors = true;
        storePositions = true;
        storeOffsets = true;
        storePayloads = true;
    }
    }
    ft.setStoreTermVectors(storeTermVectors);
    ft.setStoreTermVectorOffsets(storeOffsets);
    ft.setStoreTermVectorPayloads(storePayloads);
    ft.setStoreTermVectorPositions(storePositions);

    String optionString = AbstractFieldMapper.termVectorOptionsToString(ft);
    run(addMapping(prepareCreate("test"), "type1",
            new Object[] { "field", "type", "string", "term_vector", optionString, "analyzer", "tv_test" })
                    .setSettings(ImmutableSettings.settingsBuilder()
                            .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace").putArray(
                                    "index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
    ensureYellow();
    for (int i = 0; i < 10; i++) {
        client().prepareIndex("test", "type1", Integer.toString(i))
                .setSource(XContentFactory.jsonBuilder().startObject()
                        .field("field", "the quick brown fox jumps over the lazy dog")
                        // 0the3 4quick9 10brown15 16fox19 20jumps25 26over30
                        // 31the34 35lazy39 40dog43
                        .endObject())
                .execute().actionGet();
        refresh();
    }
    String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" };
    int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 };
    int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } };
    int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } };
    int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } };

    boolean isPayloadRequested = random.nextBoolean();
    boolean isOffsetRequested = random.nextBoolean();
    boolean isPositionsRequested = random.nextBoolean();
    String infoString = createInfoString(isPositionsRequested, isOffsetRequested, isPayloadRequested,
            optionString);
    for (int i = 0; i < 10; i++) {
        TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i))
                .setPayloads(isPayloadRequested).setOffsets(isOffsetRequested)
                .setPositions(isPositionsRequested).setSelectedFields();
        TermVectorResponse response = resp.execute().actionGet();
        assertThat(infoString + "doc id: " + i + " doesn't exists but should", response.documentExists(),
                equalTo(true));
        Fields fields = response.getFields();
        assertThat(fields.size(), equalTo(ft.storeTermVectors() ? 1 : 0));
        if (ft.storeTermVectors()) {
            Terms terms = fields.terms("field");
            assertThat(terms.size(), equalTo(8l));
            TermsEnum iterator = terms.iterator(null);
            for (int j = 0; j < values.length; j++) {
                String string = values[j];
                BytesRef next = iterator.next();
                assertThat(infoString, next, Matchers.notNullValue());
                assertThat(infoString + "expected " + string, string, equalTo(next.utf8ToString()));
                assertThat(infoString, next, Matchers.notNullValue());
                // do not test ttf or doc frequency, because here we have
                // many shards and do not know how documents are distributed
                DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
                // docs and pos only returns something if positions or
                // payloads or offsets are stored / requestd Otherwise use
                // DocsEnum?
                assertThat(infoString, docsAndPositions.nextDoc(), equalTo(0));
                assertThat(infoString, freq[j], equalTo(docsAndPositions.freq()));
                int[] termPos = pos[j];
                int[] termStartOffset = startOffset[j];
                int[] termEndOffset = endOffset[j];
                if (isPositionsRequested && storePositions) {
                    assertThat(infoString, termPos.length, equalTo(freq[j]));
                }
                if (isOffsetRequested && storeOffsets) {
                    assertThat(termStartOffset.length, equalTo(freq[j]));
                    assertThat(termEndOffset.length, equalTo(freq[j]));
                }
                for (int k = 0; k < freq[j]; k++) {
                    int nextPosition = docsAndPositions.nextPosition();
                    // only return something useful if requested and stored
                    if (isPositionsRequested && storePositions) {
                        assertThat(infoString + "positions for term: " + string, nextPosition,
                                equalTo(termPos[k]));
                    } else {
                        assertThat(infoString + "positions for term: ", nextPosition, equalTo(-1));
                    }

                    // only return something useful if requested and stored
                    if (isPayloadRequested && storePayloads) {
                        assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(),
                                equalTo(new BytesRef("word")));
                    } else {
                        assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(),
                                equalTo(null));
                    }
                    // only return something useful if requested and stored
                    if (isOffsetRequested && storeOffsets) {

                        assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(),
                                equalTo(termStartOffset[k]));
                        assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(),
                                equalTo(termEndOffset[k]));
                    } else {
                        assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(),
                                equalTo(-1));
                        assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(),
                                equalTo(-1));
                    }

                }
            }
            assertThat(iterator.next(), Matchers.nullValue());
        }

    }
}

From source file:org.elasticsearch.test.integration.termvectors.GetTermVectorTests.java

License:Apache License

private void compareLuceneESTermVectorResults(Fields fields, Fields luceneFields,
        HashMap<String, Boolean> storePositionsMap, HashMap<String, Boolean> storeOfsetsMap,
        HashMap<String, Boolean> storePayloadsMap, boolean getPositions, boolean getOffsets,
        boolean getPayloads, String[] selectedFields) throws IOException {
    HashSet<String> selectedFieldsMap = new HashSet<String>(Arrays.asList(selectedFields));

    Iterator<String> luceneFieldNames = luceneFields.iterator();
    assertThat(luceneFields.size(), equalTo(storeOfsetsMap.size()));
    assertThat(fields.size(), equalTo(selectedFields.length));

    while (luceneFieldNames.hasNext()) {
        String luceneFieldName = luceneFieldNames.next();
        if (!selectedFieldsMap.contains(luceneFieldName))
            continue;
        Terms esTerms = fields.terms(luceneFieldName);
        Terms luceneTerms = luceneFields.terms(luceneFieldName);
        TermsEnum esTermEnum = esTerms.iterator(null);
        TermsEnum luceneTermEnum = luceneTerms.iterator(null);

        int numTerms = 0;

        while (esTermEnum.next() != null) {
            luceneTermEnum.next();/*from  www .  jav  a  2 s . com*/
            assertThat(esTermEnum.totalTermFreq(), equalTo(luceneTermEnum.totalTermFreq()));
            DocsAndPositionsEnum esDocsPosEnum = esTermEnum.docsAndPositions(null, null, 0);
            DocsAndPositionsEnum luceneDocsPosEnum = luceneTermEnum.docsAndPositions(null, null, 0);
            if (luceneDocsPosEnum == null) {
                assertThat(storeOfsetsMap.get(luceneFieldName), equalTo(false));
                assertThat(storePayloadsMap.get(luceneFieldName), equalTo(false));
                assertThat(storePositionsMap.get(luceneFieldName), equalTo(false));
                continue;

            }
            numTerms++;

            assertThat("failed for field: " + luceneFieldName, esTermEnum.term().utf8ToString(),
                    equalTo(luceneTermEnum.term().utf8ToString()));
            esDocsPosEnum.nextDoc();
            luceneDocsPosEnum.nextDoc();

            int freq = (int) esDocsPosEnum.freq();
            assertThat(freq, equalTo(luceneDocsPosEnum.freq()));
            for (int i = 0; i < freq; i++) {

                int lucenePos = luceneDocsPosEnum.nextPosition();
                int esPos = esDocsPosEnum.nextPosition();
                if (storePositionsMap.get(luceneFieldName) && getPositions) {
                    assertThat(luceneFieldName, lucenePos, equalTo(esPos));
                } else {
                    assertThat(esPos, equalTo(-1));
                }
                if (storeOfsetsMap.get(luceneFieldName) && getOffsets) {
                    assertThat(luceneDocsPosEnum.startOffset(), equalTo(esDocsPosEnum.startOffset()));
                    assertThat(luceneDocsPosEnum.endOffset(), equalTo(esDocsPosEnum.endOffset()));
                } else {
                    assertThat(esDocsPosEnum.startOffset(), equalTo(-1));
                    assertThat(esDocsPosEnum.endOffset(), equalTo(-1));
                }
                if (storePayloadsMap.get(luceneFieldName) && getPayloads) {
                    assertThat(luceneFieldName, luceneDocsPosEnum.getPayload(),
                            equalTo(esDocsPosEnum.getPayload()));
                } else {
                    assertThat(esDocsPosEnum.getPayload(), equalTo(null));
                }

            }
        }

    }

}

From source file:org.elasticsearch.test.integration.termvectors.GetTermVectorTestsCheckDocFreq.java

License:Apache License

private void checkWithoutFieldStatistics(int numDocs, String[] values, int[] freq, int[][] pos,
        int[][] startOffset, int[][] endOffset, int i) throws IOException {
    TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i))
            .setPayloads(true).setOffsets(true).setPositions(true).setTermStatistics(true)
            .setFieldStatistics(false).setSelectedFields();
    TermVectorResponse response = resp.execute().actionGet();
    assertThat("doc id: " + i + " doesn't exists but should", response.documentExists(), equalTo(true));
    Fields fields = response.getFields();
    assertThat(fields.size(), equalTo(1));
    Terms terms = fields.terms("field");
    assertThat(terms.size(), equalTo(8l));
    assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) -1));
    assertThat(terms.getDocCount(), Matchers.equalTo(-1));
    assertThat(terms.getSumDocFreq(), equalTo((long) -1));
    TermsEnum iterator = terms.iterator(null);
    for (int j = 0; j < values.length; j++) {
        String string = values[j];
        BytesRef next = iterator.next();
        assertThat(next, Matchers.notNullValue());
        assertThat("expected " + string, string, equalTo(next.utf8ToString()));
        assertThat(next, Matchers.notNullValue());
        if (string.equals("the")) {
            assertThat("expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq()));
        } else {//from w w  w.  j a  v a  2  s. com
            assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq()));
        }

        DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
        assertThat(docsAndPositions.nextDoc(), equalTo(0));
        assertThat(freq[j], equalTo(docsAndPositions.freq()));
        assertThat(iterator.docFreq(), equalTo(numDocs));
        int[] termPos = pos[j];
        int[] termStartOffset = startOffset[j];
        int[] termEndOffset = endOffset[j];
        assertThat(termPos.length, equalTo(freq[j]));
        assertThat(termStartOffset.length, equalTo(freq[j]));
        assertThat(termEndOffset.length, equalTo(freq[j]));
        for (int k = 0; k < freq[j]; k++) {
            int nextPosition = docsAndPositions.nextPosition();
            assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
            assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
            assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
            assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
        }
    }
    assertThat(iterator.next(), Matchers.nullValue());

    XContentBuilder xBuilder = new XContentFactory().jsonBuilder();

    response.toXContent(xBuilder, null);
    BytesStream bytesStream = xBuilder.bytesStream();
    String utf8 = bytesStream.bytes().toUtf8();
    String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i
            + "\",\"_version\":1,\"exists\":true,\"term_vectors\":{\"field\":{\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[2],\"start\":[10],\"end\":[15],\"payload\":[\"d29yZA==\"]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[8],\"start\":[40],\"end\":[43],\"payload\":[\"d29yZA==\"]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[3],\"start\":[16],\"end\":[19],\"payload\":[\"d29yZA==\"]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[4],\"start\":[20],\"end\":[25],\"payload\":[\"d29yZA==\"]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[7],\"start\":[35],\"end\":[39],\"payload\":[\"d29yZA==\"]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[5],\"start\":[26],\"end\":[30],\"payload\":[\"d29yZA==\"]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[1],\"start\":[4],\"end\":[9],\"payload\":[\"d29yZA==\"]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"pos\":[0,6],\"start\":[0,31],\"end\":[3,34],\"payload\":[\"d29yZA==\",\"d29yZA==\"]}}}}}";
    assertThat(utf8, equalTo(expectedString));

}

From source file:org.elasticsearch.test.integration.termvectors.GetTermVectorTestsCheckDocFreq.java

License:Apache License

private void checkWithoutTermStatistics(int numDocs, String[] values, int[] freq, int[][] pos,
        int[][] startOffset, int[][] endOffset, int i) throws IOException {
    TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i))
            .setPayloads(true).setOffsets(true).setPositions(true).setTermStatistics(false)
            .setFieldStatistics(true).setSelectedFields();
    assertThat(resp.request().termStatistics(), equalTo(false));
    TermVectorResponse response = resp.execute().actionGet();
    assertThat("doc id: " + i + " doesn't exists but should", response.documentExists(), equalTo(true));
    Fields fields = response.getFields();
    assertThat(fields.size(), equalTo(1));
    Terms terms = fields.terms("field");
    assertThat(terms.size(), equalTo(8l));
    assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs)));
    assertThat(terms.getDocCount(), Matchers.equalTo(numDocs));
    assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length));
    TermsEnum iterator = terms.iterator(null);
    for (int j = 0; j < values.length; j++) {
        String string = values[j];
        BytesRef next = iterator.next();
        assertThat(next, Matchers.notNullValue());
        assertThat("expected " + string, string, equalTo(next.utf8ToString()));
        assertThat(next, Matchers.notNullValue());

        assertThat("expected ttf of " + string, -1, equalTo((int) iterator.totalTermFreq()));

        DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
        assertThat(docsAndPositions.nextDoc(), equalTo(0));
        assertThat(freq[j], equalTo(docsAndPositions.freq()));
        assertThat(iterator.docFreq(), equalTo(-1));
        int[] termPos = pos[j];
        int[] termStartOffset = startOffset[j];
        int[] termEndOffset = endOffset[j];
        assertThat(termPos.length, equalTo(freq[j]));
        assertThat(termStartOffset.length, equalTo(freq[j]));
        assertThat(termEndOffset.length, equalTo(freq[j]));
        for (int k = 0; k < freq[j]; k++) {
            int nextPosition = docsAndPositions.nextPosition();
            assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
            assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
            assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
            assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
        }//from   w  ww. j  a  v  a 2 s . co m
    }
    assertThat(iterator.next(), Matchers.nullValue());

    XContentBuilder xBuilder = new XContentFactory().jsonBuilder();

    response.toXContent(xBuilder, null);
    BytesStream bytesStream = xBuilder.bytesStream();
    String utf8 = bytesStream.bytes().toUtf8();
    String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i
            + "\",\"_version\":1,\"exists\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"term_freq\":1,\"pos\":[2],\"start\":[10],\"end\":[15],\"payload\":[\"d29yZA==\"]},\"dog\":{\"term_freq\":1,\"pos\":[8],\"start\":[40],\"end\":[43],\"payload\":[\"d29yZA==\"]},\"fox\":{\"term_freq\":1,\"pos\":[3],\"start\":[16],\"end\":[19],\"payload\":[\"d29yZA==\"]},\"jumps\":{\"term_freq\":1,\"pos\":[4],\"start\":[20],\"end\":[25],\"payload\":[\"d29yZA==\"]},\"lazy\":{\"term_freq\":1,\"pos\":[7],\"start\":[35],\"end\":[39],\"payload\":[\"d29yZA==\"]},\"over\":{\"term_freq\":1,\"pos\":[5],\"start\":[26],\"end\":[30],\"payload\":[\"d29yZA==\"]},\"quick\":{\"term_freq\":1,\"pos\":[1],\"start\":[4],\"end\":[9],\"payload\":[\"d29yZA==\"]},\"the\":{\"term_freq\":2,\"pos\":[0,6],\"start\":[0,31],\"end\":[3,34],\"payload\":[\"d29yZA==\",\"d29yZA==\"]}}}}}";

    assertThat(utf8, equalTo(expectedString));

}

From source file:org.elasticsearch.test.integration.termvectors.GetTermVectorTestsCheckDocFreq.java

License:Apache License

private void checkAllInfo(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset,
        int[][] endOffset, int i) throws IOException {
    TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i))
            .setPayloads(true).setOffsets(true).setPositions(true).setFieldStatistics(true)
            .setTermStatistics(true).setSelectedFields();
    assertThat(resp.request().fieldStatistics(), equalTo(true));
    TermVectorResponse response = resp.execute().actionGet();
    assertThat("doc id: " + i + " doesn't exists but should", response.documentExists(), equalTo(true));
    Fields fields = response.getFields();
    assertThat(fields.size(), equalTo(1));
    Terms terms = fields.terms("field");
    assertThat(terms.size(), equalTo(8l));
    assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs)));
    assertThat(terms.getDocCount(), Matchers.equalTo(numDocs));
    assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length));
    TermsEnum iterator = terms.iterator(null);
    for (int j = 0; j < values.length; j++) {
        String string = values[j];
        BytesRef next = iterator.next();
        assertThat(next, Matchers.notNullValue());
        assertThat("expected " + string, string, equalTo(next.utf8ToString()));
        assertThat(next, Matchers.notNullValue());
        if (string.equals("the")) {
            assertThat("expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq()));
        } else {//from   w w w  .  jav  a 2s.  c  o m
            assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq()));
        }

        DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
        assertThat(docsAndPositions.nextDoc(), equalTo(0));
        assertThat(freq[j], equalTo(docsAndPositions.freq()));
        assertThat(iterator.docFreq(), equalTo(numDocs));
        int[] termPos = pos[j];
        int[] termStartOffset = startOffset[j];
        int[] termEndOffset = endOffset[j];
        assertThat(termPos.length, equalTo(freq[j]));
        assertThat(termStartOffset.length, equalTo(freq[j]));
        assertThat(termEndOffset.length, equalTo(freq[j]));
        for (int k = 0; k < freq[j]; k++) {
            int nextPosition = docsAndPositions.nextPosition();
            assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
            assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
            assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
            assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
        }
    }
    assertThat(iterator.next(), Matchers.nullValue());

    XContentBuilder xBuilder = new XContentFactory().jsonBuilder();

    response.toXContent(xBuilder, null);
    BytesStream bytesStream = xBuilder.bytesStream();
    String utf8 = bytesStream.bytes().toUtf8();
    String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i
            + "\",\"_version\":1,\"exists\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[2],\"start\":[10],\"end\":[15],\"payload\":[\"d29yZA==\"]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[8],\"start\":[40],\"end\":[43],\"payload\":[\"d29yZA==\"]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[3],\"start\":[16],\"end\":[19],\"payload\":[\"d29yZA==\"]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[4],\"start\":[20],\"end\":[25],\"payload\":[\"d29yZA==\"]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[7],\"start\":[35],\"end\":[39],\"payload\":[\"d29yZA==\"]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[5],\"start\":[26],\"end\":[30],\"payload\":[\"d29yZA==\"]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"pos\":[1],\"start\":[4],\"end\":[9],\"payload\":[\"d29yZA==\"]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"pos\":[0,6],\"start\":[0,31],\"end\":[3,34],\"payload\":[\"d29yZA==\",\"d29yZA==\"]}}}}}";
    assertThat(utf8, equalTo(expectedString));

}

From source file:org.elasticsearch.vectorize.VectorizeService.java

License:Apache License

private void processTermVectorsFields(Vectorizer vectorizer, Fields termVectorsFields) throws IOException {
    for (String fieldName : termVectorsFields) {
        TermsEnum termsEnum = termVectorsFields.terms(fieldName).iterator();
        while (termsEnum.next() != null) {
            Term term = new Term(fieldName, termsEnum.term());
            TermStatistics termStatistics = new TermStatistics(termsEnum.term(), termsEnum.docFreq(),
                    termsEnum.totalTermFreq());
            int freq = termsEnum.postings(null, null, PostingsEnum.ALL).freq();
            vectorizer.add(term, termStatistics, freq);
        }//from   w w w.  j a v  a 2 s . c o  m
    }
}

From source file:org.getopt.luke.HighFreqTerms.java

License:Apache License

/**
 * //from  ww w .java  2s .  c o  m
 * @param reader
 * @param numTerms
 * @param fieldNames
 * @return TermStats[] ordered by terms with highest docFreq first.
 * @throws Exception
 */
public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String[] fieldNames)
        throws Exception {
    TermStatsQueue tiq = null;
    TermsEnum te = null;

    if (fieldNames != null) {
        Fields fields = MultiFields.getFields(reader);
        if (fields == null) {
            LOG.info("Index with no fields - probably empty or corrupted");
            return EMPTY_STATS;
        }
        tiq = new TermStatsQueue(numTerms);
        for (String field : fieldNames) {
            Terms terms = fields.terms(field);
            if (terms != null) {
                te = terms.iterator(te);
                fillQueue(te, tiq, field);
            }
        }
    } else {
        Fields fields = MultiFields.getFields(reader);
        if (fields == null) {
            LOG.info("Index with no fields - probably empty or corrupted");
            return EMPTY_STATS;
        }
        tiq = new TermStatsQueue(numTerms);
        Iterator<String> fieldIterator = fields.iterator();
        while (fieldIterator.hasNext()) {
            String field = fieldIterator.next();
            Terms terms = fields.terms(field);
            if (terms != null) {
                te = terms.iterator(te);
                fillQueue(te, tiq, field);
            }
        }
    }

    TermStats[] result = new TermStats[tiq.size()];
    // we want highest first so we read the queue and populate the array
    // starting at the end and work backwards
    int count = tiq.size() - 1;
    while (tiq.size() != 0) {
        result[count] = tiq.pop();
        count--;
    }
    return result;
}

From source file:org.hibernate.search.query.dsl.impl.MoreLikeThisBuilder.java

License:LGPL

/**
 * Find words for a more-like-this query former.
 * Store them per field name according to the order of fieldnames defined in {@link #fieldsContext}.
 * If the field name is not compatible with term retrieval, the queue will be empty for that index.
 *///from ww w.j  a  v  a  2s .c om
private List<PriorityQueue<Object[]>> retrieveTerms() throws IOException {
    int size = fieldsContext.size();
    Map<String, Map<String, Int>> termFreqMapPerFieldname = new HashMap<String, Map<String, Int>>(size);
    final Fields vectors;
    Document maybeDocument = null;
    if (documentNumber == null && size > 0) {
        //build the document from the entity instance

        //first build the list of fields we are interested in
        String[] fieldNames = new String[size];
        Iterator<FieldContext> fieldsContextIterator = fieldsContext.iterator();
        for (int index = 0; index < size; index++) {
            fieldNames[index] = fieldsContextIterator.next().getField();
        }
        //TODO should we keep the fieldToAnalyzerMap around to pass to the analyzer?
        Map<String, String> fieldToAnalyzerMap = new HashMap<String, String>();
        //FIXME by calling documentBuilder we don't honor .comparingField("foo").ignoreFieldBridge(): probably not a problem in practice though
        maybeDocument = documentBuilder.getDocument(null, input, null, fieldToAnalyzerMap, null,
                new ContextualExceptionBridgeHelper(), fieldNames);
        vectors = null;
    } else {
        vectors = indexReader.getTermVectors(documentNumber);
    }
    for (FieldContext fieldContext : fieldsContext) {
        String fieldName = fieldContext.getField();
        if (isCompatibleField(fieldName)) {
            Map<String, Int> termFreqMap = new HashMap<String, Int>();
            termFreqMapPerFieldname.put(fieldName, termFreqMap);
            final Terms vector;
            if (vectors != null) {
                vector = vectors.terms(fieldName);
            } else {
                vector = null;
            }

            // field does not store term vector info
            if (vector == null) {
                if (maybeDocument == null) {
                    maybeDocument = indexReader.document(documentNumber);
                }
                IndexableField[] fields = maybeDocument.getFields(fieldName);
                for (IndexableField field : fields) {
                    //TODO numbers
                    final String stringValue = DocumentBuilderHelper.extractStringFromFieldable(field);
                    if (stringValue != null) {
                        addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldContext);
                    }
                }
            } else {
                addTermFrequencies(termFreqMap, vector);
            }
        } else {
            //place null as the field is not compatible
            termFreqMapPerFieldname.put(fieldName, null);
        }
    }
    List<PriorityQueue<Object[]>> results = new ArrayList<PriorityQueue<Object[]>>(size);
    for (Map.Entry<String, Map<String, Int>> entry : termFreqMapPerFieldname.entrySet()) {
        results.add(createQueue(entry.getKey(), entry.getValue()));
    }
    return results;
}

From source file:org.languagetool.dev.archive.StartTokenCounter.java

License:Open Source License

public static void main(String[] args) throws IOException {
    long totalCount = 0;
    File dir = new File("/data/google-ngram-index/en/2grams");
    try (FSDirectory directory = FSDirectory.open(dir.toPath());
            IndexReader reader = DirectoryReader.open(directory)) {
        IndexSearcher searcher = new IndexSearcher(reader);
        Fields fields = MultiFields.getFields(reader);
        Terms ngrams = fields.terms("ngram");
        TermsEnum iterator = ngrams.iterator();
        BytesRef next;//  w ww.  java  2s.c o m
        int i = 0;
        while ((next = iterator.next()) != null) {
            String term = next.utf8ToString();
            if (term.startsWith(LanguageModel.GOOGLE_SENTENCE_START)) {
                if (term.matches(".*_(ADJ|ADV|NUM|VERB|ADP|NOUN|PRON|CONJ|DET|PRT)$")) {
                    //System.out.println("ignore: " + term);
                    continue;
                }
                TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 3);
                if (topDocs.totalHits == 0) {
                    throw new RuntimeException("No hits for " + term + ": " + topDocs.totalHits);
                } else if (topDocs.totalHits == 1) {
                    int docId = topDocs.scoreDocs[0].doc;
                    Document document = reader.document(docId);
                    Long count = Long.parseLong(document.get("count"));
                    //System.out.println(term + " -> " + count);
                    totalCount += count;
                    if (++i % 10_000 == 0) {
                        System.out.println(i + " ... " + totalCount);
                    }
                } else {
                    throw new RuntimeException(
                            "More hits than expected for " + term + ": " + topDocs.totalHits);
                }
            }
        }
    }
    System.out.println("==> " + totalCount);
}

From source file:org.languagetool.dev.bigdata.GermanUppercasePhraseFinder.java

License:Open Source License

public static void main(String[] args) throws IOException {
    if (args.length != 1) {
        System.out.println("Usage: " + GermanUppercasePhraseFinder.class.getSimpleName() + " <ngramIndexDir>");
        System.exit(1);//from  w  ww.  j  av a2 s .co m
    }
    JLanguageTool lt = new JLanguageTool(Languages.getLanguageForShortCode("de"));
    FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath());
    IndexReader reader = DirectoryReader.open(fsDir);
    IndexSearcher searcher = new IndexSearcher(reader);
    Fields fields = MultiFields.getFields(reader);
    Terms terms = fields.terms("ngram");
    TermsEnum termsEnum = terms.iterator();
    int count = 0;
    BytesRef next;
    while ((next = termsEnum.next()) != null) {
        String term = next.utf8ToString();
        count++;
        //term = "persischer Golf";  // for testing
        String[] parts = term.split(" ");
        boolean useful = true;
        int lcCount = 0;
        List<String> ucParts = new ArrayList<>();
        for (String part : parts) {
            if (part.length() < MIN_TERM_LEN) {
                useful = false;
                break;
            }
            String uc = StringTools.uppercaseFirstChar(part);
            if (!part.equals(uc)) {
                lcCount++;
            }
            ucParts.add(uc);
        }
        if (!useful || lcCount == 0 || lcCount == 2) {
            continue;
        }
        String uppercase = Strings.join(ucParts, " ");
        if (term.equals(uppercase)) {
            continue;
        }
        long thisCount = getOccurrenceCount(reader, searcher, term);
        long thisUpperCount = getOccurrenceCount(reader, searcher, uppercase);
        if (count % 10_000 == 0) {
            System.err.println(count + " @ " + term);
        }
        if (thisCount > LIMIT || thisUpperCount > LIMIT) {
            if (thisUpperCount > thisCount) {
                if (isRelevant(lt, term)) {
                    float factor = (float) thisUpperCount / thisCount;
                    System.out.printf(
                            "%.2f " + thisUpperCount + " " + uppercase + " " + thisCount + " " + term + "\n",
                            factor);
                }
            }
        }
    }
}