Example usage for org.apache.lucene.util BytesRef utf8ToString

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRef utf8ToString.

Prototype

public String utf8ToString()

Source Link

Document

Interprets stored bytes as UTF8 bytes, returning the resulting string

Usage

From source file:org.elasticsearch.action.termvectors.GetTermVectorsTests.java

License:Apache License

private void checkBrownFoxTermVector(Fields fields, String fieldName, boolean withPayloads)
        throws ElasticsearchException, IOException {
    String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" };
    int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 };
    int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } };
    int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } };
    int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } };

    Terms terms = fields.terms(fieldName);
    assertThat(terms.size(), equalTo(8l));
    TermsEnum iterator = terms.iterator();
    for (int j = 0; j < values.length; j++) {
        String string = values[j];
        BytesRef next = iterator.next();
        assertThat(next, notNullValue());
        assertThat("expected " + string, string, equalTo(next.utf8ToString()));
        assertThat(next, notNullValue());
        // do not test ttf or doc frequency, because here we have many
        // shards and do not know how documents are distributed
        PostingsEnum docsAndPositions = iterator.postings(null, null, PostingsEnum.ALL);
        assertThat(docsAndPositions.nextDoc(), equalTo(0));
        assertThat(freq[j], equalTo(docsAndPositions.freq()));
        int[] termPos = pos[j];
        int[] termStartOffset = startOffset[j];
        int[] termEndOffset = endOffset[j];
        assertThat(termPos.length, equalTo(freq[j]));
        assertThat(termStartOffset.length, equalTo(freq[j]));
        assertThat(termEndOffset.length, equalTo(freq[j]));
        for (int k = 0; k < freq[j]; k++) {
            int nextPosition = docsAndPositions.nextPosition();
            assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
            assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
            assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
            if (withPayloads) {
                assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
            }// www. j  a  v  a2  s .c  o  m
        }
    }
    assertThat(iterator.next(), nullValue());
}

From source file:org.elasticsearch.action.termvectors.GetTermVectorsTests.java

License:Apache License

private void compareTermVectors(String fieldName, Fields fields0, Fields fields1) throws IOException {
    Terms terms0 = fields0.terms(fieldName);
    Terms terms1 = fields1.terms(fieldName);
    assertThat(terms0, notNullValue());/* w  ww. ja v  a 2  s.  co  m*/
    assertThat(terms1, notNullValue());
    assertThat(terms0.size(), equalTo(terms1.size()));

    TermsEnum iter0 = terms0.iterator();
    TermsEnum iter1 = terms1.iterator();
    for (int i = 0; i < terms0.size(); i++) {
        BytesRef next0 = iter0.next();
        assertThat(next0, notNullValue());
        BytesRef next1 = iter1.next();
        assertThat(next1, notNullValue());

        // compare field value
        String string0 = next0.utf8ToString();
        String string1 = next1.utf8ToString();
        assertThat("expected: " + string0, string0, equalTo(string1));

        // compare df and ttf
        assertThat("term: " + string0, iter0.docFreq(), equalTo(iter1.docFreq()));
        assertThat("term: " + string0, iter0.totalTermFreq(), equalTo(iter1.totalTermFreq()));

        // compare freq and docs
        PostingsEnum docsAndPositions0 = iter0.postings(null, null, PostingsEnum.ALL);
        PostingsEnum docsAndPositions1 = iter1.postings(null, null, PostingsEnum.ALL);
        assertThat("term: " + string0, docsAndPositions0.nextDoc(), equalTo(docsAndPositions1.nextDoc()));
        assertThat("term: " + string0, docsAndPositions0.freq(), equalTo(docsAndPositions1.freq()));

        // compare position, start offsets and end offsets
        for (int j = 0; j < docsAndPositions0.freq(); j++) {
            assertThat("term: " + string0, docsAndPositions0.nextPosition(),
                    equalTo(docsAndPositions1.nextPosition()));
            assertThat("term: " + string0, docsAndPositions0.startOffset(),
                    equalTo(docsAndPositions1.startOffset()));
            assertThat("term: " + string0, docsAndPositions0.endOffset(),
                    equalTo(docsAndPositions1.endOffset()));
        }
    }
    assertThat(iter0.next(), nullValue());
    assertThat(iter1.next(), nullValue());
}

From source file:org.elasticsearch.action.termvectors.TermVectorsFilter.java

License:Apache License

public void selectBestTerms() throws IOException {
    PostingsEnum docsEnum = null;/*w  w w. ja  va  2s  .c om*/

    for (String fieldName : fields) {
        if ((selectedFields != null) && (!selectedFields.contains(fieldName))) {
            continue;
        }

        Terms terms = fields.terms(fieldName);
        Terms topLevelTerms = topLevelFields.terms(fieldName);

        // if no terms found, take the retrieved term vector fields for stats
        if (topLevelTerms == null) {
            topLevelTerms = terms;
        }

        long numDocs = getDocCount(fieldName, topLevelTerms);

        // one queue per field name
        ScoreTermsQueue queue = new ScoreTermsQueue(Math.min(maxNumTerms, (int) terms.size()));

        // select terms with highest tf-idf
        TermsEnum termsEnum = terms.iterator();
        TermsEnum topLevelTermsEnum = topLevelTerms.iterator();
        while (termsEnum.next() != null) {
            BytesRef termBytesRef = termsEnum.term();
            boolean foundTerm = topLevelTermsEnum.seekExact(termBytesRef);
            assert foundTerm : "Term: " + termBytesRef.utf8ToString() + " not found!";

            Term term = new Term(fieldName, termBytesRef);

            // remove noise words
            int freq = getTermFreq(termsEnum, docsEnum);
            if (isNoise(term.bytes().utf8ToString(), freq)) {
                continue;
            }

            // now call on docFreq
            long docFreq = getTermStatistics(topLevelTermsEnum, term).docFreq();
            if (!isAccepted(docFreq)) {
                continue;
            }

            // filter based on score
            float score = computeScore(docFreq, freq, numDocs);
            queue.addOrUpdate(new ScoreTerm(term.field(), term.bytes().utf8ToString(), score));
        }

        // retain the best terms for quick lookups
        ScoreTerm scoreTerm;
        while ((scoreTerm = queue.pop()) != null) {
            scoreTerms.put(new Term(scoreTerm.field, scoreTerm.word), scoreTerm);
            sizes.incrementAndGet(scoreTerm.field);
        }
    }
}

From source file:org.elasticsearch.common.lucene.search.XTermsFilter.java

License:Apache License

@Override
public String toString() {
    StringBuilder builder = new StringBuilder();
    BytesRef spare = new BytesRef(termsBytes);
    boolean first = true;
    for (int i = 0; i < termsAndFields.length; i++) {
        TermsAndField current = termsAndFields[i];
        for (int j = current.start; j < current.end; j++) {
            spare.offset = offsets[j];//w ww.ja  v a2  s  .  c  o  m
            spare.length = offsets[j + 1] - offsets[j];
            if (!first) {
                builder.append(' ');
            }
            first = false;
            builder.append(current.field).append(':');
            builder.append(spare.utf8ToString());
        }
    }

    return builder.toString();
}

From source file:org.elasticsearch.common.xcontent.BaseXContentTestCase.java

License:Apache License

public void testBinaryUTF8() throws Exception {
    assertResult("{'utf8':null}", () -> builder().startObject().utf8Field("utf8", null).endObject());

    final BytesRef randomBytesRef = new BytesRef(randomBytes());
    XContentBuilder builder = builder().startObject();
    if (randomBoolean()) {
        builder.utf8Field("utf8", randomBytesRef);
    } else {/*w ww. java2  s .c om*/
        builder.field("utf8").utf8Value(randomBytesRef);
    }
    builder.endObject();

    XContentParser parser = createParser(xcontentType().xContent(), builder.bytes());
    assertSame(parser.nextToken(), Token.START_OBJECT);
    assertSame(parser.nextToken(), Token.FIELD_NAME);
    assertEquals(parser.currentName(), "utf8");
    assertTrue(parser.nextToken().isValue());
    assertThat(parser.utf8Bytes().utf8ToString(), equalTo(randomBytesRef.utf8ToString()));
    assertSame(parser.nextToken(), Token.END_OBJECT);
    assertNull(parser.nextToken());
}

From source file:org.elasticsearch.index.field.data.DocFieldData.java

License:Apache License

public String stringValue() {
    BytesRef val = fieldData.stringValue(docId);
    if (val == null) {
        return null;
    }/*from  ww w .j a  v  a  2s  .  c  om*/
    return val.utf8ToString();
}

From source file:org.elasticsearch.index.field.data.strings.StringDocFieldData.java

License:Apache License

public String getValue() {
    BytesRef value = fieldData.value(docId);
    if (value == null) {
        return null;
    }//from   ww w. j  a v  a  2 s.c  o  m
    return value.utf8ToString();
}

From source file:org.elasticsearch.index.fielddata.AbstractStringFieldDataTestCase.java

License:Apache License

public void testNestedSorting(MultiValueMode sortMode) throws IOException {
    final String[] values = new String[randomIntBetween(2, 20)];
    for (int i = 0; i < values.length; ++i) {
        values[i] = TestUtil.randomSimpleString(getRandom());
    }/*www .  j  av  a2  s .co m*/
    final int numParents = scaledRandomIntBetween(10, 3072);
    List<Document> docs = new ArrayList<>();
    FixedBitSet parents = new FixedBitSet(64);
    for (int i = 0; i < numParents; ++i) {
        docs.clear();
        final int numChildren = randomInt(4);
        for (int j = 0; j < numChildren; ++j) {
            final Document child = new Document();
            final int numValues = randomInt(3);
            for (int k = 0; k < numValues; ++k) {
                final String value = RandomPicks.randomFrom(getRandom(), values);
                addField(child, "text", value);
            }
            docs.add(child);
        }
        final Document parent = new Document();
        parent.add(new StringField("type", "parent", Store.YES));
        final String value = RandomPicks.randomFrom(getRandom(), values);
        if (value != null) {
            addField(parent, "text", value);
        }
        docs.add(parent);
        int bit = parents.prevSetBit(parents.length() - 1) + docs.size();
        parents = FixedBitSet.ensureCapacity(parents, bit);
        parents.set(bit);
        writer.addDocuments(docs);
        if (randomInt(10) == 0) {
            writer.commit();
        }
    }
    DirectoryReader directoryReader = DirectoryReader.open(writer, true);
    directoryReader = ElasticsearchDirectoryReader.wrap(directoryReader, new ShardId(new Index("test"), 0));
    IndexSearcher searcher = new IndexSearcher(directoryReader);
    IndexFieldData<?> fieldData = getForField("text");
    final Object missingValue;
    switch (randomInt(4)) {
    case 0:
        missingValue = "_first";
        break;
    case 1:
        missingValue = "_last";
        break;
    case 2:
        missingValue = new BytesRef(RandomPicks.randomFrom(getRandom(), values));
        break;
    default:
        missingValue = new BytesRef(TestUtil.randomSimpleString(getRandom()));
        break;
    }
    Query parentFilter = new TermQuery(new Term("type", "parent"));
    Query childFilter = Queries.not(parentFilter);
    Nested nested = createNested(searcher, parentFilter, childFilter);
    BytesRefFieldComparatorSource nestedComparatorSource = new BytesRefFieldComparatorSource(fieldData,
            missingValue, sortMode, nested);
    ToParentBlockJoinQuery query = new ToParentBlockJoinQuery(new ConstantScoreQuery(childFilter),
            new QueryBitSetProducer(parentFilter), ScoreMode.None);
    Sort sort = new Sort(new SortField("text", nestedComparatorSource));
    TopFieldDocs topDocs = searcher.search(query, randomIntBetween(1, numParents), sort);
    assertTrue(topDocs.scoreDocs.length > 0);
    BytesRef previous = null;
    for (int i = 0; i < topDocs.scoreDocs.length; ++i) {
        final int docID = topDocs.scoreDocs[i].doc;
        assertTrue("expected " + docID + " to be a parent", parents.get(docID));
        BytesRef cmpValue = null;
        for (int child = parents.prevSetBit(docID - 1) + 1; child < docID; ++child) {
            String[] sVals = searcher.doc(child).getValues("text");
            final BytesRef[] vals;
            if (sVals.length == 0) {
                vals = new BytesRef[0];
            } else {
                vals = new BytesRef[sVals.length];
                for (int j = 0; j < vals.length; ++j) {
                    vals[j] = new BytesRef(sVals[j]);
                }
            }
            for (BytesRef value : vals) {
                if (cmpValue == null) {
                    cmpValue = value;
                } else if (sortMode == MultiValueMode.MIN && value.compareTo(cmpValue) < 0) {
                    cmpValue = value;
                } else if (sortMode == MultiValueMode.MAX && value.compareTo(cmpValue) > 0) {
                    cmpValue = value;
                }
            }
        }
        if (cmpValue == null) {
            if ("_first".equals(missingValue)) {
                cmpValue = new BytesRef();
            } else if ("_last".equals(missingValue) == false) {
                cmpValue = (BytesRef) missingValue;
            }
        }
        if (previous != null && cmpValue != null) {
            assertTrue(previous.utf8ToString() + "   /   " + cmpValue.utf8ToString(),
                    previous.compareTo(cmpValue) <= 0);
        }
        previous = cmpValue;
    }
    searcher.getIndexReader().close();
}

From source file:org.elasticsearch.index.fielddata.AbstractStringFieldDataTests.java

License:Apache License

public void testNestedSorting(SortMode sortMode) throws IOException {
    final String[] values = new String[randomIntBetween(2, 20)];
    for (int i = 0; i < values.length; ++i) {
        values[i] = _TestUtil.randomSimpleString(getRandom());
    }/*from   w  ww. j  av a  2  s  .c  om*/
    final int numParents = atLeast(100);
    List<Document> docs = new ArrayList<Document>();
    final OpenBitSet parents = new OpenBitSet();
    for (int i = 0; i < numParents; ++i) {
        docs.clear();
        final int numChildren = randomInt(4);
        for (int j = 0; j < numChildren; ++j) {
            final Document child = new Document();
            final int numValues = randomInt(3);
            for (int k = 0; k < numValues; ++k) {
                final String value = RandomPicks.randomFrom(getRandom(), values);
                child.add(new StringField("text", value, Store.YES));
            }
            docs.add(child);
        }
        final Document parent = new Document();
        parent.add(new StringField("type", "parent", Store.YES));
        final String value = RandomPicks.randomFrom(getRandom(), values);
        if (value != null) {
            parent.add(new StringField("text", value, Store.YES));
        }
        docs.add(parent);
        parents.set(parents.prevSetBit(parents.length() - 1) + docs.size());
        writer.addDocuments(docs);
        if (randomInt(10) == 0) {
            writer.commit();
        }
    }
    IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(writer, true));
    IndexFieldData<?> fieldData = getForField("text");
    final BytesRef missingValue;
    switch (randomInt(4)) {
    case 0:
        missingValue = new BytesRef();
        break;
    case 1:
        missingValue = BytesRefFieldComparatorSource.MAX_TERM;
        break;
    case 2:
        missingValue = new BytesRef(RandomPicks.randomFrom(getRandom(), values));
        break;
    default:
        missingValue = new BytesRef(_TestUtil.randomSimpleString(getRandom()));
        break;
    }
    BytesRefFieldComparatorSource innerSource = new BytesRefFieldComparatorSource(fieldData, missingValue,
            sortMode);
    Filter parentFilter = new TermFilter(new Term("type", "parent"));
    Filter childFilter = new NotFilter(parentFilter);
    NestedFieldComparatorSource nestedComparatorSource = new NestedFieldComparatorSource(sortMode, innerSource,
            parentFilter, childFilter);
    ToParentBlockJoinQuery query = new ToParentBlockJoinQuery(
            new XFilteredQuery(new MatchAllDocsQuery(), childFilter),
            new FixedBitSetCachingWrapperFilter(parentFilter), ScoreMode.None);
    Sort sort = new Sort(new SortField("text", nestedComparatorSource));
    TopFieldDocs topDocs = searcher.search(query, randomIntBetween(1, numParents), sort);
    assertTrue(topDocs.scoreDocs.length > 0);
    BytesRef previous = null;
    for (int i = 0; i < topDocs.scoreDocs.length; ++i) {
        final int docID = topDocs.scoreDocs[i].doc;
        assertTrue("expected " + docID + " to be a parent", parents.get(docID));
        BytesRef cmpValue = null;
        for (int child = parents.prevSetBit(docID - 1) + 1; child < docID; ++child) {
            String[] vals = searcher.doc(child).getValues("text");
            if (vals.length == 0) {
                vals = new String[] { missingValue.utf8ToString() };
            }
            for (String value : vals) {
                final BytesRef bytesValue = new BytesRef(value);
                if (cmpValue == null) {
                    cmpValue = bytesValue;
                } else if (sortMode == SortMode.MIN && bytesValue.compareTo(cmpValue) < 0) {
                    cmpValue = bytesValue;
                } else if (sortMode == SortMode.MAX && bytesValue.compareTo(cmpValue) > 0) {
                    cmpValue = bytesValue;
                }
            }
        }
        if (cmpValue == null) {
            cmpValue = missingValue;
        }
        if (previous != null) {
            assertNotNull(cmpValue);
            assertTrue(previous.utf8ToString() + "   /   " + cmpValue.utf8ToString(),
                    previous.compareTo(cmpValue) <= 0);
        }
        previous = cmpValue;
    }
    searcher.getIndexReader().close();
}

From source file:org.elasticsearch.index.percolator.QueriesLoaderCollector.java

License:Apache License

@Override
public void collect(int doc) throws IOException {
    // the _source is the query

    if (idValues.setDocument(doc) > 0) {
        BytesRef id = idValues.nextValue();
        fieldsVisitor.reset();//from   w w w.j  a va 2s .  c  o  m
        reader.document(doc, fieldsVisitor);

        try {
            // id is only used for logging, if we fail we log the id in the catch statement
            final Query parseQuery = percolator.parsePercolatorDocument(null, fieldsVisitor.source());
            if (parseQuery != null) {
                queries.put(new HashedBytesRef(idValues.copyShared(), idValues.currentValueHash()), parseQuery);
            } else {
                logger.warn("failed to add query [{}] - parser returned null", id);
            }

        } catch (Exception e) {
            logger.warn("failed to add query [{}]", e, id.utf8ToString());
        }
    }
}