List of usage examples for org.apache.lucene.util BytesRef utf8ToString
public String utf8ToString()
From source file:org.elasticsearch.action.termvectors.GetTermVectorsTests.java
License:Apache License
private void checkBrownFoxTermVector(Fields fields, String fieldName, boolean withPayloads) throws ElasticsearchException, IOException { String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" }; int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 }; int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } }; int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } }; int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } }; Terms terms = fields.terms(fieldName); assertThat(terms.size(), equalTo(8l)); TermsEnum iterator = terms.iterator(); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(next, notNullValue()); assertThat("expected " + string, string, equalTo(next.utf8ToString())); assertThat(next, notNullValue()); // do not test ttf or doc frequency, because here we have many // shards and do not know how documents are distributed PostingsEnum docsAndPositions = iterator.postings(null, null, PostingsEnum.ALL); assertThat(docsAndPositions.nextDoc(), equalTo(0)); assertThat(freq[j], equalTo(docsAndPositions.freq())); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; assertThat(termPos.length, equalTo(freq[j])); assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); assertThat("term: " + string, nextPosition, equalTo(termPos[k])); assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); if (withPayloads) { assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); }// www. j a v a2 s .c o m } } assertThat(iterator.next(), nullValue()); }
From source file:org.elasticsearch.action.termvectors.GetTermVectorsTests.java
License:Apache License
private void compareTermVectors(String fieldName, Fields fields0, Fields fields1) throws IOException { Terms terms0 = fields0.terms(fieldName); Terms terms1 = fields1.terms(fieldName); assertThat(terms0, notNullValue());/* w ww. ja v a 2 s. co m*/ assertThat(terms1, notNullValue()); assertThat(terms0.size(), equalTo(terms1.size())); TermsEnum iter0 = terms0.iterator(); TermsEnum iter1 = terms1.iterator(); for (int i = 0; i < terms0.size(); i++) { BytesRef next0 = iter0.next(); assertThat(next0, notNullValue()); BytesRef next1 = iter1.next(); assertThat(next1, notNullValue()); // compare field value String string0 = next0.utf8ToString(); String string1 = next1.utf8ToString(); assertThat("expected: " + string0, string0, equalTo(string1)); // compare df and ttf assertThat("term: " + string0, iter0.docFreq(), equalTo(iter1.docFreq())); assertThat("term: " + string0, iter0.totalTermFreq(), equalTo(iter1.totalTermFreq())); // compare freq and docs PostingsEnum docsAndPositions0 = iter0.postings(null, null, PostingsEnum.ALL); PostingsEnum docsAndPositions1 = iter1.postings(null, null, PostingsEnum.ALL); assertThat("term: " + string0, docsAndPositions0.nextDoc(), equalTo(docsAndPositions1.nextDoc())); assertThat("term: " + string0, docsAndPositions0.freq(), equalTo(docsAndPositions1.freq())); // compare position, start offsets and end offsets for (int j = 0; j < docsAndPositions0.freq(); j++) { assertThat("term: " + string0, docsAndPositions0.nextPosition(), equalTo(docsAndPositions1.nextPosition())); assertThat("term: " + string0, docsAndPositions0.startOffset(), equalTo(docsAndPositions1.startOffset())); assertThat("term: " + string0, docsAndPositions0.endOffset(), equalTo(docsAndPositions1.endOffset())); } } assertThat(iter0.next(), nullValue()); assertThat(iter1.next(), nullValue()); }
From source file:org.elasticsearch.action.termvectors.TermVectorsFilter.java
License:Apache License
public void selectBestTerms() throws IOException { PostingsEnum docsEnum = null;/*w w w. ja va 2s .c om*/ for (String fieldName : fields) { if ((selectedFields != null) && (!selectedFields.contains(fieldName))) { continue; } Terms terms = fields.terms(fieldName); Terms topLevelTerms = topLevelFields.terms(fieldName); // if no terms found, take the retrieved term vector fields for stats if (topLevelTerms == null) { topLevelTerms = terms; } long numDocs = getDocCount(fieldName, topLevelTerms); // one queue per field name ScoreTermsQueue queue = new ScoreTermsQueue(Math.min(maxNumTerms, (int) terms.size())); // select terms with highest tf-idf TermsEnum termsEnum = terms.iterator(); TermsEnum topLevelTermsEnum = topLevelTerms.iterator(); while (termsEnum.next() != null) { BytesRef termBytesRef = termsEnum.term(); boolean foundTerm = topLevelTermsEnum.seekExact(termBytesRef); assert foundTerm : "Term: " + termBytesRef.utf8ToString() + " not found!"; Term term = new Term(fieldName, termBytesRef); // remove noise words int freq = getTermFreq(termsEnum, docsEnum); if (isNoise(term.bytes().utf8ToString(), freq)) { continue; } // now call on docFreq long docFreq = getTermStatistics(topLevelTermsEnum, term).docFreq(); if (!isAccepted(docFreq)) { continue; } // filter based on score float score = computeScore(docFreq, freq, numDocs); queue.addOrUpdate(new ScoreTerm(term.field(), term.bytes().utf8ToString(), score)); } // retain the best terms for quick lookups ScoreTerm scoreTerm; while ((scoreTerm = queue.pop()) != null) { scoreTerms.put(new Term(scoreTerm.field, scoreTerm.word), scoreTerm); sizes.incrementAndGet(scoreTerm.field); } } }
From source file:org.elasticsearch.common.lucene.search.XTermsFilter.java
License:Apache License
@Override public String toString() { StringBuilder builder = new StringBuilder(); BytesRef spare = new BytesRef(termsBytes); boolean first = true; for (int i = 0; i < termsAndFields.length; i++) { TermsAndField current = termsAndFields[i]; for (int j = current.start; j < current.end; j++) { spare.offset = offsets[j];//w ww.ja v a2 s . c o m spare.length = offsets[j + 1] - offsets[j]; if (!first) { builder.append(' '); } first = false; builder.append(current.field).append(':'); builder.append(spare.utf8ToString()); } } return builder.toString(); }
From source file:org.elasticsearch.common.xcontent.BaseXContentTestCase.java
License:Apache License
public void testBinaryUTF8() throws Exception { assertResult("{'utf8':null}", () -> builder().startObject().utf8Field("utf8", null).endObject()); final BytesRef randomBytesRef = new BytesRef(randomBytes()); XContentBuilder builder = builder().startObject(); if (randomBoolean()) { builder.utf8Field("utf8", randomBytesRef); } else {/*w ww. java2 s .c om*/ builder.field("utf8").utf8Value(randomBytesRef); } builder.endObject(); XContentParser parser = createParser(xcontentType().xContent(), builder.bytes()); assertSame(parser.nextToken(), Token.START_OBJECT); assertSame(parser.nextToken(), Token.FIELD_NAME); assertEquals(parser.currentName(), "utf8"); assertTrue(parser.nextToken().isValue()); assertThat(parser.utf8Bytes().utf8ToString(), equalTo(randomBytesRef.utf8ToString())); assertSame(parser.nextToken(), Token.END_OBJECT); assertNull(parser.nextToken()); }
From source file:org.elasticsearch.index.field.data.DocFieldData.java
License:Apache License
public String stringValue() { BytesRef val = fieldData.stringValue(docId); if (val == null) { return null; }/*from ww w .j a v a 2s . c om*/ return val.utf8ToString(); }
From source file:org.elasticsearch.index.field.data.strings.StringDocFieldData.java
License:Apache License
public String getValue() { BytesRef value = fieldData.value(docId); if (value == null) { return null; }//from ww w. j a v a 2 s.c o m return value.utf8ToString(); }
From source file:org.elasticsearch.index.fielddata.AbstractStringFieldDataTestCase.java
License:Apache License
public void testNestedSorting(MultiValueMode sortMode) throws IOException { final String[] values = new String[randomIntBetween(2, 20)]; for (int i = 0; i < values.length; ++i) { values[i] = TestUtil.randomSimpleString(getRandom()); }/*www . j av a2 s .co m*/ final int numParents = scaledRandomIntBetween(10, 3072); List<Document> docs = new ArrayList<>(); FixedBitSet parents = new FixedBitSet(64); for (int i = 0; i < numParents; ++i) { docs.clear(); final int numChildren = randomInt(4); for (int j = 0; j < numChildren; ++j) { final Document child = new Document(); final int numValues = randomInt(3); for (int k = 0; k < numValues; ++k) { final String value = RandomPicks.randomFrom(getRandom(), values); addField(child, "text", value); } docs.add(child); } final Document parent = new Document(); parent.add(new StringField("type", "parent", Store.YES)); final String value = RandomPicks.randomFrom(getRandom(), values); if (value != null) { addField(parent, "text", value); } docs.add(parent); int bit = parents.prevSetBit(parents.length() - 1) + docs.size(); parents = FixedBitSet.ensureCapacity(parents, bit); parents.set(bit); writer.addDocuments(docs); if (randomInt(10) == 0) { writer.commit(); } } DirectoryReader directoryReader = DirectoryReader.open(writer, true); directoryReader = ElasticsearchDirectoryReader.wrap(directoryReader, new ShardId(new Index("test"), 0)); IndexSearcher searcher = new IndexSearcher(directoryReader); IndexFieldData<?> fieldData = getForField("text"); final Object missingValue; switch (randomInt(4)) { case 0: missingValue = "_first"; break; case 1: missingValue = "_last"; break; case 2: missingValue = new BytesRef(RandomPicks.randomFrom(getRandom(), values)); break; default: missingValue = new BytesRef(TestUtil.randomSimpleString(getRandom())); break; } Query parentFilter = new TermQuery(new Term("type", "parent")); Query childFilter = Queries.not(parentFilter); Nested nested = createNested(searcher, parentFilter, childFilter); BytesRefFieldComparatorSource nestedComparatorSource = new BytesRefFieldComparatorSource(fieldData, missingValue, sortMode, nested); ToParentBlockJoinQuery query = new ToParentBlockJoinQuery(new ConstantScoreQuery(childFilter), new QueryBitSetProducer(parentFilter), ScoreMode.None); Sort sort = new Sort(new SortField("text", nestedComparatorSource)); TopFieldDocs topDocs = searcher.search(query, randomIntBetween(1, numParents), sort); assertTrue(topDocs.scoreDocs.length > 0); BytesRef previous = null; for (int i = 0; i < topDocs.scoreDocs.length; ++i) { final int docID = topDocs.scoreDocs[i].doc; assertTrue("expected " + docID + " to be a parent", parents.get(docID)); BytesRef cmpValue = null; for (int child = parents.prevSetBit(docID - 1) + 1; child < docID; ++child) { String[] sVals = searcher.doc(child).getValues("text"); final BytesRef[] vals; if (sVals.length == 0) { vals = new BytesRef[0]; } else { vals = new BytesRef[sVals.length]; for (int j = 0; j < vals.length; ++j) { vals[j] = new BytesRef(sVals[j]); } } for (BytesRef value : vals) { if (cmpValue == null) { cmpValue = value; } else if (sortMode == MultiValueMode.MIN && value.compareTo(cmpValue) < 0) { cmpValue = value; } else if (sortMode == MultiValueMode.MAX && value.compareTo(cmpValue) > 0) { cmpValue = value; } } } if (cmpValue == null) { if ("_first".equals(missingValue)) { cmpValue = new BytesRef(); } else if ("_last".equals(missingValue) == false) { cmpValue = (BytesRef) missingValue; } } if (previous != null && cmpValue != null) { assertTrue(previous.utf8ToString() + " / " + cmpValue.utf8ToString(), previous.compareTo(cmpValue) <= 0); } previous = cmpValue; } searcher.getIndexReader().close(); }
From source file:org.elasticsearch.index.fielddata.AbstractStringFieldDataTests.java
License:Apache License
public void testNestedSorting(SortMode sortMode) throws IOException { final String[] values = new String[randomIntBetween(2, 20)]; for (int i = 0; i < values.length; ++i) { values[i] = _TestUtil.randomSimpleString(getRandom()); }/*from w ww. j av a 2 s .c om*/ final int numParents = atLeast(100); List<Document> docs = new ArrayList<Document>(); final OpenBitSet parents = new OpenBitSet(); for (int i = 0; i < numParents; ++i) { docs.clear(); final int numChildren = randomInt(4); for (int j = 0; j < numChildren; ++j) { final Document child = new Document(); final int numValues = randomInt(3); for (int k = 0; k < numValues; ++k) { final String value = RandomPicks.randomFrom(getRandom(), values); child.add(new StringField("text", value, Store.YES)); } docs.add(child); } final Document parent = new Document(); parent.add(new StringField("type", "parent", Store.YES)); final String value = RandomPicks.randomFrom(getRandom(), values); if (value != null) { parent.add(new StringField("text", value, Store.YES)); } docs.add(parent); parents.set(parents.prevSetBit(parents.length() - 1) + docs.size()); writer.addDocuments(docs); if (randomInt(10) == 0) { writer.commit(); } } IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(writer, true)); IndexFieldData<?> fieldData = getForField("text"); final BytesRef missingValue; switch (randomInt(4)) { case 0: missingValue = new BytesRef(); break; case 1: missingValue = BytesRefFieldComparatorSource.MAX_TERM; break; case 2: missingValue = new BytesRef(RandomPicks.randomFrom(getRandom(), values)); break; default: missingValue = new BytesRef(_TestUtil.randomSimpleString(getRandom())); break; } BytesRefFieldComparatorSource innerSource = new BytesRefFieldComparatorSource(fieldData, missingValue, sortMode); Filter parentFilter = new TermFilter(new Term("type", "parent")); Filter childFilter = new NotFilter(parentFilter); NestedFieldComparatorSource nestedComparatorSource = new NestedFieldComparatorSource(sortMode, innerSource, parentFilter, childFilter); ToParentBlockJoinQuery query = new ToParentBlockJoinQuery( new XFilteredQuery(new MatchAllDocsQuery(), childFilter), new FixedBitSetCachingWrapperFilter(parentFilter), ScoreMode.None); Sort sort = new Sort(new SortField("text", nestedComparatorSource)); TopFieldDocs topDocs = searcher.search(query, randomIntBetween(1, numParents), sort); assertTrue(topDocs.scoreDocs.length > 0); BytesRef previous = null; for (int i = 0; i < topDocs.scoreDocs.length; ++i) { final int docID = topDocs.scoreDocs[i].doc; assertTrue("expected " + docID + " to be a parent", parents.get(docID)); BytesRef cmpValue = null; for (int child = parents.prevSetBit(docID - 1) + 1; child < docID; ++child) { String[] vals = searcher.doc(child).getValues("text"); if (vals.length == 0) { vals = new String[] { missingValue.utf8ToString() }; } for (String value : vals) { final BytesRef bytesValue = new BytesRef(value); if (cmpValue == null) { cmpValue = bytesValue; } else if (sortMode == SortMode.MIN && bytesValue.compareTo(cmpValue) < 0) { cmpValue = bytesValue; } else if (sortMode == SortMode.MAX && bytesValue.compareTo(cmpValue) > 0) { cmpValue = bytesValue; } } } if (cmpValue == null) { cmpValue = missingValue; } if (previous != null) { assertNotNull(cmpValue); assertTrue(previous.utf8ToString() + " / " + cmpValue.utf8ToString(), previous.compareTo(cmpValue) <= 0); } previous = cmpValue; } searcher.getIndexReader().close(); }
From source file:org.elasticsearch.index.percolator.QueriesLoaderCollector.java
License:Apache License
@Override public void collect(int doc) throws IOException { // the _source is the query if (idValues.setDocument(doc) > 0) { BytesRef id = idValues.nextValue(); fieldsVisitor.reset();//from w w w.j a va 2s . c o m reader.document(doc, fieldsVisitor); try { // id is only used for logging, if we fail we log the id in the catch statement final Query parseQuery = percolator.parsePercolatorDocument(null, fieldsVisitor.source()); if (parseQuery != null) { queries.put(new HashedBytesRef(idValues.copyShared(), idValues.currentValueHash()), parseQuery); } else { logger.warn("failed to add query [{}] - parser returned null", id); } } catch (Exception e) { logger.warn("failed to add query [{}]", e, id.utf8ToString()); } } }