List of usage examples for org.apache.lucene.util BytesRef deepCopyOf
public static BytesRef deepCopyOf(BytesRef other)
other
The returned BytesRef will have a length of other.length and an offset of zero.
From source file:org.apache.solr.uninverting.TestFieldCache.java
License:Apache License
public void test() throws IOException { FieldCache cache = FieldCache.DEFAULT; NumericDocValues doubles = cache.getNumerics(reader, "theDouble", FieldCache.DOUBLE_POINT_PARSER); for (int i = 0; i < NUM_DOCS; i++) { assertEquals(i, doubles.nextDoc()); assertEquals(Double.doubleToLongBits(Double.MAX_VALUE - i), doubles.longValue()); }/*ww w. j a va2 s. co m*/ NumericDocValues longs = cache.getNumerics(reader, "theLong", FieldCache.LONG_POINT_PARSER); for (int i = 0; i < NUM_DOCS; i++) { assertEquals(i, longs.nextDoc()); assertEquals(Long.MAX_VALUE - i, longs.longValue()); } NumericDocValues ints = cache.getNumerics(reader, "theInt", FieldCache.INT_POINT_PARSER); for (int i = 0; i < NUM_DOCS; i++) { assertEquals(i, ints.nextDoc()); assertEquals(Integer.MAX_VALUE - i, ints.longValue()); } NumericDocValues floats = cache.getNumerics(reader, "theFloat", FieldCache.FLOAT_POINT_PARSER); for (int i = 0; i < NUM_DOCS; i++) { assertEquals(i, floats.nextDoc()); assertEquals(Float.floatToIntBits(Float.MAX_VALUE - i), floats.longValue()); } Bits docsWithField = cache.getDocsWithField(reader, "theLong", FieldCache.LONG_POINT_PARSER); assertSame("Second request to cache return same array", docsWithField, cache.getDocsWithField(reader, "theLong", FieldCache.LONG_POINT_PARSER)); assertTrue("docsWithField(theLong) must be class Bits.MatchAllBits", docsWithField instanceof Bits.MatchAllBits); assertTrue("docsWithField(theLong) Size: " + docsWithField.length() + " is not: " + NUM_DOCS, docsWithField.length() == NUM_DOCS); for (int i = 0; i < docsWithField.length(); i++) { assertTrue(docsWithField.get(i)); } docsWithField = cache.getDocsWithField(reader, "sparse", FieldCache.INT_POINT_PARSER); assertSame("Second request to cache return same array", docsWithField, cache.getDocsWithField(reader, "sparse", FieldCache.INT_POINT_PARSER)); assertFalse("docsWithField(sparse) must not be class Bits.MatchAllBits", docsWithField instanceof Bits.MatchAllBits); assertTrue("docsWithField(sparse) Size: " + docsWithField.length() + " is not: " + NUM_DOCS, docsWithField.length() == NUM_DOCS); for (int i = 0; i < docsWithField.length(); i++) { assertEquals(i % 2 == 0, docsWithField.get(i)); } // getTermsIndex SortedDocValues termsIndex = cache.getTermsIndex(reader, "theRandomUnicodeString"); for (int i = 0; i < NUM_DOCS; i++) { final String s; if (i > termsIndex.docID()) { termsIndex.advance(i); } if (i == termsIndex.docID()) { s = termsIndex.binaryValue().utf8ToString(); } else { s = null; } assertTrue("for doc " + i + ": " + s + " does not equal: " + unicodeStrings[i], unicodeStrings[i] == null || unicodeStrings[i].equals(s)); } int nTerms = termsIndex.getValueCount(); TermsEnum tenum = termsIndex.termsEnum(); for (int i = 0; i < nTerms; i++) { BytesRef val1 = BytesRef.deepCopyOf(tenum.next()); final BytesRef val = termsIndex.lookupOrd(i); // System.out.println("i="+i); assertEquals(val, val1); } // seek the enum around (note this isn't a great test here) int num = atLeast(100); for (int i = 0; i < num; i++) { int k = random().nextInt(nTerms); final BytesRef val = BytesRef.deepCopyOf(termsIndex.lookupOrd(k)); assertEquals(TermsEnum.SeekStatus.FOUND, tenum.seekCeil(val)); assertEquals(val, tenum.term()); } for (int i = 0; i < nTerms; i++) { final BytesRef val = BytesRef.deepCopyOf(termsIndex.lookupOrd(i)); assertEquals(TermsEnum.SeekStatus.FOUND, tenum.seekCeil(val)); assertEquals(val, tenum.term()); } // test bad field termsIndex = cache.getTermsIndex(reader, "bogusfield"); // getTerms BinaryDocValues terms = cache.getTerms(reader, "theRandomUnicodeString"); for (int i = 0; i < NUM_DOCS; i++) { if (terms.docID() < i) { terms.nextDoc(); } if (terms.docID() == i) { assertEquals(unicodeStrings[i], terms.binaryValue().utf8ToString()); } else { assertNull(unicodeStrings[i]); } } // test bad field terms = cache.getTerms(reader, "bogusfield"); // getDocTermOrds SortedSetDocValues termOrds = cache.getDocTermOrds(reader, "theRandomUnicodeMultiValuedField", null); int numEntries = cache.getCacheEntries().length; // ask for it again, and check that we didnt create any additional entries: termOrds = cache.getDocTermOrds(reader, "theRandomUnicodeMultiValuedField", null); assertEquals(numEntries, cache.getCacheEntries().length); for (int i = 0; i < NUM_DOCS; i++) { // This will remove identical terms. A DocTermOrds doesn't return duplicate ords for a docId List<BytesRef> values = new ArrayList<>(new LinkedHashSet<>(Arrays.asList(multiValued[i]))); for (BytesRef v : values) { if (v == null) { // why does this test use null values... instead of an empty list: confusing break; } if (i > termOrds.docID()) { assertEquals(i, termOrds.nextDoc()); } long ord = termOrds.nextOrd(); assert ord != SortedSetDocValues.NO_MORE_ORDS; BytesRef scratch = termOrds.lookupOrd(ord); assertEquals(v, scratch); } if (i == termOrds.docID()) { assertEquals(SortedSetDocValues.NO_MORE_ORDS, termOrds.nextOrd()); } } // test bad field termOrds = cache.getDocTermOrds(reader, "bogusfield", null); assertTrue(termOrds.getValueCount() == 0); FieldCache.DEFAULT.purgeByCacheKey(reader.getCoreCacheKey()); }
From source file:org.apache.solr.uninverting.TestFieldCacheVsDocValues.java
License:Apache License
private void assertEquals(int maxDoc, SortedDocValues expected, SortedDocValues actual) throws Exception { // can be null for the segment if no docs actually had any SortedDocValues // in this case FC.getDocTermsOrds returns EMPTY if (actual == null) { assertEquals(expected.getValueCount(), 0); return;/* w ww. j a v a 2s . c o m*/ } assertEquals(expected.getValueCount(), actual.getValueCount()); // compare ord lists while (true) { int docID = expected.nextDoc(); if (docID == NO_MORE_DOCS) { assertEquals(NO_MORE_DOCS, actual.nextDoc()); break; } assertEquals(docID, actual.nextDoc()); assertEquals(expected.ordValue(), actual.ordValue()); assertEquals(expected.binaryValue(), actual.binaryValue()); } // compare ord dictionary for (long i = 0; i < expected.getValueCount(); i++) { final BytesRef expectedBytes = BytesRef.deepCopyOf(expected.lookupOrd((int) i)); final BytesRef actualBytes = actual.lookupOrd((int) i); assertEquals(expectedBytes, actualBytes); } // compare termsenum assertEquals(expected.getValueCount(), expected.termsEnum(), actual.termsEnum()); }
From source file:org.apache.solr.uninverting.TestFieldCacheVsDocValues.java
License:Apache License
private void assertEquals(int maxDoc, SortedSetDocValues expected, SortedSetDocValues actual) throws Exception { // can be null for the segment if no docs actually had any SortedDocValues // in this case FC.getDocTermsOrds returns EMPTY if (actual == null) { assertEquals(expected.getValueCount(), 0); return;/* w ww. j a va 2s .c om*/ } assertEquals(expected.getValueCount(), actual.getValueCount()); while (true) { int docID = expected.nextDoc(); assertEquals(docID, actual.nextDoc()); if (docID == NO_MORE_DOCS) { break; } long expectedOrd; while ((expectedOrd = expected.nextOrd()) != NO_MORE_ORDS) { assertEquals(expectedOrd, actual.nextOrd()); } assertEquals(NO_MORE_ORDS, actual.nextOrd()); } // compare ord dictionary for (long i = 0; i < expected.getValueCount(); i++) { final BytesRef expectedBytes = BytesRef.deepCopyOf(expected.lookupOrd(i)); final BytesRef actualBytes = actual.lookupOrd(i); assertEquals(expectedBytes, actualBytes); } // compare termsenum assertEquals(expected.getValueCount(), expected.termsEnum(), actual.termsEnum()); }
From source file:org.codelibs.elasticsearch.common.bytes.BytesArray.java
License:Apache License
public BytesArray(BytesRef bytesRef, boolean deepCopy) { if (deepCopy) { bytesRef = BytesRef.deepCopyOf(bytesRef); }//from ww w . j a va 2 s . c o m bytes = bytesRef.bytes; offset = bytesRef.offset; length = bytesRef.length; }
From source file:org.codelibs.elasticsearch.common.bytes.BytesReference.java
License:Apache License
/** * Returns a compact array from the given BytesReference. The returned array won't be copied unless necessary. If you need * to modify the returned array use <tt>BytesRef.deepCopyOf(reference.toBytesRef()</tt> instead *//*from w ww . j a v a 2 s .com*/ public static byte[] toBytes(BytesReference reference) { final BytesRef bytesRef = reference.toBytesRef(); if (bytesRef.offset == 0 && bytesRef.length == bytesRef.bytes.length) { return bytesRef.bytes; } return BytesRef.deepCopyOf(bytesRef).bytes; }
From source file:org.codelibs.elasticsearch.index.query.WrapperQueryBuilder.java
License:Apache License
/** * Creates a query builder given a query provided as a {BytesReference} */// ww w .j a v a 2 s. c om public WrapperQueryBuilder(BytesReference source) { if (source == null || source.length() == 0) { throw new IllegalArgumentException("query source text cannot be null or empty"); } this.source = BytesRef.deepCopyOf(source.toBytesRef()).bytes; }
From source file:org.codelibs.elasticsearch.search.aggregations.bucket.significant.GlobalOrdinalsSignificantTermsAggregator.java
License:Apache License
@Override public SignificantStringTerms buildAggregation(long owningBucketOrdinal) throws IOException { assert owningBucketOrdinal == 0; if (globalOrds == null) { // no context in this reader return buildEmptyAggregation(); }//from www . j av a 2 s .c o m final int size; if (bucketCountThresholds.getMinDocCount() == 0) { // if minDocCount == 0 then we can end up with more buckets then maxBucketOrd() returns size = (int) Math.min(globalOrds.getValueCount(), bucketCountThresholds.getShardSize()); } else { size = (int) Math.min(maxBucketOrd(), bucketCountThresholds.getShardSize()); } long supersetSize = termsAggFactory.getSupersetNumDocs(); long subsetSize = numCollectedDocs; BucketSignificancePriorityQueue<SignificantStringTerms.Bucket> ordered = new BucketSignificancePriorityQueue<>( size); SignificantStringTerms.Bucket spare = null; for (long globalTermOrd = 0; globalTermOrd < globalOrds.getValueCount(); ++globalTermOrd) { if (includeExclude != null && !acceptedGlobalOrdinals.get(globalTermOrd)) { continue; } final long bucketOrd = getBucketOrd(globalTermOrd); final int bucketDocCount = bucketOrd < 0 ? 0 : bucketDocCount(bucketOrd); if (bucketCountThresholds.getMinDocCount() > 0 && bucketDocCount == 0) { continue; } if (bucketDocCount < bucketCountThresholds.getShardMinDocCount()) { continue; } if (spare == null) { spare = new SignificantStringTerms.Bucket(new BytesRef(), 0, 0, 0, 0, null, format); } spare.bucketOrd = bucketOrd; copy(globalOrds.lookupOrd(globalTermOrd), spare.termBytes); spare.subsetDf = bucketDocCount; spare.subsetSize = subsetSize; spare.supersetDf = termsAggFactory.getBackgroundFrequency(spare.termBytes); spare.supersetSize = supersetSize; // During shard-local down-selection we use subset/superset stats // that are for this shard only // Back at the central reducer these properties will be updated with // global stats spare.updateScore(significanceHeuristic); spare = ordered.insertWithOverflow(spare); } final SignificantStringTerms.Bucket[] list = new SignificantStringTerms.Bucket[ordered.size()]; for (int i = ordered.size() - 1; i >= 0; i--) { final SignificantStringTerms.Bucket bucket = ordered.pop(); // the terms are owned by the BytesRefHash, we need to pull a copy since the BytesRef hash data may be recycled at some point bucket.termBytes = BytesRef.deepCopyOf(bucket.termBytes); bucket.aggregations = bucketAggregations(bucket.bucketOrd); list[i] = bucket; } return new SignificantStringTerms(name, bucketCountThresholds.getRequiredSize(), bucketCountThresholds.getMinDocCount(), pipelineAggregators(), metaData(), format, subsetSize, supersetSize, significanceHeuristic, Arrays.asList(list)); }
From source file:org.codelibs.elasticsearch.search.aggregations.bucket.significant.SignificantStringTermsAggregator.java
License:Apache License
@Override public SignificantStringTerms buildAggregation(long owningBucketOrdinal) throws IOException { assert owningBucketOrdinal == 0; final int size = (int) Math.min(bucketOrds.size(), bucketCountThresholds.getShardSize()); long supersetSize = termsAggFactory.getSupersetNumDocs(); long subsetSize = numCollectedDocs; BucketSignificancePriorityQueue<SignificantStringTerms.Bucket> ordered = new BucketSignificancePriorityQueue<>( size);/* w w w .j a v a2 s . c o m*/ SignificantStringTerms.Bucket spare = null; for (int i = 0; i < bucketOrds.size(); i++) { final int docCount = bucketDocCount(i); if (docCount < bucketCountThresholds.getShardMinDocCount()) { continue; } if (spare == null) { spare = new SignificantStringTerms.Bucket(new BytesRef(), 0, 0, 0, 0, null, format); } bucketOrds.get(i, spare.termBytes); spare.subsetDf = docCount; spare.subsetSize = subsetSize; spare.supersetDf = termsAggFactory.getBackgroundFrequency(spare.termBytes); spare.supersetSize = supersetSize; // During shard-local down-selection we use subset/superset stats // that are for this shard only // Back at the central reducer these properties will be updated with // global stats spare.updateScore(significanceHeuristic); spare.bucketOrd = i; spare = ordered.insertWithOverflow(spare); } final SignificantStringTerms.Bucket[] list = new SignificantStringTerms.Bucket[ordered.size()]; for (int i = ordered.size() - 1; i >= 0; i--) { final SignificantStringTerms.Bucket bucket = ordered.pop(); // the terms are owned by the BytesRefHash, we need to pull a copy since the BytesRef hash data may be recycled at some point bucket.termBytes = BytesRef.deepCopyOf(bucket.termBytes); bucket.aggregations = bucketAggregations(bucket.bucketOrd); list[i] = bucket; } return new SignificantStringTerms(name, bucketCountThresholds.getRequiredSize(), bucketCountThresholds.getMinDocCount(), pipelineAggregators(), metaData(), format, subsetSize, supersetSize, significanceHeuristic, Arrays.asList(list)); }
From source file:org.codelibs.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.java
License:Apache License
public Result getCorrections(TokenStream stream, final CandidateGenerator generator, float maxErrors, int numCorrections, WordScorer wordScorer, float confidence, int gramSize) throws IOException { final List<CandidateSet> candidateSetsList = new ArrayList<>(); DirectCandidateGenerator.analyze(stream, new DirectCandidateGenerator.TokenConsumer() { CandidateSet currentSet = null;//w w w. j av a2 s . c o m private TypeAttribute typeAttribute; private final BytesRefBuilder termsRef = new BytesRefBuilder(); private boolean anyUnigram = false; private boolean anyTokens = false; @Override public void reset(TokenStream stream) { super.reset(stream); typeAttribute = stream.addAttribute(TypeAttribute.class); } @Override public void nextToken() throws IOException { anyTokens = true; BytesRef term = fillBytesRef(termsRef); if (requireUnigram && typeAttribute.type() == ShingleFilter.DEFAULT_TOKEN_TYPE) { return; } anyUnigram = true; if (posIncAttr.getPositionIncrement() == 0 && typeAttribute.type() == SynonymFilter.TYPE_SYNONYM) { assert currentSet != null; long freq = 0; if ((freq = generator.frequency(term)) > 0) { currentSet.addOneCandidate( generator.createCandidate(BytesRef.deepCopyOf(term), freq, realWordLikelihood)); } } else { if (currentSet != null) { candidateSetsList.add(currentSet); } currentSet = new CandidateSet(Candidate.EMPTY, generator.createCandidate(BytesRef.deepCopyOf(term), true)); } } @Override public void end() { if (currentSet != null) { candidateSetsList.add(currentSet); } if (requireUnigram && !anyUnigram && anyTokens) { throw new IllegalStateException("At least one unigram is required but all tokens were ngrams"); } } }); if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) { return Result.EMPTY; } for (CandidateSet candidateSet : candidateSetsList) { generator.drawCandidates(candidateSet); } double cutoffScore = Double.MIN_VALUE; CandidateScorer scorer = new CandidateScorer(wordScorer, numCorrections, gramSize); CandidateSet[] candidateSets = candidateSetsList.toArray(new CandidateSet[candidateSetsList.size()]); if (confidence > 0.0) { Candidate[] candidates = new Candidate[candidateSets.length]; for (int i = 0; i < candidates.length; i++) { candidates[i] = candidateSets[i].originalTerm; } double inputPhraseScore = scorer.score(candidates, candidateSets); cutoffScore = inputPhraseScore * confidence; } Correction[] bestCandidates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore); return new Result(bestCandidates, cutoffScore); }
From source file:org.codelibs.elasticsearch.search.suggest.term.TermSuggester.java
License:Apache License
private static List<Token> queryTerms(SuggestionContext suggestion, CharsRefBuilder spare) throws IOException { final List<Token> result = new ArrayList<>(); final String field = suggestion.getField(); DirectCandidateGenerator.analyze(suggestion.getAnalyzer(), suggestion.getText(), field, new DirectCandidateGenerator.TokenConsumer() { @Override//w w w . j a va 2s. co m public void nextToken() { Term term = new Term(field, BytesRef.deepCopyOf(fillBytesRef(new BytesRefBuilder()))); result.add(new Token(term, offsetAttr.startOffset(), offsetAttr.endOffset())); } }, spare); return result; }