List of usage examples for org.apache.lucene.util BytesRef deepCopyOf
public static BytesRef deepCopyOf(BytesRef other)
other
The returned BytesRef will have a length of other.length and an offset of zero.
From source file:org.alfresco.solr.AlfrescoFieldType.java
License:Open Source License
public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) { if (part == null || analyzerIn == null) return null; TokenStream source = null;//from www . j a v a 2 s . c om try { source = analyzerIn.tokenStream(field, part); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); if (!source.incrementToken()) throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "analyzer returned no terms for multiTerm term: " + part); if (source.incrementToken()) throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "analyzer returned too many terms for multiTerm term: " + part); source.end(); return BytesRef.deepCopyOf(bytes); } catch (IOException e) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "error analyzing range part: " + part, e); } finally { IOUtils.closeWhileHandlingException(source); } }
From source file:org.alfresco.solr.query.Solr4QueryParser.java
License:Open Source License
@SuppressWarnings("RCN_REDUNDANT_NULLCHECK_WOULD_HAVE_BEEN_A_NPE") protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) { if (analyzerIn == null) analyzerIn = getAnalyzer();/*from w w w . j av a2 s. c o m*/ try (TokenStream source = analyzerIn.tokenStream(field, part)) { source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); if (!source.incrementToken()) throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part); BytesRef bytes = BytesRef.deepCopyOf(termAtt.getBytesRef()); if (source.incrementToken()) throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part); source.end(); return bytes; } catch (IOException e) { throw new RuntimeException("Error analyzing multiTerm term: " + part, e); } }
From source file:org.allenai.blacklab.queryParser.lucene.QueryParserBase.java
License:Apache License
/** * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow *///from ww w . j ava 2 s.c o m protected TextPattern newFieldQuery(Analyzer analyzer, String field, String queryText, boolean quoted) throws ParseException { // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or nothing based on the term count TokenStream source; try { source = analyzer.tokenStream(field, new StringReader(queryText)); source.reset(); } catch (IOException e) { ParseException p = new ParseException("Unable to initialize TokenStream to analyze query text"); p.initCause(e); throw p; } CachingTokenFilter buffer = new CachingTokenFilter(source); TermToBytesRefAttribute termAtt = null; PositionIncrementAttribute posIncrAtt = null; int numTokens = 0; buffer.reset(); if (buffer.hasAttribute(TermToBytesRefAttribute.class)) { termAtt = buffer.getAttribute(TermToBytesRefAttribute.class); } if (buffer.hasAttribute(PositionIncrementAttribute.class)) { posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class); } int positionCount = 0; boolean severalTokensAtSamePosition = false; boolean hasMoreTokens = false; if (termAtt != null) { try { hasMoreTokens = buffer.incrementToken(); while (hasMoreTokens) { numTokens++; int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1; if (positionIncrement != 0) { positionCount += positionIncrement; } else { severalTokensAtSamePosition = true; } hasMoreTokens = buffer.incrementToken(); } } catch (IOException e) { // ignore } } try { // rewind the buffer stream buffer.reset(); // close original stream - all tokens buffered source.close(); } catch (IOException e) { ParseException p = new ParseException("Cannot close TokenStream analyzing query text"); p.initCause(e); throw p; } BytesRef bytes = termAtt == null ? null : termAtt.getBytesRef(); if (numTokens == 0) return null; else if (numTokens == 1) { try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; termAtt.fillBytesRef(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } return newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))); } else { if (severalTokensAtSamePosition || (!quoted && !autoGeneratePhraseQueries)) { if (positionCount == 1 || (!quoted && !autoGeneratePhraseQueries)) { // no phrase query: TextPatternBoolean q = newBooleanQuery(positionCount == 1); // BL: BooleanQuery -> TextPatternBoolean BooleanClause.Occur occur = positionCount > 1 && operator == AND_OPERATOR ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD; for (int i = 0; i < numTokens; i++) { try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; termAtt.fillBytesRef(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } TextPattern currentQuery = newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))); q.add(currentQuery, occur); } return q; } else { // phrase query: TPMultiPhrase mpq = newMultiPhraseQuery(); // BL: MultiPhraseQuery -> TPMultiPhrase mpq.setSlop(phraseSlop); List<Term> multiTerms = new ArrayList<Term>(); int position = -1; for (int i = 0; i < numTokens; i++) { int positionIncrement = 1; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; termAtt.fillBytesRef(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens } if (positionIncrement > 0 && multiTerms.size() > 0) { if (enablePositionIncrements) { mpq.add(multiTerms.toArray(new Term[0]), position); } else { mpq.add(multiTerms.toArray(new Term[0])); } multiTerms.clear(); } position += positionIncrement; multiTerms.add(new Term(field, BytesRef.deepCopyOf(bytes))); } if (enablePositionIncrements) { mpq.add(multiTerms.toArray(new Term[0]), position); } else { mpq.add(multiTerms.toArray(new Term[0])); } return mpq; } } else { TPPhrase pq = newPhraseQuery(); // BL: PhraseQuery -> TPPhrase pq.setSlop(phraseSlop); int position = -1; for (int i = 0; i < numTokens; i++) { int positionIncrement = 1; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; termAtt.fillBytesRef(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens } if (enablePositionIncrements) { position += positionIncrement; pq.add(new Term(field, BytesRef.deepCopyOf(bytes)), position); } else { pq.add(new Term(field, BytesRef.deepCopyOf(bytes))); } } return pq; } } }
From source file:org.allenai.blacklab.queryParser.lucene.QueryParserBase.java
License:Apache License
protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) { TokenStream source;//w w w. j a v a2 s .c o m if (analyzerIn == null) analyzerIn = analyzer; try { source = analyzerIn.tokenStream(field, new StringReader(part)); source.reset(); } catch (IOException e) { throw new RuntimeException("Unable to initialize TokenStream to analyze multiTerm term: " + part, e); } TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); try { if (!source.incrementToken()) throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part); termAtt.fillBytesRef(); if (source.incrementToken()) throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part); } catch (IOException e) { throw new RuntimeException("error analyzing range part: " + part, e); } try { source.end(); source.close(); } catch (IOException e) { throw new RuntimeException("Unable to end & close TokenStream after analyzing multiTerm term: " + part, e); } return BytesRef.deepCopyOf(bytes); }
From source file:org.apache.blur.manager.writer.IndexImporter.java
License:Apache License
private void applyDeletes(Directory directory, IndexWriter indexWriter, String shard, boolean emitDeletes) throws IOException { DirectoryReader reader = DirectoryReader.open(directory); try {/*from w w w. j a v a2 s . com*/ LOG.info("Applying deletes in reader [{0}]", reader); CompositeReaderContext compositeReaderContext = reader.getContext(); List<AtomicReaderContext> leaves = compositeReaderContext.leaves(); BlurPartitioner blurPartitioner = new BlurPartitioner(); Text key = new Text(); int numberOfShards = _shardContext.getTableContext().getDescriptor().getShardCount(); int shardId = ShardUtil.getShardIndex(shard); for (AtomicReaderContext context : leaves) { AtomicReader atomicReader = context.reader(); Fields fields = atomicReader.fields(); Terms terms = fields.terms(BlurConstants.ROW_ID); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef ref = null; while ((ref = termsEnum.next()) != null) { key.set(ref.bytes, ref.offset, ref.length); int partition = blurPartitioner.getPartition(key, null, numberOfShards); if (shardId != partition) { throw new IOException("Index is corrupted, RowIds are found in wrong shard, partition [" + partition + "] does not shard [" + shardId + "], this can happen when rows are not hashed correctly."); } if (emitDeletes) { indexWriter.deleteDocuments(new Term(BlurConstants.ROW_ID, BytesRef.deepCopyOf(ref))); } } } } } finally { reader.close(); } }
From source file:org.apache.blur.store.hdfs_v2.HdfsKeyValueStore.java
License:Apache License
@Override public void put(BytesRef key, BytesRef value) throws IOException { ensureOpen();/*from w ww. j av a 2 s.c o m*/ if (value == null) { delete(key); return; } _writeLock.lock(); ensureOpenForWriting(); try { Operation op = getPutOperation(OperationType.PUT, key, value); Path path = write(op); BytesRef deepCopyOf = BytesRef.deepCopyOf(value); _size.addAndGet(deepCopyOf.bytes.length); Value old = _pointers.put(BytesRef.deepCopyOf(key), new Value(deepCopyOf, path)); if (old != null) { _size.addAndGet(-old._bytesRef.bytes.length); } } catch (RemoteException e) { throw new IOException("Another HDFS KeyStore has likely taken ownership of this key value store.", e); } catch (LeaseExpiredException e) { throw new IOException("Another HDFS KeyStore has likely taken ownership of this key value store.", e); } finally { _writeLock.unlock(); } }
From source file:org.apache.blur.store.hdfs_v2.HdfsKeyValueStore.java
License:Apache License
private void loadIndex(Path path, Operation operation) { Value old;/*from w w w .j a v a2 s .co m*/ switch (operation.type) { case PUT: BytesRef deepCopyOf = BytesRef.deepCopyOf(getKey(operation.value)); _size.addAndGet(deepCopyOf.bytes.length); old = _pointers.put(BytesRef.deepCopyOf(getKey(operation.key)), new Value(deepCopyOf, path)); break; case DELETE: old = _pointers.remove(getKey(operation.key)); break; default: throw new RuntimeException("Not supported [" + operation.type + "]"); } if (old != null) { _size.addAndGet(-old._bytesRef.bytes.length); } }
From source file:org.apache.solr.handler.AnalysisRequestHandlerBase.java
License:Apache License
/** * Analyzes the given text using the given analyzer and returns the produced tokens. * * @param query The query to analyze. * @param analyzer The analyzer to use./*from w w w . ja v a 2 s .c om*/ */ protected Set<BytesRef> getQueryTokenSet(String query, Analyzer analyzer) { TokenStream tokenStream = null; try { tokenStream = analyzer.tokenStream("", query); final Set<BytesRef> tokens = new HashSet<BytesRef>(); final TermToBytesRefAttribute bytesAtt = tokenStream.getAttribute(TermToBytesRefAttribute.class); final BytesRef bytes = bytesAtt.getBytesRef(); tokenStream.reset(); while (tokenStream.incrementToken()) { bytesAtt.fillBytesRef(); tokens.add(BytesRef.deepCopyOf(bytes)); } tokenStream.end(); return tokens; } catch (IOException ioe) { throw new RuntimeException("Error occured while iterating over tokenstream", ioe); } finally { IOUtils.closeWhileHandlingException(tokenStream); } }
From source file:org.apache.solr.handler.component.TermsComponent.java
License:Apache License
@Override public void process(ResponseBuilder rb) throws IOException { SolrParams params = rb.req.getParams(); if (!params.getBool(TermsParams.TERMS, false)) return;//from w ww. j a va 2s. c o m String[] fields = params.getParams(TermsParams.TERMS_FIELD); NamedList<Object> termsResult = new SimpleOrderedMap<Object>(); rb.rsp.add("terms", termsResult); if (fields == null || fields.length == 0) return; int limit = params.getInt(TermsParams.TERMS_LIMIT, 10); if (limit < 0) { limit = Integer.MAX_VALUE; } String lowerStr = params.get(TermsParams.TERMS_LOWER); String upperStr = params.get(TermsParams.TERMS_UPPER); boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false); boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true); boolean sort = !TermsParams.TERMS_SORT_INDEX .equals(params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT)); int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1); int freqmax = params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT); if (freqmax < 0) { freqmax = Integer.MAX_VALUE; } String prefix = params.get(TermsParams.TERMS_PREFIX_STR); String regexp = params.get(TermsParams.TERMS_REGEXP_STR); Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null; boolean raw = params.getBool(TermsParams.TERMS_RAW, false); final AtomicReader indexReader = rb.req.getSearcher().getAtomicReader(); Fields lfields = indexReader.fields(); for (String field : fields) { NamedList<Integer> fieldTerms = new NamedList<Integer>(); termsResult.add(field, fieldTerms); Terms terms = lfields == null ? null : lfields.terms(field); if (terms == null) { // no terms for this field continue; } FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field); if (ft == null) ft = new StrField(); // prefix must currently be text BytesRef prefixBytes = prefix == null ? null : new BytesRef(prefix); BytesRef upperBytes = null; if (upperStr != null) { upperBytes = new BytesRef(); ft.readableToIndexed(upperStr, upperBytes); } BytesRef lowerBytes; if (lowerStr == null) { // If no lower bound was specified, use the prefix lowerBytes = prefixBytes; } else { lowerBytes = new BytesRef(); if (raw) { // TODO: how to handle binary? perhaps we don't for "raw"... or if the field exists // perhaps we detect if the FieldType is non-character and expect hex if so? lowerBytes = new BytesRef(lowerStr); } else { lowerBytes = new BytesRef(); ft.readableToIndexed(lowerStr, lowerBytes); } } TermsEnum termsEnum = terms.iterator(null); BytesRef term = null; if (lowerBytes != null) { if (termsEnum.seekCeil(lowerBytes) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); //Only advance the enum if we are excluding the lower bound and the lower Term actually matches if (lowerIncl == false && term.equals(lowerBytes)) { term = termsEnum.next(); } } } else { // position termsEnum on first term term = termsEnum.next(); } int i = 0; BoundedTreeSet<CountPair<BytesRef, Integer>> queue = (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(limit) : null); CharsRef external = new CharsRef(); while (term != null && (i < limit || sort)) { boolean externalized = false; // did we fill in "external" yet for this term? // stop if the prefix doesn't match if (prefixBytes != null && !StringHelper.startsWith(term, prefixBytes)) break; if (pattern != null) { // indexed text or external text? // TODO: support "raw" mode? ft.indexedToReadable(term, external); externalized = true; if (!pattern.matcher(external).matches()) { term = termsEnum.next(); continue; } } if (upperBytes != null) { int upperCmp = term.compareTo(upperBytes); // if we are past the upper term, or equal to it (when don't include upper) then stop. if (upperCmp > 0 || (upperCmp == 0 && !upperIncl)) break; } // This is a good term in the range. Check if mincount/maxcount conditions are satisfied. int docFreq = termsEnum.docFreq(); if (docFreq >= freqmin && docFreq <= freqmax) { // add the term to the list if (sort) { queue.add(new CountPair<BytesRef, Integer>(BytesRef.deepCopyOf(term), docFreq)); } else { // TODO: handle raw somehow if (!externalized) { ft.indexedToReadable(term, external); } fieldTerms.add(external.toString(), docFreq); i++; } } term = termsEnum.next(); } if (sort) { for (CountPair<BytesRef, Integer> item : queue) { if (i >= limit) break; ft.indexedToReadable(item.key, external); fieldTerms.add(external.toString(), item.val); i++; } } } }
From source file:org.apache.solr.request.SimpleFacets.java
License:Apache License
/** * Returns a list of terms in the specified field along with the * corresponding count of documents in the set that match that constraint. * This method uses the FilterCache to get the intersection count between <code>docs</code> * and the DocSet for each term in the filter. * * @see FacetParams#FACET_LIMIT// www. jav a2s . c o m * @see FacetParams#FACET_ZEROS * @see FacetParams#FACET_MISSING */ public NamedList<Integer> getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, String sort, String prefix) throws IOException { /* :TODO: potential optimization... * cache the Terms with the highest docFreq and try them first * don't enum if we get our max from them */ // Minimum term docFreq in order to use the filterCache for that term. int minDfFilterCache = params.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0); // make sure we have a set that is fast for random access, if we will use it for that DocSet fastForRandomSet = docs; if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) { SortedIntDocSet sset = (SortedIntDocSet) docs; fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size()); } IndexSchema schema = searcher.getSchema(); AtomicReader r = searcher.getAtomicReader(); FieldType ft = schema.getFieldType(field); boolean sortByCount = sort.equals("count") || sort.equals("true"); final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1; final BoundedTreeSet<CountPair<BytesRef, Integer>> queue = sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null; final NamedList<Integer> res = new NamedList<Integer>(); int min = mincount - 1; // the smallest value in the top 'N' values int off = offset; int lim = limit >= 0 ? limit : Integer.MAX_VALUE; BytesRef startTermBytes = null; if (prefix != null) { String indexedPrefix = ft.toInternal(prefix); startTermBytes = new BytesRef(indexedPrefix); } Fields fields = r.fields(); Terms terms = fields == null ? null : fields.terms(field); TermsEnum termsEnum = null; SolrIndexSearcher.DocsEnumState deState = null; BytesRef term = null; if (terms != null) { termsEnum = terms.iterator(null); // TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for // facet.offset when sorting by index order. if (startTermBytes != null) { if (termsEnum.seekCeil(startTermBytes) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); } } else { // position termsEnum on first term term = termsEnum.next(); } } DocsEnum docsEnum = null; CharsRef charsRef = new CharsRef(10); if (docs.size() >= mincount) { while (term != null) { if (startTermBytes != null && !StringHelper.startsWith(term, startTermBytes)) break; int df = termsEnum.docFreq(); // If we are sorting, we can use df>min (rather than >=) since we // are going in index order. For certain term distributions this can // make a large difference (for example, many terms with df=1). if (df > 0 && df > min) { int c; if (df >= minDfFilterCache) { // use the filter cache if (deState == null) { deState = new SolrIndexSearcher.DocsEnumState(); deState.fieldName = field; deState.liveDocs = r.getLiveDocs(); deState.termsEnum = termsEnum; deState.docsEnum = docsEnum; } c = searcher.numDocs(docs, deState); docsEnum = deState.docsEnum; } else { // iterate over TermDocs to calculate the intersection // TODO: specialize when base docset is a bitset or hash set (skipDocs)? or does it matter for this? // TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class impl) // TODO: would passing deleted docs lead to better efficiency over checking the fastForRandomSet? docsEnum = termsEnum.docs(null, docsEnum, DocsEnum.FLAG_NONE); c = 0; if (docsEnum instanceof MultiDocsEnum) { MultiDocsEnum.EnumWithSlice[] subs = ((MultiDocsEnum) docsEnum).getSubs(); int numSubs = ((MultiDocsEnum) docsEnum).getNumSubs(); for (int subindex = 0; subindex < numSubs; subindex++) { MultiDocsEnum.EnumWithSlice sub = subs[subindex]; if (sub.docsEnum == null) continue; int base = sub.slice.start; int docid; while ((docid = sub.docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid + base)) c++; } } } else { int docid; while ((docid = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid)) c++; } } } if (sortByCount) { if (c > min) { BytesRef termCopy = BytesRef.deepCopyOf(term); queue.add(new CountPair<BytesRef, Integer>(termCopy, c)); if (queue.size() >= maxsize) min = queue.last().val; } } else { if (c >= mincount && --off < 0) { if (--lim < 0) break; ft.indexedToReadable(term, charsRef); res.add(charsRef.toString(), c); } } } term = termsEnum.next(); } } if (sortByCount) { for (CountPair<BytesRef, Integer> p : queue) { if (--off >= 0) continue; if (--lim < 0) break; ft.indexedToReadable(p.key, charsRef); res.add(charsRef.toString(), p.val); } } if (missing) { res.add(null, getFieldMissingCount(searcher, docs, field)); } return res; }