List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java
License:Apache License
@Override public Filter prefixFilter(String value, @Nullable QueryParseContext context) { // Use HashSplitterSearch* analysis and post-process it to create the real filter TokenStream tok = null; try {//from w w w .j ava 2s .com tok = indexAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(value)); tok.reset(); } catch (IOException e) { return null; } CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class); BooleanFilter f = new BooleanFilter(); try { int remainingSize = sizeIsVariable ? 0 : sizeValue; // note: prefixes are not included while (tok.incrementToken()) { Term term = names().createIndexNameTerm(termAtt.toString()); if (termAtt.length() < 1 + chunkLength) { if (remainingSize > 0) { // implies size is fixed if (remainingSize < chunkLength) f.add(new PrefixLengthFilter(term, 1 + remainingSize, 1 + remainingSize), BooleanClause.Occur.MUST); else f.add(new PrefixLengthFilter(term, 1 + chunkLength, 1 + chunkLength), BooleanClause.Occur.MUST); } else { // varying size: only limit to the chunkLength f.add(new PrefixLengthFilter(term, 0, 1 + chunkLength), BooleanClause.Occur.MUST); } } else { f.add(new TermFilter(term), BooleanClause.Occur.MUST); } remainingSize -= termAtt.length() - 1; // termAtt contains the prefix, remainingSize doesn't take it into account } tok.end(); tok.close(); } catch (IOException e) { e.printStackTrace(); f = null; } return f; }
From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java
License:Apache License
@Override public Filter rangeFilter(String lowerTerm, String upperTerm, boolean includeLower, boolean includeUpper, @Nullable QueryParseContext context) { // Special case: -infinity to +infinity if (lowerTerm == null && upperTerm == null) { if (sizeIsVariable) return null; StringBuilder sbWildcardPart = new StringBuilder(); for (int i = 0; i < chunkLength; i++) sbWildcardPart.append(wildcardOne); String wildcardPart = sbWildcardPart.toString(); BooleanFilter filter = new BooleanFilter(); for (int i = sizeValue / chunkLength - 1; i >= 0; i--) { filter.add(new WildcardFilter(names().createIndexNameTerm(prefixes.charAt(i) + wildcardPart)), BooleanClause.Occur.MUST); }//from w w w .j a v a 2s .c o m if (sizeValue % chunkLength != 0) { // If the size is not dividible by chunkLength, // we still have a last chunk, but that has a shorter length filter.add( new WildcardFilter(names().createIndexNameTerm(prefixes.charAt(sizeValue / chunkLength + 1) + wildcardPart.substring(0, sizeValue % chunkLength))), BooleanClause.Occur.MUST); } return filter; } // Check for emptyness if (lowerTerm != null && upperTerm != null) { int cmp = lowerTerm.compareTo(upperTerm); // Bound invertion if (cmp > 0) return MatchNoDocsFilter.INSTANCE; // Equal bounds if (cmp == 0) { // and both inclusive bounds: singleton if (includeLower && includeUpper) { // Special case: equal terms return fieldFilter(lowerTerm, context); } // otherwise, empty range return MatchNoDocsFilter.INSTANCE; } } // Analyze lower and upper terms List<String> lowerTerms = new LinkedList<String>(); List<String> upperTerms = new LinkedList<String>(); if (lowerTerm != null) { TokenStream tok = null; try { tok = indexAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(lowerTerm)); tok.reset(); } catch (IOException e) { return null; } CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class); try { while (tok.incrementToken()) lowerTerms.add(termAtt.toString()); tok.end(); tok.close(); } catch (IOException e) { return null; } } if (upperTerm != null) { TokenStream tok = null; try { tok = indexAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(upperTerm)); tok.reset(); } catch (IOException e) { return null; } CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class); try { while (tok.incrementToken()) upperTerms.add(termAtt.toString()); tok.end(); tok.close(); } catch (IOException e) { return null; } } // Generate the filter BooleanFilter topLevelAndFilter = new BooleanFilter(); Iterator<String> lowers = lowerTerms.iterator(); Iterator<String> uppers = upperTerms.iterator(); String currLower = null; String currUpper = null; int remainingLowerSize = sizeIsVariable ? 0 : sizeValue; int remainingUpperSize = sizeIsVariable ? 0 : sizeValue; // First, the common prefix while (lowers.hasNext() && uppers.hasNext()) { currLower = lowers.next(); currUpper = uppers.next(); // The last part cannot be part of the prefix // because that special case has already been handled if (!lowers.hasNext() || !uppers.hasNext()) break; if (!currLower.equals(currUpper)) break; topLevelAndFilter.add(new TermFilter(names().createIndexNameTerm(currLower)), BooleanClause.Occur.MUST); remainingLowerSize -= currLower.length() - 1; remainingUpperSize -= currUpper.length() - 1; } String subPrefixLower = currLower; BooleanFilter secondLevelOrFilter = new BooleanFilter(); BooleanFilter lastFilter; // Add the range part of the query (secondLevelOrFilter) to the prefix part is already in topLevelAndFilter topLevelAndFilter.add(secondLevelOrFilter, BooleanClause.Occur.MUST); // We still have secondLevelOrFilter to populate lastFilter = new BooleanFilter(); // Handle the first diverging token of the lowerTerm (if it's not also the last available!) if (lowers.hasNext()) { lastFilter.add(new TermFilter(names().createIndexNameTerm(currLower)), BooleanClause.Occur.MUST); remainingLowerSize -= currLower.length() - 1; currLower = lowers.next(); } secondLevelOrFilter.add(lastFilter, BooleanClause.Occur.SHOULD); // Then get to the last token of the lowerTerm while (lowers.hasNext()) { BooleanFilter orFilter = new BooleanFilter(); lastFilter.add(orFilter, BooleanClause.Occur.MUST); orFilter.add(new TermRangeLengthFilter(names().indexName(), currLower, luceneTermUpperBound(currLower), false, false, 1 + chunkLength, 1 + chunkLength), BooleanClause.Occur.SHOULD); BooleanFilter nextFilter = new BooleanFilter(); nextFilter.add(new TermFilter(names().createIndexNameTerm(currLower)), BooleanClause.Occur.MUST); orFilter.add(nextFilter, BooleanClause.Occur.SHOULD); lastFilter = nextFilter; remainingLowerSize -= currLower.length() - 1; currLower = lowers.next(); } // Handle the last token of the lowerTerm if (remainingLowerSize < 0) lastFilter.add(new TermRangeLengthFilter(names().indexName(), currLower, luceneTermUpperBound(currLower), includeLower, false, 0, 1 + chunkLength), BooleanClause.Occur.MUST); else if (remainingLowerSize < chunkLength) lastFilter.add( new TermRangeLengthFilter(names().indexName(), currLower, luceneTermUpperBound(currLower), includeLower, false, 1 + remainingLowerSize, 1 + remainingLowerSize), BooleanClause.Occur.MUST); else lastFilter.add(new TermRangeLengthFilter(names().indexName(), currLower, luceneTermUpperBound(currLower), includeLower, false, 1 + chunkLength, 1 + chunkLength), BooleanClause.Occur.MUST); // Range from the non prefix part of the lowerTerm to the non prefix part of the upperTerm if (remainingUpperSize < 0) secondLevelOrFilter.add(new TermRangeLengthFilter(names().indexName(), subPrefixLower, currUpper, false, false, 0, 1 + chunkLength), BooleanClause.Occur.SHOULD); else if (remainingUpperSize < chunkLength) secondLevelOrFilter.add(new TermRangeLengthFilter(names().indexName(), subPrefixLower, currUpper, false, false, 1 + remainingUpperSize, 1 + remainingUpperSize), BooleanClause.Occur.SHOULD); else secondLevelOrFilter.add(new TermRangeLengthFilter(names().indexName(), subPrefixLower, currUpper, false, false, 1 + chunkLength, 1 + chunkLength), BooleanClause.Occur.SHOULD); lastFilter = new BooleanFilter(); // Handle the first diverging token of the upperTerm (if it's not also the last available!) if (uppers.hasNext()) { lastFilter.add(new TermFilter(names().createIndexNameTerm(currUpper)), BooleanClause.Occur.MUST); remainingUpperSize -= currUpper.length() - 1; currUpper = uppers.next(); } secondLevelOrFilter.add(lastFilter, BooleanClause.Occur.SHOULD); // Then get to the last token of the upperTerm while (uppers.hasNext()) { BooleanFilter orFilter = new BooleanFilter(); lastFilter.add(orFilter, BooleanClause.Occur.MUST); orFilter.add(new TermRangeLengthFilter(names().indexName(), luceneTermLowerBound(currUpper), currUpper, false, false, 1 + chunkLength, 1 + chunkLength), BooleanClause.Occur.SHOULD); BooleanFilter nextFilter = new BooleanFilter(); nextFilter.add(new TermFilter(names().createIndexNameTerm(currUpper)), BooleanClause.Occur.MUST); orFilter.add(nextFilter, BooleanClause.Occur.SHOULD); lastFilter = nextFilter; remainingUpperSize -= currUpper.length() - 1; currUpper = uppers.next(); } // Handle the last token of the upperTerm if (remainingUpperSize < 0) lastFilter.add(new TermRangeLengthFilter(names().indexName(), luceneTermLowerBound(currUpper), currUpper, false, includeUpper, 0, 1 + chunkLength), BooleanClause.Occur.MUST); else if (remainingUpperSize < chunkLength) lastFilter.add( new TermRangeLengthFilter(names().indexName(), luceneTermLowerBound(currUpper), currUpper, false, includeUpper, 1 + remainingUpperSize, 1 + remainingUpperSize), BooleanClause.Occur.MUST); else lastFilter.add(new TermRangeLengthFilter(names().indexName(), luceneTermLowerBound(currUpper), currUpper, false, includeUpper, 1 + chunkLength, 1 + chunkLength), BooleanClause.Occur.MUST); return topLevelAndFilter; }
From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java
License:Apache License
@Override public Query wildcardQuery(String value, @Nullable MultiTermQuery.RewriteMethod method, @Nullable QueryParseContext context) { // Use HashSplitterSearch* analysis and post-process it to create the real query TokenStream tok = null; try {// w w w. j a v a 2 s. com tok = searchAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(value)); tok.reset(); } catch (IOException e) { return null; } CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class); BooleanQuery q = new BooleanQuery(); try { while (tok.incrementToken()) { q.add(new WildcardQuery(names().createIndexNameTerm(termAtt.toString()), wildcardOne, wildcardAny), BooleanClause.Occur.MUST); } tok.end(); tok.close(); } catch (IOException e) { e.printStackTrace(); q = null; } return q; }
From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java
License:Apache License
@Override public Filter wildcardFilter(String value, @Nullable QueryParseContext context) { // Use HashSplitterSearch* analysis and post-process it to create the real query TokenStream tok = null; try {/*from w ww .j a v a 2 s .c o m*/ tok = searchAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(value)); tok.reset(); } catch (IOException e) { return null; } CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class); BooleanFilter f = new BooleanFilter(); try { while (tok.incrementToken()) { f.add(new WildcardFilter(names().createIndexNameTerm(termAtt.toString()), wildcardOne, wildcardAny), BooleanClause.Occur.MUST); } tok.end(); tok.close(); } catch (IOException e) { e.printStackTrace(); f = null; } return f; }
From source file:org.elasticsearch.index.mapper.token.AnalyzedTextFieldMapper.java
License:Apache License
static List<String> getAnalyzedText(TokenStream tokenStream) throws IOException { try {//from ww w .j a v a 2 s . c om List<String> analyzedText = new ArrayList<>(); CharTermAttribute terms = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { analyzedText.add(new String(terms.toString())); } tokenStream.end(); return analyzedText; } finally { tokenStream.close(); } }
From source file:org.elasticsearch.index.query.CommonTermsQueryParser.java
License:Apache License
private final Query parseQueryString(ExtendedCommonTermsQuery query, String queryString, String fieldName, QueryParseContext parseContext, String queryAnalyzer, String lowFreqMinimumShouldMatch, String highFreqMinimumShouldMatch) throws IOException { FieldMapper<?> mapper = null;/*from w w w .j a v a2 s . c o m*/ String field; MapperService.SmartNameFieldMappers smartNameFieldMappers = parseContext.smartFieldMappers(fieldName); if (smartNameFieldMappers != null && smartNameFieldMappers.hasMapper()) { mapper = smartNameFieldMappers.mapper(); field = mapper.names().indexName(); } else { field = fieldName; } Analyzer analyzer = null; if (queryAnalyzer == null) { if (mapper != null) { analyzer = mapper.searchAnalyzer(); } if (analyzer == null && smartNameFieldMappers != null) { analyzer = smartNameFieldMappers.searchAnalyzer(); } if (analyzer == null) { analyzer = parseContext.mapperService().searchAnalyzer(); } } else { analyzer = parseContext.mapperService().analysisService().analyzer(queryAnalyzer); if (analyzer == null) { throw new ElasticsearchIllegalArgumentException("No analyzer found for [" + queryAnalyzer + "]"); } } // Logic similar to QueryParser#getFieldQuery TokenStream source = analyzer.tokenStream(field, queryString.toString()); int count = 0; try { source.reset(); CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); while (source.incrementToken()) { BytesRef ref = new BytesRef(termAtt.length() * 4); // oversize for // UTF-8 UnicodeUtil.UTF16toUTF8(termAtt.buffer(), 0, termAtt.length(), ref); query.add(new Term(field, ref)); count++; } } finally { source.close(); } if (count == 0) { return null; } query.setLowFreqMinimumNumberShouldMatch(lowFreqMinimumShouldMatch); query.setHighFreqMinimumNumberShouldMatch(highFreqMinimumShouldMatch); return wrapSmartNameQuery(query, smartNameFieldMappers, parseContext); }
From source file:org.elasticsearch.index.search.QueryStringQueryParser.java
License:Apache License
private Query getPossiblyAnalyzedPrefixQuery(String field, String termStr) throws ParseException { if (analyzeWildcard == false) { return super.getPrefixQuery(field, termStr); }/*from w ww.jav a 2s .c om*/ List<List<String>> tlist; // get Analyzer from superclass and tokenize the term TokenStream source = null; try { try { source = getAnalyzer().tokenStream(field, termStr); source.reset(); } catch (IOException e) { return super.getPrefixQuery(field, termStr); } tlist = new ArrayList<>(); List<String> currentPos = new ArrayList<>(); CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posAtt = source.addAttribute(PositionIncrementAttribute.class); while (true) { try { if (!source.incrementToken()) break; } catch (IOException e) { break; } if (currentPos.isEmpty() == false && posAtt.getPositionIncrement() > 0) { tlist.add(currentPos); currentPos = new ArrayList<>(); } currentPos.add(termAtt.toString()); } if (currentPos.isEmpty() == false) { tlist.add(currentPos); } } finally { if (source != null) { IOUtils.closeWhileHandlingException(source); } } if (tlist.size() == 0) { return new MatchNoDocsQuery("analysis was empty for " + field + ":" + termStr); } if (tlist.size() == 1 && tlist.get(0).size() == 1) { return super.getPrefixQuery(field, tlist.get(0).get(0)); } // build a boolean query with prefix on the last position only. List<BooleanClause> clauses = new ArrayList<>(); for (int pos = 0; pos < tlist.size(); pos++) { List<String> plist = tlist.get(pos); boolean isLastPos = (pos == tlist.size() - 1); Query posQuery; if (plist.size() == 1) { if (isLastPos) { posQuery = super.getPrefixQuery(field, plist.get(0)); } else { posQuery = newTermQuery(new Term(field, plist.get(0))); } } else if (isLastPos == false) { // build a synonym query for terms in the same position. Term[] terms = new Term[plist.size()]; for (int i = 0; i < plist.size(); i++) { terms[i] = new Term(field, plist.get(i)); } posQuery = new SynonymQuery(terms); } else { List<BooleanClause> innerClauses = new ArrayList<>(); for (String token : plist) { innerClauses .add(new BooleanClause(super.getPrefixQuery(field, token), BooleanClause.Occur.SHOULD)); } posQuery = getBooleanQuery(innerClauses); } clauses.add(new BooleanClause(posQuery, getDefaultOperator() == Operator.AND ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD)); } return getBooleanQuery(clauses); }
From source file:org.elasticsearch.search.aggregations.bucket.significant.SignificantTextAggregator.java
License:Apache License
@Override public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, final LeafBucketCollector sub) throws IOException { final BytesRefBuilder previous = new BytesRefBuilder(); return new LeafBucketCollectorBase(sub, null) { @Override// w ww .jav a2 s .co m public void collect(int doc, long bucket) throws IOException { collectFromSource(doc, bucket, fieldName, sourceFieldNames); numCollectedDocs++; if (dupSequenceSpotter != null) { dupSequenceSpotter.startNewSequence(); } } private void processTokenStream(int doc, long bucket, TokenStream ts, BytesRefHash inDocTerms, String fieldText) throws IOException { if (dupSequenceSpotter != null) { ts = new DeDuplicatingTokenFilter(ts, dupSequenceSpotter); } CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); try { while (ts.incrementToken()) { if (dupSequenceSpotter != null) { long newTrieSize = dupSequenceSpotter.getEstimatedSizeInBytes(); long growth = newTrieSize - lastTrieSize; // Only update the circuitbreaker after if (growth > MEMORY_GROWTH_REPORTING_INTERVAL_BYTES) { addRequestCircuitBreakerBytes(growth); lastTrieSize = newTrieSize; } } previous.clear(); previous.copyChars(termAtt); BytesRef bytes = previous.get(); if (inDocTerms.add(bytes) >= 0) { if (includeExclude == null || includeExclude.accept(bytes)) { long bucketOrdinal = bucketOrds.add(bytes); if (bucketOrdinal < 0) { // already seen bucketOrdinal = -1 - bucketOrdinal; collectExistingBucket(sub, doc, bucketOrdinal); } else { collectBucket(sub, doc, bucketOrdinal); } } } } } finally { ts.close(); } } private void collectFromSource(int doc, long bucket, String indexedFieldName, String[] sourceFieldNames) throws IOException { MappedFieldType fieldType = context.getQueryShardContext().fieldMapper(indexedFieldName); if (fieldType == null) { throw new IllegalArgumentException("Aggregation [" + name + "] cannot process field [" + indexedFieldName + "] since it is not present"); } SourceLookup sourceLookup = context.lookup().source(); sourceLookup.setSegmentAndDocument(ctx, doc); BytesRefHash inDocTerms = new BytesRefHash(256, context.bigArrays()); try { for (String sourceField : sourceFieldNames) { List<Object> textsToHighlight = sourceLookup.extractRawValues(sourceField); textsToHighlight = textsToHighlight.stream().map(obj -> { if (obj instanceof BytesRef) { return fieldType.valueForDisplay(obj).toString(); } else { return obj; } }).collect(Collectors.toList()); Analyzer analyzer = fieldType.indexAnalyzer(); for (Object fieldValue : textsToHighlight) { String fieldText = fieldValue.toString(); TokenStream ts = analyzer.tokenStream(indexedFieldName, fieldText); processTokenStream(doc, bucket, ts, inDocTerms, fieldText); } } } finally { Releasables.close(inDocTerms); } } }; }
From source file:org.elasticsearch.search.highlight.PlainHighlighter.java
License:Apache License
private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, TokenStream tokenStream) throws IOException { try {// ww w . j a v a 2 s . co m if (!tokenStream.hasAttribute(OffsetAttribute.class)) { // Can't split on term boundaries without offsets return -1; } int end = -1; tokenStream.reset(); while (tokenStream.incrementToken()) { OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class); if (attr.endOffset() >= noMatchSize) { // Jump to the end of this token if it wouldn't put us past the boundary if (attr.endOffset() == noMatchSize) { end = noMatchSize; } return end; } end = attr.endOffset(); } // We've exhausted the token stream so we should just highlight everything. return end; } finally { tokenStream.end(); tokenStream.close(); } }
From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTest.java
License:Apache License
@Test public void testValidNumberOfExpansions() throws IOException { Builder builder = new SynonymMap.Builder(true); for (int i = 0; i < 256; i++) { builder.add(new CharsRef("" + (i + 1)), new CharsRef("" + (1000 + (i + 1))), true); }/*from ww w . java 2 s .c o m*/ StringBuilder valueBuilder = new StringBuilder(); for (int i = 0; i < 8; i++) { valueBuilder.append(i + 1); valueBuilder.append(" "); } MockTokenizer tokenizer = new MockTokenizer(new StringReader(valueBuilder.toString()), MockTokenizer.WHITESPACE, true); SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true); TokenStream suggestTokenStream = new CompletionTokenStream(filter, new BytesRef("Surface keyword|friggin payload|10"), new CompletionTokenStream.ToFiniteStrings() { @Override public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException { Set<IntsRef> finiteStrings = suggester .toFiniteStrings(suggester.getTokenStreamToAutomaton(), stream); return finiteStrings; } }); suggestTokenStream.reset(); ByteTermAttribute attr = suggestTokenStream.addAttribute(ByteTermAttribute.class); PositionIncrementAttribute posAttr = suggestTokenStream.addAttribute(PositionIncrementAttribute.class); int maxPos = 0; int count = 0; while (suggestTokenStream.incrementToken()) { count++; assertNotNull(attr.getBytesRef()); assertTrue(attr.getBytesRef().length > 0); maxPos += posAttr.getPositionIncrement(); } suggestTokenStream.close(); assertEquals(count, 256); assertEquals(count, maxPos); }