List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:org.elasticsearch.index.mapper.date.SimpleDateMappingTests.java
License:Apache License
private void assertNumericTokensEqual(ParsedDocument doc, DocumentMapper defaultMapper, String fieldA, String fieldB) throws IOException { assertThat(doc.rootDoc().getField(fieldA).tokenStream(defaultMapper.indexAnalyzer()), notNullValue()); assertThat(doc.rootDoc().getField(fieldB).tokenStream(defaultMapper.indexAnalyzer()), notNullValue()); TokenStream tokenStream = doc.rootDoc().getField(fieldA).tokenStream(defaultMapper.indexAnalyzer()); tokenStream.reset();/*from w w w . ja v a 2 s .c om*/ NumericTermAttribute nta = tokenStream.addAttribute(NumericTermAttribute.class); List<Long> values = new ArrayList<Long>(); while (tokenStream.incrementToken()) { values.add(nta.getRawValue()); } tokenStream = doc.rootDoc().getField(fieldB).tokenStream(defaultMapper.indexAnalyzer()); tokenStream.reset(); nta = tokenStream.addAttribute(NumericTermAttribute.class); int pos = 0; while (tokenStream.incrementToken()) { assertThat(values.get(pos++), equalTo(nta.getRawValue())); } assertThat(pos, equalTo(values.size())); }
From source file:org.elasticsearch.index.mapper.FeatureFieldMapperTests.java
License:Apache License
static int getFrequency(TokenStream tk) throws IOException { TermFrequencyAttribute freqAttribute = tk.addAttribute(TermFrequencyAttribute.class); tk.reset();// ww w.j a v a 2s . c om assertTrue(tk.incrementToken()); int freq = freqAttribute.getTermFrequency(); assertFalse(tk.incrementToken()); return freq; }
From source file:org.elasticsearch.index.mapper.token.AnalyzedTextFieldMapper.java
License:Apache License
static List<String> getAnalyzedText(TokenStream tokenStream) throws IOException { try {/*from w w w . j a v a 2 s.c o m*/ List<String> analyzedText = new ArrayList<>(); CharTermAttribute terms = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { analyzedText.add(new String(terms.toString())); } tokenStream.end(); return analyzedText; } finally { tokenStream.close(); } }
From source file:org.elasticsearch.index.query.CommonTermsQueryParser.java
License:Apache License
private final Query parseQueryString(ExtendedCommonTermsQuery query, String queryString, String fieldName, QueryParseContext parseContext, String queryAnalyzer, String lowFreqMinimumShouldMatch, String highFreqMinimumShouldMatch) throws IOException { FieldMapper<?> mapper = null;// w ww . jav a 2s .co m String field; MapperService.SmartNameFieldMappers smartNameFieldMappers = parseContext.smartFieldMappers(fieldName); if (smartNameFieldMappers != null && smartNameFieldMappers.hasMapper()) { mapper = smartNameFieldMappers.mapper(); field = mapper.names().indexName(); } else { field = fieldName; } Analyzer analyzer = null; if (queryAnalyzer == null) { if (mapper != null) { analyzer = mapper.searchAnalyzer(); } if (analyzer == null && smartNameFieldMappers != null) { analyzer = smartNameFieldMappers.searchAnalyzer(); } if (analyzer == null) { analyzer = parseContext.mapperService().searchAnalyzer(); } } else { analyzer = parseContext.mapperService().analysisService().analyzer(queryAnalyzer); if (analyzer == null) { throw new ElasticsearchIllegalArgumentException("No analyzer found for [" + queryAnalyzer + "]"); } } // Logic similar to QueryParser#getFieldQuery TokenStream source = analyzer.tokenStream(field, queryString.toString()); int count = 0; try { source.reset(); CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); while (source.incrementToken()) { BytesRef ref = new BytesRef(termAtt.length() * 4); // oversize for // UTF-8 UnicodeUtil.UTF16toUTF8(termAtt.buffer(), 0, termAtt.length(), ref); query.add(new Term(field, ref)); count++; } } finally { source.close(); } if (count == 0) { return null; } query.setLowFreqMinimumNumberShouldMatch(lowFreqMinimumShouldMatch); query.setHighFreqMinimumNumberShouldMatch(highFreqMinimumShouldMatch); return wrapSmartNameQuery(query, smartNameFieldMappers, parseContext); }
From source file:org.elasticsearch.index.search.QueryStringQueryParser.java
License:Apache License
private Query getPossiblyAnalyzedPrefixQuery(String field, String termStr) throws ParseException { if (analyzeWildcard == false) { return super.getPrefixQuery(field, termStr); }//from w w w . j av a 2 s . co m List<List<String>> tlist; // get Analyzer from superclass and tokenize the term TokenStream source = null; try { try { source = getAnalyzer().tokenStream(field, termStr); source.reset(); } catch (IOException e) { return super.getPrefixQuery(field, termStr); } tlist = new ArrayList<>(); List<String> currentPos = new ArrayList<>(); CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posAtt = source.addAttribute(PositionIncrementAttribute.class); while (true) { try { if (!source.incrementToken()) break; } catch (IOException e) { break; } if (currentPos.isEmpty() == false && posAtt.getPositionIncrement() > 0) { tlist.add(currentPos); currentPos = new ArrayList<>(); } currentPos.add(termAtt.toString()); } if (currentPos.isEmpty() == false) { tlist.add(currentPos); } } finally { if (source != null) { IOUtils.closeWhileHandlingException(source); } } if (tlist.size() == 0) { return new MatchNoDocsQuery("analysis was empty for " + field + ":" + termStr); } if (tlist.size() == 1 && tlist.get(0).size() == 1) { return super.getPrefixQuery(field, tlist.get(0).get(0)); } // build a boolean query with prefix on the last position only. List<BooleanClause> clauses = new ArrayList<>(); for (int pos = 0; pos < tlist.size(); pos++) { List<String> plist = tlist.get(pos); boolean isLastPos = (pos == tlist.size() - 1); Query posQuery; if (plist.size() == 1) { if (isLastPos) { posQuery = super.getPrefixQuery(field, plist.get(0)); } else { posQuery = newTermQuery(new Term(field, plist.get(0))); } } else if (isLastPos == false) { // build a synonym query for terms in the same position. Term[] terms = new Term[plist.size()]; for (int i = 0; i < plist.size(); i++) { terms[i] = new Term(field, plist.get(i)); } posQuery = new SynonymQuery(terms); } else { List<BooleanClause> innerClauses = new ArrayList<>(); for (String token : plist) { innerClauses .add(new BooleanClause(super.getPrefixQuery(field, token), BooleanClause.Occur.SHOULD)); } posQuery = getBooleanQuery(innerClauses); } clauses.add(new BooleanClause(posQuery, getDefaultOperator() == Operator.AND ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD)); } return getBooleanQuery(clauses); }
From source file:org.elasticsearch.search.aggregations.bucket.significant.SignificantTextAggregator.java
License:Apache License
@Override public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, final LeafBucketCollector sub) throws IOException { final BytesRefBuilder previous = new BytesRefBuilder(); return new LeafBucketCollectorBase(sub, null) { @Override//w w w. j a va2 s. c o m public void collect(int doc, long bucket) throws IOException { collectFromSource(doc, bucket, fieldName, sourceFieldNames); numCollectedDocs++; if (dupSequenceSpotter != null) { dupSequenceSpotter.startNewSequence(); } } private void processTokenStream(int doc, long bucket, TokenStream ts, BytesRefHash inDocTerms, String fieldText) throws IOException { if (dupSequenceSpotter != null) { ts = new DeDuplicatingTokenFilter(ts, dupSequenceSpotter); } CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); try { while (ts.incrementToken()) { if (dupSequenceSpotter != null) { long newTrieSize = dupSequenceSpotter.getEstimatedSizeInBytes(); long growth = newTrieSize - lastTrieSize; // Only update the circuitbreaker after if (growth > MEMORY_GROWTH_REPORTING_INTERVAL_BYTES) { addRequestCircuitBreakerBytes(growth); lastTrieSize = newTrieSize; } } previous.clear(); previous.copyChars(termAtt); BytesRef bytes = previous.get(); if (inDocTerms.add(bytes) >= 0) { if (includeExclude == null || includeExclude.accept(bytes)) { long bucketOrdinal = bucketOrds.add(bytes); if (bucketOrdinal < 0) { // already seen bucketOrdinal = -1 - bucketOrdinal; collectExistingBucket(sub, doc, bucketOrdinal); } else { collectBucket(sub, doc, bucketOrdinal); } } } } } finally { ts.close(); } } private void collectFromSource(int doc, long bucket, String indexedFieldName, String[] sourceFieldNames) throws IOException { MappedFieldType fieldType = context.getQueryShardContext().fieldMapper(indexedFieldName); if (fieldType == null) { throw new IllegalArgumentException("Aggregation [" + name + "] cannot process field [" + indexedFieldName + "] since it is not present"); } SourceLookup sourceLookup = context.lookup().source(); sourceLookup.setSegmentAndDocument(ctx, doc); BytesRefHash inDocTerms = new BytesRefHash(256, context.bigArrays()); try { for (String sourceField : sourceFieldNames) { List<Object> textsToHighlight = sourceLookup.extractRawValues(sourceField); textsToHighlight = textsToHighlight.stream().map(obj -> { if (obj instanceof BytesRef) { return fieldType.valueForDisplay(obj).toString(); } else { return obj; } }).collect(Collectors.toList()); Analyzer analyzer = fieldType.indexAnalyzer(); for (Object fieldValue : textsToHighlight) { String fieldText = fieldValue.toString(); TokenStream ts = analyzer.tokenStream(indexedFieldName, fieldText); processTokenStream(doc, bucket, ts, inDocTerms, fieldText); } } } finally { Releasables.close(inDocTerms); } } }; }
From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTest.java
License:Apache License
@Test public void testValidNumberOfExpansions() throws IOException { Builder builder = new SynonymMap.Builder(true); for (int i = 0; i < 256; i++) { builder.add(new CharsRef("" + (i + 1)), new CharsRef("" + (1000 + (i + 1))), true); }/*from www.j av a 2s .c o m*/ StringBuilder valueBuilder = new StringBuilder(); for (int i = 0; i < 8; i++) { valueBuilder.append(i + 1); valueBuilder.append(" "); } MockTokenizer tokenizer = new MockTokenizer(new StringReader(valueBuilder.toString()), MockTokenizer.WHITESPACE, true); SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true); TokenStream suggestTokenStream = new CompletionTokenStream(filter, new BytesRef("Surface keyword|friggin payload|10"), new CompletionTokenStream.ToFiniteStrings() { @Override public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException { Set<IntsRef> finiteStrings = suggester .toFiniteStrings(suggester.getTokenStreamToAutomaton(), stream); return finiteStrings; } }); suggestTokenStream.reset(); ByteTermAttribute attr = suggestTokenStream.addAttribute(ByteTermAttribute.class); PositionIncrementAttribute posAttr = suggestTokenStream.addAttribute(PositionIncrementAttribute.class); int maxPos = 0; int count = 0; while (suggestTokenStream.incrementToken()) { count++; assertNotNull(attr.getBytesRef()); assertTrue(attr.getBytesRef().length > 0); maxPos += posAttr.getPositionIncrement(); } suggestTokenStream.close(); assertEquals(count, 256); assertEquals(count, maxPos); }
From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTests.java
License:Apache License
@Test public void testValidNumberOfExpansions() throws IOException { Builder builder = new SynonymMap.Builder(true); for (int i = 0; i < 256; i++) { builder.add(new CharsRef("" + (i + 1)), new CharsRef("" + (1000 + (i + 1))), true); }/*from w ww . ja v a 2 s . c o m*/ StringBuilder valueBuilder = new StringBuilder(); for (int i = 0; i < 8; i++) { valueBuilder.append(i + 1); valueBuilder.append(" "); } MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); tokenizer.setReader(new StringReader(valueBuilder.toString())); SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true); TokenStream suggestTokenStream = new CompletionTokenStream(filter, new BytesRef("Surface keyword|friggin payload|10"), new CompletionTokenStream.ToFiniteStrings() { @Override public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException { Set<IntsRef> finiteStrings = suggester.toFiniteStrings(stream); return finiteStrings; } }); suggestTokenStream.reset(); ByteTermAttribute attr = suggestTokenStream.addAttribute(ByteTermAttribute.class); PositionIncrementAttribute posAttr = suggestTokenStream.addAttribute(PositionIncrementAttribute.class); int maxPos = 0; int count = 0; while (suggestTokenStream.incrementToken()) { count++; assertNotNull(attr.getBytesRef()); assertTrue(attr.getBytesRef().length > 0); maxPos += posAttr.getPositionIncrement(); } suggestTokenStream.close(); assertEquals(count, 256); assertEquals(count, maxPos); }
From source file:org.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.java
License:Apache License
public Result getCorrections(TokenStream stream, final CandidateGenerator generator, float maxErrors, int numCorrections, IndexReader reader, WordScorer wordScorer, BytesRef separator, float confidence, int gramSize) throws IOException { final List<CandidateSet> candidateSetsList = new ArrayList<DirectCandidateGenerator.CandidateSet>(); SuggestUtils.analyze(stream, new SuggestUtils.TokenConsumer() { CandidateSet currentSet = null;/*from w ww. j a v a 2 s . c o m*/ private TypeAttribute typeAttribute; private final BytesRef termsRef = new BytesRef(); private boolean anyUnigram = false; private boolean anyTokens = false; @Override public void reset(TokenStream stream) { super.reset(stream); typeAttribute = stream.addAttribute(TypeAttribute.class); } @Override public void nextToken() throws IOException { anyTokens = true; BytesRef term = fillBytesRef(termsRef); if (requireUnigram && typeAttribute.type() == ShingleFilter.DEFAULT_TOKEN_TYPE) { return; } anyUnigram = true; if (posIncAttr.getPositionIncrement() == 0 && typeAttribute.type() == SynonymFilter.TYPE_SYNONYM) { assert currentSet != null; long freq = 0; if ((freq = generator.frequency(term)) > 0) { currentSet.addOneCandidate( generator.createCandidate(BytesRef.deepCopyOf(term), freq, realWordLikelihood)); } } else { if (currentSet != null) { candidateSetsList.add(currentSet); } currentSet = new CandidateSet(Candidate.EMPTY, generator.createCandidate(BytesRef.deepCopyOf(term), true)); } } @Override public void end() { if (currentSet != null) { candidateSetsList.add(currentSet); } if (requireUnigram && !anyUnigram && anyTokens) { throw new IllegalStateException("At least one unigram is required but all tokens were ngrams"); } } }); if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) { return Result.EMPTY; } for (CandidateSet candidateSet : candidateSetsList) { generator.drawCandidates(candidateSet); } double cutoffScore = Double.MIN_VALUE; CandidateScorer scorer = new CandidateScorer(wordScorer, numCorrections, gramSize); CandidateSet[] candidateSets = candidateSetsList.toArray(new CandidateSet[candidateSetsList.size()]); if (confidence > 0.0) { Candidate[] candidates = new Candidate[candidateSets.length]; for (int i = 0; i < candidates.length; i++) { candidates[i] = candidateSets[i].originalTerm; } double inputPhraseScore = scorer.score(candidates, candidateSets); cutoffScore = inputPhraseScore * confidence; } Correction[] findBestCandiates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore); return new Result(findBestCandiates, cutoffScore); }
From source file:org.exist.indexing.lucene.XMLToQuery.java
License:Open Source License
private Query phraseQuery(String field, Element node, Analyzer analyzer) throws XPathException { NodeList termList = node.getElementsByTagName("term"); if (termList.getLength() == 0) { PhraseQuery query = new PhraseQuery(); String qstr = getText(node); try {//from www .jav a 2s . co m TokenStream stream = analyzer.tokenStream(field, new StringReader(qstr)); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { query.add(new Term(field, termAttr.toString())); } stream.end(); stream.close(); } catch (IOException e) { throw new XPathException("Error while parsing phrase query: " + qstr); } int slop = getSlop(node); if (slop > -1) query.setSlop(slop); return query; } MultiPhraseQuery query = new MultiPhraseQuery(); for (int i = 0; i < termList.getLength(); i++) { Element elem = (Element) termList.item(i); String text = getText(elem); if (text.indexOf('?') > -1 || text.indexOf('*') > 0) { Term[] expanded = expandTerms(field, text); if (expanded.length > 0) query.add(expanded); } else { String termStr = getTerm(field, text, analyzer); if (termStr != null) query.add(new Term(field, text)); } } int slop = getSlop(node); if (slop > -1) query.setSlop(slop); return query; }