Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:org.elasticsearch.index.mapper.date.SimpleDateMappingTests.java

License:Apache License

private void assertNumericTokensEqual(ParsedDocument doc, DocumentMapper defaultMapper, String fieldA,
        String fieldB) throws IOException {
    assertThat(doc.rootDoc().getField(fieldA).tokenStream(defaultMapper.indexAnalyzer()), notNullValue());
    assertThat(doc.rootDoc().getField(fieldB).tokenStream(defaultMapper.indexAnalyzer()), notNullValue());

    TokenStream tokenStream = doc.rootDoc().getField(fieldA).tokenStream(defaultMapper.indexAnalyzer());
    tokenStream.reset();/*from w  w w .  ja v a  2  s  .c om*/
    NumericTermAttribute nta = tokenStream.addAttribute(NumericTermAttribute.class);
    List<Long> values = new ArrayList<Long>();
    while (tokenStream.incrementToken()) {
        values.add(nta.getRawValue());
    }

    tokenStream = doc.rootDoc().getField(fieldB).tokenStream(defaultMapper.indexAnalyzer());
    tokenStream.reset();
    nta = tokenStream.addAttribute(NumericTermAttribute.class);
    int pos = 0;
    while (tokenStream.incrementToken()) {
        assertThat(values.get(pos++), equalTo(nta.getRawValue()));
    }
    assertThat(pos, equalTo(values.size()));
}

From source file:org.elasticsearch.index.mapper.FeatureFieldMapperTests.java

License:Apache License

static int getFrequency(TokenStream tk) throws IOException {
    TermFrequencyAttribute freqAttribute = tk.addAttribute(TermFrequencyAttribute.class);
    tk.reset();// ww  w.j a v a 2s .  c  om
    assertTrue(tk.incrementToken());
    int freq = freqAttribute.getTermFrequency();
    assertFalse(tk.incrementToken());
    return freq;
}

From source file:org.elasticsearch.index.mapper.token.AnalyzedTextFieldMapper.java

License:Apache License

static List<String> getAnalyzedText(TokenStream tokenStream) throws IOException {
    try {/*from w w  w . j  a v  a  2 s.c o  m*/
        List<String> analyzedText = new ArrayList<>();
        CharTermAttribute terms = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();

        while (tokenStream.incrementToken()) {
            analyzedText.add(new String(terms.toString()));
        }
        tokenStream.end();
        return analyzedText;
    } finally {
        tokenStream.close();
    }
}

From source file:org.elasticsearch.index.query.CommonTermsQueryParser.java

License:Apache License

private final Query parseQueryString(ExtendedCommonTermsQuery query, String queryString, String fieldName,
        QueryParseContext parseContext, String queryAnalyzer, String lowFreqMinimumShouldMatch,
        String highFreqMinimumShouldMatch) throws IOException {

    FieldMapper<?> mapper = null;// w ww . jav a 2s  .co m
    String field;
    MapperService.SmartNameFieldMappers smartNameFieldMappers = parseContext.smartFieldMappers(fieldName);
    if (smartNameFieldMappers != null && smartNameFieldMappers.hasMapper()) {
        mapper = smartNameFieldMappers.mapper();
        field = mapper.names().indexName();
    } else {
        field = fieldName;
    }

    Analyzer analyzer = null;
    if (queryAnalyzer == null) {
        if (mapper != null) {
            analyzer = mapper.searchAnalyzer();
        }
        if (analyzer == null && smartNameFieldMappers != null) {
            analyzer = smartNameFieldMappers.searchAnalyzer();
        }
        if (analyzer == null) {
            analyzer = parseContext.mapperService().searchAnalyzer();
        }
    } else {
        analyzer = parseContext.mapperService().analysisService().analyzer(queryAnalyzer);
        if (analyzer == null) {
            throw new ElasticsearchIllegalArgumentException("No analyzer found for [" + queryAnalyzer + "]");
        }
    }

    // Logic similar to QueryParser#getFieldQuery
    TokenStream source = analyzer.tokenStream(field, queryString.toString());
    int count = 0;
    try {
        source.reset();
        CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
        while (source.incrementToken()) {
            BytesRef ref = new BytesRef(termAtt.length() * 4); // oversize for
                                                               // UTF-8
            UnicodeUtil.UTF16toUTF8(termAtt.buffer(), 0, termAtt.length(), ref);
            query.add(new Term(field, ref));
            count++;
        }
    } finally {
        source.close();
    }

    if (count == 0) {
        return null;
    }
    query.setLowFreqMinimumNumberShouldMatch(lowFreqMinimumShouldMatch);
    query.setHighFreqMinimumNumberShouldMatch(highFreqMinimumShouldMatch);
    return wrapSmartNameQuery(query, smartNameFieldMappers, parseContext);
}

From source file:org.elasticsearch.index.search.QueryStringQueryParser.java

License:Apache License

private Query getPossiblyAnalyzedPrefixQuery(String field, String termStr) throws ParseException {
    if (analyzeWildcard == false) {
        return super.getPrefixQuery(field, termStr);
    }//from  w w w  . j av a 2 s . co  m
    List<List<String>> tlist;
    // get Analyzer from superclass and tokenize the term
    TokenStream source = null;
    try {
        try {
            source = getAnalyzer().tokenStream(field, termStr);
            source.reset();
        } catch (IOException e) {
            return super.getPrefixQuery(field, termStr);
        }
        tlist = new ArrayList<>();
        List<String> currentPos = new ArrayList<>();
        CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posAtt = source.addAttribute(PositionIncrementAttribute.class);

        while (true) {
            try {
                if (!source.incrementToken())
                    break;
            } catch (IOException e) {
                break;
            }
            if (currentPos.isEmpty() == false && posAtt.getPositionIncrement() > 0) {
                tlist.add(currentPos);
                currentPos = new ArrayList<>();
            }
            currentPos.add(termAtt.toString());
        }
        if (currentPos.isEmpty() == false) {
            tlist.add(currentPos);
        }
    } finally {
        if (source != null) {
            IOUtils.closeWhileHandlingException(source);
        }
    }

    if (tlist.size() == 0) {
        return new MatchNoDocsQuery("analysis was empty for " + field + ":" + termStr);
    }

    if (tlist.size() == 1 && tlist.get(0).size() == 1) {
        return super.getPrefixQuery(field, tlist.get(0).get(0));
    }

    // build a boolean query with prefix on the last position only.
    List<BooleanClause> clauses = new ArrayList<>();
    for (int pos = 0; pos < tlist.size(); pos++) {
        List<String> plist = tlist.get(pos);
        boolean isLastPos = (pos == tlist.size() - 1);
        Query posQuery;
        if (plist.size() == 1) {
            if (isLastPos) {
                posQuery = super.getPrefixQuery(field, plist.get(0));
            } else {
                posQuery = newTermQuery(new Term(field, plist.get(0)));
            }
        } else if (isLastPos == false) {
            // build a synonym query for terms in the same position.
            Term[] terms = new Term[plist.size()];
            for (int i = 0; i < plist.size(); i++) {
                terms[i] = new Term(field, plist.get(i));
            }
            posQuery = new SynonymQuery(terms);
        } else {
            List<BooleanClause> innerClauses = new ArrayList<>();
            for (String token : plist) {
                innerClauses
                        .add(new BooleanClause(super.getPrefixQuery(field, token), BooleanClause.Occur.SHOULD));
            }
            posQuery = getBooleanQuery(innerClauses);
        }
        clauses.add(new BooleanClause(posQuery,
                getDefaultOperator() == Operator.AND ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD));
    }
    return getBooleanQuery(clauses);
}

From source file:org.elasticsearch.search.aggregations.bucket.significant.SignificantTextAggregator.java

License:Apache License

@Override
public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, final LeafBucketCollector sub)
        throws IOException {
    final BytesRefBuilder previous = new BytesRefBuilder();
    return new LeafBucketCollectorBase(sub, null) {

        @Override//w w w.  j  a  va2  s. c  o  m
        public void collect(int doc, long bucket) throws IOException {
            collectFromSource(doc, bucket, fieldName, sourceFieldNames);
            numCollectedDocs++;
            if (dupSequenceSpotter != null) {
                dupSequenceSpotter.startNewSequence();
            }
        }

        private void processTokenStream(int doc, long bucket, TokenStream ts, BytesRefHash inDocTerms,
                String fieldText) throws IOException {
            if (dupSequenceSpotter != null) {
                ts = new DeDuplicatingTokenFilter(ts, dupSequenceSpotter);
            }
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            ts.reset();
            try {
                while (ts.incrementToken()) {
                    if (dupSequenceSpotter != null) {
                        long newTrieSize = dupSequenceSpotter.getEstimatedSizeInBytes();
                        long growth = newTrieSize - lastTrieSize;
                        // Only update the circuitbreaker after
                        if (growth > MEMORY_GROWTH_REPORTING_INTERVAL_BYTES) {
                            addRequestCircuitBreakerBytes(growth);
                            lastTrieSize = newTrieSize;
                        }
                    }
                    previous.clear();
                    previous.copyChars(termAtt);
                    BytesRef bytes = previous.get();
                    if (inDocTerms.add(bytes) >= 0) {
                        if (includeExclude == null || includeExclude.accept(bytes)) {
                            long bucketOrdinal = bucketOrds.add(bytes);
                            if (bucketOrdinal < 0) { // already seen
                                bucketOrdinal = -1 - bucketOrdinal;
                                collectExistingBucket(sub, doc, bucketOrdinal);
                            } else {
                                collectBucket(sub, doc, bucketOrdinal);
                            }
                        }
                    }
                }

            } finally {
                ts.close();
            }
        }

        private void collectFromSource(int doc, long bucket, String indexedFieldName, String[] sourceFieldNames)
                throws IOException {
            MappedFieldType fieldType = context.getQueryShardContext().fieldMapper(indexedFieldName);
            if (fieldType == null) {
                throw new IllegalArgumentException("Aggregation [" + name + "] cannot process field ["
                        + indexedFieldName + "] since it is not present");
            }

            SourceLookup sourceLookup = context.lookup().source();
            sourceLookup.setSegmentAndDocument(ctx, doc);
            BytesRefHash inDocTerms = new BytesRefHash(256, context.bigArrays());

            try {
                for (String sourceField : sourceFieldNames) {
                    List<Object> textsToHighlight = sourceLookup.extractRawValues(sourceField);
                    textsToHighlight = textsToHighlight.stream().map(obj -> {
                        if (obj instanceof BytesRef) {
                            return fieldType.valueForDisplay(obj).toString();
                        } else {
                            return obj;
                        }
                    }).collect(Collectors.toList());

                    Analyzer analyzer = fieldType.indexAnalyzer();
                    for (Object fieldValue : textsToHighlight) {
                        String fieldText = fieldValue.toString();
                        TokenStream ts = analyzer.tokenStream(indexedFieldName, fieldText);
                        processTokenStream(doc, bucket, ts, inDocTerms, fieldText);
                    }
                }
            } finally {
                Releasables.close(inDocTerms);
            }
        }
    };
}

From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTest.java

License:Apache License

@Test
public void testValidNumberOfExpansions() throws IOException {
    Builder builder = new SynonymMap.Builder(true);
    for (int i = 0; i < 256; i++) {
        builder.add(new CharsRef("" + (i + 1)), new CharsRef("" + (1000 + (i + 1))), true);
    }/*from   www.j av a 2s .c  o m*/
    StringBuilder valueBuilder = new StringBuilder();
    for (int i = 0; i < 8; i++) {
        valueBuilder.append(i + 1);
        valueBuilder.append(" ");
    }
    MockTokenizer tokenizer = new MockTokenizer(new StringReader(valueBuilder.toString()),
            MockTokenizer.WHITESPACE, true);
    SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true);

    TokenStream suggestTokenStream = new CompletionTokenStream(filter,
            new BytesRef("Surface keyword|friggin payload|10"), new CompletionTokenStream.ToFiniteStrings() {
                @Override
                public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
                    Set<IntsRef> finiteStrings = suggester
                            .toFiniteStrings(suggester.getTokenStreamToAutomaton(), stream);
                    return finiteStrings;
                }
            });

    suggestTokenStream.reset();
    ByteTermAttribute attr = suggestTokenStream.addAttribute(ByteTermAttribute.class);
    PositionIncrementAttribute posAttr = suggestTokenStream.addAttribute(PositionIncrementAttribute.class);
    int maxPos = 0;
    int count = 0;
    while (suggestTokenStream.incrementToken()) {
        count++;
        assertNotNull(attr.getBytesRef());
        assertTrue(attr.getBytesRef().length > 0);
        maxPos += posAttr.getPositionIncrement();
    }
    suggestTokenStream.close();
    assertEquals(count, 256);
    assertEquals(count, maxPos);

}

From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTests.java

License:Apache License

@Test
public void testValidNumberOfExpansions() throws IOException {
    Builder builder = new SynonymMap.Builder(true);
    for (int i = 0; i < 256; i++) {
        builder.add(new CharsRef("" + (i + 1)), new CharsRef("" + (1000 + (i + 1))), true);
    }/*from  w  ww . ja  v  a  2 s  . c  o m*/
    StringBuilder valueBuilder = new StringBuilder();
    for (int i = 0; i < 8; i++) {
        valueBuilder.append(i + 1);
        valueBuilder.append(" ");
    }
    MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
    tokenizer.setReader(new StringReader(valueBuilder.toString()));
    SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true);

    TokenStream suggestTokenStream = new CompletionTokenStream(filter,
            new BytesRef("Surface keyword|friggin payload|10"), new CompletionTokenStream.ToFiniteStrings() {
                @Override
                public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
                    Set<IntsRef> finiteStrings = suggester.toFiniteStrings(stream);
                    return finiteStrings;
                }
            });

    suggestTokenStream.reset();
    ByteTermAttribute attr = suggestTokenStream.addAttribute(ByteTermAttribute.class);
    PositionIncrementAttribute posAttr = suggestTokenStream.addAttribute(PositionIncrementAttribute.class);
    int maxPos = 0;
    int count = 0;
    while (suggestTokenStream.incrementToken()) {
        count++;
        assertNotNull(attr.getBytesRef());
        assertTrue(attr.getBytesRef().length > 0);
        maxPos += posAttr.getPositionIncrement();
    }
    suggestTokenStream.close();
    assertEquals(count, 256);
    assertEquals(count, maxPos);

}

From source file:org.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.java

License:Apache License

public Result getCorrections(TokenStream stream, final CandidateGenerator generator, float maxErrors,
        int numCorrections, IndexReader reader, WordScorer wordScorer, BytesRef separator, float confidence,
        int gramSize) throws IOException {

    final List<CandidateSet> candidateSetsList = new ArrayList<DirectCandidateGenerator.CandidateSet>();
    SuggestUtils.analyze(stream, new SuggestUtils.TokenConsumer() {
        CandidateSet currentSet = null;/*from   w ww.  j  a v  a  2 s  . c  o m*/
        private TypeAttribute typeAttribute;
        private final BytesRef termsRef = new BytesRef();
        private boolean anyUnigram = false;
        private boolean anyTokens = false;

        @Override
        public void reset(TokenStream stream) {
            super.reset(stream);
            typeAttribute = stream.addAttribute(TypeAttribute.class);
        }

        @Override
        public void nextToken() throws IOException {
            anyTokens = true;
            BytesRef term = fillBytesRef(termsRef);
            if (requireUnigram && typeAttribute.type() == ShingleFilter.DEFAULT_TOKEN_TYPE) {
                return;
            }
            anyUnigram = true;
            if (posIncAttr.getPositionIncrement() == 0 && typeAttribute.type() == SynonymFilter.TYPE_SYNONYM) {
                assert currentSet != null;
                long freq = 0;
                if ((freq = generator.frequency(term)) > 0) {
                    currentSet.addOneCandidate(
                            generator.createCandidate(BytesRef.deepCopyOf(term), freq, realWordLikelihood));
                }
            } else {
                if (currentSet != null) {
                    candidateSetsList.add(currentSet);
                }
                currentSet = new CandidateSet(Candidate.EMPTY,
                        generator.createCandidate(BytesRef.deepCopyOf(term), true));
            }
        }

        @Override
        public void end() {
            if (currentSet != null) {
                candidateSetsList.add(currentSet);
            }
            if (requireUnigram && !anyUnigram && anyTokens) {
                throw new IllegalStateException("At least one unigram is required but all tokens were ngrams");
            }
        }
    });

    if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) {
        return Result.EMPTY;
    }

    for (CandidateSet candidateSet : candidateSetsList) {
        generator.drawCandidates(candidateSet);
    }
    double cutoffScore = Double.MIN_VALUE;
    CandidateScorer scorer = new CandidateScorer(wordScorer, numCorrections, gramSize);
    CandidateSet[] candidateSets = candidateSetsList.toArray(new CandidateSet[candidateSetsList.size()]);
    if (confidence > 0.0) {
        Candidate[] candidates = new Candidate[candidateSets.length];
        for (int i = 0; i < candidates.length; i++) {
            candidates[i] = candidateSets[i].originalTerm;
        }
        double inputPhraseScore = scorer.score(candidates, candidateSets);
        cutoffScore = inputPhraseScore * confidence;
    }
    Correction[] findBestCandiates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);

    return new Result(findBestCandiates, cutoffScore);
}

From source file:org.exist.indexing.lucene.XMLToQuery.java

License:Open Source License

private Query phraseQuery(String field, Element node, Analyzer analyzer) throws XPathException {
    NodeList termList = node.getElementsByTagName("term");
    if (termList.getLength() == 0) {
        PhraseQuery query = new PhraseQuery();
        String qstr = getText(node);
        try {//from  www  .jav  a 2s  . co  m
            TokenStream stream = analyzer.tokenStream(field, new StringReader(qstr));
            CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            while (stream.incrementToken()) {
                query.add(new Term(field, termAttr.toString()));
            }
            stream.end();
            stream.close();
        } catch (IOException e) {
            throw new XPathException("Error while parsing phrase query: " + qstr);
        }
        int slop = getSlop(node);
        if (slop > -1)
            query.setSlop(slop);
        return query;
    }
    MultiPhraseQuery query = new MultiPhraseQuery();
    for (int i = 0; i < termList.getLength(); i++) {
        Element elem = (Element) termList.item(i);
        String text = getText(elem);
        if (text.indexOf('?') > -1 || text.indexOf('*') > 0) {
            Term[] expanded = expandTerms(field, text);
            if (expanded.length > 0)
                query.add(expanded);
        } else {
            String termStr = getTerm(field, text, analyzer);
            if (termStr != null)
                query.add(new Term(field, text));
        }
    }
    int slop = getSlop(node);
    if (slop > -1)
        query.setSlop(slop);
    return query;
}