Example usage for org.apache.lucene.analysis.shingle ShingleFilter DEFAULT_TOKEN_TYPE

List of usage examples for org.apache.lucene.analysis.shingle ShingleFilter DEFAULT_TOKEN_TYPE

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.shingle ShingleFilter DEFAULT_TOKEN_TYPE.

Prototype

String DEFAULT_TOKEN_TYPE

To view the source code for org.apache.lucene.analysis.shingle ShingleFilter DEFAULT_TOKEN_TYPE.

Click Source Link

Document

default token type attribute value is "shingle"

Usage

From source file:org.codelibs.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.java

License:Apache License

public Result getCorrections(TokenStream stream, final CandidateGenerator generator, float maxErrors,
        int numCorrections, WordScorer wordScorer, float confidence, int gramSize) throws IOException {

    final List<CandidateSet> candidateSetsList = new ArrayList<>();
    DirectCandidateGenerator.analyze(stream, new DirectCandidateGenerator.TokenConsumer() {
        CandidateSet currentSet = null;/* www.  j  a  va2  s .  c o m*/
        private TypeAttribute typeAttribute;
        private final BytesRefBuilder termsRef = new BytesRefBuilder();
        private boolean anyUnigram = false;
        private boolean anyTokens = false;

        @Override
        public void reset(TokenStream stream) {
            super.reset(stream);
            typeAttribute = stream.addAttribute(TypeAttribute.class);
        }

        @Override
        public void nextToken() throws IOException {
            anyTokens = true;
            BytesRef term = fillBytesRef(termsRef);
            if (requireUnigram && typeAttribute.type() == ShingleFilter.DEFAULT_TOKEN_TYPE) {
                return;
            }
            anyUnigram = true;
            if (posIncAttr.getPositionIncrement() == 0 && typeAttribute.type() == SynonymFilter.TYPE_SYNONYM) {
                assert currentSet != null;
                long freq = 0;
                if ((freq = generator.frequency(term)) > 0) {
                    currentSet.addOneCandidate(
                            generator.createCandidate(BytesRef.deepCopyOf(term), freq, realWordLikelihood));
                }
            } else {
                if (currentSet != null) {
                    candidateSetsList.add(currentSet);
                }
                currentSet = new CandidateSet(Candidate.EMPTY,
                        generator.createCandidate(BytesRef.deepCopyOf(term), true));
            }
        }

        @Override
        public void end() {
            if (currentSet != null) {
                candidateSetsList.add(currentSet);
            }
            if (requireUnigram && !anyUnigram && anyTokens) {
                throw new IllegalStateException("At least one unigram is required but all tokens were ngrams");
            }
        }
    });

    if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) {
        return Result.EMPTY;
    }

    for (CandidateSet candidateSet : candidateSetsList) {
        generator.drawCandidates(candidateSet);
    }
    double cutoffScore = Double.MIN_VALUE;
    CandidateScorer scorer = new CandidateScorer(wordScorer, numCorrections, gramSize);
    CandidateSet[] candidateSets = candidateSetsList.toArray(new CandidateSet[candidateSetsList.size()]);
    if (confidence > 0.0) {
        Candidate[] candidates = new Candidate[candidateSets.length];
        for (int i = 0; i < candidates.length; i++) {
            candidates[i] = candidateSets[i].originalTerm;
        }
        double inputPhraseScore = scorer.score(candidates, candidateSets);
        cutoffScore = inputPhraseScore * confidence;
    }
    Correction[] bestCandidates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);

    return new Result(bestCandidates, cutoffScore);
}

From source file:org.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.java

License:Apache License

public Result getCorrections(TokenStream stream, final CandidateGenerator generator, float maxErrors,
        int numCorrections, IndexReader reader, WordScorer wordScorer, BytesRef separator, float confidence,
        int gramSize) throws IOException {

    final List<CandidateSet> candidateSetsList = new ArrayList<DirectCandidateGenerator.CandidateSet>();
    SuggestUtils.analyze(stream, new SuggestUtils.TokenConsumer() {
        CandidateSet currentSet = null;/*from w w w.  j  a  v a  2s  .  co m*/
        private TypeAttribute typeAttribute;
        private final BytesRef termsRef = new BytesRef();
        private boolean anyUnigram = false;
        private boolean anyTokens = false;

        @Override
        public void reset(TokenStream stream) {
            super.reset(stream);
            typeAttribute = stream.addAttribute(TypeAttribute.class);
        }

        @Override
        public void nextToken() throws IOException {
            anyTokens = true;
            BytesRef term = fillBytesRef(termsRef);
            if (requireUnigram && typeAttribute.type() == ShingleFilter.DEFAULT_TOKEN_TYPE) {
                return;
            }
            anyUnigram = true;
            if (posIncAttr.getPositionIncrement() == 0 && typeAttribute.type() == SynonymFilter.TYPE_SYNONYM) {
                assert currentSet != null;
                long freq = 0;
                if ((freq = generator.frequency(term)) > 0) {
                    currentSet.addOneCandidate(
                            generator.createCandidate(BytesRef.deepCopyOf(term), freq, realWordLikelihood));
                }
            } else {
                if (currentSet != null) {
                    candidateSetsList.add(currentSet);
                }
                currentSet = new CandidateSet(Candidate.EMPTY,
                        generator.createCandidate(BytesRef.deepCopyOf(term), true));
            }
        }

        @Override
        public void end() {
            if (currentSet != null) {
                candidateSetsList.add(currentSet);
            }
            if (requireUnigram && !anyUnigram && anyTokens) {
                throw new IllegalStateException("At least one unigram is required but all tokens were ngrams");
            }
        }
    });

    if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) {
        return Result.EMPTY;
    }

    for (CandidateSet candidateSet : candidateSetsList) {
        generator.drawCandidates(candidateSet);
    }
    double cutoffScore = Double.MIN_VALUE;
    CandidateScorer scorer = new CandidateScorer(wordScorer, numCorrections, gramSize);
    CandidateSet[] candidateSets = candidateSetsList.toArray(new CandidateSet[candidateSetsList.size()]);
    if (confidence > 0.0) {
        Candidate[] candidates = new Candidate[candidateSets.length];
        for (int i = 0; i < candidates.length; i++) {
            candidates[i] = candidateSets[i].originalTerm;
        }
        double inputPhraseScore = scorer.score(candidates, candidateSets);
        cutoffScore = inputPhraseScore * confidence;
    }
    Correction[] findBestCandiates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);

    return new Result(findBestCandiates, cutoffScore);
}

From source file:org.opensextant.solrtexttagger.ConcatenateFilter.java

License:Open Source License

@Override
public final boolean incrementToken() throws IOException {
    if (done)//from w  ww  .j  a  va 2s .  c  om
        return false;
    done = true;

    buf.setLength(0);
    boolean firstTerm = true;
    while (input.incrementToken()) {
        if (!firstTerm) {
            buf.append(separator);
        }
        //TODO consider indexing special chars when posInc > 1 (stop words). We ignore for now. #13
        buf.append(termAtt);
        firstTerm = false;
    }
    input.end();//call here so we can see end of stream offsets

    termAtt.setEmpty().append(buf);
    //Setting the other attributes ultimately won't have much effect but lets be thorough
    offsetAtt.setOffset(0, offsetAtt.endOffset());
    posIncrAtt.setPositionIncrement(1);
    posLenAtt.setPositionLength(1);//or do we add up the positions?  Probably not used any way.
    typeAtt.setType(ShingleFilter.DEFAULT_TOKEN_TYPE);//"shingle"

    return true;
}