List of usage examples for org.apache.lucene.analysis.shingle ShingleFilter DEFAULT_TOKEN_TYPE
String DEFAULT_TOKEN_TYPE
To view the source code for org.apache.lucene.analysis.shingle ShingleFilter DEFAULT_TOKEN_TYPE.
Click Source Link
From source file:org.codelibs.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.java
License:Apache License
public Result getCorrections(TokenStream stream, final CandidateGenerator generator, float maxErrors, int numCorrections, WordScorer wordScorer, float confidence, int gramSize) throws IOException { final List<CandidateSet> candidateSetsList = new ArrayList<>(); DirectCandidateGenerator.analyze(stream, new DirectCandidateGenerator.TokenConsumer() { CandidateSet currentSet = null;/* www. j a va2 s . c o m*/ private TypeAttribute typeAttribute; private final BytesRefBuilder termsRef = new BytesRefBuilder(); private boolean anyUnigram = false; private boolean anyTokens = false; @Override public void reset(TokenStream stream) { super.reset(stream); typeAttribute = stream.addAttribute(TypeAttribute.class); } @Override public void nextToken() throws IOException { anyTokens = true; BytesRef term = fillBytesRef(termsRef); if (requireUnigram && typeAttribute.type() == ShingleFilter.DEFAULT_TOKEN_TYPE) { return; } anyUnigram = true; if (posIncAttr.getPositionIncrement() == 0 && typeAttribute.type() == SynonymFilter.TYPE_SYNONYM) { assert currentSet != null; long freq = 0; if ((freq = generator.frequency(term)) > 0) { currentSet.addOneCandidate( generator.createCandidate(BytesRef.deepCopyOf(term), freq, realWordLikelihood)); } } else { if (currentSet != null) { candidateSetsList.add(currentSet); } currentSet = new CandidateSet(Candidate.EMPTY, generator.createCandidate(BytesRef.deepCopyOf(term), true)); } } @Override public void end() { if (currentSet != null) { candidateSetsList.add(currentSet); } if (requireUnigram && !anyUnigram && anyTokens) { throw new IllegalStateException("At least one unigram is required but all tokens were ngrams"); } } }); if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) { return Result.EMPTY; } for (CandidateSet candidateSet : candidateSetsList) { generator.drawCandidates(candidateSet); } double cutoffScore = Double.MIN_VALUE; CandidateScorer scorer = new CandidateScorer(wordScorer, numCorrections, gramSize); CandidateSet[] candidateSets = candidateSetsList.toArray(new CandidateSet[candidateSetsList.size()]); if (confidence > 0.0) { Candidate[] candidates = new Candidate[candidateSets.length]; for (int i = 0; i < candidates.length; i++) { candidates[i] = candidateSets[i].originalTerm; } double inputPhraseScore = scorer.score(candidates, candidateSets); cutoffScore = inputPhraseScore * confidence; } Correction[] bestCandidates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore); return new Result(bestCandidates, cutoffScore); }
From source file:org.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.java
License:Apache License
public Result getCorrections(TokenStream stream, final CandidateGenerator generator, float maxErrors, int numCorrections, IndexReader reader, WordScorer wordScorer, BytesRef separator, float confidence, int gramSize) throws IOException { final List<CandidateSet> candidateSetsList = new ArrayList<DirectCandidateGenerator.CandidateSet>(); SuggestUtils.analyze(stream, new SuggestUtils.TokenConsumer() { CandidateSet currentSet = null;/*from w w w. j a v a 2s . co m*/ private TypeAttribute typeAttribute; private final BytesRef termsRef = new BytesRef(); private boolean anyUnigram = false; private boolean anyTokens = false; @Override public void reset(TokenStream stream) { super.reset(stream); typeAttribute = stream.addAttribute(TypeAttribute.class); } @Override public void nextToken() throws IOException { anyTokens = true; BytesRef term = fillBytesRef(termsRef); if (requireUnigram && typeAttribute.type() == ShingleFilter.DEFAULT_TOKEN_TYPE) { return; } anyUnigram = true; if (posIncAttr.getPositionIncrement() == 0 && typeAttribute.type() == SynonymFilter.TYPE_SYNONYM) { assert currentSet != null; long freq = 0; if ((freq = generator.frequency(term)) > 0) { currentSet.addOneCandidate( generator.createCandidate(BytesRef.deepCopyOf(term), freq, realWordLikelihood)); } } else { if (currentSet != null) { candidateSetsList.add(currentSet); } currentSet = new CandidateSet(Candidate.EMPTY, generator.createCandidate(BytesRef.deepCopyOf(term), true)); } } @Override public void end() { if (currentSet != null) { candidateSetsList.add(currentSet); } if (requireUnigram && !anyUnigram && anyTokens) { throw new IllegalStateException("At least one unigram is required but all tokens were ngrams"); } } }); if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) { return Result.EMPTY; } for (CandidateSet candidateSet : candidateSetsList) { generator.drawCandidates(candidateSet); } double cutoffScore = Double.MIN_VALUE; CandidateScorer scorer = new CandidateScorer(wordScorer, numCorrections, gramSize); CandidateSet[] candidateSets = candidateSetsList.toArray(new CandidateSet[candidateSetsList.size()]); if (confidence > 0.0) { Candidate[] candidates = new Candidate[candidateSets.length]; for (int i = 0; i < candidates.length; i++) { candidates[i] = candidateSets[i].originalTerm; } double inputPhraseScore = scorer.score(candidates, candidateSets); cutoffScore = inputPhraseScore * confidence; } Correction[] findBestCandiates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore); return new Result(findBestCandiates, cutoffScore); }
From source file:org.opensextant.solrtexttagger.ConcatenateFilter.java
License:Open Source License
@Override public final boolean incrementToken() throws IOException { if (done)//from w ww .j a va 2s . c om return false; done = true; buf.setLength(0); boolean firstTerm = true; while (input.incrementToken()) { if (!firstTerm) { buf.append(separator); } //TODO consider indexing special chars when posInc > 1 (stop words). We ignore for now. #13 buf.append(termAtt); firstTerm = false; } input.end();//call here so we can see end of stream offsets termAtt.setEmpty().append(buf); //Setting the other attributes ultimately won't have much effect but lets be thorough offsetAtt.setOffset(0, offsetAtt.endOffset()); posIncrAtt.setPositionIncrement(1); posLenAtt.setPositionLength(1);//or do we add up the positions? Probably not used any way. typeAtt.setType(ShingleFilter.DEFAULT_TOKEN_TYPE);//"shingle" return true; }