List of usage examples for org.apache.lucene.analysis Token length
@Override
public final int length()
From source file:com.mhs.qsol.proximity.ProximityVisitor.java
License:Apache License
/** * Converts a token, as defined in the qsol.jtb JavaCC file, into an * appropriate query.// w w w .j av a 2 s .co m * * @param token * @return */ protected Query tokenToQuery(String token) { if (logger.isLoggable(Level.FINE)) { // logger.fine("Query tokenToQuery(String token) : token:" + token); } if (logger.isLoggable(Level.FINE)) { logger.fine("Query tokenToQuery(String token) : token:" + token); } token = removeEscapeChars(token); TokenStream source = analyzer.tokenStream(field, new StringReader(token)); CharTermAttribute charTermAtrib = source.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtrib = source.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncAtt = source.addAttribute(PositionIncrementAttribute.class); ArrayList<Token> v = new ArrayList<Token>(); Token t; int positionCount = 0; boolean severalTokensAtSamePosition = false; while (true) { try { if (!source.incrementToken()) { break; } t = new Token(charTermAtrib.buffer(), 0, charTermAtrib.length(), offsetAtrib.startOffset(), offsetAtrib.endOffset()); t.setPositionIncrement(posIncAtt.getPositionIncrement()); } catch (IOException e) { t = null; } if (t == null) { break; } v.add(t); if (t.getPositionIncrement() != 0) { positionCount += t.getPositionIncrement(); } else { severalTokensAtSamePosition = true; } } try { source.close(); } catch (IOException e) { // ignore } if (v.size() == 0) { return null; } else if (v.size() == 1) { t = v.get(0); SpanTermQuery stq = new SpanTermQuery(new Term(field, new String(t.buffer(), 0, t.length()))); stq.setBoost(this.boost); return stq; } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: SpanQuery[] spanQueries = new SpanQuery[v.size()]; StringBuilder regex = new StringBuilder(); for (int i = 0; i < v.size(); i++) { spanQueries[i] = new SpanTermQuery(new Term(field, regex.toString())); } return new SpanOrQuery(spanQueries); } else { // All the Tokens in each sub-list are positioned at the the same location. ArrayList<ArrayList<Token>> identicallyPositionedTokenLists = new ArrayList<ArrayList<Token>>(); for (int i = 0; i < v.size(); i++) { if ((i == 0) || (v.get(i).getPositionIncrement() > 0)) { identicallyPositionedTokenLists.add(new ArrayList<Token>()); } ArrayList<Token> curList = identicallyPositionedTokenLists .get(identicallyPositionedTokenLists.size() - 1); curList.add(v.get(i)); } ArrayList<SpanQuery> spanNearSubclauses = new ArrayList<SpanQuery>(); for (int listNum = 0; listNum < identicallyPositionedTokenLists.size(); listNum++) { ArrayList<Token> curTokens = identicallyPositionedTokenLists.get(listNum); ArrayList<SpanTermQuery> curTermQueries = new ArrayList<SpanTermQuery>(); for (int tokenNum = 0; tokenNum < curTokens.size(); tokenNum++) { SpanTermQuery termQuery = new SpanTermQuery( new Term(field, curTokens.get(tokenNum).term())); termQuery.setBoost(this.boost); curTermQueries.add(termQuery); } int size = curTermQueries.size(); if (size <= 0) continue; else if (size == 1) spanNearSubclauses.add(curTermQueries.get(0)); else spanNearSubclauses.add(new SpanOrQuery(curTermQueries.toArray(new SpanQuery[0]))); } SpanNearQuery query = new SpanNearQuery( (SpanQuery[]) spanNearSubclauses.toArray(new SpanQuery[0]), slop, true); return query; } } else { SpanTermQuery[] clauses = new SpanTermQuery[v.size()]; for (int i = 0; i < v.size(); i++) { Token t2 = v.get(i); clauses[i] = new SpanTermQuery(new Term(field, new String(t2.buffer(), 0, t2.length()))); } SpanNearQuery query = new SpanNearQuery(clauses, slop, true); return query; } } }
From source file:com.mhs.qsol.QsolToQueryVisitor.java
License:Apache License
/** * Converts a token, as defined in the qsol.jtb JavaCC file, into an * appropriate query.//from ww w . j av a 2 s. c o m * * @param token * @return */ protected Query tokenToQuery(String token) { token = removeEscapeChars(token); TokenStream source = analyzer.tokenStream(field, new StringReader(token)); ArrayList<Token> v = new ArrayList<Token>(); Token t; int positionCount = 0; boolean severalTokensAtSamePosition = false; CharTermAttribute charTermAtrib = source.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtrib = source.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncAtt = source.addAttribute(PositionIncrementAttribute.class); while (true) { try { if (!source.incrementToken()) { break; } t = new Token(charTermAtrib.buffer(), 0, charTermAtrib.length(), offsetAtrib.startOffset(), offsetAtrib.endOffset()); t.setPositionIncrement(posIncAtt.getPositionIncrement()); } catch (IOException e) { t = null; } if (t == null) { break; } v.add(t); if (t.getPositionIncrement() != 0) { positionCount += t.getPositionIncrement(); } else { severalTokensAtSamePosition = true; } } try { source.close(); } catch (IOException e) { // ignore } if (v.size() == 0) { // null's will get cleaned up in visitBooleanOp return null; } else if (v.size() == 1) { t = v.get(0); TermQuery termQuery = new TermQuery(new Term(field, new String(t.buffer(), 0, t.length()))); termQuery.setBoost(this.boost); return termQuery; } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: BooleanQuery q = new BooleanQuery(true); for (int i = 0; i < v.size(); i++) { t = v.get(i); TermQuery currentQuery = new TermQuery( new Term(field, new String(t.buffer(), 0, t.length()))); currentQuery.setBoost(this.boost); q.add(currentQuery, BooleanClause.Occur.SHOULD); } return q; } else { // All the Tokens in each sub-list are positioned at the the same location. ArrayList<ArrayList<Token>> identicallyPositionedTokenLists = new ArrayList<ArrayList<Token>>(); for (int i = 0; i < v.size(); i++) { if ((i == 0) || (v.get(i).getPositionIncrement() > 0)) { identicallyPositionedTokenLists.add(new ArrayList<Token>()); } ArrayList<Token> curList = identicallyPositionedTokenLists .get(identicallyPositionedTokenLists.size() - 1); curList.add(v.get(i)); } ArrayList<SpanQuery> spanNearSubclauses = new ArrayList<SpanQuery>(); for (int listNum = 0; listNum < identicallyPositionedTokenLists.size(); listNum++) { ArrayList<Token> curTokens = identicallyPositionedTokenLists.get(listNum); ArrayList<SpanTermQuery> curTermQueries = new ArrayList<SpanTermQuery>(); for (int tokenNum = 0; tokenNum < curTokens.size(); tokenNum++) { SpanTermQuery termQuery = new SpanTermQuery( new Term(field, curTokens.get(tokenNum).term())); termQuery.setBoost(this.boost); curTermQueries.add(termQuery); } int size = curTermQueries.size(); if (size <= 0) continue; else if (size == 1) spanNearSubclauses.add(curTermQueries.get(0)); else spanNearSubclauses.add(new SpanOrQuery(curTermQueries.toArray(new SpanQuery[0]))); } SpanNearQuery query = new SpanNearQuery( (SpanQuery[]) spanNearSubclauses.toArray(new SpanQuery[0]), slop, true); return query; } } else { SpanTermQuery[] clauses = new SpanTermQuery[v.size()]; for (int i = 0; i < v.size(); i++) { Token t2 = v.get(i); SpanTermQuery spanQuery = new SpanTermQuery( new Term(field, new String(t2.buffer(), 0, t2.length()))); spanQuery.setBoost(boost); clauses[i] = spanQuery; } // Note: There's a bug here (not by me) that where term offsets are not respected. SpanNearQuery query = new SpanNearQuery(clauses, slop, true); return query; } } }
From source file:com.zb.mmseg.analysis.CutLetterDigitFilter.java
License:Open Source License
private Token nextToken(Token reusableToken) throws IOException { assert reusableToken != null; // ?//from w w w. j a v a2s .c om Token nextToken = tokenQueue.poll(); if (nextToken != null) { return nextToken; } /* * // TokenUtils.nextToken ? inc if(!input.incrementToken()) { return null; } */ /* * TermAttribute termAtt = (TermAttribute)input.getAttribute(TermAttribute.class); OffsetAttribute offsetAtt = * (OffsetAttribute)input.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = * (TypeAttribute)input.getAttribute(TypeAttribute.class); nextToken = * reusableToken.reinit(termAtt.termBuffer(), 0, termAtt.termLength(), offsetAtt.startOffset(), * offsetAtt.endOffset(), typeAtt.type()); */ nextToken = TokenUtils.nextToken(input, reusableToken); if (nextToken != null && (MMSegWord.TYPE_LETTER_OR_DIGIT.equalsIgnoreCase(nextToken.type()) || MMSegWord.TYPE_DIGIT_OR_LETTER.equalsIgnoreCase(nextToken.type()))) { final char[] buffer = nextToken.buffer(); final int length = nextToken.length(); byte lastType = (byte) Character.getType(buffer[0]); // ?? int termBufferOffset = 0; int termBufferLength = 0; for (int i = 0; i < length; i++) { byte type = (byte) Character.getType(buffer[i]); if (type <= Character.MODIFIER_LETTER) { type = Character.LOWERCASE_LETTER; } if (type != lastType) { // ?? addToken(nextToken, termBufferOffset, termBufferLength, lastType); termBufferOffset += termBufferLength; termBufferLength = 0; lastType = type; } termBufferLength++; } if (termBufferLength > 0) { // ? addToken(nextToken, termBufferOffset, termBufferLength, lastType); } nextToken = tokenQueue.poll(); } return nextToken; }
From source file:com.zb.mmseg.analysis.CutLetterDigitFilter.java
License:Open Source License
public final boolean incrementToken() throws IOException { clearAttributes();/* ww w . jav a 2 s.co m*/ Token token = nextToken(reusableToken); if (token != null) { termAtt.copyBuffer(token.buffer(), 0, token.length()); offsetAtt.setOffset(token.startOffset(), token.endOffset()); typeAtt.setType(token.type()); return true; } else { end(); return false; } }
From source file:org.apache.solr.analysis.BufferedTokenStream.java
License:Apache License
/** old api emulation for back compat */ private boolean writeToken(Token token) throws IOException { clearAttributes();//from w w w. j a v a2 s . c o m termAtt.copyBuffer(token.buffer(), 0, token.length()); offsetAtt.setOffset(token.startOffset(), token.endOffset()); typeAtt.setType(token.type()); flagsAtt.setFlags(token.getFlags()); posIncAtt.setPositionIncrement(token.getPositionIncrement()); payloadAtt.setPayload(token.getPayload()); return true; }
From source file:org.apache.solr.analysis.SlowSynonymFilter.java
License:Apache License
@Override public boolean incrementToken() throws IOException { while (true) { // if there are any generated tokens, return them... don't try any // matches against them, as we specifically don't want recursion. if (replacement != null && replacement.hasNext()) { copy(this, replacement.next()); return true; }/* w ww . j a v a 2 s . c o m*/ // common case fast-path of first token not matching anything AttributeSource firstTok = nextTok(); if (firstTok == null) return false; CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class); SlowSynonymMap result = map.submap != null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null; if (result == null) { copy(this, firstTok); return true; } // fast-path failed, clone ourselves if needed if (firstTok == this) firstTok = cloneAttributes(); // OK, we matched a token, so find the longest match. matched = new LinkedList<AttributeSource>(); result = match(result); if (result == null) { // no match, simply return the first token read. copy(this, firstTok); return true; } // reuse, or create new one each time? ArrayList<AttributeSource> generated = new ArrayList<AttributeSource>( result.synonyms.length + matched.size() + 1); // // there was a match... let's generate the new tokens, merging // in the matched tokens (position increments need adjusting) // AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast(); boolean includeOrig = result.includeOrig(); AttributeSource origTok = includeOrig ? firstTok : null; PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class); int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream int repPos = 0; // curr position in replacement token stream int pos = 0; // current position in merged token stream for (int i = 0; i < result.synonyms.length; i++) { Token repTok = result.synonyms[i]; AttributeSource newTok = firstTok.cloneAttributes(); CharTermAttribute newTermAtt = newTok.addAttribute(CharTermAttribute.class); OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class); PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class); OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class); newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset()); newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length()); repPos += repTok.getPositionIncrement(); if (i == 0) repPos = origPos; // make position of first token equal to original // if necessary, insert original tokens and adjust position increment while (origTok != null && origPos <= repPos) { PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class); origPosInc.setPositionIncrement(origPos - pos); generated.add(origTok); pos += origPosInc.getPositionIncrement(); origTok = matched.isEmpty() ? null : matched.removeFirst(); if (origTok != null) { origPosInc = origTok.addAttribute(PositionIncrementAttribute.class); origPos += origPosInc.getPositionIncrement(); } } newPosIncAtt.setPositionIncrement(repPos - pos); generated.add(newTok); pos += newPosIncAtt.getPositionIncrement(); } // finish up any leftover original tokens while (origTok != null) { PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class); origPosInc.setPositionIncrement(origPos - pos); generated.add(origTok); pos += origPosInc.getPositionIncrement(); origTok = matched.isEmpty() ? null : matched.removeFirst(); if (origTok != null) { origPosInc = origTok.addAttribute(PositionIncrementAttribute.class); origPos += origPosInc.getPositionIncrement(); } } // what if we replaced a longer sequence with a shorter one? // a/0 b/5 => foo/0 // should I re-create the gap on the next buffered token? replacement = generated.iterator(); // Now return to the top of the loop to read and return the first // generated token.. The reason this is done is that we may have generated // nothing at all, and may need to continue with more matching logic. } }
From source file:org.apache.solr.analysis.SlowSynonymMap.java
License:Apache License
/** * Merge two lists of tokens, producing a single list with manipulated positionIncrements so that * the tokens end up at the same position. * * Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in the same position) * Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a has posInc=n) * *///from w w w .jav a 2s. com public static List<Token> mergeTokens(List<Token> lst1, List<Token> lst2) { ArrayList<Token> result = new ArrayList<Token>(); if (lst1 == null || lst2 == null) { if (lst2 != null) result.addAll(lst2); if (lst1 != null) result.addAll(lst1); return result; } int pos = 0; Iterator<Token> iter1 = lst1.iterator(); Iterator<Token> iter2 = lst2.iterator(); Token tok1 = iter1.hasNext() ? iter1.next() : null; Token tok2 = iter2.hasNext() ? iter2.next() : null; int pos1 = tok1 != null ? tok1.getPositionIncrement() : 0; int pos2 = tok2 != null ? tok2.getPositionIncrement() : 0; while (tok1 != null || tok2 != null) { while (tok1 != null && (pos1 <= pos2 || tok2 == null)) { Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type()); tok.copyBuffer(tok1.buffer(), 0, tok1.length()); tok.setPositionIncrement(pos1 - pos); result.add(tok); pos = pos1; tok1 = iter1.hasNext() ? iter1.next() : null; pos1 += tok1 != null ? tok1.getPositionIncrement() : 0; } while (tok2 != null && (pos2 <= pos1 || tok1 == null)) { Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type()); tok.copyBuffer(tok2.buffer(), 0, tok2.length()); tok.setPositionIncrement(pos2 - pos); result.add(tok); pos = pos2; tok2 = iter2.hasNext() ? iter2.next() : null; pos2 += tok2 != null ? tok2.getPositionIncrement() : 0; } } return result; }
From source file:org.apache.solr.analysis.TestSynonymMap.java
License:Apache License
private void assertTokIncludes(SynonymMap map, String src, String exp) throws Exception { Token[] tokens = map.submap.get(src).synonyms; boolean inc = false; for (Token token : tokens) { if (exp.equals(new String(token.buffer(), 0, token.length()))) inc = true;/* w ww . j a v a2 s . c om*/ } assertTrue(inc); }
From source file:org.apache.solr.handler.component.SpellCheckComponent.java
License:Apache License
protected NamedList toNamedList(boolean shardRequest, SpellingResult spellingResult, String origQuery, boolean extendedResults, boolean collate, boolean correctlySpelled) { NamedList result = new NamedList(); Map<Token, LinkedHashMap<String, Integer>> suggestions = spellingResult.getSuggestions(); boolean hasFreqInfo = spellingResult.hasTokenFrequencyInfo(); boolean hasSuggestions = false; boolean hasZeroFrequencyToken = false; for (Map.Entry<Token, LinkedHashMap<String, Integer>> entry : suggestions.entrySet()) { Token inputToken = entry.getKey(); String tokenString = new String(inputToken.buffer(), 0, inputToken.length()); Map<String, Integer> theSuggestions = new LinkedHashMap<String, Integer>(entry.getValue()); Iterator<String> sugIter = theSuggestions.keySet().iterator(); while (sugIter.hasNext()) { String sug = sugIter.next(); if (sug.equals(tokenString)) { sugIter.remove();/*from w w w . j a va 2 s . c o m*/ } } if (theSuggestions.size() > 0) { hasSuggestions = true; } if (theSuggestions != null && (theSuggestions.size() > 0 || shardRequest)) { SimpleOrderedMap suggestionList = new SimpleOrderedMap(); suggestionList.add("numFound", theSuggestions.size()); suggestionList.add("startOffset", inputToken.startOffset()); suggestionList.add("endOffset", inputToken.endOffset()); // Logical structure of normal (non-extended) results: // "suggestion":["alt1","alt2"] // // Logical structure of the extended results: // "suggestion":[ // {"word":"alt1","freq":7}, // {"word":"alt2","freq":4} // ] if (extendedResults && hasFreqInfo) { suggestionList.add("origFreq", spellingResult.getTokenFrequency(inputToken)); ArrayList<SimpleOrderedMap> sugs = new ArrayList<SimpleOrderedMap>(); suggestionList.add("suggestion", sugs); for (Map.Entry<String, Integer> suggEntry : theSuggestions.entrySet()) { SimpleOrderedMap sugEntry = new SimpleOrderedMap(); sugEntry.add("word", suggEntry.getKey()); sugEntry.add("freq", suggEntry.getValue()); sugs.add(sugEntry); } } else { suggestionList.add("suggestion", theSuggestions.keySet()); } if (hasFreqInfo) { int tokenFrequency = spellingResult.getTokenFrequency(inputToken); if (tokenFrequency == 0) { hasZeroFrequencyToken = true; } } result.add(tokenString, suggestionList); } } if (extendedResults) { result.add("correctlySpelled", correctlySpelled); } return result; }
From source file:org.apache.solr.spelling.AbstractLuceneSpellChecker.java
License:Apache License
@Override public SpellingResult getSuggestions(SpellingOptions options) throws IOException { SpellingResult result = new SpellingResult(options.tokens); IndexReader reader = determineReader(options.reader); Term term = field != null ? new Term(field, "") : null; float theAccuracy = (options.accuracy == Float.MIN_VALUE) ? spellChecker.getAccuracy() : options.accuracy; int count = Math.max(options.count, AbstractLuceneSpellChecker.DEFAULT_SUGGESTION_COUNT); for (Token token : options.tokens) { String tokenText = new String(token.buffer(), 0, token.length()); term = new Term(field, tokenText); int docFreq = 0; if (reader != null) { docFreq = reader.docFreq(term); }/* w ww . jav a 2s .co m*/ String[] suggestions = spellChecker.suggestSimilar(tokenText, ((options.alternativeTermCount == null || docFreq == 0) ? count : options.alternativeTermCount), field != null ? reader : null, // workaround LUCENE-1295 field, options.suggestMode, theAccuracy); if (suggestions.length == 1 && suggestions[0].equals(tokenText) && options.alternativeTermCount == null) { // These are spelled the same, continue on continue; } // If considering alternatives to "correctly-spelled" terms, then add the // original as a viable suggestion. if (options.alternativeTermCount != null && docFreq > 0) { boolean foundOriginal = false; String[] suggestionsWithOrig = new String[suggestions.length + 1]; for (int i = 0; i < suggestions.length; i++) { if (suggestions[i].equals(tokenText)) { foundOriginal = true; break; } suggestionsWithOrig[i + 1] = suggestions[i]; } if (!foundOriginal) { suggestionsWithOrig[0] = tokenText; suggestions = suggestionsWithOrig; } } if (options.extendedResults == true && reader != null && field != null) { result.addFrequency(token, docFreq); int countLimit = Math.min(options.count, suggestions.length); if (countLimit > 0) { for (int i = 0; i < countLimit; i++) { term = new Term(field, suggestions[i]); result.add(token, suggestions[i], reader.docFreq(term)); } } else { List<String> suggList = Collections.emptyList(); result.add(token, suggList); } } else { if (suggestions.length > 0) { List<String> suggList = Arrays.asList(suggestions); if (suggestions.length > options.count) { suggList = suggList.subList(0, options.count); } result.add(token, suggList); } else { List<String> suggList = Collections.emptyList(); result.add(token, suggList); } } } return result; }