List of usage examples for org.apache.lucene.analysis Token setPositionIncrement
@Override public void setPositionIncrement(int positionIncrement)
From source file:analysis.StandardTokenizer.java
License:Apache License
@SuppressWarnings("deprecation") public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; int posIncr = 1; Token result = reusableToken;//from w w w .ja va 2 s.co m if (tokenList.size() > 0) return tokenList.remove(); while (true) { int tokenType = scanner.getNextToken(); if (tokenType == StandardTokenizerImpl.YYEOF) { return null; } if (scanner.yylength() <= maxTokenLength) { reusableToken.clear(); reusableToken.setPositionIncrement(posIncr); scanner.getText(reusableToken); final int start = scanner.yychar(); reusableToken.setStartOffset(start); reusableToken.setEndOffset(start + reusableToken.termLength()); // This 'if' should be removed in the next release. For now, it // converts // invalid acronyms to HOST. When removed, only the 'else' part // should // remain. if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) { if (replaceInvalidAcronym) { reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]); reusableToken.setTermLength(reusableToken.termLength() - 1); // remove // extra // '.' tokenType = StandardTokenizerImpl.HOST; } else { reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]); tokenType = StandardTokenizerImpl.ACRONYM; } } else { reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]); } if (tokenType == StandardTokenizerImpl.HOST || tokenType == StandardTokenizerImpl.NUM || tokenType == StandardTokenizerImpl.ALPHANUM) { Tokenizer lt = new LetterDigitBreakTokenizer(new StringReader(reusableToken.term())); Token tk = null; int st = reusableToken.startOffset(); final Token token = new Token(); while ((tk = lt.next(token)) != null) { tk.setStartOffset(tk.startOffset() + st); tk.setEndOffset(tk.endOffset() + st); tk.setType(reusableToken.type()); tokenList.add((Token) tk.clone()); } } if (tokenList.size() > 0) result = tokenList.remove(); return result; } else // When we skip a too-long term, we still increment the // position increment posIncr++; } }
From source file:com.globalsight.ling.lucene.analysis.pl.PolishFilter.java
License:Apache License
/** Returns the next input Token, after being stemmed */ public final Token next() throws IOException { Token token = getNextToken();//w ww . j a va2 s .c o m if (token == null) { return null; } else { String s = stemmer.stem(token.toString(), true); if (!s.equals(token.toString())) { // reconstruct the input token. This is silly... Token res = new Token(s, token.startOffset(), token.endOffset(), token.type()); res.setPositionIncrement(token.getPositionIncrement()); return res; } return token; } }
From source file:com.mhs.qsol.proximity.ProximityVisitor.java
License:Apache License
/** * Converts a token, as defined in the qsol.jtb JavaCC file, into an * appropriate query./*from w w w . ja v a2 s.c o m*/ * * @param token * @return */ protected Query tokenToQuery(String token) { if (logger.isLoggable(Level.FINE)) { // logger.fine("Query tokenToQuery(String token) : token:" + token); } if (logger.isLoggable(Level.FINE)) { logger.fine("Query tokenToQuery(String token) : token:" + token); } token = removeEscapeChars(token); TokenStream source = analyzer.tokenStream(field, new StringReader(token)); CharTermAttribute charTermAtrib = source.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtrib = source.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncAtt = source.addAttribute(PositionIncrementAttribute.class); ArrayList<Token> v = new ArrayList<Token>(); Token t; int positionCount = 0; boolean severalTokensAtSamePosition = false; while (true) { try { if (!source.incrementToken()) { break; } t = new Token(charTermAtrib.buffer(), 0, charTermAtrib.length(), offsetAtrib.startOffset(), offsetAtrib.endOffset()); t.setPositionIncrement(posIncAtt.getPositionIncrement()); } catch (IOException e) { t = null; } if (t == null) { break; } v.add(t); if (t.getPositionIncrement() != 0) { positionCount += t.getPositionIncrement(); } else { severalTokensAtSamePosition = true; } } try { source.close(); } catch (IOException e) { // ignore } if (v.size() == 0) { return null; } else if (v.size() == 1) { t = v.get(0); SpanTermQuery stq = new SpanTermQuery(new Term(field, new String(t.buffer(), 0, t.length()))); stq.setBoost(this.boost); return stq; } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: SpanQuery[] spanQueries = new SpanQuery[v.size()]; StringBuilder regex = new StringBuilder(); for (int i = 0; i < v.size(); i++) { spanQueries[i] = new SpanTermQuery(new Term(field, regex.toString())); } return new SpanOrQuery(spanQueries); } else { // All the Tokens in each sub-list are positioned at the the same location. ArrayList<ArrayList<Token>> identicallyPositionedTokenLists = new ArrayList<ArrayList<Token>>(); for (int i = 0; i < v.size(); i++) { if ((i == 0) || (v.get(i).getPositionIncrement() > 0)) { identicallyPositionedTokenLists.add(new ArrayList<Token>()); } ArrayList<Token> curList = identicallyPositionedTokenLists .get(identicallyPositionedTokenLists.size() - 1); curList.add(v.get(i)); } ArrayList<SpanQuery> spanNearSubclauses = new ArrayList<SpanQuery>(); for (int listNum = 0; listNum < identicallyPositionedTokenLists.size(); listNum++) { ArrayList<Token> curTokens = identicallyPositionedTokenLists.get(listNum); ArrayList<SpanTermQuery> curTermQueries = new ArrayList<SpanTermQuery>(); for (int tokenNum = 0; tokenNum < curTokens.size(); tokenNum++) { SpanTermQuery termQuery = new SpanTermQuery( new Term(field, curTokens.get(tokenNum).term())); termQuery.setBoost(this.boost); curTermQueries.add(termQuery); } int size = curTermQueries.size(); if (size <= 0) continue; else if (size == 1) spanNearSubclauses.add(curTermQueries.get(0)); else spanNearSubclauses.add(new SpanOrQuery(curTermQueries.toArray(new SpanQuery[0]))); } SpanNearQuery query = new SpanNearQuery( (SpanQuery[]) spanNearSubclauses.toArray(new SpanQuery[0]), slop, true); return query; } } else { SpanTermQuery[] clauses = new SpanTermQuery[v.size()]; for (int i = 0; i < v.size(); i++) { Token t2 = v.get(i); clauses[i] = new SpanTermQuery(new Term(field, new String(t2.buffer(), 0, t2.length()))); } SpanNearQuery query = new SpanNearQuery(clauses, slop, true); return query; } } }
From source file:com.mhs.qsol.QsolToQueryVisitor.java
License:Apache License
/** * Converts a token, as defined in the qsol.jtb JavaCC file, into an * appropriate query./* w ww. j av a 2s. com*/ * * @param token * @return */ protected Query tokenToQuery(String token) { token = removeEscapeChars(token); TokenStream source = analyzer.tokenStream(field, new StringReader(token)); ArrayList<Token> v = new ArrayList<Token>(); Token t; int positionCount = 0; boolean severalTokensAtSamePosition = false; CharTermAttribute charTermAtrib = source.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtrib = source.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncAtt = source.addAttribute(PositionIncrementAttribute.class); while (true) { try { if (!source.incrementToken()) { break; } t = new Token(charTermAtrib.buffer(), 0, charTermAtrib.length(), offsetAtrib.startOffset(), offsetAtrib.endOffset()); t.setPositionIncrement(posIncAtt.getPositionIncrement()); } catch (IOException e) { t = null; } if (t == null) { break; } v.add(t); if (t.getPositionIncrement() != 0) { positionCount += t.getPositionIncrement(); } else { severalTokensAtSamePosition = true; } } try { source.close(); } catch (IOException e) { // ignore } if (v.size() == 0) { // null's will get cleaned up in visitBooleanOp return null; } else if (v.size() == 1) { t = v.get(0); TermQuery termQuery = new TermQuery(new Term(field, new String(t.buffer(), 0, t.length()))); termQuery.setBoost(this.boost); return termQuery; } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: BooleanQuery q = new BooleanQuery(true); for (int i = 0; i < v.size(); i++) { t = v.get(i); TermQuery currentQuery = new TermQuery( new Term(field, new String(t.buffer(), 0, t.length()))); currentQuery.setBoost(this.boost); q.add(currentQuery, BooleanClause.Occur.SHOULD); } return q; } else { // All the Tokens in each sub-list are positioned at the the same location. ArrayList<ArrayList<Token>> identicallyPositionedTokenLists = new ArrayList<ArrayList<Token>>(); for (int i = 0; i < v.size(); i++) { if ((i == 0) || (v.get(i).getPositionIncrement() > 0)) { identicallyPositionedTokenLists.add(new ArrayList<Token>()); } ArrayList<Token> curList = identicallyPositionedTokenLists .get(identicallyPositionedTokenLists.size() - 1); curList.add(v.get(i)); } ArrayList<SpanQuery> spanNearSubclauses = new ArrayList<SpanQuery>(); for (int listNum = 0; listNum < identicallyPositionedTokenLists.size(); listNum++) { ArrayList<Token> curTokens = identicallyPositionedTokenLists.get(listNum); ArrayList<SpanTermQuery> curTermQueries = new ArrayList<SpanTermQuery>(); for (int tokenNum = 0; tokenNum < curTokens.size(); tokenNum++) { SpanTermQuery termQuery = new SpanTermQuery( new Term(field, curTokens.get(tokenNum).term())); termQuery.setBoost(this.boost); curTermQueries.add(termQuery); } int size = curTermQueries.size(); if (size <= 0) continue; else if (size == 1) spanNearSubclauses.add(curTermQueries.get(0)); else spanNearSubclauses.add(new SpanOrQuery(curTermQueries.toArray(new SpanQuery[0]))); } SpanNearQuery query = new SpanNearQuery( (SpanQuery[]) spanNearSubclauses.toArray(new SpanQuery[0]), slop, true); return query; } } else { SpanTermQuery[] clauses = new SpanTermQuery[v.size()]; for (int i = 0; i < v.size(); i++) { Token t2 = v.get(i); SpanTermQuery spanQuery = new SpanTermQuery( new Term(field, new String(t2.buffer(), 0, t2.length()))); spanQuery.setBoost(boost); clauses[i] = spanQuery; } // Note: There's a bug here (not by me) that where term offsets are not respected. SpanNearQuery query = new SpanNearQuery(clauses, slop, true); return query; } } }
From source file:com.stimulus.archiva.search.EmailFilter.java
License:Open Source License
private void putPart(Token token) throws IOException { String emailAddress = token.termText(); emailAddress = emailAddress.replaceAll("<", ""); emailAddress = emailAddress.replaceAll(">", ""); emailAddress = emailAddress.replaceAll("\"", ""); String[] parts = extractEmailParts(emailAddress); String partout = ""; for (int i = 0; i < parts.length; i++) { partout += parts[i] + "_"; if (parts[i] != null) { Token subToken = new Token(parts[i].trim(), token.startOffset(), token.endOffset()); subToken.setPositionIncrement(0); emailTokenStack.push(subToken); }/*ww w.j a v a 2s . co m*/ } }
From source file:edu.mit.ll.vizlinc.highlight.TokenStreamFromTermPositionVector.java
License:Apache License
/** * Constructor./*from w w w. j av a 2 s. c om*/ * * @param termPositionVector TermPositionVector that contains the data for * creating the TokenStream. Must have positions and offsets. */ public TokenStreamFromTermPositionVector(final TermPositionVector termPositionVector) { termAttribute = addAttribute(CharTermAttribute.class); positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class); offsetAttribute = addAttribute(OffsetAttribute.class); final String[] terms = termPositionVector.getTerms(); for (int i = 0; i < terms.length; i++) { final TermVectorOffsetInfo[] offsets = termPositionVector.getOffsets(i); final int[] termPositions = termPositionVector.getTermPositions(i); for (int j = 0; j < termPositions.length; j++) { Token token; if (offsets != null) { token = new Token(terms[i].toCharArray(), 0, terms[i].length(), offsets[j].getStartOffset(), offsets[j].getEndOffset()); } else { token = new Token(); token.setEmpty().append(terms[i]); } // Yes - this is the position, not the increment! This is for // sorting. This value // will be corrected before use. token.setPositionIncrement(termPositions[j]); this.positionedTokens.add(token); } } CollectionUtil.mergeSort(this.positionedTokens, tokenComparator); int lastPosition = -1; for (final Token token : this.positionedTokens) { int thisPosition = token.getPositionIncrement(); token.setPositionIncrement(thisPosition - lastPosition); lastPosition = thisPosition; } this.tokensAtCurrentPosition = this.positionedTokens.iterator(); }
From source file:edu.uci.ics.sourcerer.search.analysis.CamelCaseSplitFilter.java
License:Open Source License
protected void decompose(Token token) { int start = 0, end = 0; char[] buffer = token.termBuffer(); int i = 0;//from ww w . ja va2s .c o m for (; i < token.termLength(); i++) { // only compare two consecutive letters if (i > 0 && Character.isLetter(buffer[i]) && Character.isLetter(buffer[i - 1])) { // lower -> upper if (Character.isLowerCase(buffer[i - 1]) && Character.isUpperCase(buffer[i])) { // create a new token upto buffer[i-1] tokens.add(newTok(token, start, end + 1)); start = i; // end = i; } // upper -> lower else if (Character.isUpperCase(buffer[i - 1]) && Character.isLowerCase(buffer[i])) { if (start < i - 1) { // create a new token upto buffer[i-2] tokens.add(newTok(token, start, i - 1)); // also go back and check for consecutive Upper letters // and create a token if more than one found // URIs will produce UR, URI, Is // DBPool will produce DB, DBP, Pool if (Character.isLetter(buffer[i - 2]) && Character.isUpperCase(buffer[i - 2])) { Token _tok = newTok(token, start, i); _tok.setPositionIncrement(0); tokens.add(_tok); } start = i - 1; } } // end upper -< lower } // end compare consecutive letters end = i; } // end for if (start == 0 && end == i - 1) { // no camel case found } else { // add the last token tokens.add(newTok(token, start, end + 1)); } }
From source file:edu.uci.ics.sourcerer.search.analysis.LetterDigitSplitFilter.java
License:Open Source License
protected void decompose(Token token) { int start = 0, end = 0; char[] buffer = token.termBuffer(); int i = 0;/*from w ww . ja v a 2s.c o m*/ for (; i < token.termLength(); i++) { if (i > 0) { if (// letter -> digit (Character.isLetter(buffer[i - 1]) && Character.isDigit(buffer[i])) || // digit -> letter (Character.isDigit(buffer[i - 1]) && Character.isLetter(buffer[i]))) { // create a new token upto buffer[i-1] tokens.add(newTok(token, start, end + 1)); start = i; end = i; } } end = i; } // end for if (start == 0 && end == i - 1) { // either all numbers or letters } else { // add the last token tokens.add(newTok(token, start, end + 1)); if (preserveOriginal > 0) { Token _tok = newTok(token, 0, token.termLength()); _tok.setPositionIncrement(0); tokens.add(_tok); } } }
From source file:edu.usu.cosl.analysis.es.SpanishPorterFilterFactory.java
License:Apache License
@Override public Token next() throws IOException { Token tok = input.next();/*from ww w .j a va 2 s . c o m*/ if (tok == null) return null; String tokstr = tok.termText(); // if protected, don't stem. use this to avoid stemming collisions. if (protWords != null && protWords.contains(tokstr)) { return tok; } stemmer.setCurrent(tokstr); stemmer.stem(); String newstr = stemmer.getCurrent(); if (tokstr.equals(newstr)) { return tok; } else { // TODO: it would be nice if I could just set termText directly like // lucene packages can. Token newtok = new Token(newstr, tok.startOffset(), tok.endOffset(), tok.type()); newtok.setPositionIncrement(tok.getPositionIncrement()); return newtok; } }
From source file:gpl.pierrick.brihaye.aramorph.lucene.ArabicGlosser.java
License:Open Source License
/** Returns the next gloss for the given stem. * @param firstOne Whether or not this gloss is the first one * @return The gloss. Its <CODE>termText</CODE> is the gloss of the <STRONG>stem</STRONG>. Its <CODE>type</CODE> is the grammatical category of the <STRONG>stem</STRONG>. * When several glosses are available, every emitted token's * <CODE>PositionIncrement</CODE> but the first one is set to <CODE>0</CODE> * @see org.apache.lucene.analysis.Token#setPositionIncrement(int) *//*w w w . j a v a 2 s . c o m*/ private Token nextGloss(boolean firstOne) { Token emittedToken = null; String tokenText = null; String tokenType = null; try { tokenText = (String) tokenGlosses.getFirst(); if (tokenText == null) tokenText = ""; //Token is typed in order to filter it later tokenType = (String) tokenPOS.getFirst(); if (tokenType == null) tokenType = "NO_STEM"; //OK : we're done with this gloss tokenGlosses.removeFirst(); tokenPOS.removeFirst(); //Will there be further treatment for this token ? processingToken = !tokenGlosses.isEmpty(); } //It should not be normally possible ! catch (IndexOutOfBoundsException e) { System.err.println("Something went wrong in nextGloss"); processingToken = false; //Re-emit the same token text (romanized) : not the best solution :-( tokenText = romanizedToken; tokenType = "PLACE_HOLDER"; } emittedToken = new Token(tokenText, receivedToken.startOffset(), receivedToken.endOffset(), tokenType); if (!firstOne) emittedToken.setPositionIncrement(0); if (debug) System.out.println( emittedToken.termText() + "\t" + emittedToken.type() + "\t" + "[" + emittedToken.startOffset() + "-" + emittedToken.endOffset() + "]" + "\t" + emittedToken.getPositionIncrement()); return emittedToken; }