List of usage examples for org.apache.lucene.analysis Token getPositionIncrement
@Override public int getPositionIncrement()
From source file:com.globalsight.ling.lucene.analysis.GSTokenizer.java
License:Apache License
@Override public boolean incrementToken() throws IOException { clearAttributes();// w w w . j av a 2s . co m Token tt = next(); if (tt == null) { return false; } else { gsAtt.setToken(tt); termAtt.append(tt.toString()); offsetAtt.setOffset(tt.startOffset(), tt.endOffset()); posIncrAtt.setPositionIncrement(tt.getPositionIncrement()); return true; } }
From source file:com.globalsight.ling.lucene.analysis.pl.PolishFilter.java
License:Apache License
/** Returns the next input Token, after being stemmed */ public final Token next() throws IOException { Token token = getNextToken(); if (token == null) { return null; } else {// w w w. j a v a2 s. c om String s = stemmer.stem(token.toString(), true); if (!s.equals(token.toString())) { // reconstruct the input token. This is silly... Token res = new Token(s, token.startOffset(), token.endOffset(), token.type()); res.setPositionIncrement(token.getPositionIncrement()); return res; } return token; } }
From source file:com.mathworks.xzheng.analysis.nutch.NutchExample.java
License:Apache License
public static void main(String[] args) throws IOException { Configuration conf = Configuration.getConfiguration(); conf.addResource("nutch-default.xml"); NutchDocumentAnalyzer analyzer = new NutchDocumentAnalyzer(conf); //1 TokenStream ts = analyzer.tokenStream("content", new StringReader("The quick brown fox...")); int position = 0; Token token; while (ts.incrementToken()) { // 2 token = ts.getAttribute(org.apache.lucene.analysis.Token.class); if (token == null) { break; }//from www . j a va 2s . c om int increment = token.getPositionIncrement(); if (increment > 0) { position = position + increment; System.out.println(); System.out.print(position + ": "); } System.out.print("[" + token.termBuffer().toString() + ":" + token.startOffset() + "->" + token.endOffset() + ":" + token.type() + "] "); } System.out.println(); Query nutchQuery = Query.parse("\"the quick brown\"", conf); // 3 org.apache.lucene.search.Query luceneQuery; luceneQuery = new QueryFilters(conf).filter(nutchQuery); // A System.out.println("Translated: " + luceneQuery); }
From source file:com.mhs.qsol.proximity.ProximityVisitor.java
License:Apache License
/** * Converts a token, as defined in the qsol.jtb JavaCC file, into an * appropriate query./*from w ww . java 2s . c o m*/ * * @param token * @return */ protected Query tokenToQuery(String token) { if (logger.isLoggable(Level.FINE)) { // logger.fine("Query tokenToQuery(String token) : token:" + token); } if (logger.isLoggable(Level.FINE)) { logger.fine("Query tokenToQuery(String token) : token:" + token); } token = removeEscapeChars(token); TokenStream source = analyzer.tokenStream(field, new StringReader(token)); CharTermAttribute charTermAtrib = source.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtrib = source.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncAtt = source.addAttribute(PositionIncrementAttribute.class); ArrayList<Token> v = new ArrayList<Token>(); Token t; int positionCount = 0; boolean severalTokensAtSamePosition = false; while (true) { try { if (!source.incrementToken()) { break; } t = new Token(charTermAtrib.buffer(), 0, charTermAtrib.length(), offsetAtrib.startOffset(), offsetAtrib.endOffset()); t.setPositionIncrement(posIncAtt.getPositionIncrement()); } catch (IOException e) { t = null; } if (t == null) { break; } v.add(t); if (t.getPositionIncrement() != 0) { positionCount += t.getPositionIncrement(); } else { severalTokensAtSamePosition = true; } } try { source.close(); } catch (IOException e) { // ignore } if (v.size() == 0) { return null; } else if (v.size() == 1) { t = v.get(0); SpanTermQuery stq = new SpanTermQuery(new Term(field, new String(t.buffer(), 0, t.length()))); stq.setBoost(this.boost); return stq; } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: SpanQuery[] spanQueries = new SpanQuery[v.size()]; StringBuilder regex = new StringBuilder(); for (int i = 0; i < v.size(); i++) { spanQueries[i] = new SpanTermQuery(new Term(field, regex.toString())); } return new SpanOrQuery(spanQueries); } else { // All the Tokens in each sub-list are positioned at the the same location. ArrayList<ArrayList<Token>> identicallyPositionedTokenLists = new ArrayList<ArrayList<Token>>(); for (int i = 0; i < v.size(); i++) { if ((i == 0) || (v.get(i).getPositionIncrement() > 0)) { identicallyPositionedTokenLists.add(new ArrayList<Token>()); } ArrayList<Token> curList = identicallyPositionedTokenLists .get(identicallyPositionedTokenLists.size() - 1); curList.add(v.get(i)); } ArrayList<SpanQuery> spanNearSubclauses = new ArrayList<SpanQuery>(); for (int listNum = 0; listNum < identicallyPositionedTokenLists.size(); listNum++) { ArrayList<Token> curTokens = identicallyPositionedTokenLists.get(listNum); ArrayList<SpanTermQuery> curTermQueries = new ArrayList<SpanTermQuery>(); for (int tokenNum = 0; tokenNum < curTokens.size(); tokenNum++) { SpanTermQuery termQuery = new SpanTermQuery( new Term(field, curTokens.get(tokenNum).term())); termQuery.setBoost(this.boost); curTermQueries.add(termQuery); } int size = curTermQueries.size(); if (size <= 0) continue; else if (size == 1) spanNearSubclauses.add(curTermQueries.get(0)); else spanNearSubclauses.add(new SpanOrQuery(curTermQueries.toArray(new SpanQuery[0]))); } SpanNearQuery query = new SpanNearQuery( (SpanQuery[]) spanNearSubclauses.toArray(new SpanQuery[0]), slop, true); return query; } } else { SpanTermQuery[] clauses = new SpanTermQuery[v.size()]; for (int i = 0; i < v.size(); i++) { Token t2 = v.get(i); clauses[i] = new SpanTermQuery(new Term(field, new String(t2.buffer(), 0, t2.length()))); } SpanNearQuery query = new SpanNearQuery(clauses, slop, true); return query; } } }
From source file:com.mhs.qsol.QsolToQueryVisitor.java
License:Apache License
/** * Converts a token, as defined in the qsol.jtb JavaCC file, into an * appropriate query.//from www . j a v a2 s .c o m * * @param token * @return */ protected Query tokenToQuery(String token) { token = removeEscapeChars(token); TokenStream source = analyzer.tokenStream(field, new StringReader(token)); ArrayList<Token> v = new ArrayList<Token>(); Token t; int positionCount = 0; boolean severalTokensAtSamePosition = false; CharTermAttribute charTermAtrib = source.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtrib = source.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncAtt = source.addAttribute(PositionIncrementAttribute.class); while (true) { try { if (!source.incrementToken()) { break; } t = new Token(charTermAtrib.buffer(), 0, charTermAtrib.length(), offsetAtrib.startOffset(), offsetAtrib.endOffset()); t.setPositionIncrement(posIncAtt.getPositionIncrement()); } catch (IOException e) { t = null; } if (t == null) { break; } v.add(t); if (t.getPositionIncrement() != 0) { positionCount += t.getPositionIncrement(); } else { severalTokensAtSamePosition = true; } } try { source.close(); } catch (IOException e) { // ignore } if (v.size() == 0) { // null's will get cleaned up in visitBooleanOp return null; } else if (v.size() == 1) { t = v.get(0); TermQuery termQuery = new TermQuery(new Term(field, new String(t.buffer(), 0, t.length()))); termQuery.setBoost(this.boost); return termQuery; } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: BooleanQuery q = new BooleanQuery(true); for (int i = 0; i < v.size(); i++) { t = v.get(i); TermQuery currentQuery = new TermQuery( new Term(field, new String(t.buffer(), 0, t.length()))); currentQuery.setBoost(this.boost); q.add(currentQuery, BooleanClause.Occur.SHOULD); } return q; } else { // All the Tokens in each sub-list are positioned at the the same location. ArrayList<ArrayList<Token>> identicallyPositionedTokenLists = new ArrayList<ArrayList<Token>>(); for (int i = 0; i < v.size(); i++) { if ((i == 0) || (v.get(i).getPositionIncrement() > 0)) { identicallyPositionedTokenLists.add(new ArrayList<Token>()); } ArrayList<Token> curList = identicallyPositionedTokenLists .get(identicallyPositionedTokenLists.size() - 1); curList.add(v.get(i)); } ArrayList<SpanQuery> spanNearSubclauses = new ArrayList<SpanQuery>(); for (int listNum = 0; listNum < identicallyPositionedTokenLists.size(); listNum++) { ArrayList<Token> curTokens = identicallyPositionedTokenLists.get(listNum); ArrayList<SpanTermQuery> curTermQueries = new ArrayList<SpanTermQuery>(); for (int tokenNum = 0; tokenNum < curTokens.size(); tokenNum++) { SpanTermQuery termQuery = new SpanTermQuery( new Term(field, curTokens.get(tokenNum).term())); termQuery.setBoost(this.boost); curTermQueries.add(termQuery); } int size = curTermQueries.size(); if (size <= 0) continue; else if (size == 1) spanNearSubclauses.add(curTermQueries.get(0)); else spanNearSubclauses.add(new SpanOrQuery(curTermQueries.toArray(new SpanQuery[0]))); } SpanNearQuery query = new SpanNearQuery( (SpanQuery[]) spanNearSubclauses.toArray(new SpanQuery[0]), slop, true); return query; } } else { SpanTermQuery[] clauses = new SpanTermQuery[v.size()]; for (int i = 0; i < v.size(); i++) { Token t2 = v.get(i); SpanTermQuery spanQuery = new SpanTermQuery( new Term(field, new String(t2.buffer(), 0, t2.length()))); spanQuery.setBoost(boost); clauses[i] = spanQuery; } // Note: There's a bug here (not by me) that where term offsets are not respected. SpanNearQuery query = new SpanNearQuery(clauses, slop, true); return query; } } }
From source file:edu.mit.ll.vizlinc.highlight.TokenStreamFromTermPositionVector.java
License:Apache License
/** * Constructor.//from www.ja v a 2 s.c o m * * @param termPositionVector TermPositionVector that contains the data for * creating the TokenStream. Must have positions and offsets. */ public TokenStreamFromTermPositionVector(final TermPositionVector termPositionVector) { termAttribute = addAttribute(CharTermAttribute.class); positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class); offsetAttribute = addAttribute(OffsetAttribute.class); final String[] terms = termPositionVector.getTerms(); for (int i = 0; i < terms.length; i++) { final TermVectorOffsetInfo[] offsets = termPositionVector.getOffsets(i); final int[] termPositions = termPositionVector.getTermPositions(i); for (int j = 0; j < termPositions.length; j++) { Token token; if (offsets != null) { token = new Token(terms[i].toCharArray(), 0, terms[i].length(), offsets[j].getStartOffset(), offsets[j].getEndOffset()); } else { token = new Token(); token.setEmpty().append(terms[i]); } // Yes - this is the position, not the increment! This is for // sorting. This value // will be corrected before use. token.setPositionIncrement(termPositions[j]); this.positionedTokens.add(token); } } CollectionUtil.mergeSort(this.positionedTokens, tokenComparator); int lastPosition = -1; for (final Token token : this.positionedTokens) { int thisPosition = token.getPositionIncrement(); token.setPositionIncrement(thisPosition - lastPosition); lastPosition = thisPosition; } this.tokensAtCurrentPosition = this.positionedTokens.iterator(); }
From source file:edu.mit.ll.vizlinc.highlight.TokenStreamFromTermPositionVector.java
License:Apache License
@Override public boolean incrementToken() throws IOException { if (this.tokensAtCurrentPosition.hasNext()) { final Token next = this.tokensAtCurrentPosition.next(); clearAttributes();/*from w w w. jav a2s. co m*/ termAttribute.setEmpty().append(next); positionIncrementAttribute.setPositionIncrement(next.getPositionIncrement()); offsetAttribute.setOffset(next.startOffset(), next.endOffset()); return true; } return false; }
From source file:edu.uci.ics.sourcerer.search.analysis.TestCamelCase.java
License:Open Source License
public void testCCFromJdkNames() throws Exception { // /*from w w w. j av a2 s . co m*/ ccf = getCCF("attWildcardAsURIs"); final Token reusableToken = new Token(); Token nextToken = ccf.next(reusableToken); assertEquals("att", nextToken.term()); nextToken = ccf.next(reusableToken); assertEquals("Wildcard", nextToken.term()); nextToken = ccf.next(reusableToken); assertEquals("As", nextToken.term()); nextToken = ccf.next(reusableToken); assertEquals("UR", nextToken.term()); nextToken = ccf.next(reusableToken); assertEquals("URI", nextToken.term()); assertEquals(0, nextToken.getPositionIncrement()); nextToken = ccf.next(reusableToken); assertEquals("Is", nextToken.term()); assertEquals(1, nextToken.getPositionIncrement()); assertNull(ccf.next(reusableToken)); }
From source file:edu.usu.cosl.analysis.es.SpanishPorterFilterFactory.java
License:Apache License
@Override public Token next() throws IOException { Token tok = input.next(); if (tok == null) return null; String tokstr = tok.termText(); // if protected, don't stem. use this to avoid stemming collisions. if (protWords != null && protWords.contains(tokstr)) { return tok; }/* w ww . j ava 2s . com*/ stemmer.setCurrent(tokstr); stemmer.stem(); String newstr = stemmer.getCurrent(); if (tokstr.equals(newstr)) { return tok; } else { // TODO: it would be nice if I could just set termText directly like // lucene packages can. Token newtok = new Token(newstr, tok.startOffset(), tok.endOffset(), tok.type()); newtok.setPositionIncrement(tok.getPositionIncrement()); return newtok; } }
From source file:gpl.pierrick.brihaye.aramorph.lucene.ArabicGlosser.java
License:Open Source License
/** Returns the next gloss for the given stem. * @param firstOne Whether or not this gloss is the first one * @return The gloss. Its <CODE>termText</CODE> is the gloss of the <STRONG>stem</STRONG>. Its <CODE>type</CODE> is the grammatical category of the <STRONG>stem</STRONG>. * When several glosses are available, every emitted token's * <CODE>PositionIncrement</CODE> but the first one is set to <CODE>0</CODE> * @see org.apache.lucene.analysis.Token#setPositionIncrement(int) */// w w w . ja v a2s . co m private Token nextGloss(boolean firstOne) { Token emittedToken = null; String tokenText = null; String tokenType = null; try { tokenText = (String) tokenGlosses.getFirst(); if (tokenText == null) tokenText = ""; //Token is typed in order to filter it later tokenType = (String) tokenPOS.getFirst(); if (tokenType == null) tokenType = "NO_STEM"; //OK : we're done with this gloss tokenGlosses.removeFirst(); tokenPOS.removeFirst(); //Will there be further treatment for this token ? processingToken = !tokenGlosses.isEmpty(); } //It should not be normally possible ! catch (IndexOutOfBoundsException e) { System.err.println("Something went wrong in nextGloss"); processingToken = false; //Re-emit the same token text (romanized) : not the best solution :-( tokenText = romanizedToken; tokenType = "PLACE_HOLDER"; } emittedToken = new Token(tokenText, receivedToken.startOffset(), receivedToken.endOffset(), tokenType); if (!firstOne) emittedToken.setPositionIncrement(0); if (debug) System.out.println( emittedToken.termText() + "\t" + emittedToken.type() + "\t" + "[" + emittedToken.startOffset() + "-" + emittedToken.endOffset() + "]" + "\t" + emittedToken.getPositionIncrement()); return emittedToken; }