List of usage examples for org.apache.lucene.analysis Token Token
public Token(CharSequence text, int posInc, int start, int end, int posLength)
From source file:cc.pp.analyzer.imdict.core.WordSegmenter.java
License:Apache License
/** * * RawToken???Token ?RawToken? ????/*from w w w . jav a 2s. c o m*/ * * @param rt * @param sentence ???? * @param sentenceStartOffset sentence?? * @param type tokenword * @return */ public Token convertSegToken(SegToken st, String sentence, int sentenceStartOffset, String type) { Token result; switch (st.wordType) { case STRING: case NUMBER: case FULLWIDTH_NUMBER: case FULLWIDTH_STRING: st.charArray = sentence.substring(st.startOffset, st.endOffset).toCharArray(); break; default: break; } st = tokenFilter.filter(st); result = new Token(st.charArray, 0, st.charArray.length, st.startOffset + sentenceStartOffset, st.endOffset + sentenceStartOffset); return result; }
From source file:ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search.SeparatorSplitterTokenFilter.java
License:Apache License
private void addToken(Token token, int startPos, int endPos) { if (startPos < endPos) { int startOffset = token.startOffset() + startPos; int endOffset = token.startOffset() + endPos; Token newToken = new Token(token.termBuffer(), startPos, endPos - startPos, startOffset, endOffset); tokens.add(newToken);//from www .j ava 2s . c o m } }
From source file:com.mhs.qsol.proximity.ProximityVisitor.java
License:Apache License
/** * Converts a token, as defined in the qsol.jtb JavaCC file, into an * appropriate query.//from ww w .ja v a 2s . c o m * * @param token * @return */ protected Query tokenToQuery(String token) { if (logger.isLoggable(Level.FINE)) { // logger.fine("Query tokenToQuery(String token) : token:" + token); } if (logger.isLoggable(Level.FINE)) { logger.fine("Query tokenToQuery(String token) : token:" + token); } token = removeEscapeChars(token); TokenStream source = analyzer.tokenStream(field, new StringReader(token)); CharTermAttribute charTermAtrib = source.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtrib = source.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncAtt = source.addAttribute(PositionIncrementAttribute.class); ArrayList<Token> v = new ArrayList<Token>(); Token t; int positionCount = 0; boolean severalTokensAtSamePosition = false; while (true) { try { if (!source.incrementToken()) { break; } t = new Token(charTermAtrib.buffer(), 0, charTermAtrib.length(), offsetAtrib.startOffset(), offsetAtrib.endOffset()); t.setPositionIncrement(posIncAtt.getPositionIncrement()); } catch (IOException e) { t = null; } if (t == null) { break; } v.add(t); if (t.getPositionIncrement() != 0) { positionCount += t.getPositionIncrement(); } else { severalTokensAtSamePosition = true; } } try { source.close(); } catch (IOException e) { // ignore } if (v.size() == 0) { return null; } else if (v.size() == 1) { t = v.get(0); SpanTermQuery stq = new SpanTermQuery(new Term(field, new String(t.buffer(), 0, t.length()))); stq.setBoost(this.boost); return stq; } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: SpanQuery[] spanQueries = new SpanQuery[v.size()]; StringBuilder regex = new StringBuilder(); for (int i = 0; i < v.size(); i++) { spanQueries[i] = new SpanTermQuery(new Term(field, regex.toString())); } return new SpanOrQuery(spanQueries); } else { // All the Tokens in each sub-list are positioned at the the same location. ArrayList<ArrayList<Token>> identicallyPositionedTokenLists = new ArrayList<ArrayList<Token>>(); for (int i = 0; i < v.size(); i++) { if ((i == 0) || (v.get(i).getPositionIncrement() > 0)) { identicallyPositionedTokenLists.add(new ArrayList<Token>()); } ArrayList<Token> curList = identicallyPositionedTokenLists .get(identicallyPositionedTokenLists.size() - 1); curList.add(v.get(i)); } ArrayList<SpanQuery> spanNearSubclauses = new ArrayList<SpanQuery>(); for (int listNum = 0; listNum < identicallyPositionedTokenLists.size(); listNum++) { ArrayList<Token> curTokens = identicallyPositionedTokenLists.get(listNum); ArrayList<SpanTermQuery> curTermQueries = new ArrayList<SpanTermQuery>(); for (int tokenNum = 0; tokenNum < curTokens.size(); tokenNum++) { SpanTermQuery termQuery = new SpanTermQuery( new Term(field, curTokens.get(tokenNum).term())); termQuery.setBoost(this.boost); curTermQueries.add(termQuery); } int size = curTermQueries.size(); if (size <= 0) continue; else if (size == 1) spanNearSubclauses.add(curTermQueries.get(0)); else spanNearSubclauses.add(new SpanOrQuery(curTermQueries.toArray(new SpanQuery[0]))); } SpanNearQuery query = new SpanNearQuery( (SpanQuery[]) spanNearSubclauses.toArray(new SpanQuery[0]), slop, true); return query; } } else { SpanTermQuery[] clauses = new SpanTermQuery[v.size()]; for (int i = 0; i < v.size(); i++) { Token t2 = v.get(i); clauses[i] = new SpanTermQuery(new Term(field, new String(t2.buffer(), 0, t2.length()))); } SpanNearQuery query = new SpanNearQuery(clauses, slop, true); return query; } } }
From source file:com.mhs.qsol.QsolToQueryVisitor.java
License:Apache License
/** * Converts a token, as defined in the qsol.jtb JavaCC file, into an * appropriate query./* w w w . jav a 2s.co m*/ * * @param token * @return */ protected Query tokenToQuery(String token) { token = removeEscapeChars(token); TokenStream source = analyzer.tokenStream(field, new StringReader(token)); ArrayList<Token> v = new ArrayList<Token>(); Token t; int positionCount = 0; boolean severalTokensAtSamePosition = false; CharTermAttribute charTermAtrib = source.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtrib = source.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncAtt = source.addAttribute(PositionIncrementAttribute.class); while (true) { try { if (!source.incrementToken()) { break; } t = new Token(charTermAtrib.buffer(), 0, charTermAtrib.length(), offsetAtrib.startOffset(), offsetAtrib.endOffset()); t.setPositionIncrement(posIncAtt.getPositionIncrement()); } catch (IOException e) { t = null; } if (t == null) { break; } v.add(t); if (t.getPositionIncrement() != 0) { positionCount += t.getPositionIncrement(); } else { severalTokensAtSamePosition = true; } } try { source.close(); } catch (IOException e) { // ignore } if (v.size() == 0) { // null's will get cleaned up in visitBooleanOp return null; } else if (v.size() == 1) { t = v.get(0); TermQuery termQuery = new TermQuery(new Term(field, new String(t.buffer(), 0, t.length()))); termQuery.setBoost(this.boost); return termQuery; } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: BooleanQuery q = new BooleanQuery(true); for (int i = 0; i < v.size(); i++) { t = v.get(i); TermQuery currentQuery = new TermQuery( new Term(field, new String(t.buffer(), 0, t.length()))); currentQuery.setBoost(this.boost); q.add(currentQuery, BooleanClause.Occur.SHOULD); } return q; } else { // All the Tokens in each sub-list are positioned at the the same location. ArrayList<ArrayList<Token>> identicallyPositionedTokenLists = new ArrayList<ArrayList<Token>>(); for (int i = 0; i < v.size(); i++) { if ((i == 0) || (v.get(i).getPositionIncrement() > 0)) { identicallyPositionedTokenLists.add(new ArrayList<Token>()); } ArrayList<Token> curList = identicallyPositionedTokenLists .get(identicallyPositionedTokenLists.size() - 1); curList.add(v.get(i)); } ArrayList<SpanQuery> spanNearSubclauses = new ArrayList<SpanQuery>(); for (int listNum = 0; listNum < identicallyPositionedTokenLists.size(); listNum++) { ArrayList<Token> curTokens = identicallyPositionedTokenLists.get(listNum); ArrayList<SpanTermQuery> curTermQueries = new ArrayList<SpanTermQuery>(); for (int tokenNum = 0; tokenNum < curTokens.size(); tokenNum++) { SpanTermQuery termQuery = new SpanTermQuery( new Term(field, curTokens.get(tokenNum).term())); termQuery.setBoost(this.boost); curTermQueries.add(termQuery); } int size = curTermQueries.size(); if (size <= 0) continue; else if (size == 1) spanNearSubclauses.add(curTermQueries.get(0)); else spanNearSubclauses.add(new SpanOrQuery(curTermQueries.toArray(new SpanQuery[0]))); } SpanNearQuery query = new SpanNearQuery( (SpanQuery[]) spanNearSubclauses.toArray(new SpanQuery[0]), slop, true); return query; } } else { SpanTermQuery[] clauses = new SpanTermQuery[v.size()]; for (int i = 0; i < v.size(); i++) { Token t2 = v.get(i); SpanTermQuery spanQuery = new SpanTermQuery( new Term(field, new String(t2.buffer(), 0, t2.length()))); spanQuery.setBoost(boost); clauses[i] = spanQuery; } // Note: There's a bug here (not by me) that where term offsets are not respected. SpanNearQuery query = new SpanNearQuery(clauses, slop, true); return query; } } }
From source file:com.zb.mmseg.analysis.CutLetterDigitFilter.java
License:Open Source License
private void addToken(Token oriToken, int termBufferOffset, int termBufferLength, byte type) { Token token = new Token(oriToken.buffer(), termBufferOffset, termBufferLength, oriToken.startOffset() + termBufferOffset, oriToken.startOffset() + termBufferOffset + termBufferLength); if (type == Character.DECIMAL_DIGIT_NUMBER) { token.setType(MMSegWord.TYPE_DIGIT); } else {/* w w w . j a v a2 s.c o m*/ token.setType(MMSegWord.TYPE_LETTER); } tokenQueue.offer(token); }
From source file:edu.mit.ll.vizlinc.highlight.TokenStreamFromTermPositionVector.java
License:Apache License
/** * Constructor./*from ww w . j a v a2 s.c o m*/ * * @param termPositionVector TermPositionVector that contains the data for * creating the TokenStream. Must have positions and offsets. */ public TokenStreamFromTermPositionVector(final TermPositionVector termPositionVector) { termAttribute = addAttribute(CharTermAttribute.class); positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class); offsetAttribute = addAttribute(OffsetAttribute.class); final String[] terms = termPositionVector.getTerms(); for (int i = 0; i < terms.length; i++) { final TermVectorOffsetInfo[] offsets = termPositionVector.getOffsets(i); final int[] termPositions = termPositionVector.getTermPositions(i); for (int j = 0; j < termPositions.length; j++) { Token token; if (offsets != null) { token = new Token(terms[i].toCharArray(), 0, terms[i].length(), offsets[j].getStartOffset(), offsets[j].getEndOffset()); } else { token = new Token(); token.setEmpty().append(terms[i]); } // Yes - this is the position, not the increment! This is for // sorting. This value // will be corrected before use. token.setPositionIncrement(termPositions[j]); this.positionedTokens.add(token); } } CollectionUtil.mergeSort(this.positionedTokens, tokenComparator); int lastPosition = -1; for (final Token token : this.positionedTokens) { int thisPosition = token.getPositionIncrement(); token.setPositionIncrement(thisPosition - lastPosition); lastPosition = thisPosition; } this.tokensAtCurrentPosition = this.positionedTokens.iterator(); }
From source file:edu.uci.ics.sourcerer.search.analysis.NoTokenizer.java
License:Open Source License
public Token next() throws IOException { if (charArr.size() == 0) return null; char[] _char = new char[charSize]; for (int i = 0; i < charSize; i++) { _char[i] = charArr.get(i); }/*from w w w. j a v a2s. c o m*/ Token _tok = new Token(_char, 0, charSize, 0, charSize); charArr.clear(); return _tok; }
From source file:edu.uci.ics.sourcerer.search.analysis.SingleSpaceTokenizer.java
License:Open Source License
public Token next() throws IOException { if (charArr.size() == 0) return null; char[] _char = new char[charSize]; int tokenSize = 0; for (int i = 0; i + start < charSize; i++) { char _c = charArr.get(i + start); if (isTokenChar(_c)) { tokenSize++;// ww w. j av a 2s. c o m _char[i] = _c; } else { break; } } Token _tok = new Token(_char, 0, tokenSize, start, start + tokenSize); //.., offset, length, start, end if (start + tokenSize == charSize) charArr.clear(); start = start + tokenSize + 1; return _tok; }
From source file:magoffin.matt.lucene.DigitTokenizer.java
License:Open Source License
@Override public Token next() throws IOException { if (complete) { return null; }/*from w w w .ja va 2s. c o m*/ // read in entire string StringWriter out = new StringWriter(); FileCopyUtils.copy(this.input, out); int end = out.getBuffer().length(); String numbers = out.toString().replaceAll("\\D", ""); if (maxLength > 0 && numbers.length() > maxLength) { numbers = numbers.substring(numbers.length() - maxLength); } complete = true; char[] numChars = numbers.toCharArray(); return new Token(numChars, 0, numChars.length, 0, end); }
From source file:magoffin.matt.lucene.KeyTokenizer.java
License:Open Source License
@Override public Token next() throws IOException { if (complete) { return null; }/*from w ww . j a v a 2 s .co m*/ int numRead = input.read(buffer); String key = ""; if (numRead > 0) { if (!trim) { key = new String(buffer, 0, numRead); } else { if (buffer.length == 1) { while (Character.isWhitespace(buffer[0])) { numRead = input.read(buffer); if (numRead < 1) { break; } } } else { if (numRead < buffer.length) { Arrays.fill(buffer, numRead, buffer.length, ' '); } int i = 0; while (i < buffer.length && numRead > 0) { int start = i; for (; i < buffer.length && Character.isWhitespace(buffer[i]); i++) { // skip } if (i > start) { // found whitespace at beginning, so discard them and // read more from stream System.arraycopy(buffer, i, buffer, start, buffer.length - i); numRead = input.read(buffer, buffer.length - i, i); } else { break; } } } key = new String(buffer).trim(); if (key.length() < 1) { return null; } } } complete = true; char[] keyChar = key.toCharArray(); return new Token(keyChar, 0, keyChar.length, 0, numRead - 1); }