List of usage examples for org.apache.lucene.analysis Token clear
@Override public void clear()
From source file:analysis.StandardTokenizer.java
License:Apache License
@SuppressWarnings("deprecation") public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; int posIncr = 1; Token result = reusableToken;// w ww. j a va 2 s. c o m if (tokenList.size() > 0) return tokenList.remove(); while (true) { int tokenType = scanner.getNextToken(); if (tokenType == StandardTokenizerImpl.YYEOF) { return null; } if (scanner.yylength() <= maxTokenLength) { reusableToken.clear(); reusableToken.setPositionIncrement(posIncr); scanner.getText(reusableToken); final int start = scanner.yychar(); reusableToken.setStartOffset(start); reusableToken.setEndOffset(start + reusableToken.termLength()); // This 'if' should be removed in the next release. For now, it // converts // invalid acronyms to HOST. When removed, only the 'else' part // should // remain. if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) { if (replaceInvalidAcronym) { reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]); reusableToken.setTermLength(reusableToken.termLength() - 1); // remove // extra // '.' tokenType = StandardTokenizerImpl.HOST; } else { reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]); tokenType = StandardTokenizerImpl.ACRONYM; } } else { reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]); } if (tokenType == StandardTokenizerImpl.HOST || tokenType == StandardTokenizerImpl.NUM || tokenType == StandardTokenizerImpl.ALPHANUM) { Tokenizer lt = new LetterDigitBreakTokenizer(new StringReader(reusableToken.term())); Token tk = null; int st = reusableToken.startOffset(); final Token token = new Token(); while ((tk = lt.next(token)) != null) { tk.setStartOffset(tk.startOffset() + st); tk.setEndOffset(tk.endOffset() + st); tk.setType(reusableToken.type()); tokenList.add((Token) tk.clone()); } } if (tokenList.size() > 0) result = tokenList.remove(); return result; } else // When we skip a too-long term, we still increment the // position increment posIncr++; } }
From source file:au.edu.unimelb.csse.analyser.String2NodesParser.java
License:Apache License
public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; nodesPosition++;/* ww w .j a v a 2s . c o m*/ if (nodesPosition < nodes.size()) { reusableToken.clear(); Node node = nodes.get(nodesPosition); reusableToken.setTermBuffer(node.name); reusableToken.setPayload(node.getPayload()); return reusableToken; } return null; }
From source file:com.zb.mmseg.analysis.TokenUtils.java
License:Open Source License
/** * @param input/*from w w w . j av a 2 s. co m*/ * @param reusableToken is null well new one auto. * @return null - if not next token or input is null. * @throws IOException */ public static Token nextToken(TokenStream input, Token reusableToken) throws IOException { if (input == null) { return null; } if (!input.incrementToken()) { return null; } CharTermAttribute termAtt = input.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = input.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = input.getAttribute(TypeAttribute.class); if (reusableToken == null) { reusableToken = new Token(); } reusableToken.clear(); if (termAtt != null) { // lucene 3.0 // reusableToken.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength()); // lucene 3.1 reusableToken.copyBuffer(termAtt.buffer(), 0, termAtt.length()); } if (offsetAtt != null) { // lucene 3.1 // reusableToken.setStartOffset(offsetAtt.startOffset()); // reusableToken.setEndOffset(offsetAtt.endOffset()); // lucene 4.0 reusableToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); } if (typeAtt != null) { reusableToken.setType(typeAtt.type()); } return reusableToken; }
From source file:org.apache.jackrabbit.core.query.lucene.SingletonTokenStream.java
License:Apache License
/** * {@inheritDoc}// w w w . ja v a 2 s . c o m */ public Token next(Token reusableToken) throws IOException { if (value == null) { return null; } reusableToken.clear(); reusableToken.setTermBuffer(value); reusableToken.setPayload(payload); reusableToken.setStartOffset(0); reusableToken.setEndOffset(value.length()); value = null; return reusableToken; }
From source file:org.dutir.tokenizer.CharTokenizer.java
License:Apache License
public final Token next(Token token) throws IOException { if (bufferIndex >= dataLen) { return null; }//from www. j av a 2s .co m token.clear(); int length = 0; int start = bufferIndex; char[] buffer = token.termBuffer(); while (true) { if (bufferIndex >= dataLen) { break; } final char c = ioBuffer[bufferIndex++]; if (isTokenChar(c)) { // if it's a token char if (length == 0) // start of token start = offset + bufferIndex - 1; else if (length == buffer.length) buffer = token.resizeTermBuffer(1 + length); buffer[length++] = normalize(c); // buffer it, normalized if (length == MAX_WORD_LEN) // buffer overflow! break; } else if (length > 0) // at non-Letter w/ chars break; // return 'em } token.setTermLength(length); token.setStartOffset(start); token.setEndOffset(start + length); if (stemTag) { // token.setTermText(PorterStemmer.stem(token.termText())); } // System.out.println("entoken:" + token); return token; }
From source file:org.sindice.solr.plugins.analysis.CustomStandardTokenizer.java
License:Apache License
public Token next(Token result) throws IOException { int posIncr = 1; while (true) { int tokenType = scanner.getNextToken(); if (tokenType == CustomStandardTokenizerImpl.YYEOF) { return null; }//from ww w . j a v a2 s. c o m if (scanner.yylength() <= maxTokenLength) { result.clear(); result.setPositionIncrement(posIncr); scanner.getText(result); final int start = scanner.yychar(); result.setStartOffset(start); result.setEndOffset(start + result.termLength()); result.setType(CustomStandardTokenizerImpl.TOKEN_TYPES[tokenType]); return result; } else // When we skip a too-long term, we still increment the // position increment posIncr++; } }
From source file:org.xerela.provider.configstore.ZLuceneTokenizer.java
License:Mozilla Public License
/** {@inheritDoc} */ @Override/* ww w . j a v a2 s .c om*/ public Token next(Token token) throws IOException { currentToken.setLength(0); int startOffset = offset; int endOffset = offset; boolean tokenStarted = false; while (true) { int c = reader.read(); if (c == -1) { endOffset = offset - 1; break; } if (IGNORE_CHAR[c]) { if (tokenStarted) { endOffset = offset; ++offset; break; } ++offset; continue; } if (!tokenStarted) { startOffset = offset; tokenStarted = true; } currentToken.append((char) c); ++offset; } if (currentToken.length() == 0) { return null; } token.clear(); token.setTermText(currentToken.toString()); token.setTermLength(currentToken.length()); token.setStartOffset(startOffset); token.setEndOffset(endOffset); return token; }