List of usage examples for org.apache.lucene.analysis Token endOffset
@Override public final int endOffset()
From source file:analysis.StandardTokenizer.java
License:Apache License
@SuppressWarnings("deprecation") public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; int posIncr = 1; Token result = reusableToken;//w w w .jav a 2 s. co m if (tokenList.size() > 0) return tokenList.remove(); while (true) { int tokenType = scanner.getNextToken(); if (tokenType == StandardTokenizerImpl.YYEOF) { return null; } if (scanner.yylength() <= maxTokenLength) { reusableToken.clear(); reusableToken.setPositionIncrement(posIncr); scanner.getText(reusableToken); final int start = scanner.yychar(); reusableToken.setStartOffset(start); reusableToken.setEndOffset(start + reusableToken.termLength()); // This 'if' should be removed in the next release. For now, it // converts // invalid acronyms to HOST. When removed, only the 'else' part // should // remain. if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) { if (replaceInvalidAcronym) { reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]); reusableToken.setTermLength(reusableToken.termLength() - 1); // remove // extra // '.' tokenType = StandardTokenizerImpl.HOST; } else { reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]); tokenType = StandardTokenizerImpl.ACRONYM; } } else { reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]); } if (tokenType == StandardTokenizerImpl.HOST || tokenType == StandardTokenizerImpl.NUM || tokenType == StandardTokenizerImpl.ALPHANUM) { Tokenizer lt = new LetterDigitBreakTokenizer(new StringReader(reusableToken.term())); Token tk = null; int st = reusableToken.startOffset(); final Token token = new Token(); while ((tk = lt.next(token)) != null) { tk.setStartOffset(tk.startOffset() + st); tk.setEndOffset(tk.endOffset() + st); tk.setType(reusableToken.type()); tokenList.add((Token) tk.clone()); } } if (tokenList.size() > 0) result = tokenList.remove(); return result; } else // When we skip a too-long term, we still increment the // position increment posIncr++; } }
From source file:cc.pp.analyzer.paoding.analyzer.impl.MaxWordLengthTokenCollector.java
License:Apache License
@Override public void collect(String word, int offset, int end) { Token c = candidate != null ? candidate : last; if (c == null) { candidate = new Token(word, offset, end); } else if (offset == c.startOffset()) { if (end > c.endOffset()) { candidate = new Token(word, offset, end); }/* www. j a v a2 s . c o m*/ } else if (offset > c.startOffset()) { if (candidate != null) { select(candidate); } if (end > c.endOffset()) { candidate = new Token(word, offset, end); } else { candidate = null; } } else if (end >= c.endOffset()) { if (last != null && last.startOffset() >= offset && last.endOffset() <= end) { for (Iterator/* <Token> */<Token> iter = tokens.iterator(); iter.hasNext();) { last = iter.next(); if (last.startOffset() >= offset && last.endOffset() <= end) { iter.remove(); } } } last = null; candidate = new Token(word, offset, end); } }
From source file:com.fdt.sdl.core.analyzer.phonetix.lucene.PhoneticFilter.java
License:Open Source License
/** * Returns the next token in the stream, or <code>null</code> at EOF. *//*from www .ja va 2 s . c o m*/ public Token next() throws IOException { if (encoder == null) { return input.next(); } else if (encoder.length == 1) { // optimize, if one encoder only final Token t = input.next(); if (t == null) return null; return (encoder[0] != null) ? new Token(encoder[0].generateKey(t.termText()), t.startOffset(), t.endOffset(), t.type()) : t; } else { ++actualIndex; // get next token, if necessary if (actualIndex >= encoder.length) { actualToken = input.next(); if (actualToken == null) { actualIndex = encoder.length; return null; } actualIndex = 0; } if (encoder[actualIndex] == null) return actualToken; else return new Token( encoder[actualIndex].toString() + ":" + encoder[actualIndex].generateKey(actualToken.termText()), actualToken.startOffset(), actualToken.endOffset(), actualToken.type()); } }
From source file:com.flaptor.hounder.classifier.util.TupleTokenizer.java
License:Apache License
private Token mergeTokens(Token t1, Token t2) { Token res = new Token(); if (null == t1) { return t2; }//from w ww .j ava2s .co m char[] text = (TokenUtil.termText(t1) + "_" + TokenUtil.termText(t2)).toCharArray(); res.reinit(text, 0, text.length, t1.startOffset(), t2.endOffset()); return res; }
From source file:com.flaptor.hounder.searcher.PhraseMatchingFragmenter.java
License:Apache License
public boolean isNewFragment(Token token) { boolean isNewFrag = lineBreaker(lastOffset, token.startOffset()); logger.debug("token: " + TokenUtil.termText(token)); if (isNewFrag) logger.debug("BREAK!"); lastOffset = token.endOffset(); return isNewFrag; }
From source file:com.globalsight.ling.lucene.analysis.GSTokenizer.java
License:Apache License
@Override public boolean incrementToken() throws IOException { clearAttributes();/* w w w . ja v a2s .com*/ Token tt = next(); if (tt == null) { return false; } else { gsAtt.setToken(tt); termAtt.append(tt.toString()); offsetAtt.setOffset(tt.startOffset(), tt.endOffset()); posIncrAtt.setPositionIncrement(tt.getPositionIncrement()); return true; } }
From source file:com.globalsight.ling.lucene.analysis.ngram.NgramAnalyzer.java
License:Apache License
static void test(String p_text) throws java.io.IOException { NgramAnalyzer x = new NgramAnalyzer(3); NgramTokenizer y = new NgramTokenizer(new java.io.StringReader(p_text), 3); System.out.println("Text = " + p_text); Token t; while ((t = y.next()) != null) { System.out.println(t.toString() + " (" + t.startOffset() + ":" + t.endOffset() + ")"); }//from w ww .j av a2 s .c o m }
From source file:com.globalsight.ling.lucene.analysis.ngram.NgramNoPunctuationAnalyzer.java
License:Apache License
static void test(String p_text) throws java.io.IOException { Analyzer x = new NgramNoPunctuationAnalyzer(3); NgramNoPunctuationTokenizer y = new NgramNoPunctuationTokenizer(new java.io.StringReader(p_text), 3); System.out.println("Text = " + p_text); Token t; while ((t = y.next()) != null) { System.out.println(t.toString() + " (" + t.startOffset() + ":" + t.endOffset() + ")"); }//from ww w . ja va 2 s.c o m }
From source file:com.globalsight.ling.lucene.analysis.pl.PolishFilter.java
License:Apache License
/** Returns the next input Token, after being stemmed */ public final Token next() throws IOException { Token token = getNextToken(); if (token == null) { return null; } else {//from ww w . j a va 2 s. c om String s = stemmer.stem(token.toString(), true); if (!s.equals(token.toString())) { // reconstruct the input token. This is silly... Token res = new Token(s, token.startOffset(), token.endOffset(), token.type()); res.setPositionIncrement(token.getPositionIncrement()); return res; } return token; } }
From source file:com.globalsight.ling.lucene.analysis.ru.RussianLowerCaseFilter.java
License:Apache License
public final Token next() throws java.io.IOException { Token t = getNextToken(); if (t == null) return null; String txt = t.toString();/*w w w . j a va 2 s . co m*/ char[] chArray = txt.toCharArray(); for (int i = 0; i < chArray.length; i++) { chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset); } String newTxt = new String(chArray); // create new token Token newToken = new Token(newTxt, t.startOffset(), t.endOffset()); return newToken; }