List of usage examples for org.apache.lucene.analysis Token startOffset
@Override public final int startOffset()
From source file:analysis.StandardTokenizer.java
License:Apache License
@SuppressWarnings("deprecation") public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; int posIncr = 1; Token result = reusableToken;/* w w w . j ava 2 s .c om*/ if (tokenList.size() > 0) return tokenList.remove(); while (true) { int tokenType = scanner.getNextToken(); if (tokenType == StandardTokenizerImpl.YYEOF) { return null; } if (scanner.yylength() <= maxTokenLength) { reusableToken.clear(); reusableToken.setPositionIncrement(posIncr); scanner.getText(reusableToken); final int start = scanner.yychar(); reusableToken.setStartOffset(start); reusableToken.setEndOffset(start + reusableToken.termLength()); // This 'if' should be removed in the next release. For now, it // converts // invalid acronyms to HOST. When removed, only the 'else' part // should // remain. if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) { if (replaceInvalidAcronym) { reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]); reusableToken.setTermLength(reusableToken.termLength() - 1); // remove // extra // '.' tokenType = StandardTokenizerImpl.HOST; } else { reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]); tokenType = StandardTokenizerImpl.ACRONYM; } } else { reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]); } if (tokenType == StandardTokenizerImpl.HOST || tokenType == StandardTokenizerImpl.NUM || tokenType == StandardTokenizerImpl.ALPHANUM) { Tokenizer lt = new LetterDigitBreakTokenizer(new StringReader(reusableToken.term())); Token tk = null; int st = reusableToken.startOffset(); final Token token = new Token(); while ((tk = lt.next(token)) != null) { tk.setStartOffset(tk.startOffset() + st); tk.setEndOffset(tk.endOffset() + st); tk.setType(reusableToken.type()); tokenList.add((Token) tk.clone()); } } if (tokenList.size() > 0) result = tokenList.remove(); return result; } else // When we skip a too-long term, we still increment the // position increment posIncr++; } }
From source file:cc.pp.analyzer.imdict.core.WordSegmenter.java
License:Apache License
/** * HHMMSegment??sentence Token???Token List * * @param sentenceToken ??Token/* w w w. jav a2 s.co m*/ * @param shortPathCount HHMM???? * @return ?Token List */ public List<Token> segmentSentence(Token sentenceToken, int shortPathCount) { String sentence = sentenceToken.term(); List<SegToken> segTokenList = hhmmSegmenter.process(sentence); List<Token> result = new ArrayList<Token>(); // i1rawTokens.length-2##?##?RawToken for (int i = 1; i < segTokenList.size() - 1; i++) { result.add(convertSegToken(segTokenList.get(i), sentence, sentenceToken.startOffset(), "word")); } return result; }
From source file:cc.pp.analyzer.paoding.analyzer.impl.MaxWordLengthTokenCollector.java
License:Apache License
@Override public void collect(String word, int offset, int end) { Token c = candidate != null ? candidate : last; if (c == null) { candidate = new Token(word, offset, end); } else if (offset == c.startOffset()) { if (end > c.endOffset()) { candidate = new Token(word, offset, end); }//from w w w . j av a 2s. c om } else if (offset > c.startOffset()) { if (candidate != null) { select(candidate); } if (end > c.endOffset()) { candidate = new Token(word, offset, end); } else { candidate = null; } } else if (end >= c.endOffset()) { if (last != null && last.startOffset() >= offset && last.endOffset() <= end) { for (Iterator/* <Token> */<Token> iter = tokens.iterator(); iter.hasNext();) { last = iter.next(); if (last.startOffset() >= offset && last.endOffset() <= end) { iter.remove(); } } } last = null; candidate = new Token(word, offset, end); } }
From source file:ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search.SeparatorSplitterTokenFilter.java
License:Apache License
private void addToken(Token token, int startPos, int endPos) { if (startPos < endPos) { int startOffset = token.startOffset() + startPos; int endOffset = token.startOffset() + endPos; Token newToken = new Token(token.termBuffer(), startPos, endPos - startPos, startOffset, endOffset); tokens.add(newToken);/* w ww. j av a 2s . c o m*/ } }
From source file:com.fdt.sdl.core.analyzer.phonetix.lucene.PhoneticFilter.java
License:Open Source License
/** * Returns the next token in the stream, or <code>null</code> at EOF. *///from ww w . j a v a 2s .c o m public Token next() throws IOException { if (encoder == null) { return input.next(); } else if (encoder.length == 1) { // optimize, if one encoder only final Token t = input.next(); if (t == null) return null; return (encoder[0] != null) ? new Token(encoder[0].generateKey(t.termText()), t.startOffset(), t.endOffset(), t.type()) : t; } else { ++actualIndex; // get next token, if necessary if (actualIndex >= encoder.length) { actualToken = input.next(); if (actualToken == null) { actualIndex = encoder.length; return null; } actualIndex = 0; } if (encoder[actualIndex] == null) return actualToken; else return new Token( encoder[actualIndex].toString() + ":" + encoder[actualIndex].generateKey(actualToken.termText()), actualToken.startOffset(), actualToken.endOffset(), actualToken.type()); } }
From source file:com.flaptor.hounder.classifier.util.TupleTokenizer.java
License:Apache License
private Token mergeTokens(Token t1, Token t2) { Token res = new Token(); if (null == t1) { return t2; }/* w w w. ja va2 s .c o m*/ char[] text = (TokenUtil.termText(t1) + "_" + TokenUtil.termText(t2)).toCharArray(); res.reinit(text, 0, text.length, t1.startOffset(), t2.endOffset()); return res; }
From source file:com.flaptor.hounder.searcher.PhraseMatchingFragmenter.java
License:Apache License
public boolean isNewFragment(Token token) { boolean isNewFrag = lineBreaker(lastOffset, token.startOffset()); logger.debug("token: " + TokenUtil.termText(token)); if (isNewFrag) logger.debug("BREAK!"); lastOffset = token.endOffset();//from w w w . j a va 2s .c om return isNewFrag; }
From source file:com.globalsight.ling.lucene.analysis.GSTokenizer.java
License:Apache License
@Override public boolean incrementToken() throws IOException { clearAttributes();//from ww w . j a v a 2 s . c o m Token tt = next(); if (tt == null) { return false; } else { gsAtt.setToken(tt); termAtt.append(tt.toString()); offsetAtt.setOffset(tt.startOffset(), tt.endOffset()); posIncrAtt.setPositionIncrement(tt.getPositionIncrement()); return true; } }
From source file:com.globalsight.ling.lucene.analysis.ngram.NgramAnalyzer.java
License:Apache License
static void test(String p_text) throws java.io.IOException { NgramAnalyzer x = new NgramAnalyzer(3); NgramTokenizer y = new NgramTokenizer(new java.io.StringReader(p_text), 3); System.out.println("Text = " + p_text); Token t; while ((t = y.next()) != null) { System.out.println(t.toString() + " (" + t.startOffset() + ":" + t.endOffset() + ")"); }/*from w w w .j a va 2s. c o m*/ }
From source file:com.globalsight.ling.lucene.analysis.ngram.NgramNoPunctuationAnalyzer.java
License:Apache License
static void test(String p_text) throws java.io.IOException { Analyzer x = new NgramNoPunctuationAnalyzer(3); NgramNoPunctuationTokenizer y = new NgramNoPunctuationTokenizer(new java.io.StringReader(p_text), 3); System.out.println("Text = " + p_text); Token t; while ((t = y.next()) != null) { System.out.println(t.toString() + " (" + t.startOffset() + ":" + t.endOffset() + ")"); }/*w w w . ja v a 2 s . c o m*/ }