List of usage examples for org.apache.lucene.analysis Token Token
public Token(CharSequence text, int start, int end)
From source file:at.lux.fotoretrieval.lucene.GraphTokenizer.java
License:Open Source License
public Token next() throws IOException { StringBuilder currenttoken = new StringBuilder(64); // currenttoken.append('['); char[] character = new char[1]; int i = reader.read(character); // reset our states :) tokenstart = false;//from ww w . ja v a 2 s .c om tokenend = false; do { // end of stream reached ... if (i == 0) return null; if (character[0] == '[') { // token starts here ... tokenstart = true; } else if (character[0] == ']') { // token ends here ... tokenend = true; } else if (tokenstart && !tokenend) { // between end and start ... currenttoken.append(character[0]); } // we found our token and return it ... if (tokenstart && tokenend) { // currenttoken.append(']'); // prepend a token because lucene does not allow leading wildcards. currenttoken.insert(0, '_'); String tokenString = currenttoken.toString().toLowerCase().replace(' ', '_').trim(); Token t = new Token(tokenString, 0, tokenString.length() - 1); return t; } i = reader.read(character); } while (i > 0 && !tokenend); return null; }
From source file:axiom.objectmodel.dom.ReferenceAnalyzer.java
License:Open Source License
public TokenStream tokenStream(String fieldName, final Reader reader) { return new TokenStream() { private boolean done = false; private static final String DELIM = LuceneManager.NULL_DELIM; public Token next() throws IOException { if (!done) { done = true;/*from www . j av a 2 s . c o m*/ final char[] buffer = new char[512]; StringBuffer sb = new StringBuffer(); int length = 0; while ((sb.indexOf(DELIM) < 0) && (length = reader.read(buffer)) != -1) { sb.append(buffer, 0, length); } final String value = sb.toString(); final int index = value.indexOf(DELIM); if (index < 0) { return null; } else { final String text = value.substring(0, index); return new Token(text, 0, text.length()); } } return null; } }; }
From source file:cc.pp.analyzer.paoding.analyzer.impl.MaxWordLengthTokenCollector.java
License:Apache License
@Override public void collect(String word, int offset, int end) { Token c = candidate != null ? candidate : last; if (c == null) { candidate = new Token(word, offset, end); } else if (offset == c.startOffset()) { if (end > c.endOffset()) { candidate = new Token(word, offset, end); }/*from w ww . j ava2s. c o m*/ } else if (offset > c.startOffset()) { if (candidate != null) { select(candidate); } if (end > c.endOffset()) { candidate = new Token(word, offset, end); } else { candidate = null; } } else if (end >= c.endOffset()) { if (last != null && last.startOffset() >= offset && last.endOffset() <= end) { for (Iterator/* <Token> */<Token> iter = tokens.iterator(); iter.hasNext();) { last = iter.next(); if (last.startOffset() >= offset && last.endOffset() <= end) { iter.remove(); } } } last = null; candidate = new Token(word, offset, end); } }
From source file:com.aliasi.lingmed.lucene.LuceneTokenStream.java
License:Lingpipe license
public Token next() throws IOException { if (mTokenizer == null) return null; String nextToken = mTokenizer.nextToken(); if (nextToken == null) return null; int start = mTokenizer.lastTokenStartPosition(); int end = start + nextToken.length(); // adding length is a hack; won't work with stemmers return new Token(nextToken, start, end); }
From source file:com.duroty.lucene.analysis.KeywordAnalyzer.java
License:Apache License
/** * DOCUMENT ME!/*from w ww .j a va2s . c om*/ * * @param fieldName DOCUMENT ME! * @param reader DOCUMENT ME! * * @return DOCUMENT ME! */ public TokenStream tokenStream(String fieldName, final Reader reader) { return new TokenStream() { private boolean done; private final char[] buffer = new char[1024]; public Token next() throws IOException { if (!done) { done = true; StringBuffer buffer = new StringBuffer(); int length = 0; while (true) { length = reader.read(this.buffer); if (length == -1) { break; } buffer.append(this.buffer, 0, length); } String text = buffer.toString(); return new Token(text, 0, text.length()); } return null; } }; }
From source file:com.globalsight.ling.lucene.analysis.cn.ChineseTokenizer.java
License:Apache License
private final Token flush() { if (length > 0) { //System.out.println(new String(buffer, 0, length)); return new Token(new String(buffer, 0, length), start, start + length); } else/*from w w w . j a v a 2 s . c o m*/ return null; }
From source file:com.globalsight.ling.lucene.analysis.GSTokenFilter.java
License:Apache License
public Token getNextToken() throws IOException { if (input != null && input instanceof GSTokenNext) { return ((GSTokenNext) input).next(); }/*from w w w.j a va2s . c om*/ if (input != null && input.hasAttribute(CharTermAttribute.class)) { CharTermAttribute ccc = input.getAttribute(CharTermAttribute.class); String sss = ccc.toString(); Token tt = new Token(sss, 0, sss.length()); return tt; } return next(); }
From source file:com.globalsight.ling.lucene.analysis.ru.RussianLowerCaseFilter.java
License:Apache License
public final Token next() throws java.io.IOException { Token t = getNextToken();/*from w w w .j a v a 2 s . c o m*/ if (t == null) return null; String txt = t.toString(); char[] chArray = txt.toCharArray(); for (int i = 0; i < chArray.length; i++) { chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset); } String newTxt = new String(chArray); // create new token Token newToken = new Token(newTxt, t.startOffset(), t.endOffset()); return newToken; }
From source file:com.globalsight.ling.lucene.analysis.ts.TswanaStemFilter.java
License:Apache License
/** * @return Returns the next token in the stream, or null at EOS *///from w ww . j a v a 2 s. c o m public final Token next() throws IOException { String s = null; if (stems != null) { if (stemsPointer < stems.length) { token = new Token(stems[stemsPointer], 0, stems[stemsPointer].length()); stemsPointer++; if (stemsPointer == stems.length) { stems = null; stemsPointer = -1; } } } else { token = getNextToken(); } if (token == null) { return null; } // Check the exclusiontable else if (exclusionSet != null && exclusionSet.contains(token.toString())) { return token; } else { if (stems == null) { stems = stemmer.multipleStems(token.toString()); if (stems != null) { stemsPointer = 0; token = new Token(stems[stemsPointer], 0, stems[stemsPointer].length()); stemsPointer++; } } s = stemmer.stem(token.toString()); //s = stemmer.stem(token.termText()); // If not stemmed, dont waste the time creating a new token if (!s.equals(token.toString())) { return new Token(s, token.startOffset(), token.endOffset(), token.type()); } return token; } }
From source file:com.globalsight.ling.tm2.lucene.GsTokenizer.java
License:Apache License
/** Returns the next token in the stream, or null at EOS. *///from w w w. j a v a2s.c om final public Token next() { Token token = null; int start = m_wordIterator.current(); int end = m_wordIterator.next(); if (end != BreakIterator.DONE) { String tokenString = m_text.substring(start, end).toLowerCase(); token = new Token(tokenString, start, end); } return token; }