Example usage for org.apache.lucene.analysis Token Token

List of usage examples for org.apache.lucene.analysis Token Token

Introduction

In this page you can find the example usage for org.apache.lucene.analysis Token Token.

Prototype

public Token(CharSequence text, int start, int end) 

Source Link

Document

Constructs a Token with the given term text, start and end offsets.

Usage

From source file:at.lux.fotoretrieval.lucene.GraphTokenizer.java

License:Open Source License

public Token next() throws IOException {
    StringBuilder currenttoken = new StringBuilder(64);
    // currenttoken.append('[');
    char[] character = new char[1];
    int i = reader.read(character);
    // reset our states :)
    tokenstart = false;//from   ww w  .  ja v  a 2  s  .c  om
    tokenend = false;
    do {
        // end of stream reached ...
        if (i == 0)
            return null;

        if (character[0] == '[') { // token starts here ...
            tokenstart = true;
        } else if (character[0] == ']') { // token ends here ...
            tokenend = true;
        } else if (tokenstart && !tokenend) { // between end and start ...
            currenttoken.append(character[0]);
        }
        // we found our token and return it ...
        if (tokenstart && tokenend) {
            // currenttoken.append(']');
            // prepend a token because lucene does not allow leading wildcards. 
            currenttoken.insert(0, '_');
            String tokenString = currenttoken.toString().toLowerCase().replace(' ', '_').trim();
            Token t = new Token(tokenString, 0, tokenString.length() - 1);
            return t;
        }
        i = reader.read(character);
    } while (i > 0 && !tokenend);
    return null;
}

From source file:axiom.objectmodel.dom.ReferenceAnalyzer.java

License:Open Source License

public TokenStream tokenStream(String fieldName, final Reader reader) {
    return new TokenStream() {
        private boolean done = false;
        private static final String DELIM = LuceneManager.NULL_DELIM;

        public Token next() throws IOException {
            if (!done) {
                done = true;/*from  www  .  j av a  2 s . c o m*/
                final char[] buffer = new char[512];
                StringBuffer sb = new StringBuffer();
                int length = 0;
                while ((sb.indexOf(DELIM) < 0) && (length = reader.read(buffer)) != -1) {
                    sb.append(buffer, 0, length);
                }
                final String value = sb.toString();
                final int index = value.indexOf(DELIM);
                if (index < 0) {
                    return null;
                } else {
                    final String text = value.substring(0, index);
                    return new Token(text, 0, text.length());
                }
            }

            return null;
        }
    };
}

From source file:cc.pp.analyzer.paoding.analyzer.impl.MaxWordLengthTokenCollector.java

License:Apache License

@Override
public void collect(String word, int offset, int end) {
    Token c = candidate != null ? candidate : last;
    if (c == null) {
        candidate = new Token(word, offset, end);
    } else if (offset == c.startOffset()) {
        if (end > c.endOffset()) {
            candidate = new Token(word, offset, end);
        }/*from w  ww  . j ava2s. c  o m*/
    } else if (offset > c.startOffset()) {
        if (candidate != null) {
            select(candidate);
        }
        if (end > c.endOffset()) {
            candidate = new Token(word, offset, end);
        } else {
            candidate = null;
        }
    } else if (end >= c.endOffset()) {
        if (last != null && last.startOffset() >= offset && last.endOffset() <= end) {
            for (Iterator/* <Token> */<Token> iter = tokens.iterator(); iter.hasNext();) {
                last = iter.next();
                if (last.startOffset() >= offset && last.endOffset() <= end) {
                    iter.remove();
                }
            }
        }
        last = null;
        candidate = new Token(word, offset, end);
    }
}

From source file:com.aliasi.lingmed.lucene.LuceneTokenStream.java

License:Lingpipe license

public Token next() throws IOException {
    if (mTokenizer == null)
        return null;
    String nextToken = mTokenizer.nextToken();
    if (nextToken == null)
        return null;
    int start = mTokenizer.lastTokenStartPosition();
    int end = start + nextToken.length(); // adding length is a hack; won't work with stemmers
    return new Token(nextToken, start, end);
}

From source file:com.duroty.lucene.analysis.KeywordAnalyzer.java

License:Apache License

/**
 * DOCUMENT ME!/*from  w ww .j  a  va2s .  c  om*/
 *
 * @param fieldName DOCUMENT ME!
 * @param reader DOCUMENT ME!
 *
 * @return DOCUMENT ME!
 */
public TokenStream tokenStream(String fieldName, final Reader reader) {
    return new TokenStream() {
        private boolean done;
        private final char[] buffer = new char[1024];

        public Token next() throws IOException {
            if (!done) {
                done = true;

                StringBuffer buffer = new StringBuffer();
                int length = 0;

                while (true) {
                    length = reader.read(this.buffer);

                    if (length == -1) {
                        break;
                    }

                    buffer.append(this.buffer, 0, length);
                }

                String text = buffer.toString();

                return new Token(text, 0, text.length());
            }

            return null;
        }
    };
}

From source file:com.globalsight.ling.lucene.analysis.cn.ChineseTokenizer.java

License:Apache License

private final Token flush() {
    if (length > 0) {
        //System.out.println(new String(buffer, 0, length));
        return new Token(new String(buffer, 0, length), start, start + length);
    } else/*from w  w  w . j  a  v a 2  s  . c  o m*/
        return null;
}

From source file:com.globalsight.ling.lucene.analysis.GSTokenFilter.java

License:Apache License

public Token getNextToken() throws IOException {
    if (input != null && input instanceof GSTokenNext) {
        return ((GSTokenNext) input).next();
    }/*from w  w w.j a va2s . c om*/

    if (input != null && input.hasAttribute(CharTermAttribute.class)) {
        CharTermAttribute ccc = input.getAttribute(CharTermAttribute.class);
        String sss = ccc.toString();
        Token tt = new Token(sss, 0, sss.length());
        return tt;
    }

    return next();
}

From source file:com.globalsight.ling.lucene.analysis.ru.RussianLowerCaseFilter.java

License:Apache License

public final Token next() throws java.io.IOException {
    Token t = getNextToken();/*from w  w  w  .j  a  v a  2 s .  c o m*/

    if (t == null)
        return null;

    String txt = t.toString();

    char[] chArray = txt.toCharArray();
    for (int i = 0; i < chArray.length; i++) {
        chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset);
    }

    String newTxt = new String(chArray);
    // create new token
    Token newToken = new Token(newTxt, t.startOffset(), t.endOffset());

    return newToken;
}

From source file:com.globalsight.ling.lucene.analysis.ts.TswanaStemFilter.java

License:Apache License

/**
 * @return  Returns the next token in the stream, or null at EOS
 *///from   w ww . j a v  a 2  s.  c  o m
public final Token next() throws IOException {
    String s = null;

    if (stems != null) {
        if (stemsPointer < stems.length) {
            token = new Token(stems[stemsPointer], 0, stems[stemsPointer].length());

            stemsPointer++;

            if (stemsPointer == stems.length) {
                stems = null;
                stemsPointer = -1;
            }
        }
    } else {
        token = getNextToken();
    }

    if (token == null) {
        return null;
    }
    // Check the exclusiontable
    else if (exclusionSet != null && exclusionSet.contains(token.toString())) {
        return token;
    } else {
        if (stems == null) {
            stems = stemmer.multipleStems(token.toString());

            if (stems != null) {
                stemsPointer = 0;
                token = new Token(stems[stemsPointer], 0, stems[stemsPointer].length());
                stemsPointer++;
            }
        }

        s = stemmer.stem(token.toString());

        //s = stemmer.stem(token.termText());
        // If not stemmed, dont waste the time creating a new token
        if (!s.equals(token.toString())) {
            return new Token(s, token.startOffset(), token.endOffset(), token.type());
        }

        return token;
    }
}

From source file:com.globalsight.ling.tm2.lucene.GsTokenizer.java

License:Apache License

/** Returns the next token in the stream, or null at EOS.
 *///from   w  w  w.  j a v a2s.c om
final public Token next() {
    Token token = null;

    int start = m_wordIterator.current();
    int end = m_wordIterator.next();
    if (end != BreakIterator.DONE) {
        String tokenString = m_text.substring(start, end).toLowerCase();
        token = new Token(tokenString, start, end);
    }

    return token;
}