Example usage for org.apache.lucene.analysis Token type

List of usage examples for org.apache.lucene.analysis Token type

Introduction

In this page you can find the example usage for org.apache.lucene.analysis Token type.

Prototype

@Override
public final String type() 

Source Link

Usage

From source file:analysis.StandardFilter.java

License:Apache License

/**
 * Returns the next token in the stream, or null at EOS.
 * <p>//from  ww  w  .j a va 2 s . co m
 * Removes <tt>'s</tt> from the end of words.
 * <p>
 * Removes dots from acronyms.
 */
public final Token next(final Token reusableToken) throws java.io.IOException {
    assert reusableToken != null;
    Token nextToken = input.next(reusableToken);

    if (nextToken == null)
        return null;

    char[] buffer = nextToken.termBuffer();
    final int bufferLength = nextToken.termLength();
    final String type = nextToken.type();

    if (type == APOSTROPHE_TYPE && // remove 's
            bufferLength >= 2 && buffer[bufferLength - 2] == '\''
            && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) {
        // Strip last 2 characters off
        nextToken.setTermLength(bufferLength - 2);
    } else if (type == ACRONYM_TYPE) { // remove dots
        int upto = 0;
        for (int i = 0; i < bufferLength; i++) {
            char c = buffer[i];
            if (c != '.')
                buffer[upto++] = c;
        }
        nextToken.setTermLength(upto);
    }

    return nextToken;
}

From source file:analysis.StandardTokenizer.java

License:Apache License

@SuppressWarnings("deprecation")
public Token next(final Token reusableToken) throws IOException {
    assert reusableToken != null;
    int posIncr = 1;
    Token result = reusableToken;//from  www  .  j a  va2  s  .  c  o m
    if (tokenList.size() > 0)
        return tokenList.remove();
    while (true) {
        int tokenType = scanner.getNextToken();

        if (tokenType == StandardTokenizerImpl.YYEOF) {
            return null;
        }

        if (scanner.yylength() <= maxTokenLength) {
            reusableToken.clear();
            reusableToken.setPositionIncrement(posIncr);
            scanner.getText(reusableToken);
            final int start = scanner.yychar();
            reusableToken.setStartOffset(start);
            reusableToken.setEndOffset(start + reusableToken.termLength());
            // This 'if' should be removed in the next release. For now, it
            // converts
            // invalid acronyms to HOST. When removed, only the 'else' part
            // should
            // remain.
            if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
                if (replaceInvalidAcronym) {
                    reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
                    reusableToken.setTermLength(reusableToken.termLength() - 1); // remove
                    // extra
                    // '.'
                    tokenType = StandardTokenizerImpl.HOST;
                } else {
                    reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
                    tokenType = StandardTokenizerImpl.ACRONYM;
                }
            } else {

                reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
            }

            if (tokenType == StandardTokenizerImpl.HOST || tokenType == StandardTokenizerImpl.NUM
                    || tokenType == StandardTokenizerImpl.ALPHANUM) {

                Tokenizer lt = new LetterDigitBreakTokenizer(new StringReader(reusableToken.term()));
                Token tk = null;
                int st = reusableToken.startOffset();
                final Token token = new Token();
                while ((tk = lt.next(token)) != null) {
                    tk.setStartOffset(tk.startOffset() + st);
                    tk.setEndOffset(tk.endOffset() + st);
                    tk.setType(reusableToken.type());
                    tokenList.add((Token) tk.clone());
                }
            }
            if (tokenList.size() > 0)
                result = tokenList.remove();

            return result;
        } else
            // When we skip a too-long term, we still increment the
            // position increment
            posIncr++;
    }
}

From source file:ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search.SeparatorSplitterTokenFilter.java

License:Apache License

private static boolean isSplittableToken(Token token) {
    String type = token.type();
    if (type.equals(ALPHANUM_TOKEN_TYPE) || type.equals(HOST_TOKEN_TYPE)) {
        return true;
    }/*from  w  w w .j av a 2s.c o  m*/
    if (type.equals(NUM_TOKEN_TYPE)) {
        // sometimes the original tokenizer lies to us and reports terms like 'version_3' to be
        // numbers. This is a heuristic to correct those lies.
        return Character.isLetter(token.term().charAt(0));
    }
    return false;
}

From source file:com.fdt.sdl.core.analyzer.phonetix.lucene.PhoneticFilter.java

License:Open Source License

/**
 * Returns the next token in the stream, or <code>null</code> at EOF.
 *//*from w  w w . j a v  a  2s. c  o  m*/
public Token next() throws IOException {
    if (encoder == null) {
        return input.next();
    } else if (encoder.length == 1) { // optimize, if one encoder only
        final Token t = input.next();
        if (t == null)
            return null;
        return (encoder[0] != null)
                ? new Token(encoder[0].generateKey(t.termText()), t.startOffset(), t.endOffset(), t.type())
                : t;
    } else {
        ++actualIndex;

        // get next token, if necessary
        if (actualIndex >= encoder.length) {
            actualToken = input.next();
            if (actualToken == null) {
                actualIndex = encoder.length;
                return null;
            }
            actualIndex = 0;
        }

        if (encoder[actualIndex] == null)
            return actualToken;
        else
            return new Token(
                    encoder[actualIndex].toString() + ":"
                            + encoder[actualIndex].generateKey(actualToken.termText()),
                    actualToken.startOffset(), actualToken.endOffset(), actualToken.type());
    }
}

From source file:com.globalsight.ling.lucene.analysis.pl.PolishFilter.java

License:Apache License

/** Returns the next input Token, after being stemmed */
public final Token next() throws IOException {
    Token token = getNextToken();

    if (token == null) {
        return null;
    } else {// w w w . java  2  s.c o m
        String s = stemmer.stem(token.toString(), true);

        if (!s.equals(token.toString())) {
            // reconstruct the input token. This is silly...
            Token res = new Token(s, token.startOffset(), token.endOffset(), token.type());
            res.setPositionIncrement(token.getPositionIncrement());
            return res;
        }

        return token;
    }
}

From source file:com.globalsight.ling.lucene.analysis.snowball.SnowballFilter.java

License:Apache License

/**
 * Returns the next input Token, after being stemmed.
 *//* w  w  w.  j a  va 2 s  .  c o m*/
public final Token next() throws IOException {
    Token token = getNextToken();

    if (token == null) {
        return null;
    }

    stemmer.setCurrent(token.toString());
    stemmer.stem();

    return new Token(stemmer.getCurrent(), token.startOffset(), token.endOffset(), token.type());
}

From source file:com.globalsight.ling.lucene.analysis.th.BreakIteratorTokenTokenizer.java

License:Apache License

protected TokenStream createSubStream(Token t) {
    if (t.type().equals(type)) {
        bi.setText(t.toString());/*  w  ww.j a  va2s . co m*/
        return new BreakIteratorAdaptor(t.toString(), bi, t.type(), t.startOffset());
    }

    return null;
}

From source file:com.globalsight.ling.tm2.lucene.GsStemFilter.java

License:Apache License

/**
 * Stems the next input Token and returns it.
 *///from   w  w w. java  2  s  .  c om
public final Token next() throws IOException {
    Token token = getNextToken();
    if (token != null) {
        String stemmed = m_stemmer.stem(token.toString());
        if (!stemmed.equals(token.toString())) {
            token = new Token(stemmed, token.startOffset(), token.endOffset(), token.type());
        }
    }

    return token;
}

From source file:com.ideabase.repository.core.index.filter.TermUsageFilter.java

License:Open Source License

@Override
public Token next() throws IOException {
    final Token token = input.next();
    if (token != null) {
        if (token.type().equals(TYPE_ALPHANUM)) {
            storeAndIncrementCount(String.valueOf(token.termBuffer()));
        }//from   w  w w. j  ava  2s .  co  m
        return token;
    } else {
        return null;
    }
}

From source file:com.mathworks.xzheng.analysis.nutch.NutchExample.java

License:Apache License

public static void main(String[] args) throws IOException {
    Configuration conf = Configuration.getConfiguration();
    conf.addResource("nutch-default.xml");
    NutchDocumentAnalyzer analyzer = new NutchDocumentAnalyzer(conf); //1

    TokenStream ts = analyzer.tokenStream("content", new StringReader("The quick brown fox..."));
    int position = 0;
    Token token;
    while (ts.incrementToken()) { // 2
        token = ts.getAttribute(org.apache.lucene.analysis.Token.class);
        if (token == null) {
            break;
        }/*from w  w w. jav  a 2 s. c om*/
        int increment = token.getPositionIncrement();

        if (increment > 0) {
            position = position + increment;
            System.out.println();
            System.out.print(position + ": ");
        }

        System.out.print("[" + token.termBuffer().toString() + ":" + token.startOffset() + "->"
                + token.endOffset() + ":" + token.type() + "] ");
    }
    System.out.println();

    Query nutchQuery = Query.parse("\"the quick brown\"", conf); // 3
    org.apache.lucene.search.Query luceneQuery;
    luceneQuery = new QueryFilters(conf).filter(nutchQuery); // A
    System.out.println("Translated: " + luceneQuery);
}