Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:com.github.le11.nls.lucene.UIMAPayloadsAnalyzerTest.java

License:Apache License

@Test
public void baseUIMAPayloadsAnalyzerStreamTest() {
    try {/*  w  w  w  .j a v  a2  s  .  c om*/
        TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        PayloadAttribute payloadAttribute = ts.addAttribute(PayloadAttribute.class);
        while (ts.incrementToken()) {
            assertNotNull(termAtt);
            assertNotNull(payloadAttribute);
            System.out.println("token '" + termAtt.toString() + "' has payload "
                    + new String(payloadAttribute.getPayload().getData()));
        }
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getLocalizedMessage());
    }
}

From source file:com.github.le11.nls.lucene.UIMATypeAwareAnalyzerTest.java

License:Apache License

@Test
public void testSimpleUsage() {
    try {/*from w  ww  .j av a  2s.  c o m*/
        UIMATypeAwareAnalyzer analyzer = new UIMATypeAwareAnalyzer("/HmmTaggerAggregate.xml",
                "org.apache.uima.TokenAnnotation", "posTag");
        TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAttr = ts.addAttribute(TypeAttribute.class);
        PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
        while (ts.incrementToken()) {
            assertNotNull(offsetAtt);
            assertNotNull(termAtt);
            assertNotNull(posAtt);
            assertNotNull(typeAttr);
            System.out.println("token '" + termAtt.toString() + "' has type " + typeAttr.type());
        }
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getLocalizedMessage());
    }
}

From source file:com.github.mosuka.apache.lucene.example.utils.LuceneExampleUtilTest.java

License:Apache License

public void testCreateAnalyzerWrapper() throws IOException {
    PerFieldAnalyzerWrapper wrapper = LuceneExampleUtil.createAnalyzerWrapper();

    TokenStream tokenStream = null;
    CharTermAttribute charTermAttribute = null;

    List<String> expectedIdTermList = new LinkedList<String>(Arrays.asList("1"));
    List<String> actualIdTermList = new LinkedList<String>();
    tokenStream = wrapper.tokenStream("id", "1");
    charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();/*  w w  w  . j a v a  2  s .  c o m*/
    while (tokenStream.incrementToken()) {
        actualIdTermList.add(charTermAttribute.toString());
    }
    tokenStream.close();
    assertEquals(expectedIdTermList, actualIdTermList);

    List<String> expectedTextTermList = new LinkedList<String>(
            Arrays.asList("lucene", "is", "a", "full", "text", "search", "library"));
    List<String> actualTextTermList = new LinkedList<String>();
    tokenStream = wrapper.tokenStream("text", "Lucene is a Full-text search library.");
    charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        actualTextTermList.add(charTermAttribute.toString());
    }
    tokenStream.close();
    assertEquals(expectedTextTermList, actualTextTermList);
}

From source file:com.github.pmerienne.trident.ml.preprocessing.EnglishTokenizer.java

License:Apache License

public List<String> tokenize(String text) {
    List<String> words = new ArrayList<String>();
    if (text != null && !text.isEmpty()) {
        TokenStream tokenStream = this.createTokenStream(text);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

        try {/*from   w w  w  .ja  v a2  s  .co m*/
            while (tokenStream.incrementToken()) {
                String term = charTermAttribute.toString();
                words.add(term);
            }
        } catch (IOException ioe) {
            LOGGER.error("Unable to analyze text. Cause : " + ioe.getMessage(), ioe);
        } finally {
            try {
                tokenStream.end();
                tokenStream.close();
            } catch (IOException e) {
                // Can't do nothing!!
                LOGGER.error("Unable to close token stream : " + e.getMessage());
            }
        }
    }

    return words;
}

From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java

License:Apache License

private List<LuceneToken> readTokens(TokenStream tokenStream) throws IOException {
    ArrayList<LuceneToken> tokens = new ArrayList<LuceneToken>();
    HashMap<Integer, LuceneToken> tokensByStartOffset = new HashMap<Integer, LuceneToken>();
    addAttributes(tokenStream);// w  w w .j av a2 s .c  om
    tokenStream.reset();

    while (tokenStream.incrementToken()) {
        if (tokenStream.hasAttributes()) {
            LuceneToken token = new LuceneToken();

            readOffset(tokenStream, token);

            // Lucene may output multiple tokens for compound words
            LuceneToken tokenWithSameStartOffset = tokensByStartOffset.get(token.getStartOffset());
            if (tokenWithSameStartOffset != null) {
                if (token.getEndOffset() >= tokenWithSameStartOffset.getEndOffset()) {
                    continue;
                } else {
                    tokens.remove(tokenWithSameStartOffset);
                }
            }

            readReading(tokenStream, token);
            readPartOfSpeech(tokenStream, token);
            readInflection(tokenStream, token);
            readBaseForm(tokenStream, token);

            tokensByStartOffset.put(token.getStartOffset(), token);
            tokens.add(token);
        }
    }

    tokenStream.end();
    tokenStream.close();
    return tokens;
}

From source file:com.github.rnewson.couchdb.lucene.util.AnalyzersTest.java

License:Apache License

private String[] analyze(final String analyzerName, final String text) throws Exception {
    final Analyzer analyzer = Analyzers.getAnalyzer(analyzerName);
    final TokenStream stream = analyzer.tokenStream("default", new StringReader(text));
    stream.reset();//from   w ww. j av a 2  s  . c o  m
    final List<String> result = new ArrayList<String>();
    while (stream.incrementToken()) {
        final CharTermAttribute c = stream.getAttribute(CharTermAttribute.class);
        result.add(c.toString());
    }
    return result.toArray(new String[0]);
}

From source file:com.github.tteofili.looseen.MinHashClassifier.java

License:Apache License

private ArrayList<String> getTokens(Analyzer analyzer, String field, String value) throws IOException {
    ArrayList<String> tokens = new ArrayList<String>();
    TokenStream ts = analyzer.tokenStream(field, value);
    ts.reset();//  w ww  .  j a v a2 s. co  m
    while (ts.incrementToken()) {
        CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
        String token = new String(termAttribute.buffer(), 0, termAttribute.length());
        tokens.add(token);
    }
    ts.end();
    ts.close();
    return tokens;
}

From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java

License:Apache License

public static org.apache.lucene.analysis.Token getNextToken(TokenStream input) throws IOException {
    org.apache.lucene.analysis.Token token = null;
    if (input.incrementToken()) {
        CharTermAttribute ccc = input.addAttribute(CharTermAttribute.class);
        Iterator<AttributeImpl> attIt = input.getAttributeImplsIterator();

        if (attIt == null || !attIt.hasNext()) {
            return null;
        }/*  w w  w .  j a  va2s  .c  om*/

        AttributeImpl att = attIt.next();
        if (att instanceof GSAttributeImpl) {
            token = ((GSAttributeImpl) att).getToken();
        }

        if (token == null && ccc != null && ccc.length() > 0) {
            String ttt = ccc.toString();
            token = new org.apache.lucene.analysis.Token(ttt, 0, ttt.length());
        }
    }

    return token;
}

From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java

License:Apache License

/**
 * Create GlobalSight TM tokens from a provided segment string
 * using GsAnalyzer.// w w w  . j a v  a 2 s.  c  om
 *
 * @param p_text fuzzy match format string
 * @return List of c.g.l.tm2.index.Tokens
 */
public static List<Token> createGsTokens(String p_text, GlobalSightLocale p_locale) throws Exception {
    GsAnalyzer analyzer = new GsAnalyzer(p_locale);
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text));

    tokenStream.reset();
    //GSAttribute gsAtt = tokenStream.addAttribute(GSAttribute.class);
    //org.apache.lucene.analysis.Token luceneToken = null;
    List<String> tokens = new ArrayList<String>();

    while (tokenStream.incrementToken()) {
        // luceneToken = gsAtt.getToken();

        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokens.add(termAtt.toString());

    }
    tokenStream.close();
    return buildTokenList(tokens);
}

From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java

License:Apache License

/**
 * Create GlobalSight TM tokens from a provided segment string
 * using GsAnalyzer.  This method is suitable for use with TM3
 * fuzzy indices, and does two things differently than createGsTokens():
 * 1) It returns tokens in the order in which they appear
 * 2) It does not collapse duplicate tokens (and correspondingly does
 *    not return count information)/*w w w. j  av a2s .  c  o  m*/
 *
 * @param p_text fuzzy match format string
 * @return List of Strings, each representing one token
 */
public static List<String> createTm3Tokens(String p_text, GlobalSightLocale p_locale) throws Exception {
    GsAnalyzer analyzer = new GsAnalyzer(p_locale);
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text));
    tokenStream.reset();

    List<String> tokens = new ArrayList<String>();
    while (tokenStream.incrementToken()) {
        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokens.add(termAtt.toString());
    }
    tokenStream.close();

    return tokens;
}