Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:com.github.le11.nls.lucene.UIMAPayloadsAnalyzerTest.java

License:Apache License

@Test
public void baseUIMAPayloadsAnalyzerStreamTest() {
    try {/*  w  w  w  .j a v  a2  s  .  c om*/
        TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        PayloadAttribute payloadAttribute = ts.addAttribute(PayloadAttribute.class);
        while (ts.incrementToken()) {
            assertNotNull(termAtt);
            assertNotNull(payloadAttribute);
            System.out.println("token '" + termAtt.toString() + "' has payload "
                    + new String(payloadAttribute.getPayload().getData()));
        }
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getLocalizedMessage());
    }
}

From source file:com.github.le11.nls.lucene.UIMATypeAwareAnalyzerTest.java

License:Apache License

@Test
public void testSimpleUsage() {
    try {/*from w  ww  .j av a  2s.  c o m*/
        UIMATypeAwareAnalyzer analyzer = new UIMATypeAwareAnalyzer("/HmmTaggerAggregate.xml",
                "org.apache.uima.TokenAnnotation", "posTag");
        TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAttr = ts.addAttribute(TypeAttribute.class);
        PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
        while (ts.incrementToken()) {
            assertNotNull(offsetAtt);
            assertNotNull(termAtt);
            assertNotNull(posAtt);
            assertNotNull(typeAttr);
            System.out.println("token '" + termAtt.toString() + "' has type " + typeAttr.type());
        }
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getLocalizedMessage());
    }
}

From source file:com.github.mosuka.apache.lucene.example.utils.LuceneExampleUtilTest.java

License:Apache License

public void testCreateAnalyzerWrapper() throws IOException {
    PerFieldAnalyzerWrapper wrapper = LuceneExampleUtil.createAnalyzerWrapper();

    TokenStream tokenStream = null;
    CharTermAttribute charTermAttribute = null;

    List<String> expectedIdTermList = new LinkedList<String>(Arrays.asList("1"));
    List<String> actualIdTermList = new LinkedList<String>();
    tokenStream = wrapper.tokenStream("id", "1");
    charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();/*  w w  w  . j a v a  2  s .  c o m*/
    while (tokenStream.incrementToken()) {
        actualIdTermList.add(charTermAttribute.toString());
    }
    tokenStream.close();
    assertEquals(expectedIdTermList, actualIdTermList);

    List<String> expectedTextTermList = new LinkedList<String>(
            Arrays.asList("lucene", "is", "a", "full", "text", "search", "library"));
    List<String> actualTextTermList = new LinkedList<String>();
    tokenStream = wrapper.tokenStream("text", "Lucene is a Full-text search library.");
    charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        actualTextTermList.add(charTermAttribute.toString());
    }
    tokenStream.close();
    assertEquals(expectedTextTermList, actualTextTermList);
}

From source file:com.github.pmerienne.trident.ml.preprocessing.EnglishTokenizer.java

License:Apache License

public List<String> tokenize(String text) {
    List<String> words = new ArrayList<String>();
    if (text != null && !text.isEmpty()) {
        TokenStream tokenStream = this.createTokenStream(text);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

        try {/*from   w w  w  .ja  v a2  s  .co m*/
            while (tokenStream.incrementToken()) {
                String term = charTermAttribute.toString();
                words.add(term);
            }
        } catch (IOException ioe) {
            LOGGER.error("Unable to analyze text. Cause : " + ioe.getMessage(), ioe);
        } finally {
            try {
                tokenStream.end();
                tokenStream.close();
            } catch (IOException e) {
                // Can't do nothing!!
                LOGGER.error("Unable to close token stream : " + e.getMessage());
            }
        }
    }

    return words;
}

From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java

License:Apache License

private List<LuceneToken> readTokens(TokenStream tokenStream) throws IOException {
    ArrayList<LuceneToken> tokens = new ArrayList<LuceneToken>();
    HashMap<Integer, LuceneToken> tokensByStartOffset = new HashMap<Integer, LuceneToken>();
    addAttributes(tokenStream);// w  w w .j av a2 s .c  om
    tokenStream.reset();

    while (tokenStream.incrementToken()) {
        if (tokenStream.hasAttributes()) {
            LuceneToken token = new LuceneToken();

            readOffset(tokenStream, token);

            // Lucene may output multiple tokens for compound words
            LuceneToken tokenWithSameStartOffset = tokensByStartOffset.get(token.getStartOffset());
            if (tokenWithSameStartOffset != null) {
                if (token.getEndOffset() >= tokenWithSameStartOffset.getEndOffset()) {
                    continue;
                } else {
                    tokens.remove(tokenWithSameStartOffset);
                }
            }

            readReading(tokenStream, token);
            readPartOfSpeech(tokenStream, token);
            readInflection(tokenStream, token);
            readBaseForm(tokenStream, token);

            tokensByStartOffset.put(token.getStartOffset(), token);
            tokens.add(token);
        }
    }

    tokenStream.end();
    tokenStream.close();
    return tokens;
}

From source file:com.github.rnewson.couchdb.lucene.util.AnalyzersTest.java

License:Apache License

private String[] analyze(final String analyzerName, final String text) throws Exception {
    final Analyzer analyzer = Analyzers.getAnalyzer(analyzerName);
    final TokenStream stream = analyzer.tokenStream("default", new StringReader(text));
    stream.reset();//from   w ww. j av a 2  s  . c o  m
    final List<String> result = new ArrayList<String>();
    while (stream.incrementToken()) {
        final CharTermAttribute c = stream.getAttribute(CharTermAttribute.class);
        result.add(c.toString());
    }
    return result.toArray(new String[0]);
}

From source file:com.github.tteofili.looseen.MinHashClassifier.java

License:Apache License

private ArrayList<String> getTokens(Analyzer analyzer, String field, String value) throws IOException {
    ArrayList<String> tokens = new ArrayList<String>();
    TokenStream ts = analyzer.tokenStream(field, value);
    ts.reset();//  w ww  .  j a v a2 s. co  m
    while (ts.incrementToken()) {
        CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
        String token = new String(termAttribute.buffer(), 0, termAttribute.length());
        tokens.add(token);
    }
    ts.end();
    ts.close();
    return tokens;
}

From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java

License:Apache License

public static org.apache.lucene.analysis.Token getNextToken(TokenStream input) throws IOException {
    org.apache.lucene.analysis.Token token = null;
    if (input.incrementToken()) {
        CharTermAttribute ccc = input.addAttribute(CharTermAttribute.class);
        Iterator<AttributeImpl> attIt = input.getAttributeImplsIterator();

        if (attIt == null || !attIt.hasNext()) {
            return null;
        }/*  w w  w .  j a  va2s  .c  om*/

        AttributeImpl att = attIt.next();
        if (att instanceof GSAttributeImpl) {
            token = ((GSAttributeImpl) att).getToken();
        }

        if (token == null && ccc != null && ccc.length() > 0) {
            String ttt = ccc.toString();
            token = new org.apache.lucene.analysis.Token(ttt, 0, ttt.length());
        }
    }

    return token;
}

From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java

License:Apache License

/**
 * Create GlobalSight TM tokens from a provided segment string
 * using GsAnalyzer.// w w w  . j a v  a 2 s.  c  om
 *
 * @param p_text fuzzy match format string
 * @return List of c.g.l.tm2.index.Tokens
 */
public static List<Token> createGsTokens(String p_text, GlobalSightLocale p_locale) throws Exception {
    GsAnalyzer analyzer = new GsAnalyzer(p_locale);
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text));

    tokenStream.reset();
    //GSAttribute gsAtt = tokenStream.addAttribute(GSAttribute.class);
    //org.apache.lucene.analysis.Token luceneToken = null;
    List<String> tokens = new ArrayList<String>();

    while (tokenStream.incrementToken()) {
        // luceneToken = gsAtt.getToken();

        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokens.add(termAtt.toString());

    }
    tokenStream.close();
    return buildTokenList(tokens);
}

From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java

License:Apache License

/**
 * Create GlobalSight TM tokens from a provided segment string
 * using GsAnalyzer.  This method is suitable for use with TM3
 * fuzzy indices, and does two things differently than createGsTokens():
 * 1) It returns tokens in the order in which they appear
 * 2) It does not collapse duplicate tokens (and correspondingly does
 *    not return count information)/*w w w. j  av a2s .  c  o  m*/
 *
 * @param p_text fuzzy match format string
 * @return List of Strings, each representing one token
 */
public static List<String> createTm3Tokens(String p_text, GlobalSightLocale p_locale) throws Exception {
    GsAnalyzer analyzer = new GsAnalyzer(p_locale);
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text));
    tokenStream.reset();

    List<String> tokens = new ArrayList<String>();
    while (tokenStream.incrementToken()) {
        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokens.add(termAtt.toString());
    }
    tokenStream.close();

    return tokens;
}