Example usage for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass)

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:fry.future.plugin.example.APP.java

private static List<String> tokenString(Analyzer analyzer, String str) throws IOException {
    List<String> result = new ArrayList<>();
    TokenStream tokenStream = analyzer.tokenStream("Test", new StringReader(str));
    tokenStream.reset();/*from w ww  .j  a  va 2s. c o  m*/
    while (tokenStream.incrementToken()) {
        result.add(tokenStream.getAttribute(CharTermAttribute.class).toString());
    }
    return result;
}

From source file:hivemall.nlp.tokenizer.KuromojiUDF.java

License:Apache License

private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results) throws IOException {
    // instantiate an attribute placeholder once
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    stream.reset();//from w  w w  . java  2  s  . c o  m

    while (stream.incrementToken()) {
        String term = termAttr.toString();
        results.add(new Text(term));
    }
}

From source file:info.johtani.jjug.lucene.sample.TokenizeSample.java

License:Apache License

private static void printToken(String text, Analyzer analyzer) {
    System.out.println("--- Original: [" + text + "]");
    try {//from   w  w  w  .  j av a  2 s  .  c o m
        TokenStream tokens = analyzer.tokenStream("content", text);
        tokens.reset();
        CharTermAttribute termAttr = tokens.getAttribute(CharTermAttribute.class);
        while (tokens.incrementToken()) {
            System.out.println("[" + termAttr.toString() + "]");
        }
        tokens.reset();
    } catch (IOException e) {
        e.printStackTrace();
    }

}

From source file:ivory.core.tokenize.Tokenizer.java

License:Apache License

/**
 * Convert tokenStream object into a string.
 * // w  w w. j a va2 s .co  m
 * @param tokenStream
 *    object returned by Lucene tokenizer
 * @return
 *    String corresponding to the tokens output by tokenStream
 */
protected static String streamToString(TokenStream tokenStream) {
    CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
    tokenStream.clearAttributes();
    StringBuilder tokenized = new StringBuilder();
    try {
        while (tokenStream.incrementToken()) {
            tokenized.append(termAtt.toString() + " ");
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
    return tokenized.toString().trim();
}

From source file:jaligner.Sequence.java

License:Open Source License

/**
 * Constructor/*  w  ww . j a va2s  .  com*/
 * 
 * @param sequence
 */
public Sequence(String sequence, Analyzer analyzer, int max_length) throws IOException {
    super();
    this.sequence = sequence;

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(sequence));
    Token.TokenAttributeFactory tokenAttributeFactory = new Token.TokenAttributeFactory(
            stream.getAttributeFactory());

    Vector<Token> tokenVector = new Vector<Token>();

    while (stream.incrementToken() && tokenVector.size() < max_length) {
        //            Token token = new Token();
        //            Token token = (Token) stream.getAttribute(CharTermAttribute.class);
        Token token = (Token) tokenAttributeFactory.createAttributeInstance(Token.class);

        CharTermAttribute charTerm = stream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offset = stream.getAttribute(OffsetAttribute.class);
        //            PayloadAttribute payload = stream.getAttribute(PayloadAttribute.class);
        //            FlagsAttribute flags = stream.getAttribute(FlagsAttribute.class);

        //        public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) {
        token.reinit(charTerm.buffer(), 0, charTerm.length(), offset.startOffset(), offset.endOffset());
        token.setOffset(offset.startOffset(), offset.endOffset());

        //            token.setPayload(payload.getPayload());
        //            token.setFlags(flags.getFlags());

        if (stream.hasAttribute(PositionIncrementAttribute.class)) {
            PositionIncrementAttribute positionIncrement = stream
                    .getAttribute(PositionIncrementAttribute.class);
            token.setPositionIncrement(positionIncrement.getPositionIncrement());
        }

        if (stream.hasAttribute(TypeAttribute.class)) {
            TypeAttribute type = stream.getAttribute(TypeAttribute.class);
            token.setType(type.type());
        }

        tokenVector.add(token);
    }

    stream.end();
    stream.close();

    this.tokens = tokenVector.toArray(new Token[tokenVector.size()]);
}

From source file:jobs.LoadOntologyJob.java

private int getTotalLength(String label) throws IOException {
    //Analyzer doesn't remomve stop words
    Analyzer customanalyzer = new CustomStopWordsStandardAnalyzer(Version.LUCENE_47);
    List<String> resultStop = new ArrayList<String>();
    TokenStream customstream = customanalyzer.tokenStream(null, new StringReader(label));
    customstream.reset();//from  w  w  w.  ja  v a 2s  .  c  om
    while (customstream.incrementToken()) {
        resultStop.add(customstream.getAttribute(CharTermAttribute.class).toString());
    }
    return resultStop.size();
}

From source file:jobs.LoadOntologyJob.java

private int getLengthWithoutStopWords(String label) throws IOException {
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
    List<String> result = new ArrayList<String>();
    TokenStream stream = analyzer.tokenStream(null, new StringReader(label));
    stream.reset();/*  w w w  .  jav  a 2s  .  co  m*/
    while (stream.incrementToken()) {
        result.add(stream.getAttribute(CharTermAttribute.class).toString());
    }
    return result.size();
}

From source file:jp.co.atware.solr.analizers.cjk.CJKBigramFilterTest.java

License:Apache License

@Theory
public void testIncrementToken(Fixture testData) throws Exception {
    TokenStream tokenStream = getTokenStream(testData.input);
    CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
    List<String> actual = new ArrayList<String>();
    while (tokenStream.incrementToken()) {
        actual.add(termAtt.toString());//w ww. j  a  v a 2  s  . c  o  m
    }
    assertThat(actual.toArray(EMPTY_STRING_ARRAY), is(testData.expected));
}

From source file:jp.co.atware.solr.analizers.cjk.CranioCaudalFilterTest.java

License:Apache License

@Theory
public void testIncrementToken(TestData testData) throws Exception {
    TokenStream tokenStream = createTokenStream(testData.input);
    CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
    List<String> actual = new ArrayList<String>();
    while (tokenStream.incrementToken()) {
        actual.add(termAtt.toString());//www .  j a  v a 2  s .c o  m
    }
    assertThat(actual.toArray(EMPTY_STRING_ARRAY), is(testData.expected));
}

From source file:jp.co.atware.solr.analizers.cjk.MultistageMappingCharFilterTest.java

License:Apache License

@Theory
public void testMultiMappingAndOffset(TestData testData) throws Exception {
    Reader reader = charFilterFactory.create(new StringReader(testData.input));
    TokenStream tokenStream = tokenizerFactory.create(reader);
    OffsetAttribute actualOffset = tokenStream.getAttribute(OffsetAttribute.class);
    CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);

    tokenStream.reset();//from   ww  w  . j a  va 2s . co  m

    assertThat(tokenStream.incrementToken(), is(true));
    assertThat(termAtt.toString(), is(testData.expected));
    assertThat(actualOffset.startOffset(), is(testData.start));
    assertThat(actualOffset.endOffset(), is(testData.end));
    assertThat(tokenStream.incrementToken(), is(false));
}