Example usage for org.apache.lucene.analysis TokenStream getAttribute

List of usage examples for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass) 

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:fry.future.plugin.example.APP.java

private static List<String> tokenString(Analyzer analyzer, String str) throws IOException {
    List<String> result = new ArrayList<>();
    TokenStream tokenStream = analyzer.tokenStream("Test", new StringReader(str));
    tokenStream.reset();/*from w ww  .j  a  va 2s. c o  m*/
    while (tokenStream.incrementToken()) {
        result.add(tokenStream.getAttribute(CharTermAttribute.class).toString());
    }
    return result;
}

From source file:hivemall.nlp.tokenizer.KuromojiUDF.java

License:Apache License

private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results) throws IOException {
    // instantiate an attribute placeholder once
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    stream.reset();//from w  w w  . java  2  s  . c o  m

    while (stream.incrementToken()) {
        String term = termAttr.toString();
        results.add(new Text(term));
    }
}

From source file:info.johtani.jjug.lucene.sample.TokenizeSample.java

License:Apache License

private static void printToken(String text, Analyzer analyzer) {
    System.out.println("--- Original: [" + text + "]");
    try {//from   w  w  w  .  j av a  2 s  .  c o m
        TokenStream tokens = analyzer.tokenStream("content", text);
        tokens.reset();
        CharTermAttribute termAttr = tokens.getAttribute(CharTermAttribute.class);
        while (tokens.incrementToken()) {
            System.out.println("[" + termAttr.toString() + "]");
        }
        tokens.reset();
    } catch (IOException e) {
        e.printStackTrace();
    }

}

From source file:ivory.core.tokenize.Tokenizer.java

License:Apache License

/**
 * Convert tokenStream object into a string.
 * // w  w w. j a va2 s .co  m
 * @param tokenStream
 *    object returned by Lucene tokenizer
 * @return
 *    String corresponding to the tokens output by tokenStream
 */
protected static String streamToString(TokenStream tokenStream) {
    CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
    tokenStream.clearAttributes();
    StringBuilder tokenized = new StringBuilder();
    try {
        while (tokenStream.incrementToken()) {
            tokenized.append(termAtt.toString() + " ");
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
    return tokenized.toString().trim();
}

From source file:jaligner.Sequence.java

License:Open Source License

/**
 * Constructor/*  w  ww . j a va2s  .  com*/
 * 
 * @param sequence
 */
public Sequence(String sequence, Analyzer analyzer, int max_length) throws IOException {
    super();
    this.sequence = sequence;

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(sequence));
    Token.TokenAttributeFactory tokenAttributeFactory = new Token.TokenAttributeFactory(
            stream.getAttributeFactory());

    Vector<Token> tokenVector = new Vector<Token>();

    while (stream.incrementToken() && tokenVector.size() < max_length) {
        //            Token token = new Token();
        //            Token token = (Token) stream.getAttribute(CharTermAttribute.class);
        Token token = (Token) tokenAttributeFactory.createAttributeInstance(Token.class);

        CharTermAttribute charTerm = stream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offset = stream.getAttribute(OffsetAttribute.class);
        //            PayloadAttribute payload = stream.getAttribute(PayloadAttribute.class);
        //            FlagsAttribute flags = stream.getAttribute(FlagsAttribute.class);

        //        public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) {
        token.reinit(charTerm.buffer(), 0, charTerm.length(), offset.startOffset(), offset.endOffset());
        token.setOffset(offset.startOffset(), offset.endOffset());

        //            token.setPayload(payload.getPayload());
        //            token.setFlags(flags.getFlags());

        if (stream.hasAttribute(PositionIncrementAttribute.class)) {
            PositionIncrementAttribute positionIncrement = stream
                    .getAttribute(PositionIncrementAttribute.class);
            token.setPositionIncrement(positionIncrement.getPositionIncrement());
        }

        if (stream.hasAttribute(TypeAttribute.class)) {
            TypeAttribute type = stream.getAttribute(TypeAttribute.class);
            token.setType(type.type());
        }

        tokenVector.add(token);
    }

    stream.end();
    stream.close();

    this.tokens = tokenVector.toArray(new Token[tokenVector.size()]);
}

From source file:jobs.LoadOntologyJob.java

private int getTotalLength(String label) throws IOException {
    //Analyzer doesn't remomve stop words
    Analyzer customanalyzer = new CustomStopWordsStandardAnalyzer(Version.LUCENE_47);
    List<String> resultStop = new ArrayList<String>();
    TokenStream customstream = customanalyzer.tokenStream(null, new StringReader(label));
    customstream.reset();//from  w  w  w.  ja  v a 2s  .  c  om
    while (customstream.incrementToken()) {
        resultStop.add(customstream.getAttribute(CharTermAttribute.class).toString());
    }
    return resultStop.size();
}

From source file:jobs.LoadOntologyJob.java

private int getLengthWithoutStopWords(String label) throws IOException {
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
    List<String> result = new ArrayList<String>();
    TokenStream stream = analyzer.tokenStream(null, new StringReader(label));
    stream.reset();/*  w w w  .  jav  a 2s  .  co  m*/
    while (stream.incrementToken()) {
        result.add(stream.getAttribute(CharTermAttribute.class).toString());
    }
    return result.size();
}

From source file:jp.co.atware.solr.analizers.cjk.CJKBigramFilterTest.java

License:Apache License

@Theory
public void testIncrementToken(Fixture testData) throws Exception {
    TokenStream tokenStream = getTokenStream(testData.input);
    CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
    List<String> actual = new ArrayList<String>();
    while (tokenStream.incrementToken()) {
        actual.add(termAtt.toString());//w ww. j  a  v a 2  s  . c  o  m
    }
    assertThat(actual.toArray(EMPTY_STRING_ARRAY), is(testData.expected));
}

From source file:jp.co.atware.solr.analizers.cjk.CranioCaudalFilterTest.java

License:Apache License

@Theory
public void testIncrementToken(TestData testData) throws Exception {
    TokenStream tokenStream = createTokenStream(testData.input);
    CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
    List<String> actual = new ArrayList<String>();
    while (tokenStream.incrementToken()) {
        actual.add(termAtt.toString());//www .  j a  v a 2  s .c o  m
    }
    assertThat(actual.toArray(EMPTY_STRING_ARRAY), is(testData.expected));
}

From source file:jp.co.atware.solr.analizers.cjk.MultistageMappingCharFilterTest.java

License:Apache License

@Theory
public void testMultiMappingAndOffset(TestData testData) throws Exception {
    Reader reader = charFilterFactory.create(new StringReader(testData.input));
    TokenStream tokenStream = tokenizerFactory.create(reader);
    OffsetAttribute actualOffset = tokenStream.getAttribute(OffsetAttribute.class);
    CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);

    tokenStream.reset();//from   ww  w  . j a  va 2s . co  m

    assertThat(tokenStream.incrementToken(), is(true));
    assertThat(termAtt.toString(), is(testData.expected));
    assertThat(actualOffset.startOffset(), is(testData.start));
    assertThat(actualOffset.endOffset(), is(testData.end));
    assertThat(tokenStream.incrementToken(), is(false));
}