Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:com.flaptor.indextank.query.IndexEngineParser.java

License:Apache License

public Iterator<AToken> parseDocumentField(String fieldName, String content) {
    final TokenStream tkstream = analyzer.tokenStream(fieldName, new StringReader(content));
    final TermAttribute termAtt = tkstream.addAttribute(TermAttribute.class);
    final PositionIncrementAttribute posIncrAttribute = tkstream.addAttribute(PositionIncrementAttribute.class);
    final OffsetAttribute offsetAtt = tkstream.addAttribute(OffsetAttribute.class);

    return new AbstractIterator<AToken>() {
        int currentPosition = 0;

        @Override/*from w  ww. jav a  2s .  c  om*/
        protected AToken computeNext() {
            try {
                if (!tkstream.incrementToken()) {
                    tkstream.end();
                    tkstream.close();
                    return endOfData();
                }
            } catch (IOException e) {
                //This should never happen, as the reader is a StringReader
            }
            //final org.apache.lucene.analysis.Token luceneTk = tkstream.getAttribute(org.apache.lucene.analysis.Token.class);
            currentPosition += posIncrAttribute.getPositionIncrement();
            final int position = currentPosition;
            final int startOffset = offsetAtt.startOffset();
            final int endOffset = offsetAtt.endOffset();
            final String text = termAtt.term();
            return new AToken() {
                @Override
                public String getText() {
                    return text; //luceneTk.term();
                }

                @Override
                public int getPosition() {
                    return position; //luceneTk.getPositionIncrement();
                }

                @Override
                public int getStartOffset() {
                    return startOffset;
                }

                @Override
                public int getEndOffset() {
                    return endOffset;
                }
            };
        }
    };

}

From source file:com.github.ippeiukai.externaltoken.lucene.analysis.TestPatternTokenizer.java

License:Apache License

/** 
 * TODO: rewrite tests not to use string comparison.
 *///from  ww  w . ja va  2  s  .  c  om
private static String tsToString(TokenStream in) throws IOException {
    StringBuilder out = new StringBuilder();
    CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
    // extra safety to enforce, that the state is not preserved and also
    // assign bogus values
    in.clearAttributes();
    termAtt.setEmpty().append("bogusTerm");
    while (in.incrementToken()) {
        out.append(termAtt.toString());
        in.clearAttributes();
        termAtt.setEmpty().append("bogusTerm");
        out.append(' ');
    }
    if (out.length() > 0)
        out.deleteCharAt(out.length() - 1);

    in.close();
    return out.toString();
}

From source file:com.github.le11.nls.lucene.UIMABaseAnalyzerTest.java

License:Apache License

@Test
public void baseUIMAAnalyzerStreamTest() {
    try {/*from   www  .  ja  v  a 2 s . c o  m*/
        TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        while (ts.incrementToken()) {
            assertNotNull(offsetAtt);
            assertNotNull(termAtt);
            System.out.println("token '" + termAtt.toString() + "' has offset " + offsetAtt.startOffset() + ","
                    + offsetAtt.endOffset());
        }
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getLocalizedMessage());
    }
}

From source file:com.github.le11.nls.lucene.UIMAPayloadsAnalyzerTest.java

License:Apache License

@Test
public void baseUIMAPayloadsAnalyzerStreamTest() {
    try {/*from  ww  w . j a v  a  2s. com*/
        TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        PayloadAttribute payloadAttribute = ts.addAttribute(PayloadAttribute.class);
        while (ts.incrementToken()) {
            assertNotNull(termAtt);
            assertNotNull(payloadAttribute);
            System.out.println("token '" + termAtt.toString() + "' has payload "
                    + new String(payloadAttribute.getPayload().getData()));
        }
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getLocalizedMessage());
    }
}

From source file:com.github.le11.nls.lucene.UIMATypeAwareAnalyzerTest.java

License:Apache License

@Test
public void testSimpleUsage() {
    try {/*from w w w. java  2 s.c o  m*/
        UIMATypeAwareAnalyzer analyzer = new UIMATypeAwareAnalyzer("/HmmTaggerAggregate.xml",
                "org.apache.uima.TokenAnnotation", "posTag");
        TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAttr = ts.addAttribute(TypeAttribute.class);
        PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
        while (ts.incrementToken()) {
            assertNotNull(offsetAtt);
            assertNotNull(termAtt);
            assertNotNull(posAtt);
            assertNotNull(typeAttr);
            System.out.println("token '" + termAtt.toString() + "' has type " + typeAttr.type());
        }
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getLocalizedMessage());
    }
}

From source file:com.github.mosuka.apache.lucene.example.utils.LuceneExampleUtilTest.java

License:Apache License

public void testCreateAnalyzerWrapper() throws IOException {
    PerFieldAnalyzerWrapper wrapper = LuceneExampleUtil.createAnalyzerWrapper();

    TokenStream tokenStream = null;
    CharTermAttribute charTermAttribute = null;

    List<String> expectedIdTermList = new LinkedList<String>(Arrays.asList("1"));
    List<String> actualIdTermList = new LinkedList<String>();
    tokenStream = wrapper.tokenStream("id", "1");
    charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();/*  ww  w.j ava2  s  . com*/
    while (tokenStream.incrementToken()) {
        actualIdTermList.add(charTermAttribute.toString());
    }
    tokenStream.close();
    assertEquals(expectedIdTermList, actualIdTermList);

    List<String> expectedTextTermList = new LinkedList<String>(
            Arrays.asList("lucene", "is", "a", "full", "text", "search", "library"));
    List<String> actualTextTermList = new LinkedList<String>();
    tokenStream = wrapper.tokenStream("text", "Lucene is a Full-text search library.");
    charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        actualTextTermList.add(charTermAttribute.toString());
    }
    tokenStream.close();
    assertEquals(expectedTextTermList, actualTextTermList);
}

From source file:com.github.pmerienne.trident.ml.preprocessing.EnglishTokenizer.java

License:Apache License

public List<String> tokenize(String text) {
    List<String> words = new ArrayList<String>();
    if (text != null && !text.isEmpty()) {
        TokenStream tokenStream = this.createTokenStream(text);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

        try {//w  ww .  j  a va2  s.  co m
            while (tokenStream.incrementToken()) {
                String term = charTermAttribute.toString();
                words.add(term);
            }
        } catch (IOException ioe) {
            LOGGER.error("Unable to analyze text. Cause : " + ioe.getMessage(), ioe);
        } finally {
            try {
                tokenStream.end();
                tokenStream.close();
            } catch (IOException e) {
                // Can't do nothing!!
                LOGGER.error("Unable to close token stream : " + e.getMessage());
            }
        }
    }

    return words;
}

From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java

License:Apache License

private void addAttributes(TokenStream tokenStream) {
    tokenStream.addAttribute(OffsetAttribute.class);
    tokenStream.addAttribute(ReadingAttribute.class);
    tokenStream.addAttribute(PartOfSpeechAttribute.class);
    tokenStream.addAttribute(InflectionAttribute.class);
    tokenStream.addAttribute(BaseFormAttribute.class);
}

From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java

License:Apache License

public static org.apache.lucene.analysis.Token getNextToken(TokenStream input) throws IOException {
    org.apache.lucene.analysis.Token token = null;
    if (input.incrementToken()) {
        CharTermAttribute ccc = input.addAttribute(CharTermAttribute.class);
        Iterator<AttributeImpl> attIt = input.getAttributeImplsIterator();

        if (attIt == null || !attIt.hasNext()) {
            return null;
        }/*  ww  w . j a v a2s.c  o m*/

        AttributeImpl att = attIt.next();
        if (att instanceof GSAttributeImpl) {
            token = ((GSAttributeImpl) att).getToken();
        }

        if (token == null && ccc != null && ccc.length() > 0) {
            String ttt = ccc.toString();
            token = new org.apache.lucene.analysis.Token(ttt, 0, ttt.length());
        }
    }

    return token;
}

From source file:com.ibm.watson.developer_cloud.professor_languo.primary_search.SpanBigramQueryGeneratorTest.java

License:Open Source License

private void test_that_generated_bigram_query_match_the_referenced_query() throws IOException {
    // Get stemmed question
    SingletonAnalyzer.generateAnalyzer(PrimarySearchConstants.ENGLISH_ANALYZER);
    EnglishAnalyzer ea = (EnglishAnalyzer) SingletonAnalyzer.getAnalyzer();
    TokenStream ts = ea.tokenStream("field", question);
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();//  ww w.ja va  2 s  .  c o m
    List<String> stemmedQuestion = new ArrayList<String>();
    while (ts.incrementToken())
        stemmedQuestion.add(charTermAttribute.toString());

    // get query terms
    BooleanClause[] clauses = ((BooleanQuery) spanBigramQuery).getClauses();
    SpanQuery[] queries;
    String term1, term2;
    List<String> unigrams = new ArrayList<String>();
    int numFields = clauses.length / (2 * stemmedQuestion.size() - 1);

    // test bigrams
    int bigramidx = 0;
    for (int idx = 0; idx < clauses.length; idx++) {
        Query q = clauses[idx].getQuery();
        if (q instanceof SpanNearQuery) {
            queries = ((SpanNearQuery) clauses[idx].getQuery()).getClauses();
            int termidx = bigramidx / numFields;
            term1 = ((SpanTermQuery) queries[0]).getTerm().text();
            term2 = ((SpanTermQuery) queries[1]).getTerm().text();
            assertEquals("Extracted first term doesn't match the stemmed term", stemmedQuestion.get(termidx),
                    term1);
            assertEquals("Extracted second term doesn't match the stemmed term",
                    stemmedQuestion.get(termidx + 1), term2);
            bigramidx++;
        } else if (q instanceof TermQuery) {
            unigrams.add(((TermQuery) clauses[idx].getQuery()).getTerm().text());
        } else {
            assertTrue("Unknown type of query found!", false);
        }
    }

    // test unigrams
    for (String s : unigrams)
        assertTrue(stemmedQuestion.contains(s));
    for (String s : stemmedQuestion)
        assertTrue(unigrams.contains(s));
}