Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:com.flaptor.indextank.query.IndexEngineParser.java

License:Apache License

public Iterator<AToken> parseDocumentField(String fieldName, String content) {
    final TokenStream tkstream = analyzer.tokenStream(fieldName, new StringReader(content));
    final TermAttribute termAtt = tkstream.addAttribute(TermAttribute.class);
    final PositionIncrementAttribute posIncrAttribute = tkstream.addAttribute(PositionIncrementAttribute.class);
    final OffsetAttribute offsetAtt = tkstream.addAttribute(OffsetAttribute.class);

    return new AbstractIterator<AToken>() {
        int currentPosition = 0;

        @Override/*from w  ww. jav a  2s .  c  om*/
        protected AToken computeNext() {
            try {
                if (!tkstream.incrementToken()) {
                    tkstream.end();
                    tkstream.close();
                    return endOfData();
                }
            } catch (IOException e) {
                //This should never happen, as the reader is a StringReader
            }
            //final org.apache.lucene.analysis.Token luceneTk = tkstream.getAttribute(org.apache.lucene.analysis.Token.class);
            currentPosition += posIncrAttribute.getPositionIncrement();
            final int position = currentPosition;
            final int startOffset = offsetAtt.startOffset();
            final int endOffset = offsetAtt.endOffset();
            final String text = termAtt.term();
            return new AToken() {
                @Override
                public String getText() {
                    return text; //luceneTk.term();
                }

                @Override
                public int getPosition() {
                    return position; //luceneTk.getPositionIncrement();
                }

                @Override
                public int getStartOffset() {
                    return startOffset;
                }

                @Override
                public int getEndOffset() {
                    return endOffset;
                }
            };
        }
    };

}

From source file:com.github.ippeiukai.externaltoken.lucene.analysis.TestPatternTokenizer.java

License:Apache License

/** 
 * TODO: rewrite tests not to use string comparison.
 *///from  ww  w . ja va  2  s  .  c  om
private static String tsToString(TokenStream in) throws IOException {
    StringBuilder out = new StringBuilder();
    CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
    // extra safety to enforce, that the state is not preserved and also
    // assign bogus values
    in.clearAttributes();
    termAtt.setEmpty().append("bogusTerm");
    while (in.incrementToken()) {
        out.append(termAtt.toString());
        in.clearAttributes();
        termAtt.setEmpty().append("bogusTerm");
        out.append(' ');
    }
    if (out.length() > 0)
        out.deleteCharAt(out.length() - 1);

    in.close();
    return out.toString();
}

From source file:com.github.le11.nls.lucene.UIMABaseAnalyzerTest.java

License:Apache License

@Test
public void baseUIMAAnalyzerStreamTest() {
    try {/*from   www  .  ja  v  a 2 s . c o  m*/
        TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        while (ts.incrementToken()) {
            assertNotNull(offsetAtt);
            assertNotNull(termAtt);
            System.out.println("token '" + termAtt.toString() + "' has offset " + offsetAtt.startOffset() + ","
                    + offsetAtt.endOffset());
        }
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getLocalizedMessage());
    }
}

From source file:com.github.le11.nls.lucene.UIMAPayloadsAnalyzerTest.java

License:Apache License

@Test
public void baseUIMAPayloadsAnalyzerStreamTest() {
    try {/*from  ww  w . j a v  a  2s. com*/
        TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        PayloadAttribute payloadAttribute = ts.addAttribute(PayloadAttribute.class);
        while (ts.incrementToken()) {
            assertNotNull(termAtt);
            assertNotNull(payloadAttribute);
            System.out.println("token '" + termAtt.toString() + "' has payload "
                    + new String(payloadAttribute.getPayload().getData()));
        }
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getLocalizedMessage());
    }
}

From source file:com.github.le11.nls.lucene.UIMATypeAwareAnalyzerTest.java

License:Apache License

@Test
public void testSimpleUsage() {
    try {/*from w w w. java  2 s.c o  m*/
        UIMATypeAwareAnalyzer analyzer = new UIMATypeAwareAnalyzer("/HmmTaggerAggregate.xml",
                "org.apache.uima.TokenAnnotation", "posTag");
        TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAttr = ts.addAttribute(TypeAttribute.class);
        PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
        while (ts.incrementToken()) {
            assertNotNull(offsetAtt);
            assertNotNull(termAtt);
            assertNotNull(posAtt);
            assertNotNull(typeAttr);
            System.out.println("token '" + termAtt.toString() + "' has type " + typeAttr.type());
        }
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getLocalizedMessage());
    }
}

From source file:com.github.mosuka.apache.lucene.example.utils.LuceneExampleUtilTest.java

License:Apache License

public void testCreateAnalyzerWrapper() throws IOException {
    PerFieldAnalyzerWrapper wrapper = LuceneExampleUtil.createAnalyzerWrapper();

    TokenStream tokenStream = null;
    CharTermAttribute charTermAttribute = null;

    List<String> expectedIdTermList = new LinkedList<String>(Arrays.asList("1"));
    List<String> actualIdTermList = new LinkedList<String>();
    tokenStream = wrapper.tokenStream("id", "1");
    charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();/*  ww  w.j ava2  s  . com*/
    while (tokenStream.incrementToken()) {
        actualIdTermList.add(charTermAttribute.toString());
    }
    tokenStream.close();
    assertEquals(expectedIdTermList, actualIdTermList);

    List<String> expectedTextTermList = new LinkedList<String>(
            Arrays.asList("lucene", "is", "a", "full", "text", "search", "library"));
    List<String> actualTextTermList = new LinkedList<String>();
    tokenStream = wrapper.tokenStream("text", "Lucene is a Full-text search library.");
    charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        actualTextTermList.add(charTermAttribute.toString());
    }
    tokenStream.close();
    assertEquals(expectedTextTermList, actualTextTermList);
}

From source file:com.github.pmerienne.trident.ml.preprocessing.EnglishTokenizer.java

License:Apache License

public List<String> tokenize(String text) {
    List<String> words = new ArrayList<String>();
    if (text != null && !text.isEmpty()) {
        TokenStream tokenStream = this.createTokenStream(text);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

        try {//w  ww .  j  a va2  s.  co m
            while (tokenStream.incrementToken()) {
                String term = charTermAttribute.toString();
                words.add(term);
            }
        } catch (IOException ioe) {
            LOGGER.error("Unable to analyze text. Cause : " + ioe.getMessage(), ioe);
        } finally {
            try {
                tokenStream.end();
                tokenStream.close();
            } catch (IOException e) {
                // Can't do nothing!!
                LOGGER.error("Unable to close token stream : " + e.getMessage());
            }
        }
    }

    return words;
}

From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java

License:Apache License

private void addAttributes(TokenStream tokenStream) {
    tokenStream.addAttribute(OffsetAttribute.class);
    tokenStream.addAttribute(ReadingAttribute.class);
    tokenStream.addAttribute(PartOfSpeechAttribute.class);
    tokenStream.addAttribute(InflectionAttribute.class);
    tokenStream.addAttribute(BaseFormAttribute.class);
}

From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java

License:Apache License

public static org.apache.lucene.analysis.Token getNextToken(TokenStream input) throws IOException {
    org.apache.lucene.analysis.Token token = null;
    if (input.incrementToken()) {
        CharTermAttribute ccc = input.addAttribute(CharTermAttribute.class);
        Iterator<AttributeImpl> attIt = input.getAttributeImplsIterator();

        if (attIt == null || !attIt.hasNext()) {
            return null;
        }/*  ww  w . j a v a2s.c  o m*/

        AttributeImpl att = attIt.next();
        if (att instanceof GSAttributeImpl) {
            token = ((GSAttributeImpl) att).getToken();
        }

        if (token == null && ccc != null && ccc.length() > 0) {
            String ttt = ccc.toString();
            token = new org.apache.lucene.analysis.Token(ttt, 0, ttt.length());
        }
    }

    return token;
}

From source file:com.ibm.watson.developer_cloud.professor_languo.primary_search.SpanBigramQueryGeneratorTest.java

License:Open Source License

private void test_that_generated_bigram_query_match_the_referenced_query() throws IOException {
    // Get stemmed question
    SingletonAnalyzer.generateAnalyzer(PrimarySearchConstants.ENGLISH_ANALYZER);
    EnglishAnalyzer ea = (EnglishAnalyzer) SingletonAnalyzer.getAnalyzer();
    TokenStream ts = ea.tokenStream("field", question);
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();//  ww w.ja va  2 s  .  c o m
    List<String> stemmedQuestion = new ArrayList<String>();
    while (ts.incrementToken())
        stemmedQuestion.add(charTermAttribute.toString());

    // get query terms
    BooleanClause[] clauses = ((BooleanQuery) spanBigramQuery).getClauses();
    SpanQuery[] queries;
    String term1, term2;
    List<String> unigrams = new ArrayList<String>();
    int numFields = clauses.length / (2 * stemmedQuestion.size() - 1);

    // test bigrams
    int bigramidx = 0;
    for (int idx = 0; idx < clauses.length; idx++) {
        Query q = clauses[idx].getQuery();
        if (q instanceof SpanNearQuery) {
            queries = ((SpanNearQuery) clauses[idx].getQuery()).getClauses();
            int termidx = bigramidx / numFields;
            term1 = ((SpanTermQuery) queries[0]).getTerm().text();
            term2 = ((SpanTermQuery) queries[1]).getTerm().text();
            assertEquals("Extracted first term doesn't match the stemmed term", stemmedQuestion.get(termidx),
                    term1);
            assertEquals("Extracted second term doesn't match the stemmed term",
                    stemmedQuestion.get(termidx + 1), term2);
            bigramidx++;
        } else if (q instanceof TermQuery) {
            unigrams.add(((TermQuery) clauses[idx].getQuery()).getTerm().text());
        } else {
            assertTrue("Unknown type of query found!", false);
        }
    }

    // test unigrams
    for (String s : unigrams)
        assertTrue(stemmedQuestion.contains(s));
    for (String s : stemmedQuestion)
        assertTrue(unigrams.contains(s));
}