Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:com.ibm.watson.developer_cloud.professor_languo.primary_search.SpanTrigramQueryGeneratorTest.java

License:Open Source License

private void test_that_generated_trigram_query_match_the_referenced_query() throws IOException {
    Set<Term> queryTerms = new HashSet<Term>();

    // Get stemmed question
    SingletonAnalyzer.generateAnalyzer(PrimarySearchConstants.ENGLISH_ANALYZER);
    EnglishAnalyzer ea = (EnglishAnalyzer) SingletonAnalyzer.getAnalyzer();
    TokenStream ts = ea.tokenStream("field", question);
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();//  w  w  w  . j  a va 2 s.co  m
    List<String> stemmedQuestion = new ArrayList<String>();
    while (ts.incrementToken())
        stemmedQuestion.add(charTermAttribute.toString());

    // get query terms
    spanTrigramQuery.extractTerms(queryTerms);
    BooleanClause[] clauses = ((BooleanQuery) spanTrigramQuery).getClauses();
    SpanQuery[] qs;
    String term1, term2, term3;
    int numFields = clauses.length / (3 * stemmedQuestion.size() - 3);
    List<String> unigrams = new ArrayList<String>();
    int idx = 0;

    // test trigrams
    int trigramidx = 0;
    for (idx = clauses.length - numFields * (stemmedQuestion.size() - 2); idx < clauses.length; idx++) {
        qs = ((SpanNearQuery) clauses[idx].getQuery()).getClauses();
        int termidx = trigramidx / numFields;
        term1 = ((SpanTermQuery) qs[0]).getTerm().text();
        term2 = ((SpanTermQuery) qs[1]).getTerm().text();
        term3 = ((SpanTermQuery) qs[2]).getTerm().text();
        assertEquals("Extracted first term in the trigram doesn't match the stemmed term",
                stemmedQuestion.get(termidx), term1);
        assertEquals("Extracted second term in the trigram doesn't match the stemmed term",
                stemmedQuestion.get(termidx + 1), term2);
        assertEquals("Extracted third term in the trigram doesn't match the stemmed term",
                stemmedQuestion.get(termidx + 2), term3);
        trigramidx++;
    }

    // test bigrams
    int bigramidx = 0;
    for (idx = 0; idx < (2 * stemmedQuestion.size() - 1) * numFields; idx++) {
        Query q = clauses[idx].getQuery();
        if (q instanceof SpanNearQuery) {
            qs = ((SpanNearQuery) clauses[idx].getQuery()).getClauses();
            int termidx = bigramidx / numFields;
            term1 = ((SpanTermQuery) qs[0]).getTerm().text();
            term2 = ((SpanTermQuery) qs[1]).getTerm().text();
            assertEquals("Extracted first term in the bigram doesn't match the stemmed term",
                    stemmedQuestion.get(termidx), term1);
            assertEquals("Extracted second term in the bigram doesn't match the stemmed term",
                    stemmedQuestion.get(termidx + 1), term2);
            bigramidx++;
        } else if (q instanceof TermQuery) {
            unigrams.add(((TermQuery) clauses[idx].getQuery()).getTerm().text());
        } else {
            assertTrue("Unknown type of query found!", false);
        }
    }

    // test unigrams
    for (String s : unigrams)
        assertTrue(stemmedQuestion.contains(s));
    for (String s : stemmedQuestion)
        assertTrue(unigrams.contains(s));
}

From source file:com.ikon.analysis.AnalyzerDemo.java

License:Open Source License

/**
 * Analyze and display tokens/*from  w  w  w  .  j a va 2 s  .c om*/
 */
private static void analyze(String string, Analyzer analyzer) throws IOException {
    StringBuffer buffer = new StringBuffer();
    TokenStream stream = analyzer.tokenStream("contents", new StringReader(string));
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    buffer.append(analyzer.getClass().getName());
    buffer.append(" -> ");

    while (stream.incrementToken()) {
        buffer.append(" [");
        buffer.append(term.toString());
        buffer.append("]");
    }

    String output = buffer.toString();
    log.info(output);
}

From source file:com.jamespot.glifpix.index.ResourceDocument.java

License:Open Source License

private void addLiteralField(String literal) throws IOException {
    _luceneDocument// w w w.  j  av  a 2 s.c  om
            .add(new Field("literal", replaceUnicodeStr(literal), Store.YES, Index.NOT_ANALYZED_NO_NORMS));

    String coolLiteral = literal.replaceAll("\\\"", "");
    coolLiteral = replaceUnicodeStr(coolLiteral);

    Analyzer resAnalyzer = new ContentAnalyzer();
    TokenStream ts = resAnalyzer.tokenStream("dummyField", new StringReader(coolLiteral));

    TermAttribute termAttribute = ts.addAttribute(TermAttribute.class);

    int length = 0;
    StringBuffer sb = new StringBuffer();
    while (ts.incrementToken()) {
        sb.append("_" + termAttribute.term());
        length++;
    }
    sb.insert(0, length);
    _resourceLength = length;
    ts.end();
    ts.close();

    String finalToken = sb.toString();
    _luceneDocument.add(new Field("token", finalToken, Store.YES, Index.NOT_ANALYZED_NO_NORMS));
    _luceneDocument.add(new Field("crc", Utils.getCRC(finalToken), Store.YES, Index.NOT_ANALYZED_NO_NORMS));
}

From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java

License:Open Source License

public Map<String, Integer> getTagsFreq(String content, String lng) {

    Map<String, Integer> items = new HashMap<String, Integer>();
    TokensArray tokArray = new TokensArray(_MaxExpressionLength);

    TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content));
    TermAttribute termAttribute = ts.addAttribute(TermAttribute.class);

    try {/*from  ww w. j av a  2 s  . c  o m*/
        while (ts.incrementToken()) {
            tokArray.pushString(termAttribute.term());
            Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(),
                    _lngStopTags.get(lng));

            if (tagCandidates.size() > 0) {
                for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) {
                    String tag = _resStores.get(lng).getTag(s.getKey());
                    if (tag != null && tag.length() >= _MinWordLength) {
                        if (items.containsKey(tag)) {
                            items.put(tag, items.get(tag) + s.getValue());
                        } else {
                            items.put(tag, s.getValue());
                        }
                    }
                }
            }
        }
        ts.end();
        ts.close();

    } catch (IOException e) {
        logger.error(e);
    }

    return items;
}

From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java

License:Open Source License

public Map<String, Float> getWeightedTagsFreq(String content, String lng) {

    Map<String, Float> items = new HashMap<String, Float>();
    TokensArray tokArray = new TokensArray(_MaxExpressionLength);

    TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content));
    TermAttribute termAttribute = ts.addAttribute(TermAttribute.class);

    try {//from w  w  w .  j av  a2 s  . co  m
        while (ts.incrementToken()) {
            tokArray.pushString(termAttribute.term());
            Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(),
                    _lngStopTags.get(lng));

            if (tagCandidates.size() > 0) {
                for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) {
                    String tag = _resStores.get(lng).getTag(s.getKey());
                    if (tag != null && tag.length() >= _MinWordLength) {
                        if (items.containsKey(tag)) {
                            items.put(tag, items.get(tag)
                                    + (s.getValue().floatValue()) * getTagWeight(s.getKey(), lng));
                        } else {
                            items.put(tag, (s.getValue().floatValue()) * getTagWeight(s.getKey(), lng));
                        }
                    }
                }
            }
        }
        ts.end();
        ts.close();

    } catch (IOException e) {
        logger.error(e);
    }

    return items;
}

From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java

License:Open Source License

public Set<String> getTokens(String content, String lng) {

    Set<String> tokens = new HashSet<String>();
    TokensArray tokArray = new TokensArray(15);

    TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content));
    TermAttribute termAttribute = ts.addAttribute(TermAttribute.class);

    try {/*from   w w w . j  a v  a2  s  . c  om*/
        while (ts.incrementToken()) {
            tokArray.pushString(termAttribute.term());
            Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(),
                    _lngStopTags.get(lng));

            if (tagCandidates.size() > 0) {
                for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) {
                    tokens.add(s.getKey());
                }
            }
        }
        ts.end();
        ts.close();

    } catch (IOException e) {
        logger.error(e);
    }

    return tokens;
}

From source file:com.leavesfly.lia.advsearching.SpanQueryTest.java

License:Apache License

private void dumpSpans(SpanQuery query) throws IOException {
    Spans spans = query.getSpans(reader);
    System.out.println(query + ":");
    int numSpans = 0;

    TopDocs hits = searcher.search(query, 10);
    float[] scores = new float[2];
    for (ScoreDoc sd : hits.scoreDocs) {
        scores[sd.doc] = sd.score;//  w w w .j  a  v  a  2  s  . c om
    }

    while (spans.next()) { // A
        numSpans++;

        int id = spans.doc();
        Document doc = reader.document(id); // B

        TokenStream stream = analyzer.tokenStream("contents", // C
                new StringReader(doc.get("f"))); // C
        TermAttribute term = stream.addAttribute(TermAttribute.class);

        StringBuilder buffer = new StringBuilder();
        buffer.append("   ");
        int i = 0;
        while (stream.incrementToken()) { // D
            if (i == spans.start()) { // E
                buffer.append("<"); // E
            } // E
            buffer.append(term.term()); // E
            if (i + 1 == spans.end()) { // E
                buffer.append(">"); // E
            } // E
            buffer.append(" ");
            i++;
        }
        buffer.append("(").append(scores[id]).append(") ");
        System.out.println(buffer);
    }

    if (numSpans == 0) {
        System.out.println("   No spans");
    }
    System.out.println();
}

From source file:com.leavesfly.lia.analysis.AnalyzerUtils.java

License:Apache License

public static void displayTokens(TokenStream stream) throws IOException {

    TermAttribute term = stream.addAttribute(TermAttribute.class);
    while (stream.incrementToken()) {
        System.out.print("[" + term.term() + "] "); // B
    }/* ww w.  j ava 2  s . c o  m*/
}

From source file:com.leavesfly.lia.analysis.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", // #A
            new StringReader(text));

    TermAttribute term = stream.addAttribute(TermAttribute.class); // #B
    PositionIncrementAttribute posIncr = // #B
            stream.addAttribute(PositionIncrementAttribute.class); // #B
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); // #B
    TypeAttribute type = stream.addAttribute(TypeAttribute.class); // #B

    int position = 0;
    while (stream.incrementToken()) { // #C

        int increment = posIncr.getPositionIncrement(); // #D
        if (increment > 0) { // #D
            position = position + increment; // #D
            System.out.println(); // #D
            System.out.print(position + ": "); // #D
        }//from   w ww . ja  v  a  2  s .  c  om

        System.out.print("[" + // #E
                term.term() + ":" + // #E
                offset.startOffset() + "->" + // #E
                offset.endOffset() + ":" + // #E
                type.type() + "] "); // #E
    }
    System.out.println();
}

From source file:com.leavesfly.lia.analysis.Fragments.java

License:Apache License

public void frag3() throws Exception {
    Analyzer analyzer = null;//  w w  w .  j av a  2s. com
    String text = null;
    // START
    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    PositionIncrementAttribute posIncr = (PositionIncrementAttribute) stream
            .addAttribute(PositionIncrementAttribute.class);
    while (stream.incrementToken()) {
        System.out.println("posIncr=" + posIncr.getPositionIncrement());
    }
    // END
}