Example usage for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:com.globalsight.ling.lucene.TbTextIndex.java

License:Apache License

protected Query getQuery(String p_text) throws IOException {
    PhraseQuery result = new PhraseQuery();

    TokenStream tokens = m_analyzer.tokenStream(IndexDocument.TEXT, new StringReader(p_text));
    tokens.reset();

    Token t;//  w w w.ja  va2 s. c  o  m
    while ((t = LuceneUtil.getNextToken(tokens)) != null) {
        result.add(new Term(IndexDocument.TEXT, t.toString()));
    }

    return result;
}

From source file:com.globalsight.ling.lucene.TmFuzzyIndex.java

License:Apache License

protected Query getQuery(String p_text) throws IOException {
    BooleanQuery result = new BooleanQuery();

    TokenStream tokens = m_analyzer.tokenStream(IndexDocument.TEXT, new StringReader(p_text));
    tokens.reset();

    Token t;//from w  w w  .  j ava2s .  c o  m
    while ((t = LuceneUtil.getNextToken(tokens)) != null) {
        result.add(new BooleanClause(new TermQuery(new Term(IndexDocument.TEXT, t.toString())), Occur.SHOULD));
    }

    return result;
}

From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java

License:Apache License

/**
 * Create GlobalSight TM tokens from a provided segment string
 * using GsAnalyzer./*from w  w w . jav a2 s  .co  m*/
 *
 * @param p_text fuzzy match format string
 * @return List of c.g.l.tm2.index.Tokens
 */
public static List<Token> createGsTokens(String p_text, GlobalSightLocale p_locale) throws Exception {
    GsAnalyzer analyzer = new GsAnalyzer(p_locale);
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text));

    tokenStream.reset();
    //GSAttribute gsAtt = tokenStream.addAttribute(GSAttribute.class);
    //org.apache.lucene.analysis.Token luceneToken = null;
    List<String> tokens = new ArrayList<String>();

    while (tokenStream.incrementToken()) {
        // luceneToken = gsAtt.getToken();

        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokens.add(termAtt.toString());

    }
    tokenStream.close();
    return buildTokenList(tokens);
}

From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java

License:Apache License

/**
 * Create GlobalSight TM tokens from a provided segment string
 * using GsAnalyzer.  This method is suitable for use with TM3
 * fuzzy indices, and does two things differently than createGsTokens():
 * 1) It returns tokens in the order in which they appear
 * 2) It does not collapse duplicate tokens (and correspondingly does
 *    not return count information)/*from w  w w . j  av  a 2s  .  c o  m*/
 *
 * @param p_text fuzzy match format string
 * @return List of Strings, each representing one token
 */
public static List<String> createTm3Tokens(String p_text, GlobalSightLocale p_locale) throws Exception {
    GsAnalyzer analyzer = new GsAnalyzer(p_locale);
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text));
    tokenStream.reset();

    List<String> tokens = new ArrayList<String>();
    while (tokenStream.incrementToken()) {
        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokens.add(termAtt.toString());
    }
    tokenStream.close();

    return tokens;
}

From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java

License:Apache License

@SuppressWarnings("resource")
public static List<String> createTm3TokensNoStopWord(String p_text, GlobalSightLocale p_locale)
        throws Exception {
    GsAnalyzer analyzer = new GsAnalyzer(p_locale, false);
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text));
    tokenStream.reset();

    List<String> tokens = new ArrayList<String>();
    while (tokenStream.incrementToken()) {
        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokens.add(termAtt.toString());/* w  w w .j  a  v  a 2 s. c o  m*/
    }
    tokenStream.close();

    return tokens;
}

From source file:com.globalsight.ling.tm2.lucene.TuvDocument.java

License:Apache License

private int getTotalTokenCount(String text, Analyzer analyzer) throws Exception {
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(text));
    tokenStream.reset();

    int tokenCount = 0;
    while (tokenStream.incrementToken()) {
        tokenCount++;//  w w  w .  ja  v a  2 s . c o m
    }

    return tokenCount;
}

From source file:com.ibm.watson.developer_cloud.professor_languo.primary_search.SpanBigramQueryGeneratorTest.java

License:Open Source License

private void test_that_generated_bigram_query_match_the_referenced_query() throws IOException {
    // Get stemmed question
    SingletonAnalyzer.generateAnalyzer(PrimarySearchConstants.ENGLISH_ANALYZER);
    EnglishAnalyzer ea = (EnglishAnalyzer) SingletonAnalyzer.getAnalyzer();
    TokenStream ts = ea.tokenStream("field", question);
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    List<String> stemmedQuestion = new ArrayList<String>();
    while (ts.incrementToken())
        stemmedQuestion.add(charTermAttribute.toString());

    // get query terms
    BooleanClause[] clauses = ((BooleanQuery) spanBigramQuery).getClauses();
    SpanQuery[] queries;/*from ww  w  . j a va 2 s  .  c o m*/
    String term1, term2;
    List<String> unigrams = new ArrayList<String>();
    int numFields = clauses.length / (2 * stemmedQuestion.size() - 1);

    // test bigrams
    int bigramidx = 0;
    for (int idx = 0; idx < clauses.length; idx++) {
        Query q = clauses[idx].getQuery();
        if (q instanceof SpanNearQuery) {
            queries = ((SpanNearQuery) clauses[idx].getQuery()).getClauses();
            int termidx = bigramidx / numFields;
            term1 = ((SpanTermQuery) queries[0]).getTerm().text();
            term2 = ((SpanTermQuery) queries[1]).getTerm().text();
            assertEquals("Extracted first term doesn't match the stemmed term", stemmedQuestion.get(termidx),
                    term1);
            assertEquals("Extracted second term doesn't match the stemmed term",
                    stemmedQuestion.get(termidx + 1), term2);
            bigramidx++;
        } else if (q instanceof TermQuery) {
            unigrams.add(((TermQuery) clauses[idx].getQuery()).getTerm().text());
        } else {
            assertTrue("Unknown type of query found!", false);
        }
    }

    // test unigrams
    for (String s : unigrams)
        assertTrue(stemmedQuestion.contains(s));
    for (String s : stemmedQuestion)
        assertTrue(unigrams.contains(s));
}

From source file:com.ibm.watson.developer_cloud.professor_languo.primary_search.SpanTrigramQueryGeneratorTest.java

License:Open Source License

private void test_that_generated_trigram_query_match_the_referenced_query() throws IOException {
    Set<Term> queryTerms = new HashSet<Term>();

    // Get stemmed question
    SingletonAnalyzer.generateAnalyzer(PrimarySearchConstants.ENGLISH_ANALYZER);
    EnglishAnalyzer ea = (EnglishAnalyzer) SingletonAnalyzer.getAnalyzer();
    TokenStream ts = ea.tokenStream("field", question);
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    List<String> stemmedQuestion = new ArrayList<String>();
    while (ts.incrementToken())
        stemmedQuestion.add(charTermAttribute.toString());

    // get query terms
    spanTrigramQuery.extractTerms(queryTerms);
    BooleanClause[] clauses = ((BooleanQuery) spanTrigramQuery).getClauses();
    SpanQuery[] qs;//from   w ww. jav a  2  s  .  com
    String term1, term2, term3;
    int numFields = clauses.length / (3 * stemmedQuestion.size() - 3);
    List<String> unigrams = new ArrayList<String>();
    int idx = 0;

    // test trigrams
    int trigramidx = 0;
    for (idx = clauses.length - numFields * (stemmedQuestion.size() - 2); idx < clauses.length; idx++) {
        qs = ((SpanNearQuery) clauses[idx].getQuery()).getClauses();
        int termidx = trigramidx / numFields;
        term1 = ((SpanTermQuery) qs[0]).getTerm().text();
        term2 = ((SpanTermQuery) qs[1]).getTerm().text();
        term3 = ((SpanTermQuery) qs[2]).getTerm().text();
        assertEquals("Extracted first term in the trigram doesn't match the stemmed term",
                stemmedQuestion.get(termidx), term1);
        assertEquals("Extracted second term in the trigram doesn't match the stemmed term",
                stemmedQuestion.get(termidx + 1), term2);
        assertEquals("Extracted third term in the trigram doesn't match the stemmed term",
                stemmedQuestion.get(termidx + 2), term3);
        trigramidx++;
    }

    // test bigrams
    int bigramidx = 0;
    for (idx = 0; idx < (2 * stemmedQuestion.size() - 1) * numFields; idx++) {
        Query q = clauses[idx].getQuery();
        if (q instanceof SpanNearQuery) {
            qs = ((SpanNearQuery) clauses[idx].getQuery()).getClauses();
            int termidx = bigramidx / numFields;
            term1 = ((SpanTermQuery) qs[0]).getTerm().text();
            term2 = ((SpanTermQuery) qs[1]).getTerm().text();
            assertEquals("Extracted first term in the bigram doesn't match the stemmed term",
                    stemmedQuestion.get(termidx), term1);
            assertEquals("Extracted second term in the bigram doesn't match the stemmed term",
                    stemmedQuestion.get(termidx + 1), term2);
            bigramidx++;
        } else if (q instanceof TermQuery) {
            unigrams.add(((TermQuery) clauses[idx].getQuery()).getTerm().text());
        } else {
            assertTrue("Unknown type of query found!", false);
        }
    }

    // test unigrams
    for (String s : unigrams)
        assertTrue(stemmedQuestion.contains(s));
    for (String s : stemmedQuestion)
        assertTrue(unigrams.contains(s));
}

From source file:com.isotrol.impe3.lucene.PortalSpanishAnalyzerTest.java

License:Open Source License

private void test(String name, Analyzer a, String text) throws IOException {
    final Reader r = new StringReader(text);
    final TokenStream s = a.tokenStream(null, r);
    List<String> list = Lists.newLinkedList();
    s.reset();
    while (s.incrementToken()) {
        if (s.hasAttribute(CharTermAttribute.class)) {
            list.add(s.getAttribute(CharTermAttribute.class).toString());
        }// w ww. j  a va2s .  c  om
    }
    System.out.printf("[%s] %s => %s\n", name, text, list);
}

From source file:com.liferay.events.global.mobile.Utils.java

License:Open Source License

public static String removeStopWords(String words) throws IOException {
    if (Validator.isNull(EventContactServiceImpl.stopWords)) {
        EventContactServiceImpl.stopWords = new TreeSet<String>();
        BufferedReader r = new BufferedReader(new InputStreamReader(
                EventContactService.class.getClassLoader().getResourceAsStream("stopwords/words.txt")));
        String nextLine;/*from   w  ww  . j  av a2 s. c  o  m*/

        while ((nextLine = r.readLine()) != null) {
            String word = nextLine.trim();
            if (Validator.isNotNull(word)) {
                EventContactServiceImpl.stopWords.add(nextLine.trim());
            }
        }
        r.close();
    }
    // remove punctuation and stuff

    final CharArraySet stopSet = new CharArraySet(Version.LUCENE_35, EventContactServiceImpl.stopWords, true);

    TokenStream tokenStream = new StopFilter(Version.LUCENE_35,
            new StandardTokenizer(Version.LUCENE_35, new StringReader(words)), stopSet);

    StringBuilder sb = new StringBuilder();
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    tokenStream.reset();

    while (tokenStream.incrementToken()) {
        String term = charTermAttribute.toString();
        sb.append(term).append(" ");
    }

    return sb.toString();
}