Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java

License:Apache License

@SuppressWarnings("resource")
public static List<String> createTm3TokensNoStopWord(String p_text, GlobalSightLocale p_locale)
        throws Exception {
    GsAnalyzer analyzer = new GsAnalyzer(p_locale, false);
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text));
    tokenStream.reset();/*from  ww  w  . j  a  va  2 s  .  co  m*/

    List<String> tokens = new ArrayList<String>();
    while (tokenStream.incrementToken()) {
        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokens.add(termAtt.toString());
    }
    tokenStream.close();

    return tokens;
}

From source file:com.globalsight.ling.tm2.lucene.TuvDocument.java

License:Apache License

private int getTotalTokenCount(String text, Analyzer analyzer) throws Exception {
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(text));
    tokenStream.reset();/*from w w w.j  a  va2 s  .c o m*/

    int tokenCount = 0;
    while (tokenStream.incrementToken()) {
        tokenCount++;
    }

    return tokenCount;
}

From source file:com.grantingersoll.intell.index.BayesUpdateRequestProcessor.java

License:Apache License

public String[] tokenizeField(String input) throws IOException {
    ArrayList<String> tokenList = new ArrayList<String>(256);
    TokenStream ts = analyzer.tokenStream(inputField, new StringReader(input));
    while (ts.incrementToken()) {
        tokenList.add(ts.getAttribute(CharTermAttribute.class).toString());
    }//from  w w  w.  j av  a  2 s . com
    return tokenList.toArray(new String[tokenList.size()]);
}

From source file:com.ibm.watson.developer_cloud.professor_languo.primary_search.SpanBigramQueryGeneratorTest.java

License:Open Source License

private void test_that_generated_bigram_query_match_the_referenced_query() throws IOException {
    // Get stemmed question
    SingletonAnalyzer.generateAnalyzer(PrimarySearchConstants.ENGLISH_ANALYZER);
    EnglishAnalyzer ea = (EnglishAnalyzer) SingletonAnalyzer.getAnalyzer();
    TokenStream ts = ea.tokenStream("field", question);
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();/*w  w  w  .j av a  2s. co m*/
    List<String> stemmedQuestion = new ArrayList<String>();
    while (ts.incrementToken())
        stemmedQuestion.add(charTermAttribute.toString());

    // get query terms
    BooleanClause[] clauses = ((BooleanQuery) spanBigramQuery).getClauses();
    SpanQuery[] queries;
    String term1, term2;
    List<String> unigrams = new ArrayList<String>();
    int numFields = clauses.length / (2 * stemmedQuestion.size() - 1);

    // test bigrams
    int bigramidx = 0;
    for (int idx = 0; idx < clauses.length; idx++) {
        Query q = clauses[idx].getQuery();
        if (q instanceof SpanNearQuery) {
            queries = ((SpanNearQuery) clauses[idx].getQuery()).getClauses();
            int termidx = bigramidx / numFields;
            term1 = ((SpanTermQuery) queries[0]).getTerm().text();
            term2 = ((SpanTermQuery) queries[1]).getTerm().text();
            assertEquals("Extracted first term doesn't match the stemmed term", stemmedQuestion.get(termidx),
                    term1);
            assertEquals("Extracted second term doesn't match the stemmed term",
                    stemmedQuestion.get(termidx + 1), term2);
            bigramidx++;
        } else if (q instanceof TermQuery) {
            unigrams.add(((TermQuery) clauses[idx].getQuery()).getTerm().text());
        } else {
            assertTrue("Unknown type of query found!", false);
        }
    }

    // test unigrams
    for (String s : unigrams)
        assertTrue(stemmedQuestion.contains(s));
    for (String s : stemmedQuestion)
        assertTrue(unigrams.contains(s));
}

From source file:com.ibm.watson.developer_cloud.professor_languo.primary_search.SpanTrigramQueryGeneratorTest.java

License:Open Source License

private void test_that_generated_trigram_query_match_the_referenced_query() throws IOException {
    Set<Term> queryTerms = new HashSet<Term>();

    // Get stemmed question
    SingletonAnalyzer.generateAnalyzer(PrimarySearchConstants.ENGLISH_ANALYZER);
    EnglishAnalyzer ea = (EnglishAnalyzer) SingletonAnalyzer.getAnalyzer();
    TokenStream ts = ea.tokenStream("field", question);
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();//w w  w . j  a va2s .  c om
    List<String> stemmedQuestion = new ArrayList<String>();
    while (ts.incrementToken())
        stemmedQuestion.add(charTermAttribute.toString());

    // get query terms
    spanTrigramQuery.extractTerms(queryTerms);
    BooleanClause[] clauses = ((BooleanQuery) spanTrigramQuery).getClauses();
    SpanQuery[] qs;
    String term1, term2, term3;
    int numFields = clauses.length / (3 * stemmedQuestion.size() - 3);
    List<String> unigrams = new ArrayList<String>();
    int idx = 0;

    // test trigrams
    int trigramidx = 0;
    for (idx = clauses.length - numFields * (stemmedQuestion.size() - 2); idx < clauses.length; idx++) {
        qs = ((SpanNearQuery) clauses[idx].getQuery()).getClauses();
        int termidx = trigramidx / numFields;
        term1 = ((SpanTermQuery) qs[0]).getTerm().text();
        term2 = ((SpanTermQuery) qs[1]).getTerm().text();
        term3 = ((SpanTermQuery) qs[2]).getTerm().text();
        assertEquals("Extracted first term in the trigram doesn't match the stemmed term",
                stemmedQuestion.get(termidx), term1);
        assertEquals("Extracted second term in the trigram doesn't match the stemmed term",
                stemmedQuestion.get(termidx + 1), term2);
        assertEquals("Extracted third term in the trigram doesn't match the stemmed term",
                stemmedQuestion.get(termidx + 2), term3);
        trigramidx++;
    }

    // test bigrams
    int bigramidx = 0;
    for (idx = 0; idx < (2 * stemmedQuestion.size() - 1) * numFields; idx++) {
        Query q = clauses[idx].getQuery();
        if (q instanceof SpanNearQuery) {
            qs = ((SpanNearQuery) clauses[idx].getQuery()).getClauses();
            int termidx = bigramidx / numFields;
            term1 = ((SpanTermQuery) qs[0]).getTerm().text();
            term2 = ((SpanTermQuery) qs[1]).getTerm().text();
            assertEquals("Extracted first term in the bigram doesn't match the stemmed term",
                    stemmedQuestion.get(termidx), term1);
            assertEquals("Extracted second term in the bigram doesn't match the stemmed term",
                    stemmedQuestion.get(termidx + 1), term2);
            bigramidx++;
        } else if (q instanceof TermQuery) {
            unigrams.add(((TermQuery) clauses[idx].getQuery()).getTerm().text());
        } else {
            assertTrue("Unknown type of query found!", false);
        }
    }

    // test unigrams
    for (String s : unigrams)
        assertTrue(stemmedQuestion.contains(s));
    for (String s : stemmedQuestion)
        assertTrue(unigrams.contains(s));
}

From source file:com.ikon.analysis.AnalyzerDemo.java

License:Open Source License

/**
 * Analyze and display tokens//w w w .ja v a  2 s .co  m
 */
private static void analyze(String string, Analyzer analyzer) throws IOException {
    StringBuffer buffer = new StringBuffer();
    TokenStream stream = analyzer.tokenStream("contents", new StringReader(string));
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    buffer.append(analyzer.getClass().getName());
    buffer.append(" -> ");

    while (stream.incrementToken()) {
        buffer.append(" [");
        buffer.append(term.toString());
        buffer.append("]");
    }

    String output = buffer.toString();
    log.info(output);
}

From source file:com.intel.hadoop.graphbuilder.demoapps.wikipedia.docwordgraph.WordCountGraphTokenizer.java

License:Open Source License

public void parse(String s) {
    DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
    factory.setNamespaceAware(true);/*  w w  w .  j av  a 2  s.com*/
    DocumentBuilder builder;
    counts = new HashMap<String, Integer>();
    try {
        builder = factory.newDocumentBuilder();
        Document doc = builder.parse(new InputSource(new StringReader(s)));
        XPathFactory xfactory = XPathFactory.newInstance();
        XPath xpath = xfactory.newXPath();
        title = xpath.evaluate("//page/title/text()", doc);
        title = title.replaceAll("\\s", "_");
        // title = title.replaceAll("^[^a-zA-Z0-9]", "#");
        // title = title.replaceAll("[^a-zA-Z0-9.]", "_");
        id = xpath.evaluate("//page/id/text()", doc);
        String text = xpath.evaluate("//page/revision/text/text()", doc);

        if (!text.isEmpty()) {
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
            TokenStream stream = analyzer.tokenStream(null, new StringReader(text));
            while (stream.incrementToken()) {
                String token = stream.getAttribute(TermAttribute.class).term();

                if (dictionary != null && !dictionary.contains(token))
                    continue;

                if (counts.containsKey(token))
                    counts.put(token, counts.get(token) + 1);
                else
                    counts.put(token, 1);
            }
        }
    } catch (ParserConfigurationException e) {
        e.printStackTrace();
    } catch (SAXException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (XPathExpressionException e) {
        e.printStackTrace();
    }
}

From source file:com.isotrol.impe3.lucene.PortalSpanishAnalyzerTest.java

License:Open Source License

private void test(String name, Analyzer a, String text) throws IOException {
    final Reader r = new StringReader(text);
    final TokenStream s = a.tokenStream(null, r);
    List<String> list = Lists.newLinkedList();
    s.reset();/*w  w w .j av  a  2  s.co  m*/
    while (s.incrementToken()) {
        if (s.hasAttribute(CharTermAttribute.class)) {
            list.add(s.getAttribute(CharTermAttribute.class).toString());
        }
    }
    System.out.printf("[%s] %s => %s\n", name, text, list);
}

From source file:com.isotrol.impe3.lucene.PrefixAnalyzedQueryParser.java

License:Open Source License

@Override
protected org.apache.lucene.search.Query getPrefixQuery(String field, String termStr) throws ParseException {
    try {/*from  w w w.  j ava 2  s  .co m*/
        TokenStream ts = analyzer.tokenStream(field, new StringReader(termStr));
        if (ts.incrementToken() && ts.hasAttribute(CharTermAttribute.class)) {
            String term = ts.getAttribute(CharTermAttribute.class).toString();
            if (term != null) {
                return super.getPrefixQuery(field, term);
            }
        }
    } catch (IOException e) {
    }
    return super.getPrefixQuery(field, termStr);
}

From source file:com.jaeksoft.searchlib.analysis.CompiledAnalyzer.java

License:Open Source License

public void justTokenize(String text, List<TokenTerm> tokenTerms) throws IOException {
    StringReader reader = new StringReader(text);
    TokenStream ts1 = tokenizer.create(reader);
    TokenStream ts2 = new TokenTermPopulateFilter(tokenTerms, ts1);
    try {//from w  w  w  . j a va2 s .c om
        while (ts2.incrementToken())
            ;
    } finally {
        IOUtils.close(ts2, ts1);
    }
}