Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java

License:Apache License

@SuppressWarnings("resource")
public static List<String> createTm3TokensNoStopWord(String p_text, GlobalSightLocale p_locale)
        throws Exception {
    GsAnalyzer analyzer = new GsAnalyzer(p_locale, false);
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text));
    tokenStream.reset();/*from  ww  w  . j  a  va  2 s  .  co  m*/

    List<String> tokens = new ArrayList<String>();
    while (tokenStream.incrementToken()) {
        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokens.add(termAtt.toString());
    }
    tokenStream.close();

    return tokens;
}

From source file:com.globalsight.ling.tm2.lucene.TuvDocument.java

License:Apache License

private int getTotalTokenCount(String text, Analyzer analyzer) throws Exception {
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(text));
    tokenStream.reset();/*from w w w.j  a  va2 s  .c o m*/

    int tokenCount = 0;
    while (tokenStream.incrementToken()) {
        tokenCount++;
    }

    return tokenCount;
}

From source file:com.grantingersoll.intell.index.BayesUpdateRequestProcessor.java

License:Apache License

public String[] tokenizeField(String input) throws IOException {
    ArrayList<String> tokenList = new ArrayList<String>(256);
    TokenStream ts = analyzer.tokenStream(inputField, new StringReader(input));
    while (ts.incrementToken()) {
        tokenList.add(ts.getAttribute(CharTermAttribute.class).toString());
    }//from  w w  w.  j av  a  2 s . com
    return tokenList.toArray(new String[tokenList.size()]);
}

From source file:com.ibm.watson.developer_cloud.professor_languo.primary_search.SpanBigramQueryGeneratorTest.java

License:Open Source License

private void test_that_generated_bigram_query_match_the_referenced_query() throws IOException {
    // Get stemmed question
    SingletonAnalyzer.generateAnalyzer(PrimarySearchConstants.ENGLISH_ANALYZER);
    EnglishAnalyzer ea = (EnglishAnalyzer) SingletonAnalyzer.getAnalyzer();
    TokenStream ts = ea.tokenStream("field", question);
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();/*w  w  w  .j av a  2s. co m*/
    List<String> stemmedQuestion = new ArrayList<String>();
    while (ts.incrementToken())
        stemmedQuestion.add(charTermAttribute.toString());

    // get query terms
    BooleanClause[] clauses = ((BooleanQuery) spanBigramQuery).getClauses();
    SpanQuery[] queries;
    String term1, term2;
    List<String> unigrams = new ArrayList<String>();
    int numFields = clauses.length / (2 * stemmedQuestion.size() - 1);

    // test bigrams
    int bigramidx = 0;
    for (int idx = 0; idx < clauses.length; idx++) {
        Query q = clauses[idx].getQuery();
        if (q instanceof SpanNearQuery) {
            queries = ((SpanNearQuery) clauses[idx].getQuery()).getClauses();
            int termidx = bigramidx / numFields;
            term1 = ((SpanTermQuery) queries[0]).getTerm().text();
            term2 = ((SpanTermQuery) queries[1]).getTerm().text();
            assertEquals("Extracted first term doesn't match the stemmed term", stemmedQuestion.get(termidx),
                    term1);
            assertEquals("Extracted second term doesn't match the stemmed term",
                    stemmedQuestion.get(termidx + 1), term2);
            bigramidx++;
        } else if (q instanceof TermQuery) {
            unigrams.add(((TermQuery) clauses[idx].getQuery()).getTerm().text());
        } else {
            assertTrue("Unknown type of query found!", false);
        }
    }

    // test unigrams
    for (String s : unigrams)
        assertTrue(stemmedQuestion.contains(s));
    for (String s : stemmedQuestion)
        assertTrue(unigrams.contains(s));
}

From source file:com.ibm.watson.developer_cloud.professor_languo.primary_search.SpanTrigramQueryGeneratorTest.java

License:Open Source License

private void test_that_generated_trigram_query_match_the_referenced_query() throws IOException {
    Set<Term> queryTerms = new HashSet<Term>();

    // Get stemmed question
    SingletonAnalyzer.generateAnalyzer(PrimarySearchConstants.ENGLISH_ANALYZER);
    EnglishAnalyzer ea = (EnglishAnalyzer) SingletonAnalyzer.getAnalyzer();
    TokenStream ts = ea.tokenStream("field", question);
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();//w w  w . j  a va2s .  c om
    List<String> stemmedQuestion = new ArrayList<String>();
    while (ts.incrementToken())
        stemmedQuestion.add(charTermAttribute.toString());

    // get query terms
    spanTrigramQuery.extractTerms(queryTerms);
    BooleanClause[] clauses = ((BooleanQuery) spanTrigramQuery).getClauses();
    SpanQuery[] qs;
    String term1, term2, term3;
    int numFields = clauses.length / (3 * stemmedQuestion.size() - 3);
    List<String> unigrams = new ArrayList<String>();
    int idx = 0;

    // test trigrams
    int trigramidx = 0;
    for (idx = clauses.length - numFields * (stemmedQuestion.size() - 2); idx < clauses.length; idx++) {
        qs = ((SpanNearQuery) clauses[idx].getQuery()).getClauses();
        int termidx = trigramidx / numFields;
        term1 = ((SpanTermQuery) qs[0]).getTerm().text();
        term2 = ((SpanTermQuery) qs[1]).getTerm().text();
        term3 = ((SpanTermQuery) qs[2]).getTerm().text();
        assertEquals("Extracted first term in the trigram doesn't match the stemmed term",
                stemmedQuestion.get(termidx), term1);
        assertEquals("Extracted second term in the trigram doesn't match the stemmed term",
                stemmedQuestion.get(termidx + 1), term2);
        assertEquals("Extracted third term in the trigram doesn't match the stemmed term",
                stemmedQuestion.get(termidx + 2), term3);
        trigramidx++;
    }

    // test bigrams
    int bigramidx = 0;
    for (idx = 0; idx < (2 * stemmedQuestion.size() - 1) * numFields; idx++) {
        Query q = clauses[idx].getQuery();
        if (q instanceof SpanNearQuery) {
            qs = ((SpanNearQuery) clauses[idx].getQuery()).getClauses();
            int termidx = bigramidx / numFields;
            term1 = ((SpanTermQuery) qs[0]).getTerm().text();
            term2 = ((SpanTermQuery) qs[1]).getTerm().text();
            assertEquals("Extracted first term in the bigram doesn't match the stemmed term",
                    stemmedQuestion.get(termidx), term1);
            assertEquals("Extracted second term in the bigram doesn't match the stemmed term",
                    stemmedQuestion.get(termidx + 1), term2);
            bigramidx++;
        } else if (q instanceof TermQuery) {
            unigrams.add(((TermQuery) clauses[idx].getQuery()).getTerm().text());
        } else {
            assertTrue("Unknown type of query found!", false);
        }
    }

    // test unigrams
    for (String s : unigrams)
        assertTrue(stemmedQuestion.contains(s));
    for (String s : stemmedQuestion)
        assertTrue(unigrams.contains(s));
}

From source file:com.ikon.analysis.AnalyzerDemo.java

License:Open Source License

/**
 * Analyze and display tokens//w w w .ja v a  2 s .co  m
 */
private static void analyze(String string, Analyzer analyzer) throws IOException {
    StringBuffer buffer = new StringBuffer();
    TokenStream stream = analyzer.tokenStream("contents", new StringReader(string));
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    buffer.append(analyzer.getClass().getName());
    buffer.append(" -> ");

    while (stream.incrementToken()) {
        buffer.append(" [");
        buffer.append(term.toString());
        buffer.append("]");
    }

    String output = buffer.toString();
    log.info(output);
}

From source file:com.intel.hadoop.graphbuilder.demoapps.wikipedia.docwordgraph.WordCountGraphTokenizer.java

License:Open Source License

public void parse(String s) {
    DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
    factory.setNamespaceAware(true);/*  w w  w .  j av  a 2  s.com*/
    DocumentBuilder builder;
    counts = new HashMap<String, Integer>();
    try {
        builder = factory.newDocumentBuilder();
        Document doc = builder.parse(new InputSource(new StringReader(s)));
        XPathFactory xfactory = XPathFactory.newInstance();
        XPath xpath = xfactory.newXPath();
        title = xpath.evaluate("//page/title/text()", doc);
        title = title.replaceAll("\\s", "_");
        // title = title.replaceAll("^[^a-zA-Z0-9]", "#");
        // title = title.replaceAll("[^a-zA-Z0-9.]", "_");
        id = xpath.evaluate("//page/id/text()", doc);
        String text = xpath.evaluate("//page/revision/text/text()", doc);

        if (!text.isEmpty()) {
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
            TokenStream stream = analyzer.tokenStream(null, new StringReader(text));
            while (stream.incrementToken()) {
                String token = stream.getAttribute(TermAttribute.class).term();

                if (dictionary != null && !dictionary.contains(token))
                    continue;

                if (counts.containsKey(token))
                    counts.put(token, counts.get(token) + 1);
                else
                    counts.put(token, 1);
            }
        }
    } catch (ParserConfigurationException e) {
        e.printStackTrace();
    } catch (SAXException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (XPathExpressionException e) {
        e.printStackTrace();
    }
}

From source file:com.isotrol.impe3.lucene.PortalSpanishAnalyzerTest.java

License:Open Source License

private void test(String name, Analyzer a, String text) throws IOException {
    final Reader r = new StringReader(text);
    final TokenStream s = a.tokenStream(null, r);
    List<String> list = Lists.newLinkedList();
    s.reset();/*w  w w .j av  a  2  s.co  m*/
    while (s.incrementToken()) {
        if (s.hasAttribute(CharTermAttribute.class)) {
            list.add(s.getAttribute(CharTermAttribute.class).toString());
        }
    }
    System.out.printf("[%s] %s => %s\n", name, text, list);
}

From source file:com.isotrol.impe3.lucene.PrefixAnalyzedQueryParser.java

License:Open Source License

@Override
protected org.apache.lucene.search.Query getPrefixQuery(String field, String termStr) throws ParseException {
    try {/*from  w w w.  j ava 2  s  .co m*/
        TokenStream ts = analyzer.tokenStream(field, new StringReader(termStr));
        if (ts.incrementToken() && ts.hasAttribute(CharTermAttribute.class)) {
            String term = ts.getAttribute(CharTermAttribute.class).toString();
            if (term != null) {
                return super.getPrefixQuery(field, term);
            }
        }
    } catch (IOException e) {
    }
    return super.getPrefixQuery(field, termStr);
}

From source file:com.jaeksoft.searchlib.analysis.CompiledAnalyzer.java

License:Open Source License

public void justTokenize(String text, List<TokenTerm> tokenTerms) throws IOException {
    StringReader reader = new StringReader(text);
    TokenStream ts1 = tokenizer.create(reader);
    TokenStream ts2 = new TokenTermPopulateFilter(tokenTerms, ts1);
    try {//from w  w  w  . j a va2 s .c om
        while (ts2.incrementToken())
            ;
    } finally {
        IOUtils.close(ts2, ts1);
    }
}