Example usage for org.apache.lucene.analysis TokenStream reset

List of usage examples for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException 

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java

License:Apache License

/** normal case, unfiltered analyzer */
@Test//w ww .  ja v a  2 s .  c o  m
public void testAnalyzer() throws IOException {
    Reader reader = new StringReader(input);
    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46);
    TokenStream ts = analyzer.tokenStream(null, reader);
    ts.reset();
    validateTokens(allTokens, ts);
    ts.end();
    ts.close();
}

From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java

License:Apache License

/** filtered analyzer */
@Test/*from  w  w w . j  a va 2  s .co  m*/
public void testNonKeepdAnalyzer() throws IOException {
    Reader reader = new StringReader(input);
    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46);
    TokenStream ts = analyzer.tokenStream(null, reader);
    ts.reset();
    TokenStream f = new BloomTokenFilter(getFilter(filterTokens), false /* toss matching tokens */, ts);
    validateTokens(expectedNonKeepTokens, f);
    ts.end();
    ts.close();
}

From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java

License:Apache License

/** keep analyzer */
@Test//from  w  w w  .  j  a v  a2s .co  m
public void testKeepAnalyzer() throws IOException {
    Reader reader = new StringReader(input);
    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46);
    TokenStream ts = analyzer.tokenStream(null, reader);
    ts.reset();
    TokenStream f = new BloomTokenFilter(getFilter(filterTokens), true /* keep matching tokens */, ts);
    validateTokens(expectedKeepTokens, f);
    ts.end();
    ts.close();
}

From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java

License:Apache License

/** shingles, keep those matching whitelist */
@Test/*from   w  w w .  j a  v a 2 s.  c  om*/
public void testShingleFilteredAnalyzer() throws IOException {
    Reader reader = new StringReader(input);
    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46);
    TokenStream ts = analyzer.tokenStream(null, reader);
    ts.reset();
    ShingleFilter sf = new ShingleFilter(ts, 3);
    TokenStream f = new BloomTokenFilter(getFilter(shingleKeepTokens), true, sf);
    validateTokens(expectedShingleTokens, f);
    ts.end();
    ts.close();
}

From source file:org.apache.mahout.utils.regex.AnalyzerTransformer.java

License:Apache License

@Override
public String transformMatch(String match) {
    StringBuilder result = new StringBuilder();
    TokenStream ts = null;
    try {//  w w  w. j  av  a2  s . c  om
        ts = analyzer.tokenStream(fieldName, new StringReader(match));
        ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        TokenStreamIterator iter = new TokenStreamIterator(ts);
        while (iter.hasNext()) {
            result.append(iter.next()).append(' ');
        }
        ts.end();
    } catch (IOException e) {
        throw new IllegalStateException(e);
    } finally {
        try {
            Closeables.close(ts, true);
        } catch (IOException e) {
            log.error(e.getMessage(), e);
        }
    }
    return result.toString();
}

From source file:org.apache.mahout.vectorizer.document.SequenceFileTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();
    StringTuple document = new StringTuple();
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            document.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }//from  w  w w.  j a v  a2s.  c o  m
    }
    stream.end();
    Closeables.close(stream, true);
    context.write(key, document);
}

From source file:org.apache.maven.index.DefaultQueryCreator.java

License:Apache License

protected int countTerms(final IndexerField indexerField, final String query) {
    try {//from   w  ww. java2 s  . co  m
        TokenStream ts = nexusAnalyzer.tokenStream(indexerField.getKey(), new StringReader(query));
        ts.reset();

        int result = 0;

        while (ts.incrementToken()) {
            result++;
        }

        ts.end();
        ts.close();

        return result;
    } catch (IOException e) {
        // will not happen
        return 1;
    }
}

From source file:org.apache.nutch.scoring.similarity.cosine.Model.java

License:Apache License

/**
 * Used to create a DocVector from given String text. Used during the parse stage of the crawl 
 * cycle to create a DocVector of the currently parsed page from the parseText attribute value
 * @param content The text to tokenize//from  ww  w . j a v a  2  s .  com
 * @param mingram Value of mingram for tokenizing
 * @param maxgram Value of maxgram for tokenizing
 */
public static DocVector createDocVector(String content, int mingram, int maxgram) {
    LuceneTokenizer tokenizer;

    if (mingram > 1 && maxgram > 1) {
        LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram,
                maxgram);
        tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER,
                mingram, maxgram);
    } else if (mingram > 1) {
        maxgram = mingram;
        LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram,
                maxgram);
        tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER,
                mingram, maxgram);
    } else if (stopWords != null) {
        tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, stopWords, true,
                StemFilterType.PORTERSTEM_FILTER);
    } else {
        tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, true,
                StemFilterType.PORTERSTEM_FILTER);
    }
    TokenStream tStream = tokenizer.getTokenStream();
    HashMap<String, Integer> termVector = new HashMap<>();
    try {
        CharTermAttribute charTermAttribute = tStream.addAttribute(CharTermAttribute.class);
        tStream.reset();
        while (tStream.incrementToken()) {
            String term = charTermAttribute.toString();
            LOG.debug(term);
            if (termVector.containsKey(term)) {
                int count = termVector.get(term);
                count++;
                termVector.put(term, count);
            } else {
                termVector.put(term, 1);
            }
        }
        DocVector docVector = new DocVector();
        docVector.setTermFreqVector(termVector);
        return docVector;
    } catch (IOException e) {
        LOG.error("Error creating DocVector : {}", StringUtils.stringifyException(e));
    }
    return null;
}

From source file:org.apache.roller.weblogger.business.search.IndexUtil.java

License:Apache License

/**
 * Create a lucene term from the first token of the input string.
 * //  w w w  .  j  a  v a  2s  .c o  m
 * @param field
 *            The lucene document field to create a term with
 * @param input
 *            The input you wish to convert into a term
 * 
 * @return Lucene search term
 */
public static Term getTerm(String field, String input) {
    if (input == null || field == null) {
        return null;
    }
    Analyzer analyzer = IndexManagerImpl.getAnalyzer();
    Term term = null;
    try {
        TokenStream tokens = analyzer.tokenStream(field, new StringReader(input));
        CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
        tokens.reset();

        if (tokens.incrementToken()) {
            String termt = termAtt.toString();
            term = new Term(field, termt);
        }
    } catch (IOException e) {
        // ignored
    }
    return term;
}

From source file:org.apache.solr.analysis.DoubleMetaphoneFilterFactoryTest.java

License:Apache License

/**
 * Ensure that reset() removes any state (buffered tokens)
 *//*from www  . j a  va 2  s .  c  o  m*/
public void testReset() throws Exception {
    DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
    factory.init(new HashMap<String, String>());
    TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));

    TokenStream filteredStream = factory.create(inputStream);
    CharTermAttribute termAtt = filteredStream.addAttribute(CharTermAttribute.class);
    assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());

    assertTrue(filteredStream.incrementToken());
    assertEquals(13, termAtt.length());
    assertEquals("international", termAtt.toString());
    filteredStream.reset();

    // ensure there are no more tokens, such as ANTRNXNL
    assertFalse(filteredStream.incrementToken());
}