Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:org.apache.mahout.utils.vectors.text.document.SequenceFileTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
    TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
    StringTuple document = new StringTuple();
    while (stream.incrementToken()) {
        if (termAtt.termLength() > 0) {
            document.add(new String(termAtt.termBuffer(), 0, termAtt.termLength()));
        }//from   w w w.  j a  va 2 s.  c  o  m
    }
    context.write(key, document);
}

From source file:org.apache.mahout.vectorizer.document.SequenceFileTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();//  w  w  w. j  ava 2  s .  co m
    StringTuple document = new StringTuple();
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            document.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }
    }
    stream.end();
    Closeables.close(stream, true);
    context.write(key, document);
}

From source file:org.apache.maven.index.DefaultQueryCreator.java

License:Apache License

protected int countTerms(final IndexerField indexerField, final String query) {
    try {/*from   w ww. j a  va2  s . com*/
        TokenStream ts = nexusAnalyzer.tokenStream(indexerField.getKey(), new StringReader(query));
        ts.reset();

        int result = 0;

        while (ts.incrementToken()) {
            result++;
        }

        ts.end();
        ts.close();

        return result;
    } catch (IOException e) {
        // will not happen
        return 1;
    }
}

From source file:org.apache.nutch.scoring.similarity.cosine.Model.java

License:Apache License

/**
 * Used to create a DocVector from given String text. Used during the parse stage of the crawl 
 * cycle to create a DocVector of the currently parsed page from the parseText attribute value
 * @param content The text to tokenize//from   w  ww  . j a  v a 2  s.co m
 * @param mingram Value of mingram for tokenizing
 * @param maxgram Value of maxgram for tokenizing
 */
public static DocVector createDocVector(String content, int mingram, int maxgram) {
    LuceneTokenizer tokenizer;

    if (mingram > 1 && maxgram > 1) {
        LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram,
                maxgram);
        tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER,
                mingram, maxgram);
    } else if (mingram > 1) {
        maxgram = mingram;
        LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram,
                maxgram);
        tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER,
                mingram, maxgram);
    } else if (stopWords != null) {
        tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, stopWords, true,
                StemFilterType.PORTERSTEM_FILTER);
    } else {
        tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, true,
                StemFilterType.PORTERSTEM_FILTER);
    }
    TokenStream tStream = tokenizer.getTokenStream();
    HashMap<String, Integer> termVector = new HashMap<>();
    try {
        CharTermAttribute charTermAttribute = tStream.addAttribute(CharTermAttribute.class);
        tStream.reset();
        while (tStream.incrementToken()) {
            String term = charTermAttribute.toString();
            LOG.debug(term);
            if (termVector.containsKey(term)) {
                int count = termVector.get(term);
                count++;
                termVector.put(term, count);
            } else {
                termVector.put(term, 1);
            }
        }
        DocVector docVector = new DocVector();
        docVector.setTermFreqVector(termVector);
        return docVector;
    } catch (IOException e) {
        LOG.error("Error creating DocVector : {}", StringUtils.stringifyException(e));
    }
    return null;
}

From source file:org.apache.roller.weblogger.business.search.IndexUtil.java

License:Apache License

/**
 * Create a lucene term from the first token of the input string.
 * /*from   ww w. ja v  a 2 s. c o m*/
 * @param field
 *            The lucene document field to create a term with
 * @param input
 *            The input you wish to convert into a term
 * 
 * @return Lucene search term
 */
public static Term getTerm(String field, String input) {
    if (input == null || field == null) {
        return null;
    }
    Analyzer analyzer = IndexManagerImpl.getAnalyzer();
    Term term = null;
    try {
        TokenStream tokens = analyzer.tokenStream(field, new StringReader(input));
        CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
        tokens.reset();

        if (tokens.incrementToken()) {
            String termt = termAtt.toString();
            term = new Term(field, termt);
        }
    } catch (IOException e) {
        // ignored
    }
    return term;
}

From source file:org.apache.solr.analysis.DoubleMetaphoneFilterFactoryTest.java

License:Apache License

/**
 * Ensure that reset() removes any state (buffered tokens)
 *//*from ww  w.j  a va2  s. co  m*/
public void testReset() throws Exception {
    DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
    factory.init(new HashMap<String, String>());
    TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));

    TokenStream filteredStream = factory.create(inputStream);
    CharTermAttribute termAtt = filteredStream.addAttribute(CharTermAttribute.class);
    assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());

    assertTrue(filteredStream.incrementToken());
    assertEquals(13, termAtt.length());
    assertEquals("international", termAtt.toString());
    filteredStream.reset();

    // ensure there are no more tokens, such as ANTRNXNL
    assertFalse(filteredStream.incrementToken());
}

From source file:org.apache.solr.analysis.SlowSynonymFilterFactory.java

License:Apache License

private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory) {
    StringReader reader = new StringReader(source);
    TokenStream ts = loadTokenizer(tokFactory, reader);
    List<String> tokList = new ArrayList<String>();
    try {//  w  w w  .j  a v a2  s .  c om
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            if (termAtt.length() > 0)
                tokList.add(termAtt.toString());
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        reader.close();
    }
    return tokList;
}

From source file:org.apache.solr.analysis.TestBufferedTokenStream.java

License:Apache License

public void testReset() throws Exception {
    final String input = "How now A B brown A cow B like A B thing?";
    Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
    TokenStream ts = new AB_AAB_Stream(tokenizer);
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    assertTrue(ts.incrementToken());
    assertEquals("How", term.toString());
    assertTrue(ts.incrementToken());/*www  .  j  av a 2 s  .c o m*/
    assertEquals("now", term.toString());
    assertTrue(ts.incrementToken());
    assertEquals("A", term.toString());
    // reset back to input, 
    // if reset() does not work correctly then previous buffered tokens will remain 
    tokenizer.reset(new StringReader(input));
    ts.reset();
    assertTrue(ts.incrementToken());
    assertEquals("How", term.toString());
}

From source file:org.apache.solr.analysis.TestCollationKeyFilterFactory.java

License:Apache License

private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException {
    CharTermAttribute term1 = stream1.addAttribute(CharTermAttribute.class);
    CharTermAttribute term2 = stream2.addAttribute(CharTermAttribute.class);
    assertTrue(stream1.incrementToken());
    assertTrue(stream2.incrementToken());
    assertEquals(term1.toString(), term2.toString());
    assertFalse(stream1.incrementToken());
    assertFalse(stream2.incrementToken());
}

From source file:org.apache.solr.analysis.TestPatternReplaceCharFilterFactory.java

License:Apache License

public void testReplaceByEmpty() throws IOException {
    final String BLOCK = "aa bb cc";
    PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
    Map<String, String> args = new HashMap<String, String>();
    args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
    factory.init(args);//from  w w w.  j  av  a 2 s .c om
    CharStream cs = factory.create(CharReader.get(new StringReader(BLOCK)));
    TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs);
    assertFalse(ts.incrementToken());
}