Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:org.apache.mahout.utils.vectors.text.document.SequenceFileTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
    TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
    StringTuple document = new StringTuple();
    while (stream.incrementToken()) {
        if (termAtt.termLength() > 0) {
            document.add(new String(termAtt.termBuffer(), 0, termAtt.termLength()));
        }//from   w w w.  j a  va 2 s.  c  o  m
    }
    context.write(key, document);
}

From source file:org.apache.mahout.vectorizer.document.SequenceFileTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();//  w  w  w. j  ava 2  s .  co m
    StringTuple document = new StringTuple();
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            document.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }
    }
    stream.end();
    Closeables.close(stream, true);
    context.write(key, document);
}

From source file:org.apache.maven.index.DefaultQueryCreator.java

License:Apache License

protected int countTerms(final IndexerField indexerField, final String query) {
    try {/*from   w ww. j a  va2  s . com*/
        TokenStream ts = nexusAnalyzer.tokenStream(indexerField.getKey(), new StringReader(query));
        ts.reset();

        int result = 0;

        while (ts.incrementToken()) {
            result++;
        }

        ts.end();
        ts.close();

        return result;
    } catch (IOException e) {
        // will not happen
        return 1;
    }
}

From source file:org.apache.nutch.scoring.similarity.cosine.Model.java

License:Apache License

/**
 * Used to create a DocVector from given String text. Used during the parse stage of the crawl 
 * cycle to create a DocVector of the currently parsed page from the parseText attribute value
 * @param content The text to tokenize//from   w  ww  . j a  v a 2  s.co m
 * @param mingram Value of mingram for tokenizing
 * @param maxgram Value of maxgram for tokenizing
 */
public static DocVector createDocVector(String content, int mingram, int maxgram) {
    LuceneTokenizer tokenizer;

    if (mingram > 1 && maxgram > 1) {
        LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram,
                maxgram);
        tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER,
                mingram, maxgram);
    } else if (mingram > 1) {
        maxgram = mingram;
        LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram,
                maxgram);
        tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER,
                mingram, maxgram);
    } else if (stopWords != null) {
        tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, stopWords, true,
                StemFilterType.PORTERSTEM_FILTER);
    } else {
        tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, true,
                StemFilterType.PORTERSTEM_FILTER);
    }
    TokenStream tStream = tokenizer.getTokenStream();
    HashMap<String, Integer> termVector = new HashMap<>();
    try {
        CharTermAttribute charTermAttribute = tStream.addAttribute(CharTermAttribute.class);
        tStream.reset();
        while (tStream.incrementToken()) {
            String term = charTermAttribute.toString();
            LOG.debug(term);
            if (termVector.containsKey(term)) {
                int count = termVector.get(term);
                count++;
                termVector.put(term, count);
            } else {
                termVector.put(term, 1);
            }
        }
        DocVector docVector = new DocVector();
        docVector.setTermFreqVector(termVector);
        return docVector;
    } catch (IOException e) {
        LOG.error("Error creating DocVector : {}", StringUtils.stringifyException(e));
    }
    return null;
}

From source file:org.apache.roller.weblogger.business.search.IndexUtil.java

License:Apache License

/**
 * Create a lucene term from the first token of the input string.
 * /*from   ww w. ja v  a 2 s. c o m*/
 * @param field
 *            The lucene document field to create a term with
 * @param input
 *            The input you wish to convert into a term
 * 
 * @return Lucene search term
 */
public static Term getTerm(String field, String input) {
    if (input == null || field == null) {
        return null;
    }
    Analyzer analyzer = IndexManagerImpl.getAnalyzer();
    Term term = null;
    try {
        TokenStream tokens = analyzer.tokenStream(field, new StringReader(input));
        CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
        tokens.reset();

        if (tokens.incrementToken()) {
            String termt = termAtt.toString();
            term = new Term(field, termt);
        }
    } catch (IOException e) {
        // ignored
    }
    return term;
}

From source file:org.apache.solr.analysis.DoubleMetaphoneFilterFactoryTest.java

License:Apache License

/**
 * Ensure that reset() removes any state (buffered tokens)
 *//*from ww  w.j  a va2  s. co  m*/
public void testReset() throws Exception {
    DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
    factory.init(new HashMap<String, String>());
    TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));

    TokenStream filteredStream = factory.create(inputStream);
    CharTermAttribute termAtt = filteredStream.addAttribute(CharTermAttribute.class);
    assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());

    assertTrue(filteredStream.incrementToken());
    assertEquals(13, termAtt.length());
    assertEquals("international", termAtt.toString());
    filteredStream.reset();

    // ensure there are no more tokens, such as ANTRNXNL
    assertFalse(filteredStream.incrementToken());
}

From source file:org.apache.solr.analysis.SlowSynonymFilterFactory.java

License:Apache License

private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory) {
    StringReader reader = new StringReader(source);
    TokenStream ts = loadTokenizer(tokFactory, reader);
    List<String> tokList = new ArrayList<String>();
    try {//  w  w w  .j  a v a2  s .  c om
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            if (termAtt.length() > 0)
                tokList.add(termAtt.toString());
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        reader.close();
    }
    return tokList;
}

From source file:org.apache.solr.analysis.TestBufferedTokenStream.java

License:Apache License

public void testReset() throws Exception {
    final String input = "How now A B brown A cow B like A B thing?";
    Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
    TokenStream ts = new AB_AAB_Stream(tokenizer);
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    assertTrue(ts.incrementToken());
    assertEquals("How", term.toString());
    assertTrue(ts.incrementToken());/*www  .  j  av a 2 s  .c o m*/
    assertEquals("now", term.toString());
    assertTrue(ts.incrementToken());
    assertEquals("A", term.toString());
    // reset back to input, 
    // if reset() does not work correctly then previous buffered tokens will remain 
    tokenizer.reset(new StringReader(input));
    ts.reset();
    assertTrue(ts.incrementToken());
    assertEquals("How", term.toString());
}

From source file:org.apache.solr.analysis.TestCollationKeyFilterFactory.java

License:Apache License

private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException {
    CharTermAttribute term1 = stream1.addAttribute(CharTermAttribute.class);
    CharTermAttribute term2 = stream2.addAttribute(CharTermAttribute.class);
    assertTrue(stream1.incrementToken());
    assertTrue(stream2.incrementToken());
    assertEquals(term1.toString(), term2.toString());
    assertFalse(stream1.incrementToken());
    assertFalse(stream2.incrementToken());
}

From source file:org.apache.solr.analysis.TestPatternReplaceCharFilterFactory.java

License:Apache License

public void testReplaceByEmpty() throws IOException {
    final String BLOCK = "aa bb cc";
    PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
    Map<String, String> args = new HashMap<String, String>();
    args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
    factory.init(args);//from  w w w.  j  av  a 2 s .c om
    CharStream cs = factory.create(CharReader.get(new StringReader(BLOCK)));
    TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs);
    assertFalse(ts.incrementToken());
}