Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:org.apache.mahout.utils.regex.AnalyzerTransformer.java

License:Apache License

@Override
public String transformMatch(String match) {
    StringBuilder result = new StringBuilder();
    TokenStream ts = null;
    try {//from www  .ja v a  2s.  c o m
        ts = analyzer.tokenStream(fieldName, new StringReader(match));
        ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        TokenStreamIterator iter = new TokenStreamIterator(ts);
        while (iter.hasNext()) {
            result.append(iter.next()).append(' ');
        }
        ts.end();
    } catch (IOException e) {
        throw new IllegalStateException(e);
    } finally {
        try {
            Closeables.close(ts, true);
        } catch (IOException e) {
            log.error(e.getMessage(), e);
        }
    }
    return result.toString();
}

From source file:org.apache.mahout.utils.vectors.text.document.SequenceFileTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
    TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
    StringTuple document = new StringTuple();
    while (stream.incrementToken()) {
        if (termAtt.termLength() > 0) {
            document.add(new String(termAtt.termBuffer(), 0, termAtt.termLength()));
        }/*from   ww  w.  j a  v  a 2s . c om*/
    }
    context.write(key, document);
}

From source file:org.apache.mahout.vectorizer.document.SequenceFileTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();//from   w w w  .  j  a v  a  2s.c om
    StringTuple document = new StringTuple();
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            document.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }
    }
    stream.end();
    Closeables.close(stream, true);
    context.write(key, document);
}

From source file:org.apache.mahout.vectorizer.encoders.LuceneTextValueEncoder.java

License:Apache License

/**
 * Tokenizes a string using the simplest method.  This should be over-ridden for more subtle
 * tokenization.// w w  w.ja  v a2s  . com
 */
@Override
protected Iterable<String> tokenize(CharSequence originalForm) {
    try {
        TokenStream ts = analyzer.tokenStream(getName(), new CharSequenceReader(originalForm));
        ts.addAttribute(CharTermAttribute.class);
        return new LuceneTokenIterable(ts, false);
    } catch (IOException ex) {
        throw new IllegalStateException(ex);
    }
}

From source file:org.apache.nutch.scoring.similarity.cosine.Model.java

License:Apache License

/**
 * Used to create a DocVector from given String text. Used during the parse stage of the crawl 
 * cycle to create a DocVector of the currently parsed page from the parseText attribute value
 * @param content The text to tokenize//from w  ww  . ja va2  s.  c  o m
 * @param mingram Value of mingram for tokenizing
 * @param maxgram Value of maxgram for tokenizing
 */
public static DocVector createDocVector(String content, int mingram, int maxgram) {
    LuceneTokenizer tokenizer;

    if (mingram > 1 && maxgram > 1) {
        LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram,
                maxgram);
        tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER,
                mingram, maxgram);
    } else if (mingram > 1) {
        maxgram = mingram;
        LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram,
                maxgram);
        tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER,
                mingram, maxgram);
    } else if (stopWords != null) {
        tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, stopWords, true,
                StemFilterType.PORTERSTEM_FILTER);
    } else {
        tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, true,
                StemFilterType.PORTERSTEM_FILTER);
    }
    TokenStream tStream = tokenizer.getTokenStream();
    HashMap<String, Integer> termVector = new HashMap<>();
    try {
        CharTermAttribute charTermAttribute = tStream.addAttribute(CharTermAttribute.class);
        tStream.reset();
        while (tStream.incrementToken()) {
            String term = charTermAttribute.toString();
            LOG.debug(term);
            if (termVector.containsKey(term)) {
                int count = termVector.get(term);
                count++;
                termVector.put(term, count);
            } else {
                termVector.put(term, 1);
            }
        }
        DocVector docVector = new DocVector();
        docVector.setTermFreqVector(termVector);
        return docVector;
    } catch (IOException e) {
        LOG.error("Error creating DocVector : {}", StringUtils.stringifyException(e));
    }
    return null;
}

From source file:org.apache.roller.weblogger.business.search.IndexUtil.java

License:Apache License

/**
 * Create a lucene term from the first token of the input string.
 * /*ww  w  .j  a  v  a 2s.  c o m*/
 * @param field
 *            The lucene document field to create a term with
 * @param input
 *            The input you wish to convert into a term
 * 
 * @return Lucene search term
 */
public static Term getTerm(String field, String input) {
    if (input == null || field == null) {
        return null;
    }
    Analyzer analyzer = IndexManagerImpl.getAnalyzer();
    Term term = null;
    try {
        TokenStream tokens = analyzer.tokenStream(field, new StringReader(input));
        CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
        tokens.reset();

        if (tokens.incrementToken()) {
            String termt = termAtt.toString();
            term = new Term(field, termt);
        }
    } catch (IOException e) {
        // ignored
    }
    return term;
}

From source file:org.apache.solr.analysis.DoubleMetaphoneFilterFactoryTest.java

License:Apache License

/**
 * Ensure that reset() removes any state (buffered tokens)
 *///from   ww w . j av  a 2s . c  o m
public void testReset() throws Exception {
    DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
    factory.init(new HashMap<String, String>());
    TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));

    TokenStream filteredStream = factory.create(inputStream);
    CharTermAttribute termAtt = filteredStream.addAttribute(CharTermAttribute.class);
    assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());

    assertTrue(filteredStream.incrementToken());
    assertEquals(13, termAtt.length());
    assertEquals("international", termAtt.toString());
    filteredStream.reset();

    // ensure there are no more tokens, such as ANTRNXNL
    assertFalse(filteredStream.incrementToken());
}

From source file:org.apache.solr.analysis.SlowSynonymFilterFactory.java

License:Apache License

private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory) {
    StringReader reader = new StringReader(source);
    TokenStream ts = loadTokenizer(tokFactory, reader);
    List<String> tokList = new ArrayList<String>();
    try {//  w ww .j  a va  2  s  . co  m
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            if (termAtt.length() > 0)
                tokList.add(termAtt.toString());
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        reader.close();
    }
    return tokList;
}

From source file:org.apache.solr.analysis.TestBufferedTokenStream.java

License:Apache License

public void testReset() throws Exception {
    final String input = "How now A B brown A cow B like A B thing?";
    Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
    TokenStream ts = new AB_AAB_Stream(tokenizer);
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    assertTrue(ts.incrementToken());//from  w  w  w  .  jav a2  s .c  o m
    assertEquals("How", term.toString());
    assertTrue(ts.incrementToken());
    assertEquals("now", term.toString());
    assertTrue(ts.incrementToken());
    assertEquals("A", term.toString());
    // reset back to input, 
    // if reset() does not work correctly then previous buffered tokens will remain 
    tokenizer.reset(new StringReader(input));
    ts.reset();
    assertTrue(ts.incrementToken());
    assertEquals("How", term.toString());
}

From source file:org.apache.solr.analysis.TestCollationKeyFilterFactory.java

License:Apache License

private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException {
    CharTermAttribute term1 = stream1.addAttribute(CharTermAttribute.class);
    CharTermAttribute term2 = stream2.addAttribute(CharTermAttribute.class);
    assertTrue(stream1.incrementToken());
    assertTrue(stream2.incrementToken());
    assertEquals(term1.toString(), term2.toString());
    assertFalse(stream1.incrementToken());
    assertFalse(stream2.incrementToken());
}