Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:org.apache.mahout.utils.regex.AnalyzerTransformer.java

License:Apache License

@Override
public String transformMatch(String match) {
    StringBuilder result = new StringBuilder();
    TokenStream ts = null;
    try {//from www  .ja v a  2s.  c o m
        ts = analyzer.tokenStream(fieldName, new StringReader(match));
        ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        TokenStreamIterator iter = new TokenStreamIterator(ts);
        while (iter.hasNext()) {
            result.append(iter.next()).append(' ');
        }
        ts.end();
    } catch (IOException e) {
        throw new IllegalStateException(e);
    } finally {
        try {
            Closeables.close(ts, true);
        } catch (IOException e) {
            log.error(e.getMessage(), e);
        }
    }
    return result.toString();
}

From source file:org.apache.mahout.utils.vectors.text.document.SequenceFileTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
    TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
    StringTuple document = new StringTuple();
    while (stream.incrementToken()) {
        if (termAtt.termLength() > 0) {
            document.add(new String(termAtt.termBuffer(), 0, termAtt.termLength()));
        }/*from   ww  w.  j a  v  a 2s . c om*/
    }
    context.write(key, document);
}

From source file:org.apache.mahout.vectorizer.document.SequenceFileTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();//from   w w w  .  j  a v  a  2s.c om
    StringTuple document = new StringTuple();
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            document.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }
    }
    stream.end();
    Closeables.close(stream, true);
    context.write(key, document);
}

From source file:org.apache.mahout.vectorizer.encoders.LuceneTextValueEncoder.java

License:Apache License

/**
 * Tokenizes a string using the simplest method.  This should be over-ridden for more subtle
 * tokenization.// w w  w.ja  v a2s  . com
 */
@Override
protected Iterable<String> tokenize(CharSequence originalForm) {
    try {
        TokenStream ts = analyzer.tokenStream(getName(), new CharSequenceReader(originalForm));
        ts.addAttribute(CharTermAttribute.class);
        return new LuceneTokenIterable(ts, false);
    } catch (IOException ex) {
        throw new IllegalStateException(ex);
    }
}

From source file:org.apache.nutch.scoring.similarity.cosine.Model.java

License:Apache License

/**
 * Used to create a DocVector from given String text. Used during the parse stage of the crawl 
 * cycle to create a DocVector of the currently parsed page from the parseText attribute value
 * @param content The text to tokenize//from w  ww  . ja va2  s.  c  o m
 * @param mingram Value of mingram for tokenizing
 * @param maxgram Value of maxgram for tokenizing
 */
public static DocVector createDocVector(String content, int mingram, int maxgram) {
    LuceneTokenizer tokenizer;

    if (mingram > 1 && maxgram > 1) {
        LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram,
                maxgram);
        tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER,
                mingram, maxgram);
    } else if (mingram > 1) {
        maxgram = mingram;
        LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram,
                maxgram);
        tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER,
                mingram, maxgram);
    } else if (stopWords != null) {
        tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, stopWords, true,
                StemFilterType.PORTERSTEM_FILTER);
    } else {
        tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, true,
                StemFilterType.PORTERSTEM_FILTER);
    }
    TokenStream tStream = tokenizer.getTokenStream();
    HashMap<String, Integer> termVector = new HashMap<>();
    try {
        CharTermAttribute charTermAttribute = tStream.addAttribute(CharTermAttribute.class);
        tStream.reset();
        while (tStream.incrementToken()) {
            String term = charTermAttribute.toString();
            LOG.debug(term);
            if (termVector.containsKey(term)) {
                int count = termVector.get(term);
                count++;
                termVector.put(term, count);
            } else {
                termVector.put(term, 1);
            }
        }
        DocVector docVector = new DocVector();
        docVector.setTermFreqVector(termVector);
        return docVector;
    } catch (IOException e) {
        LOG.error("Error creating DocVector : {}", StringUtils.stringifyException(e));
    }
    return null;
}

From source file:org.apache.roller.weblogger.business.search.IndexUtil.java

License:Apache License

/**
 * Create a lucene term from the first token of the input string.
 * /*ww  w  .j  a  v  a 2s.  c o m*/
 * @param field
 *            The lucene document field to create a term with
 * @param input
 *            The input you wish to convert into a term
 * 
 * @return Lucene search term
 */
public static Term getTerm(String field, String input) {
    if (input == null || field == null) {
        return null;
    }
    Analyzer analyzer = IndexManagerImpl.getAnalyzer();
    Term term = null;
    try {
        TokenStream tokens = analyzer.tokenStream(field, new StringReader(input));
        CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
        tokens.reset();

        if (tokens.incrementToken()) {
            String termt = termAtt.toString();
            term = new Term(field, termt);
        }
    } catch (IOException e) {
        // ignored
    }
    return term;
}

From source file:org.apache.solr.analysis.DoubleMetaphoneFilterFactoryTest.java

License:Apache License

/**
 * Ensure that reset() removes any state (buffered tokens)
 *///from   ww w . j av  a 2s . c  o m
public void testReset() throws Exception {
    DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
    factory.init(new HashMap<String, String>());
    TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));

    TokenStream filteredStream = factory.create(inputStream);
    CharTermAttribute termAtt = filteredStream.addAttribute(CharTermAttribute.class);
    assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());

    assertTrue(filteredStream.incrementToken());
    assertEquals(13, termAtt.length());
    assertEquals("international", termAtt.toString());
    filteredStream.reset();

    // ensure there are no more tokens, such as ANTRNXNL
    assertFalse(filteredStream.incrementToken());
}

From source file:org.apache.solr.analysis.SlowSynonymFilterFactory.java

License:Apache License

private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory) {
    StringReader reader = new StringReader(source);
    TokenStream ts = loadTokenizer(tokFactory, reader);
    List<String> tokList = new ArrayList<String>();
    try {//  w ww .j  a va  2  s  . co  m
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            if (termAtt.length() > 0)
                tokList.add(termAtt.toString());
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        reader.close();
    }
    return tokList;
}

From source file:org.apache.solr.analysis.TestBufferedTokenStream.java

License:Apache License

public void testReset() throws Exception {
    final String input = "How now A B brown A cow B like A B thing?";
    Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
    TokenStream ts = new AB_AAB_Stream(tokenizer);
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    assertTrue(ts.incrementToken());//from  w  w  w  .  jav a2  s .c  o m
    assertEquals("How", term.toString());
    assertTrue(ts.incrementToken());
    assertEquals("now", term.toString());
    assertTrue(ts.incrementToken());
    assertEquals("A", term.toString());
    // reset back to input, 
    // if reset() does not work correctly then previous buffered tokens will remain 
    tokenizer.reset(new StringReader(input));
    ts.reset();
    assertTrue(ts.incrementToken());
    assertEquals("How", term.toString());
}

From source file:org.apache.solr.analysis.TestCollationKeyFilterFactory.java

License:Apache License

private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException {
    CharTermAttribute term1 = stream1.addAttribute(CharTermAttribute.class);
    CharTermAttribute term2 = stream2.addAttribute(CharTermAttribute.class);
    assertTrue(stream1.incrementToken());
    assertTrue(stream2.incrementToken());
    assertEquals(term1.toString(), term2.toString());
    assertFalse(stream1.incrementToken());
    assertFalse(stream2.incrementToken());
}