Example usage for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer

List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer.

Prototype

public StandardTokenizer() 

Source Link

Document

Creates a new instance of the org.apache.lucene.analysis.standard.StandardTokenizer .

Usage

From source file:org.nlp4l.framework.builtin.kea.KEAStandardAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer source = new StandardTokenizer();
    TokenStream lcf = new LowerCaseFilter(source);
    if (n == 1) {
        TokenStream stf = new KEAStopFilter(lcf, n, stopWords, beginStopWords, endStopWords);
        return new TokenStreamComponents(source, stf);
    } else {/*from  w w w .  j  av a 2s.  c  om*/
        assert n >= 2;
        ShingleFilter shf = new ShingleFilter(lcf, n, n);
        shf.setOutputUnigrams(false);
        KEAStopFilter keasf = new KEAStopFilter(shf, n, stopWords, beginStopWords, endStopWords);
        return new TokenStreamComponents(source, keasf);
    }
}

From source file:org.silverpeas.core.index.indexing.model.WAAnalyzer.java

License:Open Source License

/**
 * Returns a tokens stream built on top of the given reader.
 *
 *///from w w  w.ja v a 2s  .co  m
@Override
protected TokenStreamComponents createComponents(final String s) {
    final Tokenizer source = new StandardTokenizer();
    // remove 's and . from token
    TokenStream result = new StandardFilter(source);
    result = new LowerCaseFilter(result);
    // remove some unexplicit terms
    result = new StopFilter(result, FrenchAnalyzer.getDefaultStopSet());
    // remove [cdjlmnst-qu]' from token
    result = new ElisionFilter(result, FrenchAnalyzer.DEFAULT_ARTICLES);
    if (snowballUsed) {
        // Important! Strings given to Snowball filter must contains accents
        // so accents must be removed after stemmer have done the job
        // ignoring singular/plural, male/female and conjugated forms
        result = new SnowballFilter(result, stemmer);
    }
    // remove accents
    result = new ASCIIFoldingFilter(result);
    return new TokenStreamComponents(source, result);
}

From source file:org.tallison.lucene.search.concordance.ConcordanceTestBase.java

License:Apache License

public static Analyzer getCJKBigramAnalyzer(final boolean outputUnigrams) {
    return new Analyzer() {

        @Override/*from  w ww .  ja v  a2  s .co m*/
        public TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new StandardTokenizer();
            TokenFilter filter = new CJKBigramFilter(tokenizer, 15, outputUnigrams);
            return new TokenStreamComponents(tokenizer, filter);
        }

        @Override
        public int getPositionIncrementGap(String fieldName) {
            return 10;
        }

        @Override
        public int getOffsetGap(String fieldName) {
            return 10;
        }
    };
}

From source file:org.wltea.analyzer.sample.LuceneTokenizerDemo.java

License:Apache License

/**
 * ?StandardTokenizer/*  w w w .  ja  v a  2s  . c  o  m*/
 */
public void testST() {
    Tokenizer tokenizer = new StandardTokenizer();
    try {
        tokenizer.setReader(new StringReader(
                "?????IKAnalyer can analysis english text too"));
    } catch (IOException e) {
        throw new RuntimeException();
    }
    TokenStreamComponents tsc = new TokenStreamComponents(tokenizer);
    TokenStream ts = tsc.getTokenStream();
    OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute type = ts.addAttribute(TypeAttribute.class);
    try {
        ts.reset();
        while (ts.incrementToken()) {
            System.out.println(term.toString() + "->" + offset.startOffset() + "-" + offset.endOffset() + "->"
                    + type.type());
        }
        ts.end();
    } catch (IOException e) {
        throw new RuntimeException();
    }
}

From source file:ri.AnalyzerNuevo.java

@Override
protected TokenStreamComponents createComponents(String string) {

    //To change body of generated methods, choose Tools | Templates.
    final Tokenizer source = new StandardTokenizer();

    Reader reader = new StringReader(string);

    source.setReader(reader);/*w w w  . ja va2 s.  c o  m*/

    //SynonymMap.Builder builder = new SynonymMap.Builder(true);
    //builder.add(new CharsRef("text"), new CharsRef("documento"), true);

    //SynonymMap synonymMap;

    TokenStream pipeline = source;
    pipeline = new StandardFilter(pipeline);

    pipeline = new EnglishPossessiveFilter(pipeline);
    /*try {
    synonymMap = builder.build();
    pipeline = new SynonymFilter(pipeline,synonymMap,true);
    } catch (IOException ex) {
       Logger.getLogger(AnalyzerNuevo.class.getName()).log(Level.SEVERE, null, ex);
    }*/

    pipeline = new ASCIIFoldingFilter(pipeline);
    pipeline = new LowerCaseFilter(pipeline);
    pipeline = new StopFilter(pipeline, new CharArraySet(stopwords, true));
    pipeline = new PorterStemFilter(pipeline);

    return new TokenStreamComponents(source, pipeline);
}