Example usage for org.apache.lucene.analysis.standard StandardTokenizerFactory StandardTokenizerFactory

List of usage examples for org.apache.lucene.analysis.standard StandardTokenizerFactory StandardTokenizerFactory

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.standard StandardTokenizerFactory StandardTokenizerFactory.

Prototype

StandardTokenizerFactory

Source Link

Usage

From source file:nl.inl.blacklab.analysis.BLLatinAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    try {/*from  w w  w .j  a  v a  2s.co  m*/
        Tokenizer source = new StandardTokenizerFactory().create(reader);
        source.reset();
        TokenStream filter = null;
        if (!ComplexFieldUtil.isAlternative(fieldName, "s")) // not case- and accent-sensitive?
        {
            filter = new LowerCaseFilter(Version.LUCENE_42, source);// lowercase all
            filter.reset();
            filter = new ASCIIFoldingFilter(filter); // remove accents
            filter.reset();
        }
        return new TokenStreamComponents(source, filter);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

From source file:nl.inl.blacklab.analysis.BLStandardAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    try {//from  www.  ja v a2 s  . co  m
        Tokenizer source = new StandardTokenizerFactory().create(reader);
        source.reset();
        TokenStream filter = source;
        boolean caseSensitive = ComplexFieldUtil.isCaseSensitive(fieldName);
        if (!caseSensitive) {
            filter = new LowerCaseFilter(Version.LUCENE_42, filter);// lowercase all
            filter.reset();
        }
        boolean diacSensitive = ComplexFieldUtil.isDiacriticsSensitive(fieldName);
        if (!diacSensitive) {
            filter = new RemoveAllAccentsFilter(filter); // remove accents
            filter.reset();
        }
        if (!(caseSensitive && diacSensitive)) {
            // Is this necessary and does it do what we want?
            // e.g. do we want "zon" to ever match "zo'n"? Or are there examples
            //      where this is useful/required?
            filter = new RemovePunctuationFilter(filter); // remove punctuation
            filter.reset();
        }
        return new TokenStreamComponents(source, filter);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}