List of usage examples for org.apache.lucene.analysis.standard StandardTokenizerFactory StandardTokenizerFactory
StandardTokenizerFactory
From source file:nl.inl.blacklab.analysis.BLLatinAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { try {/*from w w w .j a v a 2s.co m*/ Tokenizer source = new StandardTokenizerFactory().create(reader); source.reset(); TokenStream filter = null; if (!ComplexFieldUtil.isAlternative(fieldName, "s")) // not case- and accent-sensitive? { filter = new LowerCaseFilter(Version.LUCENE_42, source);// lowercase all filter.reset(); filter = new ASCIIFoldingFilter(filter); // remove accents filter.reset(); } return new TokenStreamComponents(source, filter); } catch (IOException e) { throw new RuntimeException(e); } }
From source file:nl.inl.blacklab.analysis.BLStandardAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { try {//from www. ja v a2 s . co m Tokenizer source = new StandardTokenizerFactory().create(reader); source.reset(); TokenStream filter = source; boolean caseSensitive = ComplexFieldUtil.isCaseSensitive(fieldName); if (!caseSensitive) { filter = new LowerCaseFilter(Version.LUCENE_42, filter);// lowercase all filter.reset(); } boolean diacSensitive = ComplexFieldUtil.isDiacriticsSensitive(fieldName); if (!diacSensitive) { filter = new RemoveAllAccentsFilter(filter); // remove accents filter.reset(); } if (!(caseSensitive && diacSensitive)) { // Is this necessary and does it do what we want? // e.g. do we want "zon" to ever match "zo'n"? Or are there examples // where this is useful/required? filter = new RemovePunctuationFilter(filter); // remove punctuation filter.reset(); } return new TokenStreamComponents(source, filter); } catch (IOException e) { throw new RuntimeException(e); } }