List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:nl.inl.blacklab.analysis.BLDutchAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { try {//w w w . j av a2 s . co m Tokenizer source = new BLDutchTokenizer(reader); source.reset(); TokenStream filter = new BLDutchTokenFilter(source); filter.reset(); boolean caseSensitive = ComplexFieldUtil.isCaseSensitive(fieldName); if (!caseSensitive) { filter = new LowerCaseFilter(Version.LUCENE_42, filter);// lowercase all filter.reset(); } boolean diacSensitive = ComplexFieldUtil.isDiacriticsSensitive(fieldName); if (!diacSensitive) { filter = new RemoveAllAccentsFilter(filter); // remove accents filter.reset(); } return new TokenStreamComponents(source, filter); } catch (IOException e) { throw new RuntimeException(e); } }
From source file:nl.inl.blacklab.analysis.BLLatinAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { try {/*from w w w .ja va2 s .c om*/ Tokenizer source = new StandardTokenizerFactory().create(reader); source.reset(); TokenStream filter = null; if (!ComplexFieldUtil.isAlternative(fieldName, "s")) // not case- and accent-sensitive? { filter = new LowerCaseFilter(Version.LUCENE_42, source);// lowercase all filter.reset(); filter = new ASCIIFoldingFilter(filter); // remove accents filter.reset(); } return new TokenStreamComponents(source, filter); } catch (IOException e) { throw new RuntimeException(e); } }
From source file:nl.inl.blacklab.analysis.BLNonTokenizingAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { try {//from w ww . j av a 2 s . c o m Tokenizer source = new BLNonTokenizer(reader); source.reset(); TokenStream filter = source; boolean caseSensitive = ComplexFieldUtil.isCaseSensitive(fieldName); if (!caseSensitive) { filter = new LowerCaseFilter(Version.LUCENE_42, filter);// lowercase all filter.reset(); } boolean diacSensitive = ComplexFieldUtil.isDiacriticsSensitive(fieldName); if (!diacSensitive) { filter = new RemoveAllAccentsFilter(filter); // remove accents filter.reset(); } return new TokenStreamComponents(source, filter); } catch (IOException e) { throw new RuntimeException(e); } }
From source file:nl.inl.blacklab.analysis.BLStandardAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { try {/*from w ww. j a v a 2s . co m*/ Tokenizer source = new StandardTokenizerFactory().create(reader); source.reset(); TokenStream filter = source; boolean caseSensitive = ComplexFieldUtil.isCaseSensitive(fieldName); if (!caseSensitive) { filter = new LowerCaseFilter(Version.LUCENE_42, filter);// lowercase all filter.reset(); } boolean diacSensitive = ComplexFieldUtil.isDiacriticsSensitive(fieldName); if (!diacSensitive) { filter = new RemoveAllAccentsFilter(filter); // remove accents filter.reset(); } if (!(caseSensitive && diacSensitive)) { // Is this necessary and does it do what we want? // e.g. do we want "zon" to ever match "zo'n"? Or are there examples // where this is useful/required? filter = new RemovePunctuationFilter(filter); // remove punctuation filter.reset(); } return new TokenStreamComponents(source, filter); } catch (IOException e) { throw new RuntimeException(e); } }
From source file:nl.inl.blacklab.analysis.BLWhitespaceAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { try {/*from w w w . j a v a 2 s . c o m*/ Tokenizer source = new WhitespaceTokenizer(Version.LUCENE_42, reader); source.reset(); TokenStream filter = source; boolean caseSensitive = ComplexFieldUtil.isCaseSensitive(fieldName); if (!caseSensitive) { filter = new LowerCaseFilter(Version.LUCENE_42, filter);// lowercase all filter.reset(); } boolean diacSensitive = ComplexFieldUtil.isDiacriticsSensitive(fieldName); if (!diacSensitive) { filter = new RemoveAllAccentsFilter(filter); // remove accents filter.reset(); } return new TokenStreamComponents(source, filter); } catch (IOException e) { throw new RuntimeException(e); } }
From source file:nl.inl.blacklab.analysis.TestBLDutchTokenFilter.java
License:Apache License
@Test public void testBasics() throws IOException { TokenStream ts = new StubTokenStream(new String[] { "hond", "a.u.b.", "bel(len)", "[pre]cursor", "zo'n", "'Hij", "zij'", "ex-man", "-" }); try {//from w ww . j a v a 2 s. c om ts = new BLDutchTokenFilter(ts); ts.reset(); CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("hond", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("aub", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("bellen", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("precursor", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("zo'n", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("Hij", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("zij", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("ex-man", new String(ta.buffer(), 0, ta.length())); Assert.assertFalse(ts.incrementToken()); } finally { ts.close(); } }
From source file:nl.inl.blacklab.analysis.TestBLDutchTokenizer.java
License:Apache License
@Test public void testBasics() throws IOException { Reader r = new StringReader("\"hond, a.u.b.: bl(len); \t [pre]cursor \t\nzo'n 'Hij zij' ex-man -"); TokenStream ts = new BLDutchTokenizer(r); ts.reset(); try {//from w ww . j av a2 s .c o m CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("hond", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("a.u.b.", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("bl(len)", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("[pre]cursor", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("zo'n", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("'Hij", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("zij'", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("ex-man", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("-", new String(ta.buffer(), 0, ta.length())); Assert.assertFalse(ts.incrementToken()); } finally { ts.close(); } }
From source file:nl.inl.blacklab.filter.TestRemoveAllAccentsFilter.java
License:Apache License
@Test public void testRetrieve() throws IOException { TokenStream ts = new StubTokenStream(new String[] { "H", "jij" }); try {/*from w ww. j a v a 2s . c om*/ ts = new RemoveAllAccentsFilter(ts); ts.reset(); CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("He", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("jij", new String(ta.buffer(), 0, ta.length())); Assert.assertFalse(ts.incrementToken()); } finally { ts.close(); } }
From source file:nl.inl.blacklab.filter.TranscribeGermanAccentsSynonymFilter.java
License:Apache License
public static void main(String[] args) throws IOException { TokenStream ts = new WhitespaceTokenizer(Version.LUCENE_42, new StringReader("Aachen Dsseldorf Kln Berlin sterreich")); try {/*from w ww.j av a 2 s .c o m*/ ts = new TranscribeGermanAccentsSynonymFilter(ts); ts.reset(); ts = new RemoveAllAccentsFilter(ts); ts.reset(); CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); while (ts.incrementToken()) { System.out.println(new String(term.buffer(), 0, term.length())); } } finally { ts.close(); } }
From source file:nl.uva.lucenefacility.LuceneUtil.java
public static List<String> tokenizeString(Analyzer analyzer, String string) { List<String> result = new ArrayList<String>(); try {/*from w w w .ja va 2s . c om*/ TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (IOException e) { throw new RuntimeException(e); } return result; }