List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:nl.inl.blacklab.filter.AbstractSynonymFilter.java
License:Apache License
/** * @param args//from ww w . ja v a 2 s. c om * @throws IOException */ public static void main(String[] args) throws IOException { TokenStream ts = new WhitespaceTokenizer(Version.LUCENE_42, new StringReader("Dit is een test")); try { ts = new AbstractSynonymFilter(ts) { @Override public String[] getSynonyms(String s) { if (s.equals("test")) return new String[] { "testje" }; if (s.equals("is")) return new String[] { "zijn" }; return null; } }; CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); while (ts.incrementToken()) { System.out.println(new String(term.buffer(), 0, term.length())); } } finally { ts.close(); } }
From source file:nl.inl.blacklab.filter.TestRemoveAllAccentsFilter.java
License:Apache License
@Test public void testRetrieve() throws IOException { TokenStream ts = new StubTokenStream(new String[] { "H", "jij" }); try {// w w w . ja v a2 s . co m ts = new RemoveAllAccentsFilter(ts); ts.reset(); CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("He", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("jij", new String(ta.buffer(), 0, ta.length())); Assert.assertFalse(ts.incrementToken()); } finally { ts.close(); } }
From source file:nl.inl.blacklab.filter.TestTranscribeGermanAccentsFilter.java
License:Apache License
@Test public void testRetrieve() throws IOException { TokenStream ts = new StubTokenStream(new String[] { "Kln", "Berlin" }); try {/*from w w w . jav a2 s . co m*/ ts = new TranscribeGermanAccentsFilter(ts); CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("Koeln", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("Berlin", new String(ta.buffer(), 0, ta.length())); Assert.assertFalse(ts.incrementToken()); } finally { ts.close(); } }
From source file:nl.inl.blacklab.filter.TestTranscribeGermanAccentsSynonymFilter.java
License:Apache License
@Test public void testRetrieve() throws IOException { TokenStream ts = new StubTokenStream(new String[] { "Kln", "Berlin" }); try {//from w w w. j av a 2s . c om ts = new TranscribeGermanAccentsSynonymFilter(ts); CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("Kln", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("Koeln", new String(ta.buffer(), 0, ta.length())); Assert.assertTrue(ts.incrementToken()); Assert.assertEquals("Berlin", new String(ta.buffer(), 0, ta.length())); Assert.assertFalse(ts.incrementToken()); } finally { ts.close(); } }
From source file:nl.inl.blacklab.filter.TranscribeGermanAccentsFilter.java
License:Apache License
/** * Test program/*ww w . j a v a 2 s . co m*/ * @param args * @throws IOException */ public static void main(String[] args) throws IOException { TokenStream ts = new WhitespaceTokenizer(Version.LUCENE_42, new StringReader("Aachen Dsseldorf Kln Berlin sterreich")); try { ts = new TranscribeGermanAccentsFilter(ts); CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); while (ts.incrementToken()) { System.out.println(new String(term.buffer(), 0, term.length())); } } finally { ts.close(); } }
From source file:nl.inl.blacklab.filter.TranscribeGermanAccentsSynonymFilter.java
License:Apache License
public static void main(String[] args) throws IOException { TokenStream ts = new WhitespaceTokenizer(Version.LUCENE_42, new StringReader("Aachen Dsseldorf Kln Berlin sterreich")); try {/*from w ww . j a v a 2 s .c o m*/ ts = new TranscribeGermanAccentsSynonymFilter(ts); ts.reset(); ts = new RemoveAllAccentsFilter(ts); ts.reset(); CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); while (ts.incrementToken()) { System.out.println(new String(term.buffer(), 0, term.length())); } } finally { ts.close(); } }
From source file:nl.inl.blacklab.index.BLDefaultAnalyzer.java
License:Apache License
public static void main(String[] args) throws IOException { String TEST_STR = "H jij ! ? ?. ]' ??. "; Analyzer a = new BLDefaultAnalyzer(); try {/*from w ww .j a v a 2s . co m*/ TokenStream ts = a.tokenStream("test", new StringReader(TEST_STR)); CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class); while (ts.incrementToken()) { System.out.println(new String(ta.buffer(), 0, ta.length())); } TokenStream ts2 = a.tokenStream(ComplexFieldUtil.propertyField("test", null, "s"), new StringReader(TEST_STR)); ta = ts2.addAttribute(CharTermAttribute.class); while (ts2.incrementToken()) { System.out.println(new String(ta.buffer(), 0, ta.length())); } } finally { a.close(); } }
From source file:nl.inl.blacklab.index.complex.TokenStreamFromList.java
License:Apache License
public static void main(String[] args) throws IOException { TokenStream s = new TokenStreamFromList(Arrays.asList("a", "b", "c"), Arrays.asList(1, 1, 1)); try {/*from w ww. j a v a2 s .c o m*/ CharTermAttribute term = s.addAttribute(CharTermAttribute.class); s.incrementToken(); System.out.println(new String(term.buffer(), 0, term.length())); s.incrementToken(); System.out.println(new String(term.buffer(), 0, term.length())); s.incrementToken(); System.out.println(new String(term.buffer(), 0, term.length())); System.out.println(s.incrementToken()); } finally { s.close(); } }
From source file:nl.uva.lucenefacility.LuceneUtil.java
public static List<String> tokenizeString(Analyzer analyzer, String string) { List<String> result = new ArrayList<String>(); try {//from www .java2 s . co m TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (IOException e) { throw new RuntimeException(e); } return result; }
From source file:nl.uva.sne.commons.SemanticUtils.java
public static List<String> tokenize(String text, boolean stem) throws IOException, JWNLException { text = text.replaceAll("", "'"); text = text.replaceAll("_", " "); text = text.replaceAll("[0-9]", ""); text = text.replaceAll("[\\p{Punct}&&[^'-]]+", " "); text = text.replaceAll("(?:'(?:[tdsm]|[vr]e|ll))+\\b", ""); text = text.toLowerCase();/*from ww w.jav a2s. c o m*/ TokenStream tokenStream; if (stem) { tokenStream = tokenStemStream("field", new StringReader(text)); } else { tokenStream = tokenStream("field", new StringReader(text)); } ArrayList<String> words = new ArrayList<>(); try { CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { words.add(term.toString()); } tokenStream.end(); } finally { tokenStream.close(); } // Logger.getLogger(SemanticUtils.class.getName()).log(Level.INFO, "Returning {0}:", words.size() + " tokens"); return words; }