Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:nl.inl.blacklab.filter.AbstractSynonymFilter.java

License:Apache License

/**
 * @param args//from   ww  w . ja v  a  2  s. c om
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    TokenStream ts = new WhitespaceTokenizer(Version.LUCENE_42, new StringReader("Dit is een test"));
    try {
        ts = new AbstractSynonymFilter(ts) {
            @Override
            public String[] getSynonyms(String s) {
                if (s.equals("test"))
                    return new String[] { "testje" };
                if (s.equals("is"))
                    return new String[] { "zijn" };
                return null;
            }
        };

        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            System.out.println(new String(term.buffer(), 0, term.length()));
        }
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.filter.TestRemoveAllAccentsFilter.java

License:Apache License

@Test
public void testRetrieve() throws IOException {
    TokenStream ts = new StubTokenStream(new String[] { "H", "jij" });
    try {//  w w  w . ja  v a2 s .  co m
        ts = new RemoveAllAccentsFilter(ts);
        ts.reset();
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("He", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("jij", new String(ta.buffer(), 0, ta.length()));
        Assert.assertFalse(ts.incrementToken());
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.filter.TestTranscribeGermanAccentsFilter.java

License:Apache License

@Test
public void testRetrieve() throws IOException {
    TokenStream ts = new StubTokenStream(new String[] { "Kln", "Berlin" });
    try {/*from w w w  . jav a2 s  . co  m*/
        ts = new TranscribeGermanAccentsFilter(ts);
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Koeln", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Berlin", new String(ta.buffer(), 0, ta.length()));
        Assert.assertFalse(ts.incrementToken());
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.filter.TestTranscribeGermanAccentsSynonymFilter.java

License:Apache License

@Test
public void testRetrieve() throws IOException {
    TokenStream ts = new StubTokenStream(new String[] { "Kln", "Berlin" });
    try {//from w  w w.  j av  a  2s . c om
        ts = new TranscribeGermanAccentsSynonymFilter(ts);
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Kln", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Koeln", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Berlin", new String(ta.buffer(), 0, ta.length()));
        Assert.assertFalse(ts.incrementToken());
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.filter.TranscribeGermanAccentsFilter.java

License:Apache License

/**
 * Test program/*ww w  . j a v a 2  s .  co  m*/
 * @param args
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    TokenStream ts = new WhitespaceTokenizer(Version.LUCENE_42,
            new StringReader("Aachen Dsseldorf Kln Berlin sterreich"));
    try {
        ts = new TranscribeGermanAccentsFilter(ts);

        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            System.out.println(new String(term.buffer(), 0, term.length()));
        }
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.filter.TranscribeGermanAccentsSynonymFilter.java

License:Apache License

public static void main(String[] args) throws IOException {
    TokenStream ts = new WhitespaceTokenizer(Version.LUCENE_42,
            new StringReader("Aachen Dsseldorf Kln Berlin sterreich"));
    try {/*from   w  ww  .  j a v a 2 s .c o  m*/
        ts = new TranscribeGermanAccentsSynonymFilter(ts);
        ts.reset();
        ts = new RemoveAllAccentsFilter(ts);
        ts.reset();

        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            System.out.println(new String(term.buffer(), 0, term.length()));
        }
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.index.BLDefaultAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    String TEST_STR = "H jij !  ?  ?. ]'      ??. ";

    Analyzer a = new BLDefaultAnalyzer();
    try {/*from w  ww  .j  a v  a 2s . co m*/
        TokenStream ts = a.tokenStream("test", new StringReader(TEST_STR));
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            System.out.println(new String(ta.buffer(), 0, ta.length()));
        }
        TokenStream ts2 = a.tokenStream(ComplexFieldUtil.propertyField("test", null, "s"),
                new StringReader(TEST_STR));
        ta = ts2.addAttribute(CharTermAttribute.class);
        while (ts2.incrementToken()) {
            System.out.println(new String(ta.buffer(), 0, ta.length()));
        }
    } finally {
        a.close();
    }
}

From source file:nl.inl.blacklab.index.complex.TokenStreamFromList.java

License:Apache License

public static void main(String[] args) throws IOException {
    TokenStream s = new TokenStreamFromList(Arrays.asList("a", "b", "c"), Arrays.asList(1, 1, 1));
    try {/*from   w  ww. j a  v  a2  s .c o m*/
        CharTermAttribute term = s.addAttribute(CharTermAttribute.class);
        s.incrementToken();
        System.out.println(new String(term.buffer(), 0, term.length()));
        s.incrementToken();
        System.out.println(new String(term.buffer(), 0, term.length()));
        s.incrementToken();
        System.out.println(new String(term.buffer(), 0, term.length()));
        System.out.println(s.incrementToken());
    } finally {
        s.close();
    }
}

From source file:nl.uva.lucenefacility.LuceneUtil.java

public static List<String> tokenizeString(Analyzer analyzer, String string) {
    List<String> result = new ArrayList<String>();
    try {//from   www  .java2 s . co  m
        TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return result;
}

From source file:nl.uva.sne.commons.SemanticUtils.java

public static List<String> tokenize(String text, boolean stem) throws IOException, JWNLException {
    text = text.replaceAll("", "'");

    text = text.replaceAll("_", " ");
    text = text.replaceAll("[0-9]", "");
    text = text.replaceAll("[\\p{Punct}&&[^'-]]+", " ");

    text = text.replaceAll("(?:'(?:[tdsm]|[vr]e|ll))+\\b", "");
    text = text.toLowerCase();/*from ww  w.jav  a2s.  c o m*/

    TokenStream tokenStream;
    if (stem) {
        tokenStream = tokenStemStream("field", new StringReader(text));
    } else {
        tokenStream = tokenStream("field", new StringReader(text));
    }

    ArrayList<String> words = new ArrayList<>();
    try {
        CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            words.add(term.toString());
        }
        tokenStream.end();
    } finally {
        tokenStream.close();
    }
    //        Logger.getLogger(SemanticUtils.class.getName()).log(Level.INFO, "Returning {0}:", words.size() + " tokens");
    return words;
}