Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:nl.inl.blacklab.filter.AbstractSynonymFilter.java

License:Apache License

/**
 * @param args//from   ww  w . ja v  a  2  s. c om
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    TokenStream ts = new WhitespaceTokenizer(Version.LUCENE_42, new StringReader("Dit is een test"));
    try {
        ts = new AbstractSynonymFilter(ts) {
            @Override
            public String[] getSynonyms(String s) {
                if (s.equals("test"))
                    return new String[] { "testje" };
                if (s.equals("is"))
                    return new String[] { "zijn" };
                return null;
            }
        };

        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            System.out.println(new String(term.buffer(), 0, term.length()));
        }
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.filter.TestRemoveAllAccentsFilter.java

License:Apache License

@Test
public void testRetrieve() throws IOException {
    TokenStream ts = new StubTokenStream(new String[] { "H", "jij" });
    try {//  w w  w . ja  v a2 s .  co m
        ts = new RemoveAllAccentsFilter(ts);
        ts.reset();
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("He", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("jij", new String(ta.buffer(), 0, ta.length()));
        Assert.assertFalse(ts.incrementToken());
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.filter.TestTranscribeGermanAccentsFilter.java

License:Apache License

@Test
public void testRetrieve() throws IOException {
    TokenStream ts = new StubTokenStream(new String[] { "Kln", "Berlin" });
    try {/*from w w w  . jav a2 s  . co  m*/
        ts = new TranscribeGermanAccentsFilter(ts);
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Koeln", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Berlin", new String(ta.buffer(), 0, ta.length()));
        Assert.assertFalse(ts.incrementToken());
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.filter.TestTranscribeGermanAccentsSynonymFilter.java

License:Apache License

@Test
public void testRetrieve() throws IOException {
    TokenStream ts = new StubTokenStream(new String[] { "Kln", "Berlin" });
    try {//from w  w w.  j av  a  2s . c om
        ts = new TranscribeGermanAccentsSynonymFilter(ts);
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Kln", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Koeln", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Berlin", new String(ta.buffer(), 0, ta.length()));
        Assert.assertFalse(ts.incrementToken());
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.filter.TranscribeGermanAccentsFilter.java

License:Apache License

/**
 * Test program/*ww w  . j a v a 2  s .  co  m*/
 * @param args
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    TokenStream ts = new WhitespaceTokenizer(Version.LUCENE_42,
            new StringReader("Aachen Dsseldorf Kln Berlin sterreich"));
    try {
        ts = new TranscribeGermanAccentsFilter(ts);

        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            System.out.println(new String(term.buffer(), 0, term.length()));
        }
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.filter.TranscribeGermanAccentsSynonymFilter.java

License:Apache License

public static void main(String[] args) throws IOException {
    TokenStream ts = new WhitespaceTokenizer(Version.LUCENE_42,
            new StringReader("Aachen Dsseldorf Kln Berlin sterreich"));
    try {/*from   w  ww  .  j a v a 2 s .c o  m*/
        ts = new TranscribeGermanAccentsSynonymFilter(ts);
        ts.reset();
        ts = new RemoveAllAccentsFilter(ts);
        ts.reset();

        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            System.out.println(new String(term.buffer(), 0, term.length()));
        }
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.index.BLDefaultAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    String TEST_STR = "H jij !  ?  ?. ]'      ??. ";

    Analyzer a = new BLDefaultAnalyzer();
    try {/*from w  ww  .j  a v  a 2s . co m*/
        TokenStream ts = a.tokenStream("test", new StringReader(TEST_STR));
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            System.out.println(new String(ta.buffer(), 0, ta.length()));
        }
        TokenStream ts2 = a.tokenStream(ComplexFieldUtil.propertyField("test", null, "s"),
                new StringReader(TEST_STR));
        ta = ts2.addAttribute(CharTermAttribute.class);
        while (ts2.incrementToken()) {
            System.out.println(new String(ta.buffer(), 0, ta.length()));
        }
    } finally {
        a.close();
    }
}

From source file:nl.inl.blacklab.index.complex.TokenStreamFromList.java

License:Apache License

public static void main(String[] args) throws IOException {
    TokenStream s = new TokenStreamFromList(Arrays.asList("a", "b", "c"), Arrays.asList(1, 1, 1));
    try {/*from   w  ww. j a  v  a2  s .c o m*/
        CharTermAttribute term = s.addAttribute(CharTermAttribute.class);
        s.incrementToken();
        System.out.println(new String(term.buffer(), 0, term.length()));
        s.incrementToken();
        System.out.println(new String(term.buffer(), 0, term.length()));
        s.incrementToken();
        System.out.println(new String(term.buffer(), 0, term.length()));
        System.out.println(s.incrementToken());
    } finally {
        s.close();
    }
}

From source file:nl.uva.lucenefacility.LuceneUtil.java

public static List<String> tokenizeString(Analyzer analyzer, String string) {
    List<String> result = new ArrayList<String>();
    try {//from   www  .java2 s . co  m
        TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return result;
}

From source file:nl.uva.sne.commons.SemanticUtils.java

public static List<String> tokenize(String text, boolean stem) throws IOException, JWNLException {
    text = text.replaceAll("", "'");

    text = text.replaceAll("_", " ");
    text = text.replaceAll("[0-9]", "");
    text = text.replaceAll("[\\p{Punct}&&[^'-]]+", " ");

    text = text.replaceAll("(?:'(?:[tdsm]|[vr]e|ll))+\\b", "");
    text = text.toLowerCase();/*from ww  w.jav  a2s.  c o m*/

    TokenStream tokenStream;
    if (stem) {
        tokenStream = tokenStemStream("field", new StringReader(text));
    } else {
        tokenStream = tokenStream("field", new StringReader(text));
    }

    ArrayList<String> words = new ArrayList<>();
    try {
        CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            words.add(term.toString());
        }
        tokenStream.end();
    } finally {
        tokenStream.close();
    }
    //        Logger.getLogger(SemanticUtils.class.getName()).log(Level.INFO, "Returning {0}:", words.size() + " tokens");
    return words;
}