Example usage for org.apache.lucene.analysis TokenStream end

List of usage examples for org.apache.lucene.analysis TokenStream end

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream end.

Prototype

public void end() throws IOException 

Source Link

Document

This method is called by the consumer after the last token has been consumed, after #incrementToken() returned false (using the new TokenStream API).

Usage

From source file:org.tallison.lucene.search.concordance.TestBigramFilter.java

License:Apache License

@Test
public void testIncludeUnigrams() throws Exception {
    List<String> expected = Arrays.asList(
            new String[] { "a", "a_b", "b", "b_c", "c", "c_d", "d", "d_e", "e", "e_f", "f", "f_g", "g", });
    Analyzer analyzer = ConcordanceTestBase.getBigramAnalyzer(MockTokenFilter.EMPTY_STOPSET, 10, 10, true);

    String s = "a b c d e f g";
    TokenStream tokenStream = analyzer.tokenStream("f", s);
    tokenStream.reset();/*from ww  w. j  a v a 2s  .c  o  m*/
    CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);

    List<String> returned = new ArrayList<>();
    int i = 0;
    while (tokenStream.incrementToken()) {
        String token = charTermAttribute.toString();
        if (i++ % 2 == 0) {
            assertEquals(1, posIncAttribute.getPositionIncrement());
        } else {
            assertEquals(0, posIncAttribute.getPositionIncrement());
        }
        returned.add(token);
    }
    tokenStream.end();
    tokenStream.close();
    assertEquals(expected, returned);
}

From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java

License:Apache License

@Test
public void testCJKNoUnigrams() throws Exception {

    final CharacterRunAutomaton stops = MockTokenFilter.EMPTY_STOPSET;
    int posIncGap = 10;
    final int charOffsetGap = 10;
    Analyzer analyzer = getCJKBigramAnalyzer(false);
    TokenStream ts = analyzer.tokenStream(FIELD, "");
    ts.reset();//from   w w w .j av a 2  s  .  com
    CharTermAttribute charTermAttribute = ts.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute positionIncrementAttribute = ts.getAttribute(PositionIncrementAttribute.class);

    ts.end();
    ts.close();
    String[] docs = new String[] { "" };

    Directory directory = getDirectory(analyzer, docs);
    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = new IndexSearcher(reader);
    ConcordanceSearcher searcher = new ConcordanceSearcher(
            new WindowBuilder(2, 2, analyzer.getOffsetGap(FIELD)));
    Query q = new TermQuery(new Term(FIELD, ""));
    //now test straight and span wrapper
    ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10);
    searcher.search(indexSearcher, FIELD, q, q, analyzer, collector);
    for (ConcordanceWindow w : collector.getWindows()) {
        //System.out.println(w);
    }
    reader.close();
    directory.close();

}

From source file:org.wltea.analyzer.ikanalyzer.IKAnalzyerCase.java

License:Apache License

public static ArrayList<String> getTopicWord(String str) {
     // IK?smart??
     Analyzer analyzer = new IKAnalyzer(true);
     ArrayList<String> retData = new ArrayList<String>();
     // ?LuceneTokenStream
     TokenStream ts = null;
     try {/*from w  w w .  j a v a  2 s  . c  om*/
         ts = analyzer.tokenStream("myfield", new StringReader(str));
         // ???
         OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
         // ??
         CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
         // ??
         TypeAttribute type = ts.addAttribute(TypeAttribute.class);

         // ?TokenStream?StringReader
         ts.reset();
         // ??
         while (ts.incrementToken()) {
             System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                     + " | " + type.type());
             if (term.toString().length() > 1 || term.toString().matches("^[0-9]*$")) {
                 retData.add(term.toString());
             }
         }
         // TokenStreamStringReader
         ts.end(); // Perform end-of-stream operations, e.g. set the final
                   // offset.

     } catch (IOException e) {
         e.printStackTrace();
     } finally {
         // TokenStream?
         if (ts != null) {
             try {
                 ts.close();
             } catch (IOException e) {
                 e.printStackTrace();
             }
         }
     }
     return retData;
 }

From source file:org.wltea.analyzer.ikanalyzer.IKAnalzyerDemo.java

License:Apache License

public static void main(String[] args) {
    // IK?smart??
    Analyzer analyzer = new IKAnalyzer(true);

    // ?LuceneTokenStream
    TokenStream ts = null;
    try {/*from   ww w. java  2 s .  c om*/
        ts = analyzer.tokenStream("myfield", new StringReader("???"));
        // ???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        // ??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        // ??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        // ?TokenStream?StringReader
        ts.reset();
        // ??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        // TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final
                  // offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        // TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

From source file:org.wltea.analyzer.ik_analyzer5.IKAnalzyerTest.java

License:Apache License

@Test
public void testIK() {

    String text = "???";

    //IK?smart??//  w  ww . j ava 2s . c o  m
    Analyzer analyzer = new IKAnalyzer(true);

    //?LuceneTokenStream
    TokenStream ts = null;
    try {
        ts = analyzer.tokenStream("myfield", new StringReader(text));
        //???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        //??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        //??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        //?TokenStream?StringReader
        ts.reset();
        //??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        //TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        //TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        analyzer.close();
    }

}

From source file:org.wltea.analyzer.sample.IKAnalzyerDemo.java

License:Apache License

public static void main(String[] args) {
    //IK?smart??// www  . ja  v  a2s .  c  om
    Analyzer analyzer = new IKAnalyzerP(true);

    //?LuceneTokenStream
    TokenStream ts = null;
    try {
        //         ts = analyzer.tokenStream("myfield", new StringReader("WORLD ,.. html DATA</html>HELLO"));
        ts = analyzer.tokenStream("myfield", new StringReader(
                "?????IKAnalyer can analysis english text too"));
        //         ts = analyzer.tokenStream("myfield", new StringReader("???pinyin hanyu Contribute index to jpinyin development by creating an account on GitHub"));
        //???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        //??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        //??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        //?TokenStream?StringReader
        ts.reset();
        //??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        //TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        //TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

From source file:org.wltea.analyzer.sample.LuceneTokenizerDemo.java

License:Apache License

/**
 * ?StandardTokenizer//w  w w .  j  a  v  a2  s .  c o m
 */
public void testST() {
    Tokenizer tokenizer = new StandardTokenizer();
    try {
        tokenizer.setReader(new StringReader(
                "?????IKAnalyer can analysis english text too"));
    } catch (IOException e) {
        throw new RuntimeException();
    }
    TokenStreamComponents tsc = new TokenStreamComponents(tokenizer);
    TokenStream ts = tsc.getTokenStream();
    OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute type = ts.addAttribute(TypeAttribute.class);
    try {
        ts.reset();
        while (ts.incrementToken()) {
            System.out.println(term.toString() + "->" + offset.startOffset() + "-" + offset.endOffset() + "->"
                    + type.type());
        }
        ts.end();
    } catch (IOException e) {
        throw new RuntimeException();
    }
}

From source file:org.wltea.analyzer.sample.LuceneTokenizerDemo.java

License:Apache License

/**
 * ?ClassTokenizer//from  www  .  j a v a2s  .c o m
 */
public void testCT() {
    Tokenizer tokenizer = new ClassicTokenizer();
    try {
        tokenizer.setReader(new StringReader(
                "?????IKAnalyer can analysis english text too"));
    } catch (IOException e) {
        throw new RuntimeException();
    }
    TokenStreamComponents tsc = new TokenStreamComponents(tokenizer);
    TokenStream ts = tsc.getTokenStream();
    OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute type = ts.addAttribute(TypeAttribute.class);
    try {
        ts.reset();
        while (ts.incrementToken()) {
            System.out.println(term.toString() + "->" + offset.startOffset() + "-" + offset.endOffset() + "->"
                    + type.type());
        }
        ts.end();
    } catch (IOException e) {
        throw new RuntimeException();
    }
}

From source file:org.wltea.analyzer.sample.LuceneTokenizerDemo.java

License:Apache License

/**
 * ?NGramTokenizer//w  ww . jav a  2 s . c o m
 * min:1,max:2
 */
public void testNT() {
    Tokenizer tokenizer = new NGramTokenizer();
    try {
        tokenizer.setReader(new StringReader(
                "?????IKAnalyer can analysis english text too"));
    } catch (IOException e) {
        throw new RuntimeException();
    }
    TokenStreamComponents tsc = new TokenStreamComponents(tokenizer);
    TokenStream ts = tsc.getTokenStream();
    OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute type = ts.addAttribute(TypeAttribute.class);
    try {
        ts.reset();
        while (ts.incrementToken()) {
            System.out.println(term.toString() + "->" + offset.startOffset() + "-" + offset.endOffset() + "->"
                    + type.type());
        }
        ts.end();
    } catch (IOException e) {
        throw new RuntimeException();
    }
}

From source file:org.wltea.analyzer.sample.ThulacAnalzyerDemo.java

License:Apache License

public static void main(String[] args) {
    //Thulac?smart??
    Analyzer analyzer = new ThulacAnalyzer(true);
    //?LuceneTokenStream
    TokenStream ts = null;
    try {// w w w  .  j a v  a 2s. c o m
        long start = System.currentTimeMillis();
        ts = analyzer.tokenStream("myfield", new StringReader(
                "?????IKAnalyer can analysis english text too"));
        //???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        //??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        //??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        //?TokenStream?StringReader
        ts.reset();
        //??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        //TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final offset.
        System.out.println("wast:" + (System.currentTimeMillis() - start));
    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        //TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}