List of usage examples for org.apache.lucene.analysis TokenStream end
public void end() throws IOException
false
(using the new TokenStream
API). From source file:org.tallison.lucene.search.concordance.TestBigramFilter.java
License:Apache License
@Test public void testIncludeUnigrams() throws Exception { List<String> expected = Arrays.asList( new String[] { "a", "a_b", "b", "b_c", "c", "c_d", "d", "d_e", "e", "e_f", "f", "f_g", "g", }); Analyzer analyzer = ConcordanceTestBase.getBigramAnalyzer(MockTokenFilter.EMPTY_STOPSET, 10, 10, true); String s = "a b c d e f g"; TokenStream tokenStream = analyzer.tokenStream("f", s); tokenStream.reset();/*from ww w. j a v a 2s .c o m*/ CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class); List<String> returned = new ArrayList<>(); int i = 0; while (tokenStream.incrementToken()) { String token = charTermAttribute.toString(); if (i++ % 2 == 0) { assertEquals(1, posIncAttribute.getPositionIncrement()); } else { assertEquals(0, posIncAttribute.getPositionIncrement()); } returned.add(token); } tokenStream.end(); tokenStream.close(); assertEquals(expected, returned); }
From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java
License:Apache License
@Test public void testCJKNoUnigrams() throws Exception { final CharacterRunAutomaton stops = MockTokenFilter.EMPTY_STOPSET; int posIncGap = 10; final int charOffsetGap = 10; Analyzer analyzer = getCJKBigramAnalyzer(false); TokenStream ts = analyzer.tokenStream(FIELD, ""); ts.reset();//from w w w .j av a 2 s . com CharTermAttribute charTermAttribute = ts.getAttribute(CharTermAttribute.class); PositionIncrementAttribute positionIncrementAttribute = ts.getAttribute(PositionIncrementAttribute.class); ts.end(); ts.close(); String[] docs = new String[] { "" }; Directory directory = getDirectory(analyzer, docs); IndexReader reader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(reader); ConcordanceSearcher searcher = new ConcordanceSearcher( new WindowBuilder(2, 2, analyzer.getOffsetGap(FIELD))); Query q = new TermQuery(new Term(FIELD, "")); //now test straight and span wrapper ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10); searcher.search(indexSearcher, FIELD, q, q, analyzer, collector); for (ConcordanceWindow w : collector.getWindows()) { //System.out.println(w); } reader.close(); directory.close(); }
From source file:org.wltea.analyzer.ikanalyzer.IKAnalzyerCase.java
License:Apache License
public static ArrayList<String> getTopicWord(String str) { // IK?smart?? Analyzer analyzer = new IKAnalyzer(true); ArrayList<String> retData = new ArrayList<String>(); // ?LuceneTokenStream TokenStream ts = null; try {/*from w w w . j a v a 2 s . c om*/ ts = analyzer.tokenStream("myfield", new StringReader(str)); // ??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); // ?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); // ?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); // ?TokenStream?StringReader ts.reset(); // ?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); if (term.toString().length() > 1 || term.toString().matches("^[0-9]*$")) { retData.add(term.toString()); } } // TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final // offset. } catch (IOException e) { e.printStackTrace(); } finally { // TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } return retData; }
From source file:org.wltea.analyzer.ikanalyzer.IKAnalzyerDemo.java
License:Apache License
public static void main(String[] args) { // IK?smart?? Analyzer analyzer = new IKAnalyzer(true); // ?LuceneTokenStream TokenStream ts = null; try {/*from ww w. java 2 s . c om*/ ts = analyzer.tokenStream("myfield", new StringReader("???")); // ??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); // ?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); // ?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); // ?TokenStream?StringReader ts.reset(); // ?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } // TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final // offset. } catch (IOException e) { e.printStackTrace(); } finally { // TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:org.wltea.analyzer.ik_analyzer5.IKAnalzyerTest.java
License:Apache License
@Test public void testIK() { String text = "???"; //IK?smart??// w ww . j ava 2s . c o m Analyzer analyzer = new IKAnalyzer(true); //?LuceneTokenStream TokenStream ts = null; try { ts = analyzer.tokenStream("myfield", new StringReader(text)); //??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); //?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); //?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); //?TokenStream?StringReader ts.reset(); //?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } //TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final offset. } catch (IOException e) { e.printStackTrace(); } finally { //TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } analyzer.close(); } }
From source file:org.wltea.analyzer.sample.IKAnalzyerDemo.java
License:Apache License
public static void main(String[] args) { //IK?smart??// www . ja v a2s . c om Analyzer analyzer = new IKAnalyzerP(true); //?LuceneTokenStream TokenStream ts = null; try { // ts = analyzer.tokenStream("myfield", new StringReader("WORLD ,.. html DATA</html>HELLO")); ts = analyzer.tokenStream("myfield", new StringReader( "?????IKAnalyer can analysis english text too")); // ts = analyzer.tokenStream("myfield", new StringReader("???pinyin hanyu Contribute index to jpinyin development by creating an account on GitHub")); //??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); //?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); //?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); //?TokenStream?StringReader ts.reset(); //?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } //TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final offset. } catch (IOException e) { e.printStackTrace(); } finally { //TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:org.wltea.analyzer.sample.LuceneTokenizerDemo.java
License:Apache License
/** * ?StandardTokenizer//w w w . j a v a2 s . c o m */ public void testST() { Tokenizer tokenizer = new StandardTokenizer(); try { tokenizer.setReader(new StringReader( "?????IKAnalyer can analysis english text too")); } catch (IOException e) { throw new RuntimeException(); } TokenStreamComponents tsc = new TokenStreamComponents(tokenizer); TokenStream ts = tsc.getTokenStream(); OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); TypeAttribute type = ts.addAttribute(TypeAttribute.class); try { ts.reset(); while (ts.incrementToken()) { System.out.println(term.toString() + "->" + offset.startOffset() + "-" + offset.endOffset() + "->" + type.type()); } ts.end(); } catch (IOException e) { throw new RuntimeException(); } }
From source file:org.wltea.analyzer.sample.LuceneTokenizerDemo.java
License:Apache License
/** * ?ClassTokenizer//from www . j a v a2s .c o m */ public void testCT() { Tokenizer tokenizer = new ClassicTokenizer(); try { tokenizer.setReader(new StringReader( "?????IKAnalyer can analysis english text too")); } catch (IOException e) { throw new RuntimeException(); } TokenStreamComponents tsc = new TokenStreamComponents(tokenizer); TokenStream ts = tsc.getTokenStream(); OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); TypeAttribute type = ts.addAttribute(TypeAttribute.class); try { ts.reset(); while (ts.incrementToken()) { System.out.println(term.toString() + "->" + offset.startOffset() + "-" + offset.endOffset() + "->" + type.type()); } ts.end(); } catch (IOException e) { throw new RuntimeException(); } }
From source file:org.wltea.analyzer.sample.LuceneTokenizerDemo.java
License:Apache License
/** * ?NGramTokenizer//w ww . jav a 2 s . c o m * min:1,max:2 */ public void testNT() { Tokenizer tokenizer = new NGramTokenizer(); try { tokenizer.setReader(new StringReader( "?????IKAnalyer can analysis english text too")); } catch (IOException e) { throw new RuntimeException(); } TokenStreamComponents tsc = new TokenStreamComponents(tokenizer); TokenStream ts = tsc.getTokenStream(); OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); TypeAttribute type = ts.addAttribute(TypeAttribute.class); try { ts.reset(); while (ts.incrementToken()) { System.out.println(term.toString() + "->" + offset.startOffset() + "-" + offset.endOffset() + "->" + type.type()); } ts.end(); } catch (IOException e) { throw new RuntimeException(); } }
From source file:org.wltea.analyzer.sample.ThulacAnalzyerDemo.java
License:Apache License
public static void main(String[] args) { //Thulac?smart?? Analyzer analyzer = new ThulacAnalyzer(true); //?LuceneTokenStream TokenStream ts = null; try {// w w w . j a v a 2s. c o m long start = System.currentTimeMillis(); ts = analyzer.tokenStream("myfield", new StringReader( "?????IKAnalyer can analysis english text too")); //??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); //?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); //?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); //?TokenStream?StringReader ts.reset(); //?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } //TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final offset. System.out.println("wast:" + (System.currentTimeMillis() - start)); } catch (IOException e) { e.printStackTrace(); } finally { //TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }