List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:org.wltea.analyzer.sample.IKAnalzyerDemo.java
License:Apache License
public static void main(String[] args) { //IK?smart??//from w w w . j a v a2s . c o m Analyzer analyzer = new IKAnalyzerP(true); //?LuceneTokenStream TokenStream ts = null; try { // ts = analyzer.tokenStream("myfield", new StringReader("WORLD ,.. html DATA</html>HELLO")); ts = analyzer.tokenStream("myfield", new StringReader( "?????IKAnalyer can analysis english text too")); // ts = analyzer.tokenStream("myfield", new StringReader("???pinyin hanyu Contribute index to jpinyin development by creating an account on GitHub")); //??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); //?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); //?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); //?TokenStream?StringReader ts.reset(); //?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } //TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final offset. } catch (IOException e) { e.printStackTrace(); } finally { //TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:org.wltea.analyzer.sample.LuceneTokenizerDemo.java
License:Apache License
/** * ?StandardTokenizer//from www . j ava2 s . c o m */ public void testST() { Tokenizer tokenizer = new StandardTokenizer(); try { tokenizer.setReader(new StringReader( "?????IKAnalyer can analysis english text too")); } catch (IOException e) { throw new RuntimeException(); } TokenStreamComponents tsc = new TokenStreamComponents(tokenizer); TokenStream ts = tsc.getTokenStream(); OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); TypeAttribute type = ts.addAttribute(TypeAttribute.class); try { ts.reset(); while (ts.incrementToken()) { System.out.println(term.toString() + "->" + offset.startOffset() + "-" + offset.endOffset() + "->" + type.type()); } ts.end(); } catch (IOException e) { throw new RuntimeException(); } }
From source file:org.wltea.analyzer.sample.LuceneTokenizerDemo.java
License:Apache License
/** * ?ClassTokenizer/*from ww w . j a v a2 s .c o m*/ */ public void testCT() { Tokenizer tokenizer = new ClassicTokenizer(); try { tokenizer.setReader(new StringReader( "?????IKAnalyer can analysis english text too")); } catch (IOException e) { throw new RuntimeException(); } TokenStreamComponents tsc = new TokenStreamComponents(tokenizer); TokenStream ts = tsc.getTokenStream(); OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); TypeAttribute type = ts.addAttribute(TypeAttribute.class); try { ts.reset(); while (ts.incrementToken()) { System.out.println(term.toString() + "->" + offset.startOffset() + "-" + offset.endOffset() + "->" + type.type()); } ts.end(); } catch (IOException e) { throw new RuntimeException(); } }
From source file:org.wltea.analyzer.sample.LuceneTokenizerDemo.java
License:Apache License
/** * ?NGramTokenizer// w w w. ja v a 2 s . c om * min:1,max:2 */ public void testNT() { Tokenizer tokenizer = new NGramTokenizer(); try { tokenizer.setReader(new StringReader( "?????IKAnalyer can analysis english text too")); } catch (IOException e) { throw new RuntimeException(); } TokenStreamComponents tsc = new TokenStreamComponents(tokenizer); TokenStream ts = tsc.getTokenStream(); OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); TypeAttribute type = ts.addAttribute(TypeAttribute.class); try { ts.reset(); while (ts.incrementToken()) { System.out.println(term.toString() + "->" + offset.startOffset() + "-" + offset.endOffset() + "->" + type.type()); } ts.end(); } catch (IOException e) { throw new RuntimeException(); } }
From source file:org.wltea.analyzer.sample.ThulacAnalzyerDemo.java
License:Apache License
public static void main(String[] args) { //Thulac?smart?? Analyzer analyzer = new ThulacAnalyzer(true); //?LuceneTokenStream TokenStream ts = null; try {//from w w w.j av a2 s .c o m long start = System.currentTimeMillis(); ts = analyzer.tokenStream("myfield", new StringReader( "?????IKAnalyer can analysis english text too")); //??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); //?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); //?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); //?TokenStream?StringReader ts.reset(); //?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } //TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final offset. System.out.println("wast:" + (System.currentTimeMillis() - start)); } catch (IOException e) { e.printStackTrace(); } finally { //TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:org.wltea.analyzer.test.IKAnalzyerDemo.java
License:Apache License
public static void main(String[] args) { //IK?smart??/*from ww w. j a v a 2s . com*/ Analyzer analyzer = new IKAnalyzer4PinYin(true); //?LuceneTokenStream TokenStream ts = null; try { ts = analyzer.tokenStream("myfield", new StringReader( "?????IKAnalyer can analysis english text too")); //??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); //?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); //?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); //?TokenStream?StringReader ts.reset(); //?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } //TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final offset. } catch (IOException e) { e.printStackTrace(); } finally { //TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:org.xbib.elasticsearch.test.AnalyzerUtils.java
License:Apache License
public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); TypeAttribute type = stream.addAttribute(TypeAttribute.class); PayloadAttribute payload = stream.addAttribute(PayloadAttribute.class); int position = 0; while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { position = position + increment; System.out.println(); System.out.print(position + ":"); }/*from w w w . java 2s . c om*/ BytesRef pl = payload.getPayload(); if (pl != null) { System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + ":" + new String(pl.bytes) + "] "); } else { System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + "] "); } } System.out.println(); }
From source file:org.zenoss.zep.index.impl.lucene.LuceneQueryBuilder.java
License:Open Source License
/** * Tokenizes the given query using the same behavior as when the field is analyzed. * * @param fieldName The field name in the index. * @param analyzer The analyzer to use to tokenize the query. * @param query The query to tokenize. * @return The tokens from the query./* w w w . j av a2 s . c o m*/ * @throws ZepException If an exception occur. */ private static List<String> getTokens(String fieldName, Analyzer analyzer, String query) throws ZepException { final List<String> tokens = new ArrayList<String>(); try { TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(query)); CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); try { ts.reset(); while (ts.incrementToken()) { tokens.add(term.toString()); } ts.end(); } catch (IOException e) { throw new ZepException(e.getLocalizedMessage(), e); } finally { ts.close(); } } catch (IOException e) { throw new ZepException(e.getLocalizedMessage(), e); } return tokens; }
From source file:pl.litwiniuk.rowicki.modsynonyms.SlowSynonymFilterFactory.java
License:Apache License
private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory) throws IOException { StringReader reader = new StringReader(source); TokenStream ts = loadTokenizer(tokFactory, reader); List<String> tokList = new ArrayList<String>(); try {//www .j a v a 2 s . co m CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); while (ts.incrementToken()) { if (termAtt.length() > 0) tokList.add(termAtt.toString()); } } finally { reader.close(); } return tokList; }
From source file:practica2_1.Practica2_1.java
public static List<String> tokenizeString(Analyzer analyzer, String string) { List<String> result = new ArrayList<String>(); String cad;/*from ww w .j a v a2s . com*/ try { TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); //OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); CharTermAttribute cAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { //cad = stream.getAttribute(CharTermAttribute.class).toString(); result.add(cAtt.toString()); } stream.close(); stream.end(); } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } return result; }