List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:jp.mwsoft.cjkanalyzers.CJKAnalyzerNoSplitKatakana.java
License:Apache License
public static void main(String[] args) throws Exception { Set<String> stopWords = new HashSet<String>(); stopWords.add("??"); stopWords.add("??"); java.io.StringReader reader = new java.io.StringReader("??????"); CJKAnalyzer analyzer = new CJKAnalyzer(Version.LUCENE_35); TokenStream stream = analyzer.tokenStream("test", reader); for (int i = 0; i < 10; i++) { stream.incrementToken(); System.out.println(stream); }//from ww w . ja v a 2 s . com }
From source file:jp.scaleout.elasticsearch.plugins.queryparser.classic.MapperQueryParser.java
License:Apache License
private Query getPossiblyAnalyzedPrefixQuery(String field, String termStr) throws ParseException { if (!analyzeWildcard) { return super.getPrefixQuery(field, termStr); }//w w w .ja v a 2 s. co m // get Analyzer from superclass and tokenize the term TokenStream source; try { source = getAnalyzer().tokenStream(field, termStr); source.reset(); } catch (IOException e) { return super.getPrefixQuery(field, termStr); } List<String> tlist = new ArrayList<>(); CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); while (true) { try { if (!source.incrementToken()) break; } catch (IOException e) { break; } tlist.add(termAtt.toString()); } try { source.close(); } catch (IOException e) { // ignore } if (tlist.size() == 1) { return super.getPrefixQuery(field, tlist.get(0)); } else { // build a boolean query with prefix on each one... List<BooleanClause> clauses = new ArrayList<>(); for (String token : tlist) { clauses.add(new BooleanClause(super.getPrefixQuery(field, token), BooleanClause.Occur.SHOULD)); } return getBooleanQuery(clauses, true); //return super.getPrefixQuery(field, termStr); /* this means that the analyzer used either added or consumed * (common for a stemmer) tokens, and we can't build a PrefixQuery */ // throw new ParseException("Cannot build PrefixQuery with analyzer " // + getAnalyzer().getClass() // + (tlist.size() > 1 ? " - token(s) added" : " - token consumed")); } }
From source file:jp.scaleout.elasticsearch.plugins.queryparser.classic.MapperQueryParser.java
License:Apache License
private Query getPossiblyAnalyzedWildcardQuery(String field, String termStr) throws ParseException { if (!analyzeWildcard) { return super.getWildcardQuery(field, termStr); }/*from w w w. j a v a 2 s .c om*/ boolean isWithinToken = (!termStr.startsWith("?") && !termStr.startsWith("*")); StringBuilder aggStr = new StringBuilder(); StringBuilder tmp = new StringBuilder(); for (int i = 0; i < termStr.length(); i++) { char c = termStr.charAt(i); if (c == '?' || c == '*') { if (isWithinToken) { try { TokenStream source = getAnalyzer().tokenStream(field, tmp.toString()); source.reset(); CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); if (source.incrementToken()) { String term = termAtt.toString(); if (term.length() == 0) { // no tokens, just use what we have now aggStr.append(tmp); } else { aggStr.append(term); } } else { // no tokens, just use what we have now aggStr.append(tmp); } source.close(); } catch (IOException e) { aggStr.append(tmp); } tmp.setLength(0); } isWithinToken = false; aggStr.append(c); } else { tmp.append(c); isWithinToken = true; } } if (isWithinToken) { try { TokenStream source = getAnalyzer().tokenStream(field, tmp.toString()); source.reset(); CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); if (source.incrementToken()) { String term = termAtt.toString(); if (term.length() == 0) { // no tokens, just use what we have now aggStr.append(tmp); } else { aggStr.append(term); } } else { // no tokens, just use what we have now aggStr.append(tmp); } source.close(); } catch (IOException e) { aggStr.append(tmp); } } return super.getWildcardQuery(field, aggStr.toString()); }
From source file:jp.sf.fess.solr.plugin.analysis.ja.TestJapaneseNumberFilter.java
License:Apache License
public void analyze(final Analyzer analyzer, final Reader reader, final Writer writer) throws IOException { final TokenStream stream = analyzer.tokenStream("dummy", reader); stream.reset();//from w w w . j ava2 s . c o m final CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { writer.write(termAttr.toString()); writer.write("\n"); } reader.close(); writer.close(); }
From source file:jp.sf.fess.solr.plugin.analysis.synonym.NGramSynonymTokenizerTest.java
License:Apache License
private void assertTokenStream(final TokenStream stream, final String expectedStream) throws Exception { final String[] expectedTokens = expectedStream.split("/"); int count = 0; for (final String expectedToken : expectedTokens) { final String[] attrs = expectedToken.split(","); assertTrue(stream.incrementToken()); final String term = attrs[0]; assertAttribute(count, "term", term, stream.getAttribute(CharTermAttribute.class).toString()); if (attrs.length > 1) { final int so = Integer.parseInt(attrs[1]); assertAttribute(count, "startOffset", so, stream.getAttribute(OffsetAttribute.class).startOffset()); if (attrs.length > 2) { final int eo = Integer.parseInt(attrs[2]); assertAttribute(count, "endOffset", eo, stream.getAttribute(OffsetAttribute.class).endOffset()); if (attrs.length > 3) { final int pi = Integer.parseInt(attrs[3]); assertAttribute(count, "posInc", pi, stream.getAttribute(PositionIncrementAttribute.class).getPositionIncrement()); }/* ww w . j a v a 2 s. com*/ } } count++; } assertFalse(stream.incrementToken()); }
From source file:kafka.examples.Producer.java
License:Apache License
public void run() { while (true) { String access_token = "2.009F1d9BmHHChD7abcd6de0a0jui5Y"; int count = 20; Timeline tm = new Timeline(access_token); Analyzer analyzer4 = new IKAnalyzer(false);// ? try {/*from ww w . j a va 2s . c o m*/ StatusWapper status = tm.getPublicTimeline(count, 0); //------------------------------------------- try { TokenStream tokenstream = analyzer4.tokenStream("", new StringReader(status.toString())); CharTermAttribute termAttribute = tokenstream.addAttribute(CharTermAttribute.class);// token tokenstream.reset();// ? while (tokenstream.incrementToken()) {// ??token String prTxt = new String(termAttribute.buffer(), 0, termAttribute.length()); //producer.send(new KeyedMessage<Integer, String>(topic, ptTxt + " ")); System.out.print(prTxt + " "); } //System.out.println(); tokenstream.close();//TokenStream } catch (IOException e) { e.printStackTrace(); } //------------------------------------------- producer.send(new KeyedMessage<Integer, String>(topic, status.toString())); Log.logInfo(status.toString()); } catch (WeiboException e) { e.printStackTrace(); } } }
From source file:lia.analysis.CopyOfAnalyzerDemo.java
License:Apache License
private static void analyze(String text) throws IOException { System.out.println("Analyzing \"" + text + "\""); for (Analyzer analyzer : analyzers) { String name = analyzer.getClass().getSimpleName(); System.out.println(name); TokenStream stream = analyzer.tokenStream("dummy", text); stream.reset();/* ww w . j av a2 s . c o m*/ CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAttr = stream.addAttribute(OffsetAttribute.class); TypeAttribute typeAttr = stream.addAttribute(TypeAttribute.class); PositionIncrementAttribute positionAttr = stream.addAttribute(PositionIncrementAttribute.class); while (stream.incrementToken()) { System.out.print("[" + termAttr + "] "); } System.out.println(""); } }
From source file:lia.analysis.i18n.ChineseDemo.java
License:Apache License
private static void analyze(String string, Analyzer analyzer) throws IOException { StringBuffer buffer = new StringBuffer(); TokenStream stream = analyzer.tokenStream("contents", new StringReader(string)); TermAttribute term = stream.addAttribute(TermAttribute.class); while (stream.incrementToken()) { //C buffer.append("["); buffer.append(term.term());//from w ww . j a v a2s. co m buffer.append("] "); } String output = buffer.toString(); Frame f = new Frame(); f.setTitle(analyzer.getClass().getSimpleName() + " : " + string); f.setResizable(true); Font font = new Font(null, Font.PLAIN, 36); int width = getWidth(f.getFontMetrics(font), output); f.setSize((width < 250) ? 250 : width + 50, 75); // NOTE: if Label doesn't render the Chinese characters // properly, try using javax.swing.JLabel instead JLabel label = new JLabel(output); //D label.setSize(width, 75); //label.setAlignment(JLabel.CENTER); label.setFont(font); f.add(label); f.setVisible(true); }
From source file:lia.chapter4.AnalyzerUtils.java
License:Apache License
public static void displayTokens(TokenStream stream) throws IOException { TermToBytesRefAttribute term = stream.addAttribute(TermToBytesRefAttribute.class); while (stream.incrementToken()) { System.out.print("[" + term.getBytesRef().utf8ToString() + "] "); //B }//ww w. jav a2s .co m }
From source file:lia.chapter4.AnalyzerUtils.java
License:Apache License
public static void displayTokensWithPositions(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); TermToBytesRefAttribute term = stream.addAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); int position = 0; while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { position = position + increment; System.out.println(); System.out.print(position + ": "); }//w w w.java2 s . c o m System.out.print("[" + term.getBytesRef().utf8ToString() + "] "); } System.out.println(); }