List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:com.leavesfly.lia.analysis.i18n.ChineseDemo.java
License:Apache License
private static void analyze(String string, Analyzer analyzer) throws IOException { StringBuffer buffer = new StringBuffer(); TokenStream stream = analyzer.tokenStream("contents", new StringReader(string)); TermAttribute term = stream.addAttribute(TermAttribute.class); while (stream.incrementToken()) { // C buffer.append("["); buffer.append(term.term());/* w w w . j a v a 2 s . c o m*/ buffer.append("] "); } String output = buffer.toString(); Frame f = new Frame(); f.setTitle(analyzer.getClass().getSimpleName() + " : " + string); f.setResizable(true); Font font = new Font(null, Font.PLAIN, 36); int width = getWidth(f.getFontMetrics(font), output); f.setSize((width < 250) ? 250 : width + 50, 75); // NOTE: if Label doesn't render the Chinese characters // properly, try using javax.swing.JLabel instead Label label = new Label(output); // D label.setSize(width, 75); label.setAlignment(Label.CENTER); label.setFont(font); f.add(label); f.setVisible(true); }
From source file:com.liferay.events.global.mobile.Utils.java
License:Open Source License
public static String removeStopWords(String words) throws IOException { if (Validator.isNull(EventContactServiceImpl.stopWords)) { EventContactServiceImpl.stopWords = new TreeSet<String>(); BufferedReader r = new BufferedReader(new InputStreamReader( EventContactService.class.getClassLoader().getResourceAsStream("stopwords/words.txt"))); String nextLine;//from ww w . j a v a2 s . c o m while ((nextLine = r.readLine()) != null) { String word = nextLine.trim(); if (Validator.isNotNull(word)) { EventContactServiceImpl.stopWords.add(nextLine.trim()); } } r.close(); } // remove punctuation and stuff final CharArraySet stopSet = new CharArraySet(Version.LUCENE_35, EventContactServiceImpl.stopWords, true); TokenStream tokenStream = new StopFilter(Version.LUCENE_35, new StandardTokenizer(Version.LUCENE_35, new StringReader(words)), stopSet); StringBuilder sb = new StringBuilder(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); sb.append(term).append(" "); } return sb.toString(); }
From source file:com.lorelib.analyzer.sample.IKAnalzyerDemo.java
License:Apache License
public static void main(String[] args) { //IK?smart??//w ww . ja va 2 s . co m Analyzer analyzer = new IKAnalyzer(true); //?LuceneTokenStream TokenStream ts = null; try { ts = analyzer.tokenStream("myfield", new StringReader( "?????IKAnalyer can analysis english text too")); //??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); //?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); //?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); //?TokenStream?StringReader ts.reset(); //?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } //TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final offset. } catch (IOException e) { e.printStackTrace(); } finally { //TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:com.lou.simhasher.seg.WordsSegment.java
License:Open Source License
/** * ?//from w w w . j a va 2s . c o m * * @param str * @return */ public static List<String> getCutWords(String str) { Analyzer analyzer = new IKAnalyzer(); Reader r = new StringReader(str); TokenStream ts = analyzer.tokenStream("searchValue", r); ts.addAttribute(CharTermAttribute.class); List<String> list = new ArrayList<String>(); try { while (ts.incrementToken()) { CharTermAttribute ta = ts.getAttribute(CharTermAttribute.class); String word = ta.toString(); list.add(word); } } catch (IOException e) { logger.error("?IO" + e.getMessage()); } return list; }
From source file:com.mathworks.xzheng.advsearching.SpanQueryTest.java
License:Apache License
private void dumpSpans(SpanQuery query) throws IOException { Spans spans = query.getSpans(reader.getContext()); System.out.println(query + ":"); int numSpans = 0; TopDocs hits = searcher.search(query, 10); float[] scores = new float[2]; for (ScoreDoc sd : hits.scoreDocs) { scores[sd.doc] = sd.score;//from w w w .j a v a2 s . com } while (spans.next()) { // A numSpans++; int id = spans.doc(); Document doc = reader.document(id); // B TokenStream stream = analyzer.tokenStream("contents", // C new StringReader(doc.get("f"))); // C CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); StringBuilder buffer = new StringBuilder(); buffer.append(" "); int i = 0; while (stream.incrementToken()) { // D if (i == spans.start()) { // E buffer.append("<"); // E } // E buffer.append(term.toString()); // E if (i + 1 == spans.end()) { // E buffer.append(">"); // E } // E buffer.append(" "); i++; } buffer.append("(").append(scores[id]).append(") "); System.out.println(buffer); } if (numSpans == 0) { System.out.println(" No spans"); } System.out.println(); }
From source file:com.mathworks.xzheng.analysis.AnalyzerUtils.java
License:Apache License
public static void displayTokens(TokenStream stream) throws IOException { OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class); stream.reset();// w w w. j av a 2s . co m while (stream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); System.out.print("[" + charTermAttribute.toString() + "] "); //B } }
From source file:com.mathworks.xzheng.analysis.AnalyzerUtils.java
License:Apache License
public static void displayTokensWithPositions(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); int position = 0; while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { position = position + increment; System.out.println(); System.out.print(position + ": "); }/*w ww .j av a2 s. c o m*/ System.out.print("[" + term.toString() + "] "); } System.out.println(); }
From source file:com.mathworks.xzheng.analysis.AnalyzerUtils.java
License:Apache License
public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", // #A new StringReader(text)); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); // #B PositionIncrementAttribute posIncr = // #B stream.addAttribute(PositionIncrementAttribute.class); // #B OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); // #B TypeAttribute type = stream.addAttribute(TypeAttribute.class); // #B int position = 0; while (stream.incrementToken()) { // #C int increment = posIncr.getPositionIncrement(); // #D if (increment > 0) { // #D position = position + increment; // #D System.out.println(); // #D System.out.print(position + ": "); // #D }//from w w w.ja v a 2 s . c om System.out.print("[" + // #E term.toString() + ":" + // #E offset.startOffset() + "->" + // #E offset.endOffset() + ":" + // #E type.type() + "] "); // #E } System.out.println(); }
From source file:com.mathworks.xzheng.analysis.AnalyzerUtils.java
License:Apache License
public static void assertAnalyzesTo(Analyzer analyzer, String input, String[] output) throws Exception { TokenStream stream = analyzer.tokenStream("field", new StringReader(input)); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); for (String expected : output) { Assert.assertTrue(stream.incrementToken()); Assert.assertEquals(expected, termAttr.toString()); }/*from w w w .ja v a 2s .c om*/ Assert.assertFalse(stream.incrementToken()); stream.close(); }
From source file:com.mathworks.xzheng.analysis.i18n.ChineseDemo.java
License:Apache License
private static void analyze(String string, Analyzer analyzer) throws IOException { StringBuffer buffer = new StringBuffer(); TokenStream stream = analyzer.tokenStream("contents", new StringReader(string)); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { //C buffer.append("["); buffer.append(term.toString());// w ww .ja v a 2 s . c o m buffer.append("] "); } String output = buffer.toString(); Frame f = new Frame(); f.setTitle(analyzer.getClass().getSimpleName() + " : " + string); f.setResizable(true); Font font = new Font(null, Font.PLAIN, 36); int width = getWidth(f.getFontMetrics(font), output); f.setSize((width < 250) ? 250 : width + 50, 75); // NOTE: if Label doesn't render the Chinese characters // properly, try using javax.swing.JLabel instead Label label = new Label(output); //D label.setSize(width, 75); label.setAlignment(Label.CENTER); label.setFont(font); f.add(label); f.setVisible(true); }