List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:test.analysis.AnalyzerUtils.java
License:Apache License
public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", // #A new StringReader(text)); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); // #B PositionIncrementAttribute posIncr = // #B stream.addAttribute(PositionIncrementAttribute.class); // #B OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); // #B TypeAttribute type = stream.addAttribute(TypeAttribute.class); // #B stream.reset(); int position = 0; while (stream.incrementToken()) { // #C int increment = posIncr.getPositionIncrement(); // #D if (increment > 0) { // #D position = position + increment; // #D System.out.println(); // #D System.out.print(position + ": "); // #D }//w w w . j a va 2 s . c o m System.out.print("[" + // #E term + ":" + // #E offset.startOffset() + "->" + // #E offset.endOffset() + ":" + // #E type.type() + "] "); // #E } System.out.println(); stream.close(); }
From source file:test.AnalzyerDemo.java
License:Apache License
public static void main(String[] args) { Analyzer analyzer = new BaseAnalyzer(); // Analyzer analyzer = new org.apache.lucene.analysis.cjk.CJKAnalyzer(); // ?LuceneTokenStream TokenStream ts = null; try {/* w w w . j a v a 2 s. co m*/ ts = analyzer.tokenStream("myfield", new StringReader( "????????????????2?3noneok???BaseAnalyer can analysis english text too")); // ??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); // ?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); // ?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); // ?TokenStream?StringReader ts.reset(); // ?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } // TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final offset. } catch (IOException e) { e.printStackTrace(); analyzer.close(); } finally { // TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:TesterClasses.TestAnalyzer.java
public static List tokenizeString(Analyzer analyzer, String str) { List result = new ArrayList<>(); try {//from w ww .j av a2 s. c o m TokenStream stream = analyzer.tokenStream(null, new StringReader(str)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } return result; }
From source file:tfidf.TestTfIDF.java
License:CDDL license
public static ArrayList<String> cutWords(String line) throws IOException { ArrayList<String> words = new ArrayList<String>(); // String text = ReadFiles.readFile(file); IKAnalyzer analyzer = new IKAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("", new StringReader(line)); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class); // System.out.println(termAttribute.toString()+"\t"+i); words.add(termAttribute.toString()); }/* www . j a v a 2 s . c om*/ return words; }
From source file:tw.com.kyle.luminance.LumPositionMap.java
public static LumPositionMap Get(String raw_text) throws IOException { StandardAnalyzer analyzer = new StandardAnalyzer(); TokenStream tstream = analyzer.tokenStream("", raw_text); CharTermAttribute termAttr = tstream.getAttribute(CharTermAttribute.class); OffsetAttribute offAttr = tstream.getAttribute(OffsetAttribute.class); // PositionIncrementAttribute posIncAttr = tstream.getAttribute(PositionIncrementAttribute.class); // PositionLengthAttribute posLenAttr = tstream.getAttribute(PositionLengthAttribute.class); List<String> tokens = new ArrayList<>(); List<Integer> pos_list = new ArrayList<>(); int pos_counter = 0; tstream.reset(); while (tstream.incrementToken()) { tokens.add(termAttr.toString()); pos_list.add(offAttr.startOffset()); }/*from w w w . ja va 2 s . c o m*/ return new LumPositionMap(tokens, pos_list); }
From source file:tw.com.kyle.luminance.LumWindow.java
public List<LumRange> BuildLumRange(long annot_uuid) throws IOException { Document adoc = lum_annot.GetAnnotDocument(annot_uuid); if (adoc == null) { return new ArrayList<>(); }//www. j a v a 2s . c o m int doc_id = lum_reader.getDocId(adoc); TokenStream tokenStream = lum_reader.GetTokenStream(doc_id, "anno"); if (tokenStream == null) { return null; } OffsetAttribute offAttr = tokenStream.getAttribute(OffsetAttribute.class); CharTermAttribute chAttr = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); List<LumRange> lr_list = new ArrayList<>(); while (tokenStream.incrementToken()) { LumRange lr = new LumRange(); lr.data = chAttr.toString(); lr.start_off = offAttr.startOffset(); lr.end_off = offAttr.endOffset(); lr_list.add(lr); } return lr_list; }
From source file:tw.com.kyle.luminance.LumWindow.java
private Mappings prepare_mappings(int doc_id, String field) throws IOException { List<Integer> pos_list = new ArrayList<>(); List<Integer> off_list = new ArrayList<>(); TokenStream tokenStream = lum_reader.GetTokenStream(doc_id, field); if (tokenStream == null) { return null; }/*w w w . ja v a 2s . c o m*/ OffsetAttribute offsetAttr = tokenStream.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posincAttr = tokenStream.getAttribute(PositionIncrementAttribute.class); tokenStream.reset(); int pos_counter = 0; while (tokenStream.incrementToken()) { pos_list.add(pos_counter); off_list.add(offsetAttr.startOffset()); pos_counter += posincAttr.getPositionIncrement(); } Mappings mappings = new Mappings(); mappings.off_list = off_list; mappings.pos_list = pos_list; return mappings; }
From source file:tweetembeding.AnalyzerClass.java
public String analizeString(String FIELD, String txt) throws IOException { this.analyzer = setAnalyzer(); TokenStream stream = analyzer.tokenStream(FIELD, new StringReader(txt)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); StringBuffer tokenizedContentBuff = new StringBuffer(); while (stream.incrementToken()) { String term = termAtt.toString(); if (!term.equals("nbsp")) tokenizedContentBuff.append(term).append(" "); }/* w w w . ja va 2s . c o m*/ stream.end(); stream.close(); return tokenizedContentBuff.toString(); }
From source file:ucas.IKAnalzyerDemo.java
License:Apache License
public static String Spilt2Words(String content) { String resString = ""; //IK?smart??/*from www . j a v a 2 s. co m*/ Analyzer analyzer = new IKAnalyzer(true); //?LuceneTokenStream TokenStream ts = null; try { //myfield?? ts = analyzer.tokenStream("myfield", new StringReader(content)); //?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); //?TokenStream?StringReader ts.reset(); //?? while (ts.incrementToken()) { resString += term.toString() + "|"; } //TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final offset. } catch (IOException e) { e.printStackTrace(); } finally { //TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } return resString; }
From source file:uib.scratch.AnalyzerUtils.java
public static Token insertB(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); TermAttribute term = stream.addAttribute(TermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); StringBuilder currenttoken = new StringBuilder(64); // currenttoken.append('['); char[] character = new char[1]; int i = posIncr.getPositionIncrement(); // reset our states :) //posIncr/*from w ww . ja v a2s .c o m*/ boolean tokenstart = false; boolean tokenend = false; stream.reset(); while (stream.incrementToken()) { /* end of stream reached ... if (i == 0) return null; if (character[0] == '[') { // token starts here ... tokenstart = true; } else if (character[0] == ']') { // token ends here ... tokenend = true; } else if (tokenstart && !tokenend) { // between end and start ... currenttoken.append(character[0]); } // we found our token and return it ... if (tokenstart && tokenend) { // currenttoken.append(']'); // prepend a token because lucene does not allow leading wildcards. //currenttoken.insert(0, '_');*/ //String tokenString = currenttoken.toString().toLowerCase().replace(' ', '_').trim(); String tokenString = term.toString(); Token t = new Token(tokenString, 0, tokenString.length() - 1); System.out.println(t); //return t; } return null; }