List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:fr.inrialpes.exmo.ontosim.VectorSpaceMeasure.java
License:Open Source License
/** * add all words contained in toAnalyse into words collection. Words are stemmed. * @param toAnalyse : the string to be analysed * @param words : the collection to add extracted words *//* w w w .ja v a 2s . com*/ protected void analyseString(String toAnalyse, Collection<String> words) { TokenStream tokenS = analyzer.tokenStream("", new StringReader(toAnalyse)); TermAttribute termAtt = tokenS.addAttribute(TermAttribute.class); try { while (tokenS.incrementToken()) { words.add(termAtt.term()); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
From source file:gr.aueb.demo.PropertyRegistryBean.java
public static String removeStopWords(String textFile) { //CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet(); CharArraySet stopWords = PropertyRegistryBean.stopSet; TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_48, new StringReader(textFile.trim())); tokenStream = new StopFilter(Version.LUCENE_48, tokenStream, stopWords); StringBuilder sb = new StringBuilder(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); try {/*w ww . j ava 2 s . c o m*/ tokenStream.reset(); } catch (IOException ex) { Logger.getLogger(PropertyRegistryBean.class.getName()).log(Level.SEVERE, null, ex); } try { while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); sb.append(term + " "); } } catch (IOException ex) { Logger.getLogger(PropertyRegistryBean.class.getName()).log(Level.SEVERE, null, ex); } return sb.toString(); }
From source file:ikanalyzer.IKAnalzyerDemo.java
License:Apache License
public static void main(String[] args) { // IK?smart?? Analyzer analyzer = new IKAnalyzer(true); // ?LuceneTokenStream TokenStream ts = null; try {// www .j a v a2s. c o m ts = analyzer.tokenStream("myfield", new StringReader( "?????IKAnalyer can analysis english text too")); // ??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); // ?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); // ?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); // ?TokenStream?StringReader ts.reset(); // ?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } // TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final // offset. } catch (IOException e) { e.printStackTrace(); } finally { // TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:in.geocoder.component.GeocodingComponent.java
License:Apache License
private List<String> tokenize(String query, Analyzer analyzer) throws IOException { TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(query)); CharTermAttribute charTermAttr = (CharTermAttribute) tokenStream.addAttribute(CharTermAttribute.class); List<String> tokens = new ArrayList<String>(); tokenStream.reset();/*from ww w . j a v a2 s . com*/ while (tokenStream.incrementToken()) { String term = charTermAttr.toString(); String text = term; if (text != null) tokens.add(term); } return tokens; }
From source file:indexer.IndexPrinter.java
String getAnalyzedContent(String content) throws IOException { StringBuffer tokenizedContentBuff = new StringBuffer(); Analyzer analyzer = new StandardAnalyzer(); TokenStream stream = analyzer.tokenStream(TextDocIndexer.FIELD_ANALYZED_CONTENT, new StringReader(content)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();/*from w ww . j a v a 2 s. c o m*/ while (stream.incrementToken()) { String term = termAtt.toString(); tokenizedContentBuff.append(term).append(" "); } tokenizedContentBuff.append("\n"); return tokenizedContentBuff.toString(); }
From source file:indexer.LineDocumentIndexer.java
Document constructDoc(FileWriter fw, String id, String line) throws Exception { Document doc = new Document(); doc.add(new Field(DocVector.FIELD_ID, id, Field.Store.YES, Field.Index.NOT_ANALYZED)); StringBuffer tokenizedContentBuff = new StringBuffer(); TokenStream stream = analyzer.tokenStream(FIELD_WORDS, new StringReader(line)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();//w w w . j a v a2 s .c o m while (stream.incrementToken()) { String term = termAtt.toString(); term = term.toLowerCase(); tokenizedContentBuff.append(term).append(" "); } stream.end(); stream.close(); tokenizedContentBuff.append("\n"); fw.write(id + "\t" + tokenizedContentBuff.toString()); // Reanalyze doc.add(new Field(FIELD_WORDS, line, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)); return doc; }
From source file:indexer.Paragraph.java
List<Paragraph> constructParagraphs(int docId, String content) throws Exception { List<Paragraph> parList = new ArrayList<>(); List<String> tokens = new ArrayList<>(); TokenStream stream = analyzer.tokenStream("dummy", new StringReader(content)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();/*from w ww .ja v a2s. c o m*/ int count = 0; int id = 0; while (stream.incrementToken()) { String term = termAtt.toString(); tokens.add(term); count++; if (count == paraWindowSize) { // create a paragraph Paragraph p = new Paragraph(docId + "_" + String.valueOf(id++), tokens); tokens.clear(); count = 0; parList.add(p); } } if (count > 0) { Paragraph p = new Paragraph(docId + "_" + String.valueOf(id++), tokens); parList.add(p); } stream.end(); stream.close(); return parList; }
From source file:indexer.WordVecSequenceFileGenerator.java
String embedWords(Document d) throws Exception { String content = d.get(AMI_FIELDS.FIELD_CONTENT); int decScore = Integer.parseInt(d.get(AMI_FIELDS.FIELD_DECISION_SCORE)) > 0 ? 1 : 0; int prefScore = Integer.parseInt(d.get(AMI_FIELDS.FIELD_PREF_SCORE)) > 0 ? 1 : 0; List<String> tokens = new ArrayList<>(); TokenStream stream = analyzer.tokenStream("dummy", new StringReader(content)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();//from ww w. j a va2 s . c o m StringBuffer buff = new StringBuffer(); boolean labelsStoredWithWords = Boolean.parseBoolean(prop.getProperty("word.labels", "false")); while (stream.incrementToken()) { String term = termAtt.toString().toLowerCase(); String[] wordAndLabel = null; if (labelsStoredWithWords) { wordAndLabel = term.split("\\" + AMIIndexer.WORD_LABEL_DELIM); term = wordAndLabel[0]; // the first part is the word decScore = Integer.parseInt(wordAndLabel[1]); prefScore = Integer.parseInt(wordAndLabel[2]); } double[] x = wvecs.getWordVector(term); if (x == null) { System.err.println("No vec found for word " + term); continue; } String wvec = vecToStr(x); if (decScore > 1) decScore = 1; if (prefScore > 1) prefScore = 1; buff.append(wvec).append("\t").append(decScore).append("\t").append(prefScore).append("\n"); } stream.close(); return buff.toString(); }
From source file:indexing.ReviewTextAnalyzer.java
License:Open Source License
/** * @param args/* w ww.jav a 2s .c o m*/ */ public static void main(String[] args) { ReviewTextAnalyzer r = new ReviewTextAnalyzer(new ReviewDocumentIndexer()); String[] filenames = { "review.txt" }; for (String filename : filenames) { try { TokenStream tokstr = r.reusableTokenStream(null, new FileReader(filename)); TermAttribute output_term = tokstr.addAttribute(TermAttribute.class); TypeAttribute output_type = tokstr.addAttribute(TypeAttribute.class); FlagsAttribute output_flags = tokstr.addAttribute(FlagsAttribute.class); PayloadAttribute output_payload = tokstr.addAttribute(PayloadAttribute.class); int review_id = r.indexer.theReviewId.get() + 1; r.indexer.theReviewId.set(review_id); r.indexer.theStats.setCurrent(review_id, 10); while (tokstr.incrementToken()) { Token current_token = new Token(output_term.term(), output_type.type(), output_flags.getFlags(), new ReviewTermPayload(output_payload.getPayload())); System.out.print(current_token); if (current_token.isDelim(false)) { System.out.println(); } if (current_token.isDelim(true)) { System.out.println("..................................................................\n"); } } System.out.println(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println( "\n\n\n\n\n\n\n\n==================================================================\n\n\n\n\n\n\n\n"); } return; }
From source file:info.johtani.elasticsearch.action.admin.indices.extended.analyze.TransportExtendedAnalyzeAction.java
License:Apache License
private List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> processAnalysis(TokenStream stream, Set<String> includeAttributes, boolean shortAttrName, int lastPosition, int lastOffset) throws IOException { List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> tokens = new ArrayList<>(); stream.reset();/*from www. j av a 2 s . co m*/ //and each tokens output CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); TypeAttribute type = stream.addAttribute(TypeAttribute.class); while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { lastPosition = lastPosition + increment; } tokens.add(new ExtendedAnalyzeResponse.ExtendedAnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), type.type(), extractExtendedAttributes(stream, includeAttributes, shortAttrName))); } stream.end(); return tokens; }