List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:webdocs.WebDocAnalyzer.java
String preprocessText(String html, boolean title) throws IOException { int freqCutoffThreshold = title ? 1 : this.freqCutoffThreshold; HashMap<String, Integer> tfMap = new HashMap<>(); StringBuffer buff = new StringBuffer(); CharArraySet stopList = StopFilter.makeStopSet(Version.LUCENE_4_9, indexer.buildStopwordList("stopfile")); Analyzer webdocAnalyzer = new WebDocAnalyzer(indexer.getProperties(), stopList); TokenStream stream = webdocAnalyzer.tokenStream("field", new StringReader(html)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();/*from w w w .ja v a 2 s. c om*/ while (stream.incrementToken()) { String token = termAtt.toString(); Integer tf = tfMap.get(token); if (tf == null) { tf = new Integer(0); } tf++; tfMap.put(token, tf); } stream.end(); stream.close(); for (Map.Entry<String, Integer> e : tfMap.entrySet()) { String word = e.getKey(); int tf = e.getValue(); if (tf >= freqCutoffThreshold) { for (int i = 0; i < tf; i++) { // print this word tf times... word order doesn't matter! buff.append(word).append(" "); } } } return buff.toString(); }
From source file:wt10g.WTDocument.java
String preProcess(String text) throws Exception { StringBuffer tokenizedContentBuff = new StringBuffer(); TokenStream stream = analyzer.tokenStream("dummy", new StringReader(text)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();/*from ww w .j av a2 s. co m*/ while (stream.incrementToken()) { String term = termAtt.toString(); term = term.toLowerCase(); tokenizedContentBuff.append(term).append(" "); } stream.end(); stream.close(); return tokenizedContentBuff.toString(); }
From source file:yasoco.TermScore.java
Query constructQuery(int docId) throws Exception { Query q = null;/*from www . ja v a 2s .co m*/ boolean formSelectiveQueries = Boolean.parseBoolean(prop.getProperty("toptermquery", "true")); /* MoreLikeThis not woking for some reason! if (formSelectiveQueries) { q = mlt.like(docId); return q; } */ Document queryDoc = reader.document(docId); q = new BooleanQuery(); int termCount = 0; TokenStream fs = null; List<IndexableField> fields = queryDoc.getFields(); for (IndexableField field : fields) { String fieldName = field.name(); if (fieldName.equals(JavaSCTree.FIELD_DOCNAME) || fieldName.equals(JavaSCTree.FIELD_SC)) continue; // ignore non-searchable fields if (formSelectiveQueries) { List<TermScore> topList = selTerms(docId, field.name(), q); for (TermScore ts : topList) { Term thisTerm = new Term(field.name(), ts.term); ((BooleanQuery) q).add(new TermQuery(thisTerm), BooleanClause.Occur.SHOULD); } } else { fs = queryDoc.getField(fieldName).tokenStream(analyzer); CharTermAttribute termAtt = fs.addAttribute(CharTermAttribute.class); fs.reset(); // print all tokens until stream is exhausted while (fs.incrementToken()) { Term thisTerm = new Term(field.name(), termAtt.toString()); termCount++; if (termCount == maxlimit) { maxlimit = maxlimit << 1; BooleanQuery.setMaxClauseCount(maxlimit); } ((BooleanQuery) q).add(new TermQuery(thisTerm), BooleanClause.Occur.SHOULD); } fs.end(); fs.close(); } } return q; }