Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:webdocs.WebDocAnalyzer.java

String preprocessText(String html, boolean title) throws IOException {

    int freqCutoffThreshold = title ? 1 : this.freqCutoffThreshold;

    HashMap<String, Integer> tfMap = new HashMap<>();

    StringBuffer buff = new StringBuffer();
    CharArraySet stopList = StopFilter.makeStopSet(Version.LUCENE_4_9, indexer.buildStopwordList("stopfile"));

    Analyzer webdocAnalyzer = new WebDocAnalyzer(indexer.getProperties(), stopList);
    TokenStream stream = webdocAnalyzer.tokenStream("field", new StringReader(html));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    stream.reset();/*from w  w  w .ja  v  a  2  s. c om*/
    while (stream.incrementToken()) {
        String token = termAtt.toString();
        Integer tf = tfMap.get(token);
        if (tf == null) {
            tf = new Integer(0);
        }
        tf++;
        tfMap.put(token, tf);
    }

    stream.end();
    stream.close();

    for (Map.Entry<String, Integer> e : tfMap.entrySet()) {
        String word = e.getKey();
        int tf = e.getValue();
        if (tf >= freqCutoffThreshold) {
            for (int i = 0; i < tf; i++) { // print this word tf times... word order doesn't matter!
                buff.append(word).append(" ");
            }
        }
    }
    return buff.toString();
}

From source file:wt10g.WTDocument.java

String preProcess(String text) throws Exception {

    StringBuffer tokenizedContentBuff = new StringBuffer();

    TokenStream stream = analyzer.tokenStream("dummy", new StringReader(text));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();/*from  ww w  .j  av a2 s. co  m*/

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        term = term.toLowerCase();
        tokenizedContentBuff.append(term).append(" ");
    }

    stream.end();
    stream.close();
    return tokenizedContentBuff.toString();
}

From source file:yasoco.TermScore.java

Query constructQuery(int docId) throws Exception {
    Query q = null;/*from   www . ja v a 2s  .co  m*/
    boolean formSelectiveQueries = Boolean.parseBoolean(prop.getProperty("toptermquery", "true"));
    /* MoreLikeThis not woking for some reason!
    if (formSelectiveQueries) {   
       q = mlt.like(docId);
       return q;
    }
    */

    Document queryDoc = reader.document(docId);
    q = new BooleanQuery();
    int termCount = 0;
    TokenStream fs = null;

    List<IndexableField> fields = queryDoc.getFields();

    for (IndexableField field : fields) {
        String fieldName = field.name();
        if (fieldName.equals(JavaSCTree.FIELD_DOCNAME) || fieldName.equals(JavaSCTree.FIELD_SC))
            continue; // ignore non-searchable fields

        if (formSelectiveQueries) {
            List<TermScore> topList = selTerms(docId, field.name(), q);
            for (TermScore ts : topList) {
                Term thisTerm = new Term(field.name(), ts.term);
                ((BooleanQuery) q).add(new TermQuery(thisTerm), BooleanClause.Occur.SHOULD);
            }
        } else {
            fs = queryDoc.getField(fieldName).tokenStream(analyzer);
            CharTermAttribute termAtt = fs.addAttribute(CharTermAttribute.class);
            fs.reset();

            // print all tokens until stream is exhausted
            while (fs.incrementToken()) {
                Term thisTerm = new Term(field.name(), termAtt.toString());
                termCount++;
                if (termCount == maxlimit) {
                    maxlimit = maxlimit << 1;
                    BooleanQuery.setMaxClauseCount(maxlimit);
                }
                ((BooleanQuery) q).add(new TermQuery(thisTerm), BooleanClause.Occur.SHOULD);
            }
            fs.end();
            fs.close();
        }
    }
    return q;
}