NewsIR_search.TRECQuery.java Source code

Java tutorial

Introduction

Here is the source code for NewsIR_search.TRECQuery.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package NewsIR_search;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 *
 * @author dwaipayan
 */
class TRECQuery extends AnalyzerClass {
    Properties prop;
    public String qId;
    public String qTitle;
    public String qDesc;
    public String qNarr;
    //    public Query        luceneQuery;
    public List<TRECQuery> queries;

    TRECQuery(Properties prop) throws IOException {
        this.prop = prop;
        this.qId = "";
        this.qTitle = "";
        this.qDesc = "";
        this.qNarr = "";
        queries = new LinkedList<>();
        setAnalyzer();
    }

    public TRECQuery(Properties prop, String qId, String qTitle, String qDesc, String qNarr) throws IOException {
        this.prop = prop;
        this.qId = qId;
        this.qTitle = qTitle;
        this.qDesc = qDesc;
        this.qNarr = qNarr;
        queries = new LinkedList<>();
        setAnalyzer();
    }

    public TRECQuery(String qId, String qTitle, String qDesc, String qNarr) throws IOException {
        this.qId = qId;
        this.qTitle = qTitle;
        this.qDesc = qDesc;
        this.qNarr = qNarr;
        queries = new LinkedList<>();
        setAnalyzer();
    }

    TRECQuery() throws IOException {
        this.qId = "";
        this.qTitle = "";
        this.qDesc = "";
        this.qNarr = "";
        queries = new LinkedList<>();
        setAnalyzer();
    }

    /**
     * Returns the content of the 'queryField' from the query text
     * @param analyzer
     * @param queryField
     * @return (String) The content of the field
     * @throws Exception 
     */
    public String queryFieldAnalyze(Analyzer analyzer, String queryField) throws Exception {
        StringBuffer buff = new StringBuffer();
        TokenStream stream = analyzer.tokenStream(CumulativeIndexer.FIELD_TEXT, new StringReader(queryField));
        CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            String term = termAtt.toString();
            term = term.toLowerCase();
            buff.append(term).append(" ");
        }
        stream.end();
        stream.close();
        return buff.toString();
    }

    /**
     * Sets the List-TrecQUERY named 'queries'.
     * <p/>
     * The queries are read from the file (path in prop.queryPath).
     * <p/>
    */
    void trecQueryParse() throws FileNotFoundException, IOException {
        GetProjetBaseDirAndSetProjectPropFile setPropFile = new GetProjetBaseDirAndSetProjectPropFile();
        prop = setPropFile.prop;

        FileReader fr = new FileReader(prop.getProperty("query.file"));
        BufferedReader br = new BufferedReader(fr);
        String line;
        StringBuffer txtbuff = new StringBuffer();

        while ((line = br.readLine()) != null)
            txtbuff.append(line).append("\n");
        String content = txtbuff.toString();

        org.jsoup.nodes.Document jdoc = Jsoup.parse(content);
        Elements qryElts = jdoc.select("top");

        //        String tags[] = {"num", "title", "desc", "narr"};
        TRECQuery query;
        for (Element qryElt : qryElts) {
            query = new TRECQuery(prop, qryElt.select("num").first().text(), qryElt.select("title").first().text(),
                    qryElt.select("desc").first().text(), qryElt.select("narr").first().text());
            queries.add(query);
            // System.out.println("Query num "+qryElt.select("num").first().text()+"  query title="+qryElt.select("title").first().text());
        }
    }

    /**
     * Returns the BOW representation of query as BooleanQuery;
     * takes into account the fields of the query as stated in prop.queryField.
     * 
     * <p/>
     * NOTE: if queryField is not mentioned in the properties
     * 'title' of the query is always taken as the query
     * <p/>
    */
    Query getBOWQuery(Analyzer analyzer) throws Exception {
        String[] queryFields = prop.getProperty("QueryFields2Search").split(",");

        BooleanQuery booleanQuery = new BooleanQuery();

        String[] titleTerms = queryFieldAnalyze(analyzer, qTitle).split("\\s+");
        for (String term : titleTerms) {
            Term t = new Term(CumulativeIndexer.FIELD_TEXT, term);
            Query tq = new TermQuery(t);
            booleanQuery.add(tq, BooleanClause.Occur.MUST); // previous value was SHOULD
        }

        if (queryFields != null) {
            for (int i = 0; i < queryFields.length; i++) {
                if (queryFields[i].toLowerCase().equalsIgnoreCase("desc")) {
                    System.err.println("Processing Desc");
                    //* If description to be taken in the query
                    String[] descTerms = queryFieldAnalyze(analyzer, qDesc).split("\\s+");
                    for (String term : descTerms) {
                        Term t = new Term(CumulativeIndexer.FIELD_TEXT, term);
                        Query tq = new TermQuery(t);
                        booleanQuery.add(tq, BooleanClause.Occur.MUST); // previous value was SHOULD
                    }
                }

                if (queryFields[i].toLowerCase().equalsIgnoreCase("narr")) {
                    System.err.println("Processing Narr");
                    //* If narration to be taken in the query
                    String[] narrTerms = queryFieldAnalyze(analyzer, qNarr).split("\\s+");
                    for (String term : narrTerms) {
                        Term t = new Term(CumulativeIndexer.FIELD_TEXT, term);
                        Query tq = new TermQuery(t);
                        booleanQuery.add(tq, BooleanClause.Occur.MUST); // previous value was SHOULD
                    }
                }
            }
        }
        System.err.println("query length=" + booleanQuery.toString().length() + " " + "query= "
                + booleanQuery.toString().replaceAll("TEXT:", ""));

        return booleanQuery;
    }

    public static void main(String args[]) throws IOException, Exception {

        TRECQuery trecquery = new TRECQuery();
        trecquery.trecQueryParse();
        //setAnalyzer();
        trecquery.queryFieldAnalyze(trecquery.analyzer, "title");

        String atq[];
        for (int i = 0; i < (int) trecquery.queries.size(); i++) {
            String analyzedQuery = trecquery.queryFieldAnalyze(trecquery.analyzer, trecquery.queries.get(i).qTitle);
            atq = analyzedQuery.split(" ");
            for (int j = 0; j < atq.length; j++)
                System.out.print(" " + atq[j]);
            System.out.println();
        }

    }
}