com.bindez.nlp.extract.ngram.term_frequency.TermFrequency.java Source code

Java tutorial

Introduction

Here is the source code for com.bindez.nlp.extract.ngram.term_frequency.TermFrequency.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package com.bindez.nlp.extract.ngram.term_frequency;

import com.bindez.nlp.extract.ngram.NgramGenerator;
import com.bindez.nlp.extract.tokenizers.Word;
import com.bindez.nlp.extract.tokenizers.utils.WordCountComparator;
import com.bindez.nlp.extract.tokenizers.utils.WordEndPositionComparator;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrServer;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.response.TermsResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;

/**
 *
 * @author ahk , yewint
 */
public class TermFrequency {

    /**
     * @param args the command line arguments
     */
    private HttpSolrServer server;

    public TermFrequency() {
        server = new SolrServer().getSolrServer();
    }

    public static void main(String[] args) throws SolrServerException, IOException {
        // TODO code application logic here

        List<Word> words = NgramGenerator
                .getPosition("", 7, 2);
        Set<String> input = new TreeSet<String>();
        Collections.sort(words, new WordEndPositionComparator());

        Iterator<Word> itr = words.iterator();
        while (itr.hasNext()) {
            Word wt = itr.next();
            input.add(wt.getText());
            //System.out.println(wt.getText() + " : " + wt.getStart() +"-"+wt.getEnd());
        }

        new TermFrequency().index();

        List<Word> temp = new TermFrequency().query(input);
        Collections.sort(temp, new WordCountComparator("desc"));
        for (Word temp1 : temp)
            System.out.println(temp1.getText() + "\t" + temp1.getCount());

    }

    public void index() throws SolrServerException, IOException {
        server = new SolrServer().getSolrServer();

        SolrInputDocument doc1 = new SolrInputDocument();
        doc1.addField("id", "");
        doc1.addField("content", "");
        // doc1.addField("category", "Political");
        // doc1.addField("url", "www.google.com");

        SolrInputDocument doc2 = new SolrInputDocument();
        doc2.addField("id", "?");
        doc2.addField("content", "?");
        // doc2.addField("category", "Political");
        // doc2.addField("url", "www.google.com");

        SolrInputDocument doc3 = new SolrInputDocument();
        doc3.addField("id",
                "  ?");
        doc3.addField("content",
                "  ?");
        //doc3.addField("category", "Political");
        // doc3.addField("url", "www.google.com");

        Collection<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();
        docs.add(doc1);
        docs.add(doc2);
        docs.add(doc3);

        server.add(docs);
        server.commit();
    }

    public List<Word> query(Set<String> words) throws SolrServerException {
        List<Word> result = new ArrayList<Word>();
        server = new SolrServer().getSolrServer();
        for (String word : words) {
            SolrQuery query = new SolrQuery();
            query.set("q", "content:" + word);
            query.add("fl", "fl:totaltermfreq(content," + word + ")");
            query.set("rows", 1);
            QueryResponse response = server.query(query);
            SolrDocumentList results = response.getResults();
            for (SolrDocument result1 : results) {
                String count = result1.getFieldValue("fl").toString();
                Word w = new Word(word, Integer.parseInt(count));
                result.add(w);
            }

        }

        return result;
    }

    public List<Word> getTermsFrequency(Set<String> words) throws SolrServerException {
        List<Word> result = new ArrayList<Word>();
        server = new SolrServer().getSolrServer();

        for (String word : words) {
            SolrQuery query = new SolrQuery();

            query.setRequestHandler("/terms");
            query.set("terms.fl", "content");
            query.set("terms.regex", word);
            query.set("terms", "true");
            query.set("shards.qt", "/terms");
            query.set("distrib", "true");

            QueryResponse response = server.query(query);
            TermsResponse termsRes = response.getTermsResponse();
            List<TermsResponse.Term> terms = termsRes.getTerms("content");
            TermsResponse.Term term = null;
            Word w = new Word();
            if (terms != null && terms.size() > 0) {
                term = terms.get(0);
                w.setText(term.getTerm());
                w.setCount(term.getFrequency());
                result.add(w);
            }

        }

        return result;
    }
}