Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package com.bindez.nlp.extract.ngram.term_frequency; import com.bindez.nlp.extract.ngram.NgramGenerator; import com.bindez.nlp.extract.tokenizers.Word; import com.bindez.nlp.extract.tokenizers.utils.WordCountComparator; import com.bindez.nlp.extract.tokenizers.utils.WordEndPositionComparator; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.TreeSet; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.HttpSolrServer; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.response.TermsResponse; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrInputDocument; /** * * @author ahk , yewint */ public class TermFrequency { /** * @param args the command line arguments */ private HttpSolrServer server; public TermFrequency() { server = new SolrServer().getSolrServer(); } public static void main(String[] args) throws SolrServerException, IOException { // TODO code application logic here List<Word> words = NgramGenerator .getPosition("", 7, 2); Set<String> input = new TreeSet<String>(); Collections.sort(words, new WordEndPositionComparator()); Iterator<Word> itr = words.iterator(); while (itr.hasNext()) { Word wt = itr.next(); input.add(wt.getText()); //System.out.println(wt.getText() + " : " + wt.getStart() +"-"+wt.getEnd()); } new TermFrequency().index(); List<Word> temp = new TermFrequency().query(input); Collections.sort(temp, new WordCountComparator("desc")); for (Word temp1 : temp) System.out.println(temp1.getText() + "\t" + temp1.getCount()); } public void index() throws SolrServerException, IOException { server = new SolrServer().getSolrServer(); SolrInputDocument doc1 = new SolrInputDocument(); doc1.addField("id", ""); doc1.addField("content", ""); // doc1.addField("category", "Political"); // doc1.addField("url", "www.google.com"); SolrInputDocument doc2 = new SolrInputDocument(); doc2.addField("id", "?"); doc2.addField("content", "?"); // doc2.addField("category", "Political"); // doc2.addField("url", "www.google.com"); SolrInputDocument doc3 = new SolrInputDocument(); doc3.addField("id", " ?"); doc3.addField("content", " ?"); //doc3.addField("category", "Political"); // doc3.addField("url", "www.google.com"); Collection<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(); docs.add(doc1); docs.add(doc2); docs.add(doc3); server.add(docs); server.commit(); } public List<Word> query(Set<String> words) throws SolrServerException { List<Word> result = new ArrayList<Word>(); server = new SolrServer().getSolrServer(); for (String word : words) { SolrQuery query = new SolrQuery(); query.set("q", "content:" + word); query.add("fl", "fl:totaltermfreq(content," + word + ")"); query.set("rows", 1); QueryResponse response = server.query(query); SolrDocumentList results = response.getResults(); for (SolrDocument result1 : results) { String count = result1.getFieldValue("fl").toString(); Word w = new Word(word, Integer.parseInt(count)); result.add(w); } } return result; } public List<Word> getTermsFrequency(Set<String> words) throws SolrServerException { List<Word> result = new ArrayList<Word>(); server = new SolrServer().getSolrServer(); for (String word : words) { SolrQuery query = new SolrQuery(); query.setRequestHandler("/terms"); query.set("terms.fl", "content"); query.set("terms.regex", word); query.set("terms", "true"); query.set("shards.qt", "/terms"); query.set("distrib", "true"); QueryResponse response = server.query(query); TermsResponse termsRes = response.getTermsResponse(); List<TermsResponse.Term> terms = termsRes.getTerms("content"); TermsResponse.Term term = null; Word w = new Word(); if (terms != null && terms.size() > 0) { term = terms.get(0); w.setText(term.getTerm()); w.setCount(term.getFrequency()); result.add(w); } } return result; } }