Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package edu.csupomona.ir.solrapplication; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.net.MalformedURLException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; import net.sf.javaml.clustering.Clusterer; import net.sf.javaml.clustering.KMeans; import net.sf.javaml.core.Dataset; import net.sf.javaml.core.Instance; import net.sf.javaml.tools.data.FileHandler; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.HttpSolrServer; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocumentList; import edu.csupomona.ml.SimpleKMeans; /** * * @author xing */ public class Searcher { private final Pattern ptnEnglish = Pattern.compile("^[a-zA-Z]{1,20}$"); private HashMap<String, Integer> dict; // record unigrams and counts public Searcher() { dict = new HashMap<String, Integer>(); Stopword.init(); } /* * Update count for the word in hashmap * @param map The word:count mapping HashMap * @param word The word for updating * @return Nothing */ private void updateHashMap(HashMap<String, Integer> map, String word) { if (map.containsKey(word)) map.put(word, map.get(word) + 1); else map.put(word, 1); } /* * Trim the hashmap dict by removing less count words * @param Nothing * @return Nothing */ private void trimDict() { // put dict into a treemap and sort the treemap Map<String, Integer> tmp = new TreeMap<String, Integer>(); tmp.putAll(dict); tmp = MapUtil.sortByValue(tmp); // remove everything after "num" counts. Integer num = 40; for (String key : tmp.keySet()) { System.out.println(key + ":" + tmp.get(key).toString()); if (num <= 0) dict.remove(key); num--; } } /* * This method will parse the text by breaking text into words * and remove stopword, non-English word. Finally it will return * a word:count mapping as term-frequency recording for the text. * @param text The input string text * @return HashMap<String, Integer> the word:count mapping */ public HashMap<String, Integer> parseText(String text) { HashMap<String, Integer> term_frequency = new HashMap<String, Integer>(); // update hashmap for each word String[] words = text.split(" "); for (String word : words) { word = word.toLowerCase(); // unify to lowercase // skip obvious non-English words and stopword Matcher mthEnglish = ptnEnglish.matcher(word); if (!mthEnglish.find() || Stopword.isStopword(word)) continue; updateHashMap(dict, word); updateHashMap(term_frequency, word); } return term_frequency; } public void write(List tf_list, String filename) throws IOException { // trim the dict to remove words with too few counts trimDict(); // generate sparse results of term frequency for each document FileWriter fw = new FileWriter(filename, false); BufferedWriter bw = new BufferedWriter(fw); Iterator<HashMap<String, Integer>> it_tf = tf_list.iterator(); Integer index = 0; while (it_tf.hasNext()) { // for each document HashMap<String, Integer> tf = it_tf.next(); for (String key : dict.keySet()) { // get the value for the key String value = "0"; if (tf.containsKey(key)) value = tf.get(key).toString(); // write the value and separate with a comma bw.write(value + ","); } bw.write(index.toString() + "\n"); index++; } bw.close(); } public void TestKMeansLib(SolrDocumentList results) throws IOException { // load the file into Dataset Dataset data = FileHandler.loadDataset(new File("data.txt"), 40, ","); // create an instance of kMeans algorithm // 5 clusters, 50 iterations Clusterer km = new KMeans(5, 100); // cluster the data, it will be returned as an array of data sets, // with each dataset representing a cluster Dataset[] clusters = km.cluster(data); System.out.println("Cluster count: " + clusters.length); for (Dataset cluster : clusters) { System.out.println("cluster size: " + cluster.size()); for (Instance item : cluster) { // String idx = (String)item.classValue(); System.out.println(results.get(Integer.parseInt((String) item.classValue())).get("title")); } } System.out.println("total number of results: " + results.size()); } public void TestKMeans(Integer size) { SimpleKMeans kmeans = new SimpleKMeans(4, 40, size); kmeans.readDataSet("/data/data.txt"); kmeans.run(); } public static void main(String[] args) throws IOException, MalformedURLException, SolrServerException { HttpSolrServer solr = new HttpSolrServer("http://localhost:8983/solr"); SolrQuery param = new SolrQuery(); param.set("q", "news"); param.set("rows", "1000"); // query.addFilterQuery("cat:electronics","store:amazon.com"); // query.setFields("id","price","merchant","cat","store"); // param.setStart(0); // param.set("defType", "edismax"); Searcher sch = new Searcher(); QueryResponse response = solr.query(param); SolrDocumentList results = response.getResults(); // parse the result text and obtain term-frequency mapping List tf_list = new ArrayList(); for (int i = 0; i < results.size(); ++i) { tf_list.add(sch.parseText(results.get(i).get("content").toString())); } // write the most significant term-frequency into file sch.write(tf_list, "data.txt"); // test KMeans from Java ML lib // sch.TestKMeansLib(results); // test a simple implementation of KMeans sch.TestKMeans(results.size()); } }