Java tutorial
package tfidf;/* * Copyright 2015 Future TV, Inc. * * The contents of this file are subject to the terms * of the Common Development and Distribution License * (the License). You may not use this file except in * compliance with the License. * You may obtain a copy of the License at * * http://www.icntv.tv/licenses/LICENSE-1.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ import com.google.common.base.Charsets; import com.google.common.collect.Maps; import com.google.common.io.Files; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.wltea.analyzer.lucene.IKAnalyzer; import java.io.File; import java.io.IOException; import java.io.StringReader; import java.util.*; /** * Created by leixw * <p/> * Author: leixw * Date: 2015/04/21 * Time: 09:22 */ public class TestTfIDF { //word segmentation public static ArrayList<String> cutWords(String line) throws IOException { ArrayList<String> words = new ArrayList<String>(); // String text = ReadFiles.readFile(file); IKAnalyzer analyzer = new IKAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("", new StringReader(line)); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class); // System.out.println(termAttribute.toString()+"\t"+i); words.add(termAttribute.toString()); } return words; } public static HashMap<String, Integer> normalTF(List<String> cutwords) { HashMap<String, Integer> resTF = new HashMap<String, Integer>(); for (String word : cutwords) { if (resTF.get(word) == null) { resTF.put(word, 1); // System.out.println(word); } else { resTF.put(word, resTF.get(word) + 1); // System.out.println(word.toString()); } } return resTF; } public static Map<String, Float> idf(Map<Integer, Map<String, Float>> all_tf) { HashMap<String, Float> resIdf = new HashMap<String, Float>(); HashMap<String, Integer> dict = new HashMap<String, Integer>(); int docNum = all_tf.size(); for (int i = 0; i < docNum; i++) { Map<String, Float> temp = all_tf.get(i); Iterator iter = temp.entrySet().iterator(); while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); String word = entry.getKey().toString(); if (dict.get(word) == null) { dict.put(word, 1); } else { dict.put(word, dict.get(word) + 1); } } } // System.out.println("IDF for every word is:"); Iterator iter_dict = dict.entrySet().iterator(); while (iter_dict.hasNext()) { Map.Entry entry = (Map.Entry) iter_dict.next(); float value = (float) Math.log((docNum / Float.parseFloat(entry.getValue().toString())) + 0.01); resIdf.put(entry.getKey().toString(), value); // System.out.println(entry.getKey().toString() + " = " + value); } return resIdf; } public static HashMap<String, Float> tf(List<String> cutwords) { HashMap<String, Float> resTF = new HashMap<String, Float>(); int wordLen = cutwords.size(); HashMap<String, Integer> intTF = normalTF(cutwords); Iterator iter = intTF.entrySet().iterator(); //iterator for that get from TF while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); resTF.put(entry.getKey().toString(), Float.parseFloat(entry.getValue().toString()) / wordLen); // System.out.println(entry.getKey().toString() + " = "+ Float.parseFloat(entry.getValue().toString()) / wordLen); } return resTF; } public static Map<Integer, Map<String, Double>> tf_idf(Map<Integer, Map<String, Float>> all_tf, Map<String, Float> idfs) { Map<Integer, Map<String, Double>> resTfIdf = Maps.newConcurrentMap(); int docNum = all_tf.size(); for (int i = 0; i < docNum; i++) { Map<String, Double> tfidf = new HashMap<String, Double>(); Map<String, Float> temp = all_tf.get(i); Iterator iter = temp.entrySet().iterator(); while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); String word = entry.getKey().toString(); Double value = (Double) Double.parseDouble(entry.getValue().toString()) * idfs.get(word); tfidf.put(word, value); } resTfIdf.put(i, tfidf); } System.out.println("TF-IDF for Every file is :"); DisTfIdf(resTfIdf); return resTfIdf; } public static void DisTfIdf(Map<Integer, Map<String, Double>> tfidf) { Iterator iter1 = tfidf.entrySet().iterator(); while (iter1.hasNext()) { Map.Entry entrys = (Map.Entry) iter1.next(); System.out.println("FileName: " + entrys.getKey().toString()); System.out.print("{"); HashMap<String, Float> temp = (HashMap<String, Float>) entrys.getValue(); Iterator iter2 = temp.entrySet().iterator(); while (iter2.hasNext()) { Map.Entry entry = (Map.Entry) iter2.next(); System.out.print(entry.getKey().toString() + " = " + entry.getValue().toString() + ", "); } System.out.println("}"); } } public static void main(String[] args) throws IOException { List<String> lines = Files.readLines(new File("d:\\video-1.txt"), Charsets.UTF_8); //all tf for files Map<Integer, Map<String, Float>> all_tf = Maps.newConcurrentMap(); for (int i = 0; i < lines.size(); i++) { String line = lines.get(i); List<String> words = cutWords(line); Map<String, Float> tf = tf(words); all_tf.put(i, tf); } Map<String, Float> idfs = idf(all_tf); tf_idf(all_tf, idfs); } }