tfidf.TestTfIDF.java Source code

Introduction

Here is the source code for tfidf.TestTfIDF.java
Source

package tfidf;/*
              * Copyright 2015 Future TV, Inc.
              *
              * The contents of this file are subject to the terms
              * of the Common Development and Distribution License
              * (the License). You may not use this file except in
              * compliance with the License.
              * You may obtain a copy of the License at
              *
              *    http://www.icntv.tv/licenses/LICENSE-1.0
              *
              * Unless required by applicable law or agreed to in writing,
              * software distributed under the License is distributed on an
              * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
              * KIND, either express or implied.  See the License for the
              * specific language governing permissions and limitations
              * under the License.
              */

import com.google.common.base.Charsets;
import com.google.common.collect.Maps;
import com.google.common.io.Files;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;

import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.*;

/**
 * Created by leixw
 * <p/>
 * Author: leixw
 * Date: 2015/04/21
 * Time: 09:22
 */
public class TestTfIDF {
    //word segmentation
    public static ArrayList<String> cutWords(String line) throws IOException {

        ArrayList<String> words = new ArrayList<String>();
        //        String text = ReadFiles.readFile(file);

        IKAnalyzer analyzer = new IKAnalyzer();
        TokenStream tokenStream = analyzer.tokenStream("", new StringReader(line));
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            //            System.out.println(termAttribute.toString()+"\t"+i);
            words.add(termAttribute.toString());
        }
        return words;
    }

    public static HashMap<String, Integer> normalTF(List<String> cutwords) {
        HashMap<String, Integer> resTF = new HashMap<String, Integer>();

        for (String word : cutwords) {
            if (resTF.get(word) == null) {
                resTF.put(word, 1);
                //                System.out.println(word);
            } else {
                resTF.put(word, resTF.get(word) + 1);
                //                System.out.println(word.toString());
            }
        }
        return resTF;
    }

    public static Map<String, Float> idf(Map<Integer, Map<String, Float>> all_tf) {
        HashMap<String, Float> resIdf = new HashMap<String, Float>();
        HashMap<String, Integer> dict = new HashMap<String, Integer>();
        int docNum = all_tf.size();

        for (int i = 0; i < docNum; i++) {
            Map<String, Float> temp = all_tf.get(i);
            Iterator iter = temp.entrySet().iterator();
            while (iter.hasNext()) {
                Map.Entry entry = (Map.Entry) iter.next();
                String word = entry.getKey().toString();
                if (dict.get(word) == null) {
                    dict.put(word, 1);
                } else {
                    dict.put(word, dict.get(word) + 1);
                }
            }
        }
        //        System.out.println("IDF for every word is:");
        Iterator iter_dict = dict.entrySet().iterator();
        while (iter_dict.hasNext()) {
            Map.Entry entry = (Map.Entry) iter_dict.next();
            float value = (float) Math.log((docNum / Float.parseFloat(entry.getValue().toString())) + 0.01);
            resIdf.put(entry.getKey().toString(), value);
            //            System.out.println(entry.getKey().toString() + " = " + value);
        }
        return resIdf;
    }

    public static HashMap<String, Float> tf(List<String> cutwords) {
        HashMap<String, Float> resTF = new HashMap<String, Float>();

        int wordLen = cutwords.size();
        HashMap<String, Integer> intTF = normalTF(cutwords);

        Iterator iter = intTF.entrySet().iterator(); //iterator for that get from TF
        while (iter.hasNext()) {
            Map.Entry entry = (Map.Entry) iter.next();
            resTF.put(entry.getKey().toString(), Float.parseFloat(entry.getValue().toString()) / wordLen);
            //            System.out.println(entry.getKey().toString() + " = "+  Float.parseFloat(entry.getValue().toString()) / wordLen);
        }
        return resTF;
    }

    public static Map<Integer, Map<String, Double>> tf_idf(Map<Integer, Map<String, Float>> all_tf,
            Map<String, Float> idfs) {
        Map<Integer, Map<String, Double>> resTfIdf = Maps.newConcurrentMap();

        int docNum = all_tf.size();
        for (int i = 0; i < docNum; i++) {
            Map<String, Double> tfidf = new HashMap<String, Double>();
            Map<String, Float> temp = all_tf.get(i);
            Iterator iter = temp.entrySet().iterator();
            while (iter.hasNext()) {
                Map.Entry entry = (Map.Entry) iter.next();
                String word = entry.getKey().toString();
                Double value = (Double) Double.parseDouble(entry.getValue().toString()) * idfs.get(word);
                tfidf.put(word, value);
            }
            resTfIdf.put(i, tfidf);
        }
        System.out.println("TF-IDF for Every file is :");
        DisTfIdf(resTfIdf);
        return resTfIdf;
    }

    public static void DisTfIdf(Map<Integer, Map<String, Double>> tfidf) {
        Iterator iter1 = tfidf.entrySet().iterator();
        while (iter1.hasNext()) {
            Map.Entry entrys = (Map.Entry) iter1.next();
            System.out.println("FileName: " + entrys.getKey().toString());
            System.out.print("{");
            HashMap<String, Float> temp = (HashMap<String, Float>) entrys.getValue();
            Iterator iter2 = temp.entrySet().iterator();
            while (iter2.hasNext()) {
                Map.Entry entry = (Map.Entry) iter2.next();
                System.out.print(entry.getKey().toString() + " = " + entry.getValue().toString() + ", ");
            }
            System.out.println("}");
        }

    }

    public static void main(String[] args) throws IOException {
        List<String> lines = Files.readLines(new File("d:\\video-1.txt"), Charsets.UTF_8);
        //all tf for files
        Map<Integer, Map<String, Float>> all_tf = Maps.newConcurrentMap();
        for (int i = 0; i < lines.size(); i++) {
            String line = lines.get(i);
            List<String> words = cutWords(line);
            Map<String, Float> tf = tf(words);
            all_tf.put(i, tf);
        }
        Map<String, Float> idfs = idf(all_tf);
        tf_idf(all_tf, idfs);
    }
}