com.github.pffy.chinese.freq.ImmutableChineseFrequency.java Source code

Java tutorial

Introduction

Here is the source code for com.github.pffy.chinese.freq.ImmutableChineseFrequency.java

Source

/**
 * This is free and unencumbered software released into the public domain.
 * 
 * Anyone is free to copy, modify, publish, use, compile, sell, or distribute
 * this software, either in source code form or as a compiled binary, for any
 * purpose, commercial or non-commercial, and by any means.
 * 
 * In jurisdictions that recognize copyright laws, the author or authors of this
 * software dedicate any and all copyright interest in the software to the
 * public domain. We make this dedication for the benefit of the public at large
 * and to the detriment of our heirs and successors. We intend this dedication
 * to be an overt act of relinquishment in perpetuity of all present and future
 * rights to this software under copyright law.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 * 
 * For more information, please refer to <http://unlicense.org/>
 */

package com.github.pffy.chinese.freq;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Scanner;

import com.google.common.collect.HashMultiset;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Multiset;
import com.google.common.collect.Multisets;

/**
 * ImmutableChineseFrequency.java - Builds Chinese character frequency list.
 * 
 * @author The Pffy Authors
 * @version 0.8.1
 * @since 2014-05-25
 * 
 */
final public class ImmutableChineseFrequency {

    // Constants
    private final String CRLF = System.lineSeparator();

    private final String MSG_TOTAL_COUNT = "Total Characters";
    private final String MSG_REMOVED_COUNT = "Characters Removed";
    private final String MSG_HANZI_COUNT = "Hanzi Characters";
    private final String MSG_UNIQUE_COUNT = "Unique Hanzi Characters";
    private final String MSG_PROCESSED_COUNT = "Characters Processed";

    private final String FILE_NOT_LOADED = "PFFY SAYS: File not loaded.";
    private final String FILE_IO_PROBLEM = "PFFY SAYS: File I/O problem.";
    private final String MSG_EMPTYNULL_STRING = "PFFY SAYS: " + "Input string cannot be null or empty.";

    private final int PADSIZE_SUMMARY = 25;

    private final String HEADER_ROW_CSV = "hz,py,freq";
    private final String HEADER_ROW_TSV = "hz\tpy\tfreq";
    private final String HEADER_ROW_TXT = padSummary("hz [py]", this.PADSIZE_SUMMARY) + "freq";

    private final String hptxt = "txt/IdxHanyuPinyin.txt";
    private final String xptxt = "txt/IdxExtraPinyin.txt";

    // Objects
    private Map<String, String> hpdx = new LinkedHashMap<String, String>();
    private Map<String, String> xpdx = new HashMap<String, String>();

    // Counts
    private final int inputCount;
    private final int removedCount;
    private final int hanziCount;
    private final int uniqueHanziCount;
    private final int processedCount;

    // outputs
    private final String summary;
    private final String csvOutput;
    private final String tsvOutput;
    private final String txtOutput;
    private final ImmutableList<Multiset.Entry<String>> hanziList;

    private final String input;

    /**
     * Builds this object with an input text. Expecting Chinese characters.
     * 
     * @param input Chinese text for frequency analysis
     */
    public ImmutableChineseFrequency(String input) {

        init();

        // Counts
        int inputCount, removedCount, hanziCount, uniqueHanziCount, processedCount;

        Scanner sc;

        // Output data
        String csvOutput = "";
        String tsvOutput = "";
        String txtOutput = "";

        String csv, tsv, txt;
        String summaryString = "";

        String hz, py;
        int freq;

        // Google Guava magic
        String hanzi;
        Multiset<String> hanziSet = HashMultiset.create();

        Iterable<Multiset.Entry<String>> hanziSortedByCount;
        Iterator<Multiset.Entry<String>> keys;
        Multiset.Entry<String> key;

        ImmutableList<Multiset.Entry<String>> hanziList;

        if (input == null || input.isEmpty()) {
            throw new NullPointerException(this.MSG_EMPTYNULL_STRING);
        }

        inputCount = input.length();

        input = retainHanzi(input);
        removedCount = inputCount - input.length();

        hanziCount = input.length();

        sc = new Scanner(input);
        sc.useDelimiter("");

        // accumulate: counts occurrences
        while (sc.hasNext()) {
            hanzi = (String) sc.next();
            hanziSet.add(hanzi, 1);
        }

        sc.close();

        uniqueHanziCount = hanziSet.elementSet().size();
        processedCount = 0;

        hanziSortedByCount = Multisets.copyHighestCountFirst(hanziSet).entrySet();
        hanziList = Multisets.copyHighestCountFirst(hanziSet).entrySet().asList();
        keys = hanziSortedByCount.iterator();

        while (keys.hasNext()) {

            key = (Multiset.Entry<String>) keys.next();

            hz = (String) key.getElement().replaceAll("x \\d{1,}", "");
            py = (String) this.hpdx.get(hz);
            freq = (int) key.getCount();

            // check null first to avoid NullPointerException. lazy code.
            if (py == null || py.isEmpty()) {
                // not mapped yet. that is okay move on.
                continue;
            }

            csv = this.CRLF + hz + "," + py + "," + freq;
            csvOutput += csv;

            tsv = this.CRLF + hz + "\t" + py + "\t" + freq;
            tsvOutput += tsv;

            txt = this.CRLF + padSummary(hz + " [" + py + "]", this.PADSIZE_SUMMARY) + freq;
            txtOutput += txt;

            processedCount++;
        }

        summaryString += padSummary(this.MSG_TOTAL_COUNT, this.PADSIZE_SUMMARY) + inputCount;

        summaryString += this.CRLF + padSummary(this.MSG_REMOVED_COUNT, this.PADSIZE_SUMMARY) + removedCount;
        summaryString += this.CRLF + padSummary(this.MSG_HANZI_COUNT, this.PADSIZE_SUMMARY) + hanziCount;
        summaryString += this.CRLF + padSummary(this.MSG_UNIQUE_COUNT, this.PADSIZE_SUMMARY) + uniqueHanziCount;
        summaryString += this.CRLF + padSummary(this.MSG_PROCESSED_COUNT, this.PADSIZE_SUMMARY) + processedCount;

        if (processedCount > 0) {

            csvOutput = this.HEADER_ROW_CSV + csvOutput;
            tsvOutput = this.HEADER_ROW_TSV + tsvOutput;
            txtOutput = this.HEADER_ROW_TXT + txtOutput;

        }

        this.input = input;
        this.inputCount = inputCount;
        this.removedCount = removedCount;
        this.hanziCount = hanziCount;
        this.uniqueHanziCount = uniqueHanziCount;
        this.processedCount = processedCount;
        this.summary = summaryString;
        this.hanziList = hanziList;

        this.csvOutput = csvOutput;
        this.tsvOutput = tsvOutput;
        this.txtOutput = txtOutput;
    }

    /**
     * Returns the string representation of this object.
     */
    @Override
    public String toString() {
        return getSummary();
    }

    /**
     * Returns the input text.
     * 
     * @return the text input
     */
    public String getInput() {
        return this.input;
    }

    /**
     * Returns the summary of counts based on input text.
     * 
     * @return a summary of character counts
     */
    public String getSummary() {
        return this.summary;
    }

    /**
     * Returns the CSV-formatted output of the Chinese character frequency counts.
     * First row displays the headers "hz,py,freq", followed by more rows of data.
     * 
     * @return a list of characters, pinyin and frequency in CSV format
     */
    public String getCsvOutput() {
        return this.csvOutput;
    }

    /**
     * Returns the TSV-formatted output of the Chinese character frequency counts.
     * First row displays the headers "hz\tpy\tfreq", followed by more rows of
     * data.
     * 
     * @return a list of characters, pinyin and frequency in TSV format
     */
    public String getTsvOutput() {
        return this.tsvOutput;
    }

    /**
     * Returns padded text output of the Chinese character frequency counts. Rows
     * padded to match summary.
     * 
     * @return a list of characters, pinyin and frequency in padded text format
     */
    public String getTxtOutput() {
        return this.txtOutput;
    }

    /**
     * Returns total number of characters in input text.
     * 
     * @return the number of input characters
     */
    public int getInputCount() {
        return this.inputCount;
    }

    /**
     * Returns total number of characters removed by input text. The characters
     * removed comprise alphanumeric characters, punctuation, and other symbols
     * that should not be counted.
     * 
     * @return the number of characters removed from input text
     */
    public int getRemovedCount() {
        return this.removedCount;
    }

    /**
     * Returns total number of Hanzi (Chinese characters) ready for counting.
     * 
     * @return the number of Chinese characters remaining after pre-processing
     */
    public int getHanziCount() {
        return this.hanziCount;
    }

    /**
     * Returns total number of unique Hanzi that will be counted.
     * 
     * @return the number of unique Chinese characters
     */
    public int getUniqueHanziCount() {
        return this.uniqueHanziCount;
    }

    /**
     * Returns total number of actual Hanzi processed. Functions as a checksum.
     * <i>Should match number of unique Hanzi.</i> Otherwise, the character (or
     * other extra data) has not been mapped by the idx.
     * 
     * @return the number of Chinese characters processed
     */
    public int getProcessedCount() {
        return this.processedCount;
    }

    /**
     * Returns immutable list of Chinese characters of type
     * Multiset.Entry&lt;String&gt;.
     * 
     * @return immutable list of Hanzi.
     */
    public ImmutableList<Multiset.Entry<String>> getHanziList() {
        return hanziList;
    }

    // pads text right
    private String padSummary(String str, int size) {
        return String.format("%-" + size + "s", str).replace(' ', ' ') + " : ";
    }

    // removals non-Hanzi characters.
    private String retainHanzi(String str) {

        String extra;

        // removes ASCII letters, numbers, and punctuation
        str = str.replaceAll("[a-zA-Z0-9]|[@?$%\\^:/&!.,;+(){}'<>#=]", "");

        // remove all extra character characters?
        for (Entry<String, String> extras : this.xpdx.entrySet()) {
            extra = (String) extras.getKey();
            str = str.replace(extra, "");
        }

        // remove all spaces
        return str.replaceAll("\\s{1,}", "");
    }

    // startup method
    private void init() {

        try {

            // load idx files
            this.xpdx = loadIdx(this.xptxt);
            this.hpdx = loadIdx(this.hptxt);

        } catch (Exception ex) {
            System.err.println(this.FILE_NOT_LOADED + ex.getMessage());
        }
    }

    // loads TXT idx files into Map
    private Map<String, String> loadIdx(String file) {

        Map<String, String> map = new LinkedHashMap<String, String>();

        try {

            InputStream is = this.getClass().getClassLoader().getResourceAsStream(file);
            BufferedReader br = new BufferedReader(new InputStreamReader(is));

            try {

                String line = null;
                String[] idx;

                // read idx
                while ((line = br.readLine()) != null) {
                    idx = line.split(":");
                    map.put(idx[0], idx[1]);
                }

            } finally {

                // nuff read. closing.
                br.close();
            }

        } catch (IOException ioex) {
            System.err.println(this.FILE_IO_PROBLEM + ioex.getMessage());
        } catch (Exception ex) {
            System.err.println(this.FILE_NOT_LOADED + ex.getMessage());
        }

        return map;
    }
}