Java tutorial
/** * This is free and unencumbered software released into the public domain. * * Anyone is free to copy, modify, publish, use, compile, sell, or distribute * this software, either in source code form or as a compiled binary, for any * purpose, commercial or non-commercial, and by any means. * * In jurisdictions that recognize copyright laws, the author or authors of this * software dedicate any and all copyright interest in the software to the * public domain. We make this dedication for the benefit of the public at large * and to the detriment of our heirs and successors. We intend this dedication * to be an overt act of relinquishment in perpetuity of all present and future * rights to this software under copyright law. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * * For more information, please refer to <http://unlicense.org/> */ package com.github.pffy.chinese.freq; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.Map; import java.util.Map.Entry; import java.util.Scanner; import com.google.common.collect.HashMultiset; import com.google.common.collect.ImmutableList; import com.google.common.collect.Multiset; import com.google.common.collect.Multisets; /** * ImmutableChineseFrequency.java - Builds Chinese character frequency list. * * @author The Pffy Authors * @version 0.8.1 * @since 2014-05-25 * */ final public class ImmutableChineseFrequency { // Constants private final String CRLF = System.lineSeparator(); private final String MSG_TOTAL_COUNT = "Total Characters"; private final String MSG_REMOVED_COUNT = "Characters Removed"; private final String MSG_HANZI_COUNT = "Hanzi Characters"; private final String MSG_UNIQUE_COUNT = "Unique Hanzi Characters"; private final String MSG_PROCESSED_COUNT = "Characters Processed"; private final String FILE_NOT_LOADED = "PFFY SAYS: File not loaded."; private final String FILE_IO_PROBLEM = "PFFY SAYS: File I/O problem."; private final String MSG_EMPTYNULL_STRING = "PFFY SAYS: " + "Input string cannot be null or empty."; private final int PADSIZE_SUMMARY = 25; private final String HEADER_ROW_CSV = "hz,py,freq"; private final String HEADER_ROW_TSV = "hz\tpy\tfreq"; private final String HEADER_ROW_TXT = padSummary("hz [py]", this.PADSIZE_SUMMARY) + "freq"; private final String hptxt = "txt/IdxHanyuPinyin.txt"; private final String xptxt = "txt/IdxExtraPinyin.txt"; // Objects private Map<String, String> hpdx = new LinkedHashMap<String, String>(); private Map<String, String> xpdx = new HashMap<String, String>(); // Counts private final int inputCount; private final int removedCount; private final int hanziCount; private final int uniqueHanziCount; private final int processedCount; // outputs private final String summary; private final String csvOutput; private final String tsvOutput; private final String txtOutput; private final ImmutableList<Multiset.Entry<String>> hanziList; private final String input; /** * Builds this object with an input text. Expecting Chinese characters. * * @param input Chinese text for frequency analysis */ public ImmutableChineseFrequency(String input) { init(); // Counts int inputCount, removedCount, hanziCount, uniqueHanziCount, processedCount; Scanner sc; // Output data String csvOutput = ""; String tsvOutput = ""; String txtOutput = ""; String csv, tsv, txt; String summaryString = ""; String hz, py; int freq; // Google Guava magic String hanzi; Multiset<String> hanziSet = HashMultiset.create(); Iterable<Multiset.Entry<String>> hanziSortedByCount; Iterator<Multiset.Entry<String>> keys; Multiset.Entry<String> key; ImmutableList<Multiset.Entry<String>> hanziList; if (input == null || input.isEmpty()) { throw new NullPointerException(this.MSG_EMPTYNULL_STRING); } inputCount = input.length(); input = retainHanzi(input); removedCount = inputCount - input.length(); hanziCount = input.length(); sc = new Scanner(input); sc.useDelimiter(""); // accumulate: counts occurrences while (sc.hasNext()) { hanzi = (String) sc.next(); hanziSet.add(hanzi, 1); } sc.close(); uniqueHanziCount = hanziSet.elementSet().size(); processedCount = 0; hanziSortedByCount = Multisets.copyHighestCountFirst(hanziSet).entrySet(); hanziList = Multisets.copyHighestCountFirst(hanziSet).entrySet().asList(); keys = hanziSortedByCount.iterator(); while (keys.hasNext()) { key = (Multiset.Entry<String>) keys.next(); hz = (String) key.getElement().replaceAll("x \\d{1,}", ""); py = (String) this.hpdx.get(hz); freq = (int) key.getCount(); // check null first to avoid NullPointerException. lazy code. if (py == null || py.isEmpty()) { // not mapped yet. that is okay move on. continue; } csv = this.CRLF + hz + "," + py + "," + freq; csvOutput += csv; tsv = this.CRLF + hz + "\t" + py + "\t" + freq; tsvOutput += tsv; txt = this.CRLF + padSummary(hz + " [" + py + "]", this.PADSIZE_SUMMARY) + freq; txtOutput += txt; processedCount++; } summaryString += padSummary(this.MSG_TOTAL_COUNT, this.PADSIZE_SUMMARY) + inputCount; summaryString += this.CRLF + padSummary(this.MSG_REMOVED_COUNT, this.PADSIZE_SUMMARY) + removedCount; summaryString += this.CRLF + padSummary(this.MSG_HANZI_COUNT, this.PADSIZE_SUMMARY) + hanziCount; summaryString += this.CRLF + padSummary(this.MSG_UNIQUE_COUNT, this.PADSIZE_SUMMARY) + uniqueHanziCount; summaryString += this.CRLF + padSummary(this.MSG_PROCESSED_COUNT, this.PADSIZE_SUMMARY) + processedCount; if (processedCount > 0) { csvOutput = this.HEADER_ROW_CSV + csvOutput; tsvOutput = this.HEADER_ROW_TSV + tsvOutput; txtOutput = this.HEADER_ROW_TXT + txtOutput; } this.input = input; this.inputCount = inputCount; this.removedCount = removedCount; this.hanziCount = hanziCount; this.uniqueHanziCount = uniqueHanziCount; this.processedCount = processedCount; this.summary = summaryString; this.hanziList = hanziList; this.csvOutput = csvOutput; this.tsvOutput = tsvOutput; this.txtOutput = txtOutput; } /** * Returns the string representation of this object. */ @Override public String toString() { return getSummary(); } /** * Returns the input text. * * @return the text input */ public String getInput() { return this.input; } /** * Returns the summary of counts based on input text. * * @return a summary of character counts */ public String getSummary() { return this.summary; } /** * Returns the CSV-formatted output of the Chinese character frequency counts. * First row displays the headers "hz,py,freq", followed by more rows of data. * * @return a list of characters, pinyin and frequency in CSV format */ public String getCsvOutput() { return this.csvOutput; } /** * Returns the TSV-formatted output of the Chinese character frequency counts. * First row displays the headers "hz\tpy\tfreq", followed by more rows of * data. * * @return a list of characters, pinyin and frequency in TSV format */ public String getTsvOutput() { return this.tsvOutput; } /** * Returns padded text output of the Chinese character frequency counts. Rows * padded to match summary. * * @return a list of characters, pinyin and frequency in padded text format */ public String getTxtOutput() { return this.txtOutput; } /** * Returns total number of characters in input text. * * @return the number of input characters */ public int getInputCount() { return this.inputCount; } /** * Returns total number of characters removed by input text. The characters * removed comprise alphanumeric characters, punctuation, and other symbols * that should not be counted. * * @return the number of characters removed from input text */ public int getRemovedCount() { return this.removedCount; } /** * Returns total number of Hanzi (Chinese characters) ready for counting. * * @return the number of Chinese characters remaining after pre-processing */ public int getHanziCount() { return this.hanziCount; } /** * Returns total number of unique Hanzi that will be counted. * * @return the number of unique Chinese characters */ public int getUniqueHanziCount() { return this.uniqueHanziCount; } /** * Returns total number of actual Hanzi processed. Functions as a checksum. * <i>Should match number of unique Hanzi.</i> Otherwise, the character (or * other extra data) has not been mapped by the idx. * * @return the number of Chinese characters processed */ public int getProcessedCount() { return this.processedCount; } /** * Returns immutable list of Chinese characters of type * Multiset.Entry<String>. * * @return immutable list of Hanzi. */ public ImmutableList<Multiset.Entry<String>> getHanziList() { return hanziList; } // pads text right private String padSummary(String str, int size) { return String.format("%-" + size + "s", str).replace(' ', ' ') + " : "; } // removals non-Hanzi characters. private String retainHanzi(String str) { String extra; // removes ASCII letters, numbers, and punctuation str = str.replaceAll("[a-zA-Z0-9]|[@?$%\\^:/&!.,;+(){}'<>#=]", ""); // remove all extra character characters? for (Entry<String, String> extras : this.xpdx.entrySet()) { extra = (String) extras.getKey(); str = str.replace(extra, ""); } // remove all spaces return str.replaceAll("\\s{1,}", ""); } // startup method private void init() { try { // load idx files this.xpdx = loadIdx(this.xptxt); this.hpdx = loadIdx(this.hptxt); } catch (Exception ex) { System.err.println(this.FILE_NOT_LOADED + ex.getMessage()); } } // loads TXT idx files into Map private Map<String, String> loadIdx(String file) { Map<String, String> map = new LinkedHashMap<String, String>(); try { InputStream is = this.getClass().getClassLoader().getResourceAsStream(file); BufferedReader br = new BufferedReader(new InputStreamReader(is)); try { String line = null; String[] idx; // read idx while ((line = br.readLine()) != null) { idx = line.split(":"); map.put(idx[0], idx[1]); } } finally { // nuff read. closing. br.close(); } } catch (IOException ioex) { System.err.println(this.FILE_IO_PROBLEM + ioex.getMessage()); } catch (Exception ex) { System.err.println(this.FILE_NOT_LOADED + ex.getMessage()); } return map; } }