org.apdplat.superword.tools.PrefixSuffixOptimizer.java Source code

Introduction

Here is the source code for org.apdplat.superword.tools.PrefixSuffixOptimizer.java
Source

/**
 *
 * APDPlat - Application Product Development Platform Copyright (c) 2013, ??,
 * yang-shangchuan@qq.com
 *
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software
 * Foundation, either version 3 of the License, or (at your option) any later
 * version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program. If not, see <http://www.gnu.org/licenses/>.
 *
 */
package org.apdplat.superword.tools;

import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

/**
 *  http://www.palosverdes.com/jesse/pvphs/www-freecollege-com-vocab.htm 
 * ??????????
 * @author ??
 */
public class PrefixSuffixOptimizer {

    private static final String SRC_HTML = "/tools/prefix_suffix.txt";
    private static final String DST_HTML = "target/prefix_suffix_replaced.txt";
    private static final String DST_WORD = "target/prefix_suffix_words.txt";
    private static final Set<String> WORDS = new HashSet<>();

    public static void main(String[] args) throws Exception {
        InputStream in = PrefixSuffixOptimizer.class.getResourceAsStream(SRC_HTML);
        Document document = Jsoup.parse(in, "utf-8", "");
        document.select("table tbody tr td p").stream().forEach(PrefixSuffixOptimizer::replace);
        Files.write(Paths.get(DST_HTML), document.body().html().getBytes("utf-8"));

        AtomicInteger i = new AtomicInteger();
        StringBuilder text = new StringBuilder();
        WORDS.stream().sorted()
                .forEach(word -> text.append(i.incrementAndGet()).append("\t").append(word).append("\n"));
        Files.write(Paths.get(DST_WORD), text.toString().getBytes("utf-8"));
    }

    /**
     * ?????
     *
     * @param element
     */
    public static void replace(Element element) {
        String oldText = element.text();
        StringBuilder newText = new StringBuilder();
        System.out.println("oldText: " + oldText);
        String[] items = oldText.trim().replace(".", ",").split(",");
        for (String item : items) {
            item = item.trim();
            if (!StringUtils.isAlpha(item)) {
                newText.append(item).append(", ");
                continue;
            }
            if (StringUtils.isAllUpperCase(item)) {
                newText.append("<strong><a target=\"_blank\" href=\"http://www.iciba.com/").append(item)
                        .append("\">").append(item).append("</a></strong>").append(", ");
            } else {
                newText.append("<a target=\"_blank\" href=\"http://www.iciba.com/").append(item).append("\">")
                        .append(item).append("</a>").append(", ");
            }
            WORDS.add(item.toLowerCase());
        }
        if (newText.length() > 2) {
            String text = newText.substring(0, newText.length() - 2);
            System.out.println("newText: " + text);
            element.html(text);
        }
    }
}