opennlp.tools.similarity.apps.utils.Utils.java Source code

Java tutorial

Introduction

Here is the source code for opennlp.tools.similarity.apps.utils.Utils.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.similarity.apps.utils;

import java.awt.Graphics2D;
import java.awt.geom.AffineTransform;
import java.awt.image.BufferedImage;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.List;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.imageio.ImageIO;

import org.apache.commons.lang.StringUtils;

public class Utils {

    private static final Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.utils.Utils");

    protected static final ArrayList<String[]> characterMappings = new ArrayList<String[]>();

    static {
        characterMappings.add(new String[] {
                "[?]",
                " " }); // was a
        characterMappings.add(new String[] {
                "[??]",
                "A" });
        characterMappings.add(new String[] {
                "[?]",
                "c" });
        characterMappings.add(new String[] {
                "[]",
                "C" });
        characterMappings.add(new String[] { "[?]", "d" });
        characterMappings.add(new String[] {
                "[??]", "D" });
        characterMappings.add(new String[] {
                "[]",
                " " }); // was e
        characterMappings.add(new String[] {
                "[]",
                "'" }); // was E
        characterMappings.add(new String[] {
                "[?]", "g" });
        characterMappings.add(new String[] {
                "[]",
                "G" });
        characterMappings.add(new String[] { "[]", "h" });
        characterMappings.add(new String[] { "[]", "H" });
        characterMappings.add(new String[] {
                "[]",
                "i" });
        characterMappings.add(new String[] {
                "[??]",
                "I" });
        characterMappings.add(new String[] { "[]", "k" });
        characterMappings.add(new String[] { "[]", "K" });
        characterMappings.add(new String[] {
                "[??]",
                "o" });
        characterMappings.add(new String[] {
                "[?]",
                "O" });
        characterMappings.add(new String[] {
                "[]",
                "n" });
        characterMappings.add(new String[] {
                "[]",
                "N" });
        characterMappings.add(new String[] {
                "[]",
                "l" });
        characterMappings.add(new String[] {
                "[?]",
                "L" });
        characterMappings.add(new String[] {
                "[]",
                "u" });
        characterMappings.add(new String[] {
                "[]",
                "U" });
        characterMappings.add(new String[] { "[]", "y" });
        characterMappings
                .add(new String[] { "[?]", "Y" });
        characterMappings.add(new String[] {
                "[]", "r" });
        characterMappings.add(new String[] {
                "[]", "R" });
        characterMappings.add(new String[] {
                "[?]",
                "s" });
        characterMappings.add(new String[] {
                "[]",
                "S" });
        characterMappings.add(new String[] { "", "ss" });
        characterMappings.add(new String[] { "", "th" });
        characterMappings.add(new String[] { "", "Th" });
        characterMappings
                .add(new String[] { "[]", "t" });
        characterMappings
                .add(new String[] { "[]", "T" });
        characterMappings.add(new String[] { "[]", "w" });
        characterMappings.add(new String[] { "[]", "W" });
        characterMappings.add(new String[] {
                "[]",
                "z" });
        characterMappings.add(new String[] {
                "[]",
                "Z" });
        characterMappings.add(new String[] { "[]", "'" });
        characterMappings.add(new String[] { "[]", "'" });
        characterMappings.add(new String[] { "&#39;", "'" });
        characterMappings.add(new String[] { "e", "" });
        characterMappings.add(new String[] { "'AG", "" });
        characterMappings.add(new String[] { "A", " " });
        characterMappings.add(new String[] { "&quot;", "\"" });
        characterMappings.add(new String[] { "&amp;", "&" });
        characterMappings.add(new String[] { "&nbsp;", " " });
        characterMappings.add(new String[] { "", " " });
        characterMappings.add(new String[] { "", " " });
        characterMappings.add(new String[] { "", "" });
        characterMappings.add(new String[] { "", "'" });
    }

    public static String stripNonAsciiChars(String s) {
        StringBuffer b = new StringBuffer();
        if (s != null) {
            for (int i = 0; i < s.length(); i++) {
                if (((int) s.charAt(i)) <= 256) {
                    b.append(s.charAt(i));
                }
            }
        }

        return b.toString().trim().replaceAll("\\s+", " "); // replace any multiple
        // spaces with a single
        // space
    }

    public static String convertToASCII(String s) {
        s = s.replace("&amp", "");
        s = s.replaceAll("", "__apostrophe__");
        String tmp = s;
        if (tmp != null) {
            for (String[] mapping : characterMappings) {
                tmp = tmp.replaceAll(mapping[0], mapping[1]);
            }
        }
        return stripNonAsciiChars(tmp.replaceAll("__apostrophe__", "'"));
    }

    public static class KeyValue {
        public Object key = null;

        public float value = 0;

        public KeyValue(Object o, Float i) {
            this.key = o;
            this.value = i;
        }

        public static class SortByValue implements Comparator {
            public int compare(Object obj1, Object obj2) {
                float i1 = ((KeyValue) obj1).value;
                float i2 = ((KeyValue) obj2).value;

                if (i1 < i2)
                    return 1;
                return -1;
            }
        }
    }

    public static boolean createResizedCopy(String originalImage, String newImage, int scaledWidth,
            int scaledHeight) {
        boolean retVal = true;
        try {
            File o = new File(originalImage);
            BufferedImage bsrc = ImageIO.read(o);
            BufferedImage bdest = new BufferedImage(scaledWidth, scaledHeight, BufferedImage.TYPE_INT_RGB);

            Graphics2D g = bdest.createGraphics();
            AffineTransform at = AffineTransform.getScaleInstance((double) scaledWidth / bsrc.getWidth(),
                    (double) scaledHeight / bsrc.getHeight());
            g.drawRenderedImage(bsrc, at);
            ImageIO.write(bdest, "jpeg", new File(newImage));

        } catch (Exception e) {
            retVal = false;
            LOG.severe("Failed creating thumbnail for image: " + originalImage + e);
        }

        return retVal;
    }

    private static int minimum(int a, int b, int c) {
        int mi;

        mi = a;
        if (b < mi) {
            mi = b;
        }
        if (c < mi) {
            mi = c;
        }
        return mi;

    }

    public static int computeEditDistance(String s, String t) {
        int d[][]; // matrix
        int n; // length of s
        int m; // length of t
        int i; // iterates through s
        int j; // iterates through t
        char s_i; // ith character of s
        char t_j; // jth character of t
        int cost; // cost

        // Step 1
        n = s.length();
        m = t.length();
        if (n == 0) {
            return m;
        }
        if (m == 0) {
            return n;
        }
        d = new int[n + 1][m + 1];
        // Step 2
        for (i = 0; i <= n; i++) {
            d[i][0] = i;
        }
        for (j = 0; j <= m; j++) {
            d[0][j] = j;
        }
        // Step 3
        for (i = 1; i <= n; i++) {
            s_i = s.charAt(i - 1);
            // Step 4
            for (j = 1; j <= m; j++) {
                t_j = t.charAt(j - 1);
                // Step 5
                if (s_i == t_j) {
                    cost = 0;
                } else {
                    cost = 1;
                }
                // Step 6
                d[i][j] = minimum(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost);
            }
        }
        // Step 7
        return d[n][m];
    }

    public static ArrayList<KeyValue> sortByValue(HashMap<Object, Float> h) {
        ArrayList<KeyValue> res = new ArrayList<KeyValue>();
        for (Object o : h.keySet()) {
            // form a pair
            res.add(new KeyValue(o, h.get(o)));
        }

        Collections.sort(res, new KeyValue.SortByValue());

        return res;
    }

    public static String convertKeyValueToString(ArrayList<KeyValue> l) {
        StringBuffer retVal = new StringBuffer();
        for (KeyValue kv : l) {
            retVal.append(kv.key);
            retVal.append("-");
            retVal.append(kv.value);
            retVal.append(",");
        }

        return retVal.toString();
    }

    public static String convertStringArrayToString(ArrayList<String> l) {
        StringBuffer b = new StringBuffer();
        for (String s : l) {
            b.append(s);
            b.append(", ");
        }

        return b.toString();
    }

    public static String convertStringArrayToPlainString(ArrayList<String> l) {
        StringBuffer b = new StringBuffer();
        for (String s : l) {
            b.append(s);
            b.append(" ");
        }

        return b.toString();
    }

    public static boolean noDomainInUrl(String siteUrl, String url) {
        if (StringUtils.isEmpty(url)) {
            return true;
        }
        if (!url.startsWith("http://")) {
            return true;
        }
        return false;
    }

    public static String addDomainToUrl(String siteUrl, String url) {
        if (StringUtils.isEmpty(url)) {
            return null; // should we return siteUrl here ??
        }
        if (!url.startsWith("http://")) {
            String domain = StringUtils.substringBetween(siteUrl, "http://", "/");
            if (domain == null) {
                url = siteUrl + (url.startsWith("/") ? "" : "/") + url;
            } else {
                if (!url.startsWith("/")) {
                    int lastIndex = StringUtils.lastIndexOf(siteUrl, "/");
                    url = siteUrl.substring(0, lastIndex) + "/" + url;
                } else {
                    url = "http://" + domain + url;
                }
            }
        }
        return url;
    }

    public static int countValues(Hashtable<String, Float> b1) {
        int retVal = 0;
        for (String s : b1.keySet()) {
            retVal += b1.get(s);
        }

        return retVal;
    }

    public static int countValues(HashMap<String, Integer> b1) {
        int retVal = 0;
        for (String s : b1.keySet()) {
            retVal += b1.get(s);
        }

        return retVal;
    }

    public static String convertHashMapToString(HashMap<String, Integer> m) {
        StringBuffer s = new StringBuffer();
        for (String x : m.keySet()) {
            s.append(x);
            s.append("-");
            s.append(m.get(x));
            s.append(",");
        }

        return s.toString();
    }

    public static boolean isTokenAllDigitOrPunc(String token) {
        for (int i = 0; i < token.length(); i++) {
            if (java.lang.Character.isLetter(token.charAt(i))) {
                return false;
            }
        }
        return true;
    }

    public static boolean containsDigit(String token) {
        for (int i = 0; i < token.length(); i++) {
            if (java.lang.Character.isDigit(token.charAt(i))) {
                return true;
            }
        }
        return false;
    }

    public static String CleanCharacter(String txt, int uValue) {
        StringBuffer retVal = new StringBuffer();
        for (int i = 0; i < txt.length(); i++) {
            int uChar = (txt.charAt(i));
            if (uChar != uValue) {
                retVal.append(txt.charAt(i));
            } else {
                retVal.append(" ");
            }
        }
        return retVal.toString();
    }

    public static String removeHTMLTagsFromStr(String inputStr) {
        String[] removeTags = StringUtils.substringsBetween(inputStr, "<", ">");

        if (removeTags != null && removeTags.length > 0) {
            for (String tag : removeTags) {
                inputStr = StringUtils.remove(inputStr, "<" + tag + ">");
            }
        }

        return inputStr;
    }

    public static String unescapeHTML(String text) {
        return org.apache.commons.lang.StringEscapeUtils.unescapeHtml(text);
    }

    public static String stripHTML(String text) {
        return text.replaceAll("\\<.*?>", "");
    }

    public static String stripScriptTags(String text) {
        Pattern p = java.util.regex.Pattern.compile("\\<SCRIPT.*?</SCRIPT>",
                Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
        Matcher matcher = p.matcher(text);
        String tmp = matcher.replaceAll("");
        return tmp;
    }

    public static String stripNoScriptTags(String text) {
        Pattern p = java.util.regex.Pattern.compile("\\<NOSCRIPT.*?</NOSCRIPT>",
                Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
        Matcher matcher = p.matcher(text);
        String tmp = matcher.replaceAll("");
        return tmp;
    }

    public static String stripHTMLMultiLine(String text, HashSet<String> allowedHtmlTags, String escGtCh,
            String escLtCh) {
        if (StringUtils.isNotEmpty(text)) {

            boolean hadAllowedHtmlTags = false;

            if (allowedHtmlTags != null) {
                for (String htmlTag : allowedHtmlTags) {
                    String tmp = text.replaceAll("<" + htmlTag + ">", escLtCh + htmlTag + escGtCh);
                    tmp = tmp.replaceAll("</" + htmlTag + ">", escLtCh + "/" + htmlTag + escGtCh);
                    if (!tmp.equals(text)) {
                        text = tmp;
                        hadAllowedHtmlTags = true;
                    }
                }
            }

            text = stripHTMLMultiLine(text);

            if (hadAllowedHtmlTags) {
                text = text.replaceAll(escLtCh, "<");
                text = text.replaceAll(escGtCh, ">");
            }
        }

        return text;
    }

    public static String stripHTMLMultiLine(String text) {
        Pattern p = java.util.regex.Pattern.compile("\\<.*?>", Pattern.DOTALL);
        Matcher matcher = p.matcher(text);
        String tmp = matcher.replaceAll("");
        return tmp;
    }

    public static String stripHTMLCommentsMultiLine(String text) {
        Pattern p = java.util.regex.Pattern.compile("\\<!--.*?-->", Pattern.DOTALL);
        Matcher matcher = p.matcher(text);
        String tmp = matcher.replaceAll("");
        return tmp;
    }

    public static boolean isFlagSet(Integer flags, Integer flagToCheck) {
        if (flags != null && flagToCheck != null) {
            return ((flags & flagToCheck) == flagToCheck);
        }
        return false;
    }

    public static Integer updateFlag(Integer flags, Integer flagToCheck, boolean shouldSet) {
        if (shouldSet) {
            return setFlag(flags, flagToCheck);
        } else {
            return resetFlag(flags, flagToCheck);
        }
    }

    public static Integer setFlag(Integer flags, Integer flagToCheck) {
        if (flags == null) {
            flags = new Integer(0);
        }
        if (!isFlagSet(flags, flagToCheck)) {
            flags = flags + flagToCheck;
            ;
        }
        return flags;
    }

    public static Integer resetFlag(Integer flags, Integer flagToCheck) {
        if (flags == null) {
            // nothing to reset
            flags = new Integer(0);
            return flags;
        }

        if (isFlagSet(flags, flagToCheck)) {
            flags = flags - flagToCheck;
        }
        return flags;
    }

    public static String truncateOnSpace(String text, Integer length) {
        String retVal = "";
        if (text.length() <= length) {
            retVal = text;
        } else {
            StringBuffer b = new StringBuffer();
            for (int i = 0; i < text.length(); i++) {
                if (b.length() >= length && Character.isWhitespace(text.charAt(i))) { // iterate
                    // until
                    // we
                    // hit
                    // whitespace
                    b.append("...");
                    break;
                }
                b.append(text.charAt(i));
            }
            retVal = b.toString();
        }

        return retVal.trim();
    }

    public static String sanitizeString(String text) {
        text = Utils.stripHTMLCommentsMultiLine(text);
        text = Utils.stripHTMLMultiLine(text);
        text = Utils.unescapeHTML(text);
        text = StringUtils.trimToEmpty(text);
        text = text.replaceAll("\\s+", " ");
        return text;
    }

    public static String makeStringUrlSafe(String text) {
        StringBuffer b = new StringBuffer();
        for (int i = 0; i < text.length(); i++) {
            if (StringUtils.isAlphanumericSpace(String.valueOf(text.charAt(i)))) {
                b.append(text.charAt(i));
            }
        }
        return Utils.convertToASCII(b.toString().replaceAll("\\s+", " "));
    }

    public static String getEventIdFromNewsUrl(String url) {
        String eventId = null;
        String p = "news/([0-9]+)";
        Pattern pattern = Pattern.compile(p);
        Matcher matcher = pattern.matcher(url);
        while (matcher.find()) {
            // System.out.println("found: " + matcher.group(2));
            eventId = matcher.group(1);
        }
        return eventId;
    }

    public static String buildCommaSeparatedIds(List ids) {

        if (ids != null && ids.size() > 0) {
            StringBuffer sbuf = new StringBuffer();

            for (int count = 0; count < ids.size(); count++) {
                if (count > 0) {
                    sbuf.append(",");
                }
                sbuf.append(ids.get(count));
            }
            return sbuf.toString();
        }

        return null;
    }

    public static float computeScoreForRanking(List<Float> scores, int desiredRanking) {
        float newScore = 0f;

        if (desiredRanking == 1) {
            newScore = scores.get(0) + 50000;
        } else if (desiredRanking == scores.size()) {
            newScore = scores.get(scores.size() - 1) - 1;
        } else {
            newScore = (scores.get(desiredRanking - 2) + scores.get(desiredRanking - 1)) / 2;
        }

        return newScore;
    }

    public static String fullStripHTML(String text) {
        text = Utils.stripScriptTags(text);
        text = Utils.stripNoScriptTags(text);
        text = Utils.stripStyleTags(text);
        return text.replaceAll("\\<.*?>", "");
    }

    public static String stripStyleTags(String text) {
        Pattern p = java.util.regex.Pattern.compile("\\<STYLE.*?</STYLE>",
                Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
        Matcher matcher = p.matcher(text);
        String tmp = matcher.replaceAll("");
        return tmp;
    }

    public static boolean isLatinWord(String word) {
        for (int i = 0; i < word.length(); i++) {
            int asciiCode = (int) word.charAt(i);
            if (asciiCode > 128)
                return false;
        }

        return true;
    }

    static public void main(String[] args) {
        System.out.println(isLatinWord("Performing Arts Center (SPAC)"));
        System.out.println(isLatinWord("Jazz Age"));

        System.out.println(isLatinWord(
                ""));
        System.out.println(isLatinWord(" "));
        System.out.println(isLatinWord("  "));
        System.out.println(isLatinWord(
                ", "));

        System.out.println(convertToASCII(
                "Irvine Bay Hotel & Golf Club on Sunday, May 01 duringJazz on the Beach,Tobago Jazz Experience alongsideThe Jazz Singer"));
        System.out.println(convertToASCII(
                "This years event, held again at the wonderful Saratoga Performing Arts Center (SPAC)"));
        System.out.println(convertToASCII(
                "and the great saxophone playing of Sam Rogers                Rush Hour Blues 2010     .  "));
        System.out.println(
                convertToASCII("&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Ron Carter is among the most original, prolific "));
        System.out.println(convertToASCII(
                ".  Ron Carter is among the most original, prolific. "));
        // TODO deal with
        // www.wmot.org/program-guide/program-listings/28th_annual_playboy_jazz_festiva_2006.htm
        System.out.println(convertToASCII(
                "By the mid 1920s,    during the period referred to as the Jazz Age, jazz music was heard    in most major cities from the East Coast"));

    }
}