dk.statsbiblioteket.util.CachedCollator.java Source code

Introduction

Here is the source code for dk.statsbiblioteket.util.CachedCollator.java
Source

/* $Id: CachedCollator.java,v 1.3 2007/12/04 13:22:01 mke Exp $
 * $Revision: 1.3 $
 * $Date: 2007/12/04 13:22:01 $
 * $Author: mke $
 *
 * The SB Util Library.
 * Copyright (C) 2005-2007  The State and University Library of Denmark
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */
/* $Id: CachedCollator.java,v 1.3 2007/12/04 13:22:01 mke Exp $
 * $Revision: 1.3 $
 * $Date: 2007/12/04 13:22:01 $
 * $Author: mke $
 *
 * Copyright 2007 Statsbiblioteket, Denmark
 */
package dk.statsbiblioteket.util;

import dk.statsbiblioteket.util.qa.QAInfo;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.io.IOException;
import java.io.StringWriter;
import java.text.CollationKey;
import java.text.Collator;
import java.text.ParseException;
import java.text.RuleBasedCollator;
import java.util.*;

/**
 * Uses char-statistics to build a cache for the specified locale. Users should
 * provide implementation-specific char-statistics in order to achieve the
 * maximum possible speed-up. If no statistics are give, the characters from
 * 0x20 to 0xFF are used. Note that this might give faulty sorting for some
 * languages.
 *
 * When two Strings are compared, it is checked if both Strings contains only
 * characters from the char-statistics. If they do, comparison is done with
 * the cache-table, which is fast. If any of the Strings contains other
 * characters, comparison is done with the underlying Java-supplied Collator.
 *
 * null is handled explicitly and always occur last.
 *
 * The characters given must all be single-character comparable. Any characters
 * used in more complex sorting rules (e.g. "aa" in Danish), should not be
 * used for the cache. Note that the given characters are not required to be
 * sorted, but should appear in order of popularity.
 */
@QAInfo(state = QAInfo.State.QA_NEEDED, level = QAInfo.Level.NORMAL, author = "te", comment = "getCollationKey is poorly implemented due to the CollationKeyclass being final")
public class CachedCollator extends Collator {
    private static Log log = LogFactory.getLog(CachedCollator.class);

    /**
     * ASCII-chars that are not special characters and not letters. It should be
     * safe to use these as commonChars as they are normally compared 1:1.
     * Normally it is advisable to extend this range with safe letters for the
     * wanted locale.
     */
    public static final String COMMON_NON_LETTER = " !\"#$%&'()*+,-./0123456789:;<=>?[\\]^_{|}~@";

    /**
     * ASCII-chars a-z and A-Z. Depending on local rules for sorting, these
     * might not be safe to use as commonChars (e.g. "aa" comes after "ab" in
     * some standard danish sorts). Normally this will be used together with
     * {@link #COMMON_NON_LETTER} and other characters.
     */
    public static final String COMMON_AZ = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";

    /**
     * The danish letters ,  and . For danish sorting, these are safe to use
     * as commonChars, together with {@link #COMMON_NON_LETTER} and
     * {@link #COMMON_AZ}.
     */
    public static final String COMMON_DK_SPECIFIC = "";

    /**
     * Most commonly used characters from the corpus at Statsbiblioteket.
     * If the local sorting rules are danish and if "aa" comes before "ab",
     * these should be safe (and recommended) to use as commonChars.
     * These chars are a "complete" list, meaning that they can be used directly
     * as commonChars, without the need of extending.
     */
    public static final String COMMON_SUMMA_EXTRACTED = "eaoi 0ntr1s24cl93857hd6pgum.bfv:xwykj_z/-qASPCXIU"
            + "NEG$>#V&^?L~T*@%?BM|FYDR"
            + "O,HJKZQ?"
            + "?"
            + "W??;'????"
            + "?=?????"
            + "? ??";

    /**
     * If a CachedCollator is created without char statistics, it looks for
     * the resource CHARSTATS. If the resource can be fetched, the content is
     * used as char statistics.
     */
    public static final String CHARSTATS = "charstats.dat";

    /**
     * The Locale for the collator.
     */
    //    private Locale locale;
    /**
     * The fall-back collator that is used when the cache does not contain the
     * relevant characters.
     */
    private Collator subCollator;
    /**
     * Mappings for cached characters. The character values map to
     * sort-positions > 0 and < 65536.
     * A sort-position of 0 indicates that the character is not among the first
     * 65535 most common characters with Java char-value below 65536.
     */
    private final int[] cachedPositions = new int[65535];

    /**
     * Create a cached collator with the characters from 0x20 to 0xFF
     * as the most common characters. It is recommended to use the constructor
     * {@link #CachedCollator(Locale, String)} instead, in order to achieve
     * maximum speed-up and valid comparisons.
     *
     * @param locale the wanted locale for the Collator.
     */
    public CachedCollator(Locale locale) {
        log.debug("Creating default character collator for locale '" + locale + "'");
        subCollator = Collator.getInstance(locale);
        buildCache(getBasicChars());
    }

    /**
     * Create a cached collator with the given character statistics.
     *
     * @param locale     the wanted locale for the Collator.
     * @param mostCommon the most common characters for the given locale in the
     *                   setting where the collator is used. It can contain any
     *                   number of characters.
     *                   See the class documentation for details.
     *                   Duplicate characters are removed.
     *                   Example: "eaoi 0ntr1"...
     */
    public CachedCollator(Locale locale, String mostCommon) {
        log.debug("Creating collator for locale '" + locale + "' with most common characters '" + mostCommon + "'");
        subCollator = Collator.getInstance(locale);
        buildCache(mostCommon);
    }

    /**
     * Create a cached collator with the given character statistics.
     *
     * @param locale     the wanted locale for the Collator.
     * @param mostCommon the most common characters for the given locale in the
     *                   setting where the collator is used. It can contain any
     *                   number of characters.
     *                   See the class documentation for details.
     *                   Duplicate characters are removed.
     *                   Example: "eaoi 0ntr1"...
     * @param spaceFirst if true, the generated Collator is modified to sort
     *                   spaces before other characters: {"a b", "aa"}.
     */
    public CachedCollator(Locale locale, String mostCommon, boolean spaceFirst) {
        subCollator = Collator.getInstance(locale);
        if (spaceFirst) {
            subCollator = fixCollator(subCollator, false);
        }
        buildCache(mostCommon);
    }

    /**
     * Create a cached collator with the given character statistics. This uses
     * the characters from 0x20 to 0xFF as the most common characters. It is
     * recommended to use {@link #CachedCollator(Locale, String)} instead, in
     * order to achieve maximum speed-up and valid comparisons.
     *
     * @param locale     the wanted locale for the Collator.
     * @param spaceFirst if true, the generated Collator is modified to sort
     *                   spaces before other characters: {"a b", "aa"}.
     */
    public CachedCollator(Locale locale, boolean spaceFirst) {
        subCollator = Collator.getInstance(locale);
        if (spaceFirst) {
            subCollator = fixCollator(subCollator, false);
        }
        buildCache(getBasicChars());
    }

    /**
     * Create a cached collator with the characters from 0x20 to 0xFF
     * as the most common characters. It is recommended to use the constructor
     * {@link #CachedCollator(Collator, String)} instead, in order to achieve
     * maximum speed-up and valid comparisons.
     *
     * @param collator the inner Collator that the cache is wrapped around.
     */
    public CachedCollator(Collator collator) {
        subCollator = collator;
        buildCache(getBasicChars());
    }

    /**
     * Create a cached collator with the given character statistics.
     *
     * @param collator   the inner Collator that the cache is wrapped around.
     * @param mostCommon the most common characters for the given collator in
     *                   the setting where the cached collator is used. It can
     *                   contain any number of characters.
     *                   See the class documentation for details.
     *                   Duplicate characters are removed.
     *                   Example: "eaoi 0ntr1"...
     */
    public CachedCollator(Collator collator, String mostCommon) {
        subCollator = collator;

        buildCache(mostCommon);
    }

    protected String getBasicChars() {
        log.trace("geBasicChars called");
        try {
            return Streams.getUTF8Resource(CHARSTATS);
        } catch (IOException e) {
            log.debug("Could not fetch the resource '" + CHARSTATS + "'. Defaulting to 0x20-0xFF");
        }

        int START = 0x20;
        int END = 0xFF;
        StringWriter sw = new StringWriter(END - START + 1);
        for (int i = START; i <= END; i++) {
            sw.append((char) i);
        }
        return sw.toString();
    }

    /**
     * Fills the cache, based on mostCommon and {@link #subCollator}.
     *
     * @param mostCommon the most common characters in the application-specific
     *                   domain in prioritized order.
     */
    protected void buildCache(String mostCommon) {
        log.debug("Building cache for '" + mostCommon + "'");
        // Make sure the characters are unique
        Set<String> unique = new LinkedHashSet<String>(mostCommon.length());
        int highest = 0;
        for (Character c : mostCommon.toCharArray()) {
            unique.add(c.toString());
            highest = Math.max(highest, c);
        }
        // Sort the characters
        List<String> sorted = new ArrayList<String>(unique);
        Collections.sort(sorted, subCollator);
        if (log.isTraceEnabled()) {
            log.trace("mostCommon sorted: '" + Logs.expand(sorted, 5000) + "'");
        }

        // Split in low and high value characters.
        //        cachedPositions = new int[highest+1];
        int position = 1;
        char lastChar = 0;
        for (String cString : sorted) {
            if (cString.length() != 1) {
                log.warn("The expected character '" + cString + "' was of length " + cString.length()
                        + ". Skipping");
                continue;
            }
            char c = cString.charAt(0);
            if (lastChar == 0) {
                lastChar = c;
            }
            cachedPositions[c] = position;
            if (subCollator.compare(Character.toString(lastChar), Character.toString(c)) != 0) {
                position++;
            }
            lastChar = c;
        }
        log.debug("Finished building cache for " + position + " characters (" + (mostCommon.length() - position)
                + " duplicates removed, " + position + " unique positions) " + "of which the highest was "
                + highest);
    }

    protected int getPosition(char c) {
        // Holds the full char range so no boundary check is needed
        return cachedPositions[c];
    }

    @Override
    public int compare(final String source, final String target) {
        if (source == null) {
            return target == null ? 0 : 1;
        } else if (target == null) {
            return -1;
        }
        final int length = Math.min(source.length(), target.length());
        /*
         Only to length-2 as "foobar" and "fo" should sort "fo", "foobar" as
         o vs.  is secondary difference
         */
        for (int i = 0; i < length - 1; i++) {
            try {
                final int sPos = cachedPositions[source.charAt(i)];
                final int tPos = cachedPositions[target.charAt(i)];
                if (sPos == 0 || tPos == 0) {
                    return subCollator.compare(source, target);
                }
                if (sPos != tPos) {
                    return source.charAt(i + 1) == ' ' || target.charAt(i + 1) == ' '
                            ? subCollator.compare(source, target)
                            : sPos - tPos;
                }
            } catch (IndexOutOfBoundsException e) { // Non-handled char
                log.debug(String.format(
                        "Got an IndexOutOfBoundsException, which should not be possible as cachedPositions should hold "
                                + "entries for all possible char valued. The length of cachedPositions is %d, "
                                + "source.charAt(%d) == '%s', target.charAt(%d) == '%s'",
                        cachedPositions.length, i, source.charAt(i), i, source.charAt(i)), e);
                return subCollator.compare(source, target);
            }
        }
        return subCollator.compare(source, target);
        //        return source.length()- target.length();
    }

    @Override
    public int compare(Object source, Object target) {
        return compare((String) source, (String) target);
    }

    @Override
    public CollationKey getCollationKey(String source) {
        return subCollator.getCollationKey(source);
    }

    @Override
    public int hashCode() {
        return subCollator.hashCode();
    }

    /**
     * @return the cached chars in collator order.
     */
    public String getCachedChars() {
        List<String> chars = new ArrayList<String>(5000);
        for (int i = 0; i < cachedPositions.length; i++) {
            if (cachedPositions[i] > 0) {
                chars.add(Character.toString((char) i));
            }
        }
        Collections.sort(chars, this);
        StringWriter sw = new StringWriter(chars.size());
        for (String c : chars) {
            sw.append(c);
        }
        return sw.toString();
    }

    private static Collator fixCollator(Collator collator, boolean check) {
        if (!(collator instanceof RuleBasedCollator)) {
            log.warn(String.format("fixCollator expected a RuleBasedCollator but got %s. Unable to update Collator",
                    collator.getClass()));
            return collator;
        }
        String rules = ((RuleBasedCollator) collator).getRules();
        if (check && !rules.contains("<' '<'\u005f'")) {
            log.debug("fixCollator: The received Collator already sorts spaces first");
            return collator;
        }
        try {
            RuleBasedCollator newCollator = new RuleBasedCollator(rules.replace("<'\u005f'", "<' '<'\u005f'"));
            log.trace("Successfully updated Collator to prioritize spaces before other characters");
            return newCollator;
        } catch (ParseException e) {
            throw new RuntimeException("ParseException while parsing\n" + rules, e);
        }
    }

}