Android Open Source - sdk-hyphenation Hyphenator






From Project

Back to project page sdk-hyphenation.

License

The source code is released under:

GNU Lesser General Public License

If you think the Android project sdk-hyphenation listed in this page is inappropriate, such as containing malicious code/tools or violating the copyright, please email info at java2s dot com, thanks.

Java Source Code

package org.silpa.hyphenation.text;
//w w w.  j  av a 2 s  . c om
import android.content.Context;

import org.silpa.guesslanguage.GuessLanguage;
import org.silpa.hyphenation.R;
import org.silpa.hyphenation.text.Utf8TexParser.TexParserException;
import org.silpa.hyphenation.util.ErrorHandler;
import org.silpa.hyphenation.util.List;
import org.silpa.hyphenation.util.LoggingErrorHandler;

import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
import java.util.logging.Logger;

/**
 * insert soft hyphens at all allowed locations uses TeX hyphenation tables
 */
public class Hyphenator {

    //Hyphens from the wikipedia article: https://en.wikipedia.org/wiki/Hyphen#Unicode
    public static final char HYPHEN = '\u2010';
    public static final char HYPHEN_MINUS = '\u002d';
    public static final char SOFT_HYPHEN = '\u00ad';
    public static final char NON_BREAKING_HYPHEN = '\u2011';
    private static final char ZERO_WIDTH_SPACE = '\u200b';

    private final ForwardingErrorHandler errorHandler;
    private RuleDefinition ruleSet;
    private final ByteScanner b;

    // Guess Language
    private GuessLanguage guessLanguage;
    private Context mContext;
    private static Map<String, String> indicHyphenRules = new HashMap<>();

    static {
        indicHyphenRules.put("as", "\\patterns{\n2??2\n1?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n??1\n??1\n?1\n?1\n?1\n?1\n?1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n2?2\n?1\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n?1\n2?1\n2?1\n2??1\n2?1\n2??2\n}\n\\hyphenation{\n}");
        indicHyphenRules.put("bn", "\\patterns{\n2??2\n1?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n??1\n??1\n?1\n?1\n?1\n?1\n?1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n2?2\n?1\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n?1\n2?1\n2?1\n2??1\n2?1\n2??2\n}\n\\hyphenation{\n}");
        indicHyphenRules.put("gu", "\\patterns{\n2??2\n1?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n??1\n??1\n?1\n?1\n?1\n?1\n?1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n2??1\n2?1\n2?1\n2??2\n2?2\n}\n\\hyphenation{\n}");
        indicHyphenRules.put("hi", "\\patterns{\n2??2\n1?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n??1\n??1\n?1\n?1\n?1\n?1\n?1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n2??1\n2?1\n2?1\n2?1\n2?1\n2?1\n2??2\n}\n\\hyphenation{\n}");
        indicHyphenRules.put("kn", "\\patterns{\n2??2\n1?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n??1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n2?1\n2?1\n2?1\n2?1\n2?1\n2??2\n}\n\\hyphenation{\n}");
        indicHyphenRules.put("ml", "\\patterns{\n2??2\n1?1\n1?1\n1?1\n1?1\n1?1\n1?1\n1?1\n1?1\n1?1\n1?1\n1?1\n1?1\n1??1\n1??1\n1?1\n1?1\n1?1\n?1\n?1\n?1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n2?1\n2?1\n2??2\n???2\n???2\n???2\n???2\n???2\n???2\n2?????\n2?????\n2?????\n2?????\n2?????\n2?????\n2?\n2?\n2?\n2?\n2?\n2?\n}\n\\hyphenation{\n}");
        indicHyphenRules.put("mr", "\\patterns{\n2??2\n1?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n??1\n??1\n?1\n?1\n?1\n?1\n?1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n2??1\n2?1\n2?1\n2?1\n2?1\n2?1\n2??2\n}\n\\hyphenation{\n}");
        indicHyphenRules.put("or", "\\patterns{\n2??2\n1?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n??1\n??1\n?1\n?1\n?1\n?1\n?1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n2?1\n2?1\n2?1\n2??1\n2??2\n}\n\\hyphenation{\n}");
        indicHyphenRules.put("pa", "\\patterns{\n2??2\n1?1\n?1\n?1\n?1\n?1\n?1\n?1\n??1\n??1\n?1\n?1\n?1\n?1\n?1\n??1\n?1\n?1\n?1\n?1\n?1\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n2??1\n2?1\n2?1\n2??2\n2?2\n2?2\n}\n\\hyphenation{\n}");
        indicHyphenRules.put("ta", "\\patterns{\n2??2\n1?1\n1?1\n1?1\n1?1\n1?1\n1?1\n1?1\n1?1\n1??1\n1??1\n1?1\n1?1\n1?1\n?1\n?1\n?1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2???1\n2?1\n2?1\n2?1\n2??1\n}\n\\hyphenation{\n}");
        indicHyphenRules.put("te", "\\patterns{\n2??2\n1?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n??1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n??1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n?1\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1??\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n1?\n2??1\n2?1\n2?1\n2?1\n2?1\n2??2\n}\n\\hyphenation{\n}");
    }

    /**
     * Constructor
     * Creates an uninitialized instance of Hyphenator. The same instance can be
     * reused for different hyphenation tables.
     *
     * @param context context of application
     */
    public Hyphenator(Context context) {
        errorHandler = new ForwardingErrorHandler(new LoggingErrorHandler(Logger.getLogger(this.getClass().getCanonicalName())));
        b = new ByteScanner(errorHandler);
        this.mContext = context;
        this.guessLanguage = new GuessLanguage(this.mContext);
    }

    public RuleDefinition getRuleSet() {
        return ruleSet;
    }

    public void setRuleSet(RuleDefinition scanner) {
        this.ruleSet = scanner;
    }

    public ErrorHandler getErrorHandler() {
        return errorHandler.getTarget();
    }

    /**
     * installs error handler.
     *
     * @param eh ErrorHandler used while parsing and hyphenating
     * @see org.silpa.hyphenation.util.ErrorHandler
     */
    public void setErrorHandler(ErrorHandler eh) {
        errorHandler.setTarget(eh);
    }

    /**
     * Loads a hyphenation table with a reader. This enables the use of UTF-8 pattern files.
     * Note that escape codes in the original tex-files are not supported, e.g. ^^f6.
     * This method also differs in that multiple calls to loadTable are not joined, only the
     * most recent pattern file is used.
     * Only "\pattern{" and "\hyphenation{" groups are supported.
     *
     * @param reader a reader containing hyphenation patterns (most likely a file)
     * @throws TexParserException if there are problems reading the input
     */
    public void loadTable(Reader reader) throws TexParserException {
        Utf8TexParser parser = new Utf8TexParser();
        ruleSet = parser.parse(reader);
    }

    /**
     * loads hyphenation table
     *
     * @param in hyphenation table
     * @throws java.io.IOException IOException while reading rules
     */
    public void loadTable(java.io.InputStream in) throws java.io.IOException {
        int[] codelist = new int[256];
        {
            for (int i = 0; i != 256; ++i)
                codelist[i] = i;
        }
        loadTable(in, codelist);
    }

    /**
     * loads hyphenation table and code list for non-ucs encoding
     *
     * @param in       hyphenation table
     * @param codelist an array of 256 elements. maps one-byte codes to UTF codes
     * @throws java.io.IOException IOException while reading rules
     */
    public void loadTable(java.io.InputStream in, int[] codelist)
            throws java.io.IOException {
        b.scan(in, codelist);
        ruleSet = b;
    }

    /**
     * performs hyphenation
     *
     * @param phrase string to hyphenate
     * @return the string with soft hyphens inserted
     */
    public String hyphenate(String phrase) {
        return hyphenate(phrase, 1, 1);
    }

    /**
     * performs hyphenation
     *
     * @param phrase         string to hyphenate
     * @param leftHyphenMin  unbreakable characters at the beginning of each word in the
     *                       phrase
     * @param rightHyphenMin unbreakable characters at the end of each word in the phrase
     * @return the string with soft hyphens inserted
     */
    public String hyphenate(String phrase, int leftHyphenMin, int rightHyphenMin) {

        // Check input
        leftHyphenMin = Math.max(leftHyphenMin, 1);
        rightHyphenMin = Math.max(rightHyphenMin, 1);

        // Ignore short phrases (early out)
        if (phrase.length() < rightHyphenMin + leftHyphenMin) {
            return phrase;
        }

        int processedOffset = Integer.MIN_VALUE;
        int ich = 0;
        char[] sourcePhraseChars = new char[phrase.length() + 1];
        sourcePhraseChars[sourcePhraseChars.length - 1] = (char) 0;
        phrase.getChars(0, phrase.length(), sourcePhraseChars, 0);


        char[] hyphenatedPhraseChars = new char[sourcePhraseChars.length * 2 - 1];
        int ihy = 0;

        boolean inword = false;
        while (true) {
            if (inword) {
                if (Character.isLetter(sourcePhraseChars[ich])) {
                    ich++;
                } else { // last character will be reprocessed in the other
                    // state
                    int length = ich - processedOffset;
                    String word = new String(sourcePhraseChars, processedOffset, length).toLowerCase();
                    int[] hyphenQualificationPoints = ruleSet
                            .getException(word);

                    if (hyphenQualificationPoints == null) {
                        char[] wordChars = extractWord(sourcePhraseChars, processedOffset, length);
                        hyphenQualificationPoints = applyHyphenationRules(
                                wordChars, length);
                    }

                    // now inserting soft hyphens
                    if (leftHyphenMin + rightHyphenMin <= length) {
                        for (int i = 0; i < leftHyphenMin - 1; i++) {
                            hyphenatedPhraseChars[ihy++] = sourcePhraseChars[processedOffset++];
                        }

                        for (int i = leftHyphenMin - 1; i < length
                                - rightHyphenMin; i++) {
                            hyphenatedPhraseChars[ihy++] = sourcePhraseChars[processedOffset++];
                            if (hyphenQualificationPoints[i] % 2 == 1)
                                hyphenatedPhraseChars[ihy++] = SOFT_HYPHEN;
                        }

                        for (int i = length - rightHyphenMin; i < length; i++) {
                            hyphenatedPhraseChars[ihy++] = sourcePhraseChars[processedOffset++];
                        }
                    } else {
                        //Word is to short to hyphenate, so just copy
                        for (int i = 0; i != length; ++i) {
                            hyphenatedPhraseChars[ihy++] = sourcePhraseChars[processedOffset++];
                        }
                    }
                    inword = false;
                }
            } else {
                if (Character.isLetter(sourcePhraseChars[ich])) {
                    processedOffset = ich;
                    inword = true; // processedOffset remembers the start of the word
                } else {
                    if (sourcePhraseChars[ich] == (char) 0)
                        break; // zero is a guard inserted earlier
                    hyphenatedPhraseChars[ihy++] = sourcePhraseChars[ich];
                    if (sourcePhraseChars[ich] == HYPHEN_MINUS || sourcePhraseChars[ich] == HYPHEN) {
                        hyphenatedPhraseChars[ihy++] = ZERO_WIDTH_SPACE;
                    }
                }
                ich++;
            }
        }
        return new String(hyphenatedPhraseChars, 0, ihy);
    }

    /**
     * performs hyphenation with auto detection of language.
     * Object must be created with Hyphenator(Context context)
     *
     * @param phrase string to hyphenate
     * @return hyphenated string
     */
    public String hyphenateWithDetectLangauge(String phrase) {
        return hyphenateWithDetectLangauge(phrase, 1, 1);
    }

    /**
     * performs hyphenation with auto detection of language.
     * Object must be created with Hyphenator(Context context)
     *
     * @param phrase         string to hyphenate
     * @param leftHyphenMin  unbreakable characters at the beginning of each word in the
     *                       phrase
     * @param rightHyphenMin unbreakable characters at the end of each word in the phrase
     * @return the string with soft hyphens inserted
     */
    public String hyphenateWithDetectLangauge(String phrase, int leftHyphenMin, int rightHyphenMin) {
        if (guessLanguage == null) {
            return null;
        }
        String lang = guessLanguage.guessLanguage(phrase);
        if (indicHyphenRules.get(lang) == null) {
            return phrase;
        }

        try {
            if (lang.equals("en")) {
                this.loadTable(this.mContext.getResources().openRawResource(R.raw.silpa_sdk_hyph_en));
            } else {
                RuleDefinition rules = new Utf8TexParser().parse(indicHyphenRules.get(lang));
                this.setRuleSet(rules);
            }
        } catch (TexParserException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return hyphenate(phrase, leftHyphenMin, rightHyphenMin);
    }

    /**
     * Extract a word from a char array. The word is converted to lower case and
     * a '.' character is appended to the beginning and end of the new array.
     *
     * @param chars      The character array to extract a smaller section from
     * @param wordStart  First character to include from the source array <b>chars</b>.
     * @param wordLength Number of characters to include from the source array
     *                   <b>chars</b>
     * @return Word converted so lower case and surrounded by '.'
     */
    private char[] extractWord(char[] chars, int wordStart, int wordLength) {
        char[] echars = new char[wordLength + 2];
        echars[0] = echars[echars.length - 1] = '.';
        for (int i = 0; i < wordLength; i++) {
            echars[1 + i] = Character.toLowerCase(chars[wordStart + i]);
        }
        return echars;
    }

    /**
     * Generate a hyphen qualification points for a word by applying rules.
     *
     * @param wordChars Word surrounded by '.' characters
     * @param length    Length of the word (excluding '.' characters)
     * @return hyphen qualification points for the word
     */
    @SuppressWarnings("rawtypes")
    private int[] applyHyphenationRules(final char[] wordChars, final int length) {
        int[] hyphenQualificationPoints = new int[wordChars.length + 1];

        for (int istart = 0; istart < length; istart++) {
            List rules = ruleSet.getPatternTree((int) wordChars[istart]);
            int i = istart;

            java.util.Enumeration rulesEnumeration = rules.elements();
            while (rulesEnumeration.hasMoreElements()) {
                rules = (List) rulesEnumeration.nextElement();

                if (((Character) rules.head()).charValue() == wordChars[i]) {
                    rules = rules.longTail(); // values
                    int[] nodevalues = (int[]) rules.head();
                    for (int inv = 0; inv < nodevalues.length; inv++) {
                        if (nodevalues[inv] > hyphenQualificationPoints[istart
                                + inv]) {
                            hyphenQualificationPoints[istart + inv] = nodevalues[inv];
                        }
                    }
                    i++;

                    if (i == wordChars.length) {
                        break;
                    }
                    rulesEnumeration = rules.longTail().elements(); // child
                    // nodes
                }
            }
        }

        int[] newvalues = new int[length];
        System.arraycopy(hyphenQualificationPoints, 2, newvalues, 0, length); // save
        // 12
        // bytes;
        // senseless
        hyphenQualificationPoints = newvalues;
        return hyphenQualificationPoints;
    }

    private class ForwardingErrorHandler implements ErrorHandler {
        private ErrorHandler target;

        public ForwardingErrorHandler(ErrorHandler target) {
            this.target = target;
        }

        public ErrorHandler getTarget() {
            return target;
        }

        public void setTarget(ErrorHandler target) {
            this.target = target;
        }

        public void debug(String domain, String message) {
            target.debug(domain, message);
        }

        public void info(String s) {
            target.info(s);
        }

        public void warning(String s) {
            target.warning(s);
        }

        public void error(String s) {
            target.error(s);
        }

        public void exception(String s, Exception e) {
            target.exception(s, e);
        }
    }

}




Java Source Code List

org.silpa.hyphenation.text.ByteScanner.java
org.silpa.hyphenation.text.Hyphenator.java
org.silpa.hyphenation.text.RuleDefinition.java
org.silpa.hyphenation.text.TreeNode.java
org.silpa.hyphenation.text.Utf8TexParser.java
org.silpa.hyphenation.util.Applicator.java
org.silpa.hyphenation.util.ErrorHandler.java
org.silpa.hyphenation.util.Hashtable.java
org.silpa.hyphenation.util.List.java
org.silpa.hyphenation.util.LoggingErrorHandler.java