us.eharning.atomun.mnemonic.spi.electrum.v2.CJKCleanupUtility.java Source code

Java tutorial

Introduction

Here is the source code for us.eharning.atomun.mnemonic.spi.electrum.v2.CJKCleanupUtility.java

Source

/*
 * Copyright 2014, 2015 Thomas Harning Jr. <harningt@gmail.com>
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package us.eharning.atomun.mnemonic.spi.electrum.v2;

import com.google.common.base.CharMatcher;
import com.google.common.base.Charsets;
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableRangeSet;
import com.google.common.collect.Range;
import com.google.common.collect.RangeSet;
import com.google.common.collect.TreeRangeSet;
import com.google.common.io.LineProcessor;
import com.google.common.io.Resources;

import java.io.IOException;
import java.util.Iterator;
import javax.annotation.Nonnull;

/**
 * Utility class to help handle cleanup implemented in Electrum for CJK.
 */
class CJKCleanupUtility {
    private static final RangeSet<Integer> CJK_RANGES = buildRanges();

    private static RangeSet<Integer> buildRanges() {
        LineProcessor<RangeSet<Integer>> lineProcess = new LineProcessor<RangeSet<Integer>>() {
            private final RangeSet<Integer> resultBuilder = TreeRangeSet.create();

            @Override
            public boolean processLine(@Nonnull String line) throws IOException {
                /* Skip comments and empty lines */
                int commentIndex = line.indexOf('#');
                if (commentIndex >= 0) {
                    line = line.substring(0, commentIndex);
                }
                line = CharMatcher.WHITESPACE.trimFrom(line);
                if (line.isEmpty()) {
                    return true;
                }
                /* NOTE: Assuming 0xHEX-0xHEX notation */
                int splitMarker = line.indexOf('-');
                assert splitMarker >= 0;
                int start = Integer.parseInt(line.substring(2, splitMarker), 16);
                int stop = Integer.parseInt(line.substring(splitMarker + 3), 16);
                resultBuilder.add(Range.closed(start, stop));
                return true;
            }

            @Override
            public RangeSet<Integer> getResult() {
                return ImmutableRangeSet.copyOf(resultBuilder);
            }
        };
        try {
            return Resources.readLines(MnemonicUtility.class.getResource("cjk_ranges.dat"), Charsets.UTF_8,
                    lineProcess);
        } catch (IOException e) {
            throw new Error(e);
        }
    }

    public String cleanup(String input) {
        /* Check for non-ascii range to short-circuit fast */
        if (CharMatcher.ASCII.matchesAllOf(input)) {
            /* All ascii, skip */
            return input;
        }
        /* Split into words and join specially */
        StringBuilder cleanBuilder = new StringBuilder(input.length());
        String previousWord = null;
        Iterator<String> splitIterator = Splitter.on(' ').split(input).iterator();
        /* All one unit - no space removal necessary */
        if (!splitIterator.hasNext()) {
            return input;
        }
        previousWord = splitIterator.next();
        cleanBuilder.append(previousWord);
        while (splitIterator.hasNext()) {
            String nextWord = splitIterator.next();
            if (hasSpaceBetween(previousWord, nextWord)) {
                cleanBuilder.append(' ');
            }
            cleanBuilder.append(nextWord);
            previousWord = nextWord;
        }
        return cleanBuilder.toString();
    }

    private boolean hasSpaceBetween(String previousWord, String nextWord) {
        int previousCodepoint = previousWord.codePointBefore(previousWord.length());
        if (!CJK_RANGES.contains(previousCodepoint)) {
            return true;
        }
        int nextCodePoint = nextWord.codePointAt(0);
        if (!CJK_RANGES.contains(nextCodePoint)) {
            return true;
        }
        /* Both are in CJK range, no space */
        return false;
    }
}