com.zz.langchecker.LangSwitcherTokenizer.java Source code

Introduction

Here is the source code for com.zz.langchecker.LangSwitcherTokenizer.java
Source

/*
 *
 *     Copyright 2013-2014 https://github.com/blizznets authors
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 *
 */

package com.zz.langchecker;

import com.google.common.base.Charsets;
import com.google.common.base.Joiner;
import com.google.common.base.Optional;
import com.google.common.base.Predicate;
import com.google.common.base.Splitter;
import com.google.common.base.Throwables;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.io.LineProcessor;
import com.google.common.io.Resources;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import static com.zz.langchecker.Characters.*;

public final class LangSwitcherTokenizer implements Tokenizer {
    private static final Joiner JOINER = Joiner.on("");

    private static final char APOSTROPHE = '\'';
    private static final char APOSTROPHE_1 = '`';

    final LangChecker langChecker;
    final Map<String, String> exceptions;

    final int minTokenLength;

    LangSwitcherTokenizer(LangChecker langChecker, int minTokenLength) {
        this.langChecker = langChecker;

        this.minTokenLength = minTokenLength;

        try {
            this.exceptions = Resources.readLines(this.getClass().getResource("exceptions.csv"), Charsets.UTF_8,
                    new ExceptionsLineProcessor());
        } catch (IOException e) {
            throw Throwables.propagate(e);
        }
    }

    public static LangSwitcherTokenizer create() {
        return new LangSwitcherTokenizer(LangChecker.create(), 0);
    }

    public static LangSwitcherTokenizer create(int minTokenLength) {
        return new LangSwitcherTokenizer(LangChecker.create(), minTokenLength);
    }

    @Override
    public TokenizerResponse tokenize(String input) {
        List<Integer> uppercasePositions = uppercasePositions(input);
        String canonical = canonical(input);
        List<Token> allTokens = split(canonical);
        List<String> wordTokens = FluentIterable.from(allTokens).filter(TokenFunctions.isWord())
                .transform(TokenFunctions.corrected()).toList();
        String corrected = restoreUppercase(JOINER.join(Lists.transform(allTokens, TokenFunctions.corrected())),
                uppercasePositions);

        return ImmutableTokenizerResponse.builder().original(input).addAllTokens(wordTokens)
                .corrected(
                        canonical(corrected).equals(canonical) ? Optional.<String>absent() : Optional.of(corrected))
                .build();
    }

    List<Token> split(String input) {
        List<Token> tokens = Lists.newArrayList();
        for (Token token : splitBySpecificSeparators(input, isSeparator(), false)) {
            tokens.addAll(splitPossibleSubTokens(token));
        }
        return ImmutableList.copyOf(tokens);
    }

    List<Token> splitBySpecificSeparators(String original, Predicate<Character> isSeparator,
            boolean useExceptions) {
        if (original.isEmpty()) {
            return ImmutableList.of();
        }

        List<Token> tokens = Lists.newArrayList();

        char[] chars = original.toCharArray();
        boolean isPrevSeparator = false;
        boolean isPrevDigit = false;
        int start = 0;
        CharType.Set charTypes = CharType.createSet();

        for (int i = 0; i < chars.length; i++) {
            char ch = chars[i];

            boolean isCurrentSeparator = isSeparator.apply(ch);
            boolean isCurrentDigit = Character.isDigit(ch);

            if (i > 0 && (isCurrentSeparator ^ isPrevSeparator || isCurrentDigit ^ isPrevDigit)) {
                String token = original.substring(start, i);
                tokens.add(
                        buildToken(TokenType.of(isPrevSeparator), token, token, token, charTypes, useExceptions));

                start = i;
                charTypes = CharType.createSet();
            }

            isPrevSeparator = isCurrentSeparator;
            isPrevDigit = isCurrentDigit;
            charTypes.add(CharType.of(Character.toLowerCase(ch)));
        }

        String token = original.substring(start);
        tokens.add(buildToken(TokenType.of(isPrevSeparator), token, token, token, charTypes, useExceptions));

        return ImmutableList.copyOf(tokens);
    }

    List<Token> splitPossibleSubTokens(Token token) {
        CharType.Set charTypes = token.charTypes();

        if (charTypes.containsOnly(CharType.EN_OR_POSSIBLE_RU)) {
            return enOrPossibleRu(token);
        }

        if (charTypes.containsOnlyFirstOrBoth(CharType.SEPARATOR_OR_POSSIBLE_RU, CharType.EN_OR_POSSIBLE_RU)) {
            return separatorOrPossibleEn(token);
        }

        if (charTypes.containsOnly(CharType.RU_OR_POSSIBLE_EN)) {
            return ruOrPossibleEn(token);
        }

        if (charTypes.containsOnlyFirstOrBoth(CharType.RU_OR_POSSIBLE_SEPARATOR, CharType.RU_OR_POSSIBLE_EN)) {
            return ruOrPossibleSeparator(token);
        }

        return ImmutableList.of(token);
    }

    private List<Token> enOrPossibleRu(Token token) {
        String corrected = token.canonical();
        if (!langChecker.check(Lang.EN, token.canonical())) {
            String switched = Characters.switchLang(token.canonical(), Lang.RU);
            if (langChecker.check(Lang.RU, switched)) {
                corrected = switched;
            }
        }
        return ImmutableList.of(buildToken(token.type(), token.corrected(), token.canonical(), corrected));
    }

    private List<Token> separatorOrPossibleEn(Token token) {
        if (isAbbreviation(token.canonical())) {
            return splitBySpecificSeparators(token.canonical(), isSeparatorOrPossibleRu(), false);
        } else {
            String switched = Characters.switchLang(token.canonical(), Lang.RU);
            if (langChecker.check(Lang.RU, switched)) {
                return ImmutableList.of(buildToken(token.type(), token.corrected(), token.canonical(), switched));
            } else {
                return splitBySpecificSeparators(token.canonical(), isSeparatorOrPossibleRu(), true);
            }
        }
    }

    private List<Token> ruOrPossibleEn(Token token) {
        String corrected = token.canonical();
        if (!langChecker.check(Lang.RU, token.canonical())) {
            String switched = Characters.switchLang(token.canonical(), Lang.EN);
            if (langChecker.check(Lang.EN, switched)) {
                corrected = switched;
            }
        }
        return ImmutableList.of(buildToken(token.type(), token.corrected(), token.canonical(), corrected));
    }

    private List<Token> ruOrPossibleSeparator(Token token) {
        boolean correct = langChecker.check(Lang.RU, token.canonical());
        List<Token> splitByPossibleSeparators = ImmutableList.of();

        if (!correct) {
            splitByPossibleSeparators = splitBySpecificSeparators(Characters.switchLang(token.canonical(), Lang.EN),
                    isSeparatorOrPossibleRu(), true);
            correct = !checkAll(splitByPossibleSeparators, Lang.EN);
        }

        if (correct) {
            return ImmutableList
                    .of(buildToken(token.type(), token.corrected(), token.canonical(), token.canonical()));
        } else {
            return splitByPossibleSeparators;
        }
    }

    private String canonical(String candidate) {
        return candidate.replace(APOSTROPHE_1, APOSTROPHE).toLowerCase();
    }

    private Token buildToken(TokenType tokenType, String original, String canonical, String corrected) {
        return buildToken(tokenType, original, canonical, corrected, CharType.createSet());
    }

    private Token buildToken(TokenType tokenType, String original, String canonical, String corrected,
            CharType.Set charTypes) {
        return buildToken(tokenType, original, canonical, corrected, charTypes, true);
    }

    private Token buildToken(TokenType tokenType, String original, String canonical, String corrected,
            CharType.Set charTypes, boolean useException) {

        return ImmutableToken.builder()
                // XXX assume that we use exceptions for words
                .type(useException && exceptions.containsKey(canonical) ? TokenType.WORD : tokenType)
                .original(original).canonical(canonical)
                .corrected(useException && exceptions.containsKey(canonical) ? exceptions.get(canonical)
                        : canonical.length() < minTokenLength ? canonical : corrected)
                .charTypes(charTypes).build();
    }

    private boolean checkAll(Iterable<Token> tokens, Lang lang) {
        boolean atLeastOneWord = false;
        for (Token token : tokens) {
            atLeastOneWord = token.isWord();
            if (token.isWord() && !langChecker.check(lang, token.corrected())) {
                return false;
            }
        }
        return atLeastOneWord;
    }

    private static final class ExceptionsLineProcessor implements LineProcessor<Map<String, String>> {
        static final Splitter SPLITTER = Splitter.on("|").trimResults();
        ImmutableMap.Builder<String, String> builder = ImmutableMap.builder();

        @Override
        public boolean processLine(String line) throws IOException {
            Iterator<String> iterator = SPLITTER.split(line).iterator();
            builder.put(iterator.next(), iterator.next());
            return true;
        }

        @Override
        public Map<String, String> getResult() {
            return builder.build();
        }
    }
}