com.github.stagirs.lingvo.morph.MorphPredictor.java Source code

Java tutorial

Introduction

Here is the source code for com.github.stagirs.lingvo.morph.MorphPredictor.java

Source

/*
 * Copyright 2017 Dmitriy Malakhov.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.github.stagirs.lingvo.morph;

import com.github.stagirs.lingvo.morph.model.Morph;
import static java.lang.Byte.parseByte;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.codec.language.DaitchMokotoffSoundex;
import org.apache.commons.lang.StringUtils;

/**
 *
 * @author Dmitriy Malakhov
 */
public class MorphPredictor {
    private static final Map<String, String> letters = new HashMap<String, String>();
    static {
        letters.put("?", "A");
        letters.put("", "B");
        letters.put("", "V");
        letters.put("", "G");
        letters.put("", "D");
        letters.put("", "E");
        letters.put("?", "E");
        letters.put("", "ZH");
        letters.put("", "Z");
        letters.put("", "I");
        letters.put("", "I");
        letters.put("", "K");
        letters.put("", "L");
        letters.put("", "M");
        letters.put("?", "N");
        letters.put("", "O");
        letters.put("", "P");
        letters.put("", "R");
        letters.put("", "S");
        letters.put("", "T");
        letters.put("", "U");
        letters.put("", "F");
        letters.put("", "H");
        letters.put("", "C");
        letters.put("", "CH");
        letters.put("", "SH");
        letters.put("", "SH");
        letters.put("", "'");
        letters.put("", "Y");
        letters.put("", "'");
        letters.put("", "E");
        letters.put("", "U");
        letters.put("", "YA");
        letters.put("", "a");
        letters.put("", "b");
        letters.put("", "v");
        letters.put("", "g");
        letters.put("", "d");
        letters.put("", "e");
        letters.put("", "e");
        letters.put("", "zh");
        letters.put("", "z");
        letters.put("", "i");
        letters.put("", "i");
        letters.put("", "k");
        letters.put("", "l");
        letters.put("", "m");
        letters.put("", "n");
        letters.put("", "o");
        letters.put("", "p");
        letters.put("", "r");
        letters.put("?", "s");
        letters.put("", "t");
        letters.put("", "u");
        letters.put("", "f");
        letters.put("", "h");
        letters.put("", "c");
        letters.put("", "ch");
        letters.put("", "sh");
        letters.put("", "sh");
        letters.put("", "'");
        letters.put("", "y");
        letters.put("", "'");
        letters.put("?", "e");
        letters.put("", "u");
        letters.put("?", "ya");
    }

    public static Morph get(String word) {
        MorphStateMachine.State state = MorphStateMachine.begin();
        int finish = word.length() - 1;
        for (; finish >= word.length() - 2; finish--) {
            MorphStateMachine.State curState = MorphStateMachine.getState(state, word.charAt(finish));
            if (curState == null) {
                break;
            }
            state = curState;
        }
        MorphIterator iterator = new MorphIterator(state);
        Morph minMorph = null;
        int minDistanceLev = Integer.MAX_VALUE;
        int minDistanceSoundex = Integer.MAX_VALUE;
        DaitchMokotoffSoundex soundex = new DaitchMokotoffSoundex();
        String translitWord = toTranslit(word);
        byte[] encoded = getBytes(soundex, translitWord);
        while (iterator.hasNext()) {
            Morph morph = iterator.next();
            int distanceLev = StringUtils.getLevenshteinDistance(word, morph.getRaw());
            if (minDistanceLev < distanceLev) {
                continue;
            }
            int distanceSoundex = getDistance(soundex, morph.getRaw(), encoded);
            if (minDistanceLev == distanceLev && minDistanceSoundex < distanceSoundex) {
                continue;
            }
            minMorph = morph;
            minDistanceLev = distanceLev;
            minDistanceSoundex = distanceSoundex;
        }
        if (minMorph == null) {
            return null;
        }
        minMorph.setWord(word);
        return minMorph;
    }

    private static String toTranslit(String text) {
        StringBuilder sb = new StringBuilder(text.length());
        for (int i = 0; i < text.length(); i++) {
            String l = text.substring(i, i + 1);
            if (letters.containsKey(l)) {
                sb.append(letters.get(l));
            } else {
                sb.append(l);
            }
        }
        return sb.toString();
    }

    private static int getDistance(DaitchMokotoffSoundex soundex, String word, byte[] encoded) {
        byte[] bytes = getBytes(soundex, toTranslit(word));
        int distance = 0;
        for (int i = 0; i < bytes.length && i < encoded.length; i++) {
            distance += Math.abs(encoded[i] - bytes[i]);
        }
        return distance + Math.abs(bytes.length - encoded.length);
    }

    private static byte[] getBytes(DaitchMokotoffSoundex soundex, String translitWord) {
        String[] parts = soundex.encode(translitWord).split("");
        byte[] bytes = new byte[parts.length];
        for (int i = 0; i < parts.length; i++) {
            bytes[i] = parseByte(parts[i]);
        }
        return bytes;
    }
}