Java tutorial
/* * Copyright 2017 Dmitriy Malakhov. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.github.stagirs.lingvo.build; import com.github.stagirs.lingvo.build.model.Lemma; import com.github.stagirs.lingvo.model.WordForm; import com.github.stagirs.lingvo.morph.model.RuleMapping; import com.github.stagirs.lingvo.morph.model.Rule; import com.github.stagirs.lingvo.morph.model.RuleItem; import gnu.trove.map.hash.TObjectIntHashMap; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.commons.io.FileUtils; /** * * @author Dmitriy Malakhov */ public class MorphStateMachineBuilder { private static class Struct { TObjectIntHashMap prefcommons = new TObjectIntHashMap(); List<Rule> rules = new ArrayList<Rule>(); Map<String, Integer> rule2id = new HashMap<String, Integer>(); } public static void main(String[] args) throws IOException { FileUtils.writeLines(new File("src/main/resources/MorphStateMachine"), "utf-8", suf2mapping()); } private static Collection<RuleMapping> suf2mapping() throws IOException { Set<String> suffixes = getSuffixes(); Map<String, Struct> sufs = new HashMap<String, Struct>(); for (Map.Entry<String, List<WordForm[]>> raw2WordForms : getRaw2WordForms().entrySet()) { String raw = raw2WordForms.getKey(); int[] index = getCommon(suffixes, raw, raw2WordForms.getValue()); String common = index[0] < index[1] ? raw.substring(index[0], index[1]) : ""; String rawPref = raw.substring(0, Math.min(index[0], index[1])); String rawSuf = raw.substring(index[1]); Map<String, List<RuleItem>> map = new HashMap(); for (WordForm[] wordForm : raw2WordForms.getValue()) { String normSuf = wordForm[1].getWord().substring(common.length()); if (!map.containsKey(normSuf)) { map.put(normSuf, new ArrayList<RuleItem>()); } map.get(normSuf).add(new RuleItem(normSuf, wordForm[1].getForm(), wordForm[0].getForm())); } RuleItem[][] items = new RuleItem[map.size()][]; int i = 0; for (List<RuleItem> item : map.values()) { items[i++] = item.toArray(new RuleItem[item.size()]); } Rule rule = new Rule(rawPref, rawSuf, items); String ruleId = Rule.serialize(rule); if (!sufs.containsKey(rawSuf)) { sufs.put(rawSuf, new Struct()); } Struct struct = sufs.get(rawSuf); if (!struct.rule2id.containsKey(ruleId)) { struct.rule2id.put(ruleId, struct.rule2id.size()); struct.rules.add(rule); } struct.prefcommons.put(rawPref + common, struct.rule2id.get(ruleId)); } List<RuleMapping> list = new ArrayList<RuleMapping>(); for (Map.Entry<String, Struct> entrySet : sufs.entrySet()) { list.add(new RuleMapping(entrySet.getKey(), entrySet.getValue().prefcommons, entrySet.getValue().rules.toArray(new Rule[entrySet.getValue().rules.size()]))); } return list; } private static int[] getCommon(Set<String> suffixes, String raw, List<WordForm[]> wordForms) { int[] index = new int[] { 0, raw.length() }; for (WordForm[] record : wordForms) { int[] com = getCommon(record[1].getWord(), raw); index[0] = Math.max(com[0], index[0]); index[1] = Math.min(com[1], index[1]); } for (int i = 0; i < index[1]; i++) { if (suffixes.contains(raw.substring(i))) { index[1] = i; break; } } return index; } private static Map<String, List<WordForm[]>> getRaw2WordForms() throws IOException { Map<String, List<WordForm[]>> result = new HashMap<String, List<WordForm[]>>(); for (String line : FileUtils.readLines(new File("dict.opcorpora.plain"), "utf-8")) { if (line.isEmpty()) { continue; } Lemma lemma = Lemma.parse(line); WordForm norm = lemma.getNorm(); for (WordForm raw : lemma.getItems()) { add(result, raw, norm); } } return result; } private static void add(Map<String, List<WordForm[]>> result, WordForm raw, WordForm norm) { if (!result.containsKey(raw.getWord())) { result.put(raw.getWord(), new ArrayList<WordForm[]>()); } result.get(raw.getWord()).add(new WordForm[] { raw, norm }); } private static Set<String> getSuffixes() throws IOException { Set<String> suffixes = new HashSet(); for (String line : FileUtils.readLines(new File("dict.opcorpora.plain"), "utf-8")) { if (line.isEmpty()) { continue; } Lemma lemma = Lemma.parse(line); WordForm norm = lemma.getNorm(); for (WordForm raw : lemma.getItems()) { int[] com = getCommon(norm.getWord(), raw.getWord()); for (int j = com[1]; j < raw.getWord().length(); j++) { suffixes.add(raw.getWord().substring(j)); } } } return suffixes; } private static int[] getCommon(String norm, String form) { for (int i = 0; i < norm.length() - 2; i++) { String common = norm.substring(0, norm.length() - i); if (form.contains(common)) { int index = form.indexOf(common); return new int[] { index, index + common.length() }; } } return new int[] { 0, 0 }; } }