com.github.stagirs.lingvo.build.Xml2Plain.java Source code

Introduction

Here is the source code for com.github.stagirs.lingvo.build.Xml2Plain.java
Source

/*
 * Copyright 2017 Dmitriy Malakhov.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.github.stagirs.lingvo.build;

import com.github.stagirs.lingvo.build.model.Annotation;
import com.github.stagirs.lingvo.build.model.Lemma;
import com.github.stagirs.lingvo.model.WordForm;
import com.github.stagirs.lingvo.model.Attr;
import static com.github.stagirs.lingvo.model.Attr.ms_f;
import com.github.stagirs.lingvo.model.Form;
import com.github.stagirs.lingvo.model.Type;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;

/**
 *
 * @author Dmitriy Malakhov
 */
public class Xml2Plain {

    public static void main(String[] args) throws Exception {
        FileUtils.writeLines(new File("dict.opcorpora.plain"), "utf-8",
                getLemmas(new File("dict.opcorpora.xml.zip")));
        FileUtils.writeLines(new File("annot.opcorpora.no_ambig.plain"), "utf-8",
                getAnnotations(new File("annot.opcorpora.no_ambig.xml.zip")));
    }

public static List<Lemma> getLemmas(File file) throws IOException{
    List<String> dictLines = extract(file);
    Pattern lemmaPattern = Pattern.compile("<lemma id=\"(.*?)\".*?/lemma>", Pattern.MULTILINE|Pattern.DOTALL);
    Pattern normPattern = Pattern.compile("<l.*?t=\"(.*?)\">(.*?)</l>", Pattern.MULTILINE|Pattern.DOTALL);
    Pattern rawPattern = Pattern.compile("<f.*?t=\"(.*?)\">(.*?)</f>", Pattern.MULTILINE|Pattern.DOTALL);
    Pattern gPattern = Pattern.compile("<g.*?v=\"(.*?)\".*?/>", Pattern.MULTILINE|Pattern.DOTALL);
    List<Lemma> lemmas = new ArrayList<Lemma>();
    for (String dictLine : dictLines) {
        Matcher lemmaMatcher = lemmaPattern.matcher(dictLine);
        if(!lemmaMatcher.find()){
            continue;
        }
        String lemma = lemmaMatcher.group(0);
        Matcher normMatcher = normPattern.matcher(lemma);
        if(!normMatcher.find()){
            continue;
        }
        List<Attr> mainAttr = fillAttributes(new ArrayList<Attr>(), gPattern.matcher(normMatcher.group(2)));
        if(mainAttr.contains(Attr.Init)){
            continue;
        }
        Matcher rawMatcher = rawPattern.matcher(lemma);
        List<WordForm> items = new ArrayList<WordForm>();
        while(rawMatcher.find()){
            String word = rawMatcher.group(1).replace('', '').replace("", "");
            if(!word.matches("[-?\\-]+")){
                continue;
            }
            WordForm wf = new WordForm(word, new Form(fillAttributes(new ArrayList<Attr>(mainAttr), gPattern.matcher(rawMatcher.group(2)))));
            if(wf.getWord().length() == 1 && !wf.getForm().isStop()){
                continue;
            }
            if(items.isEmpty()){
                if(wf.getForm().getAttrs().contains(Attr.Erro) || wf.getForm().getAttrs().contains(Attr.Dist)){
                    break;
                }
            }
            items.add(wf);
        }
        if(items.isEmpty()){
            continue;
        }
        lemmas.add(new Lemma(items));
    }
    return lemmas;
}

public static List<Annotation> getAnnotations(File file) throws IOException{
    List<String> annotLines = extract(file);
    Pattern tokenPattern = Pattern.compile("<token.*?text=\"(.*?)\">(.*?)</token>", Pattern.MULTILINE|Pattern.DOTALL);
    Pattern gPattern = Pattern.compile("<g.*?v=\"(.*?)\"", Pattern.MULTILINE|Pattern.DOTALL);
    Pattern mainPattern = Pattern.compile("<source>(.*?)</source>.*?<tokens>(.*?)</tokens>", Pattern.MULTILINE|Pattern.DOTALL);
    List<Annotation> result = new ArrayList<Annotation>();
    StringBuilder sb = null;
    for (String annotLine : annotLines) {
        if(annotLine.trim().startsWith("<sentence ")){
            sb = new StringBuilder();
            continue;
        }
        if(annotLine.trim().startsWith("</sentence>")){
            Matcher mainMatcher = mainPattern.matcher(sb.toString());
            sb = null;
            if(!mainMatcher.find()){
                continue;
            }
            String text = mainMatcher.group(1).replace("", "").replace("\t", "");
            List<WordForm> items = new ArrayList<WordForm>();

            Matcher tokenMatcher = tokenPattern.matcher(mainMatcher.group(2));
            while(tokenMatcher.find()){
                String word = tokenMatcher.group(1).replace('', '').replace("", "").toLowerCase();
                List<Attr> attrs = fillAttributes(new ArrayList<Attr>(), gPattern.matcher(tokenMatcher.group(2)));
                if(attrs.isEmpty()){
                    continue;
                }
                items.add(new WordForm(word, new Form(attrs)));
            }
            result.add(new Annotation(text, items));
            continue;
        }
        if(sb != null){
            sb.append(annotLine);
        }
    }
    return result;
}

    private static List<Attr> fillAttributes(List<Attr> list, Matcher attrMatcher) {
        while (attrMatcher.find()) {
            String attr = attrMatcher.group(1).replace("-", "_");
            if (Character.isDigit(attr.charAt(0))) {
                attr = "N" + attr;
            }
            if (attr.toLowerCase().equals("ms_f")) {
                if (list.contains(Attr.femn)) {
                    list.remove(Attr.femn);
                }
                if (list.contains(Attr.masc)) {
                    list.remove(Attr.masc);
                }
                if (list.contains(Attr.neut)) {
                    list.remove(Attr.neut);
                }
                if (list.contains(Attr.GNdr)) {
                    list.remove(Attr.GNdr);
                }
                list.add(ms_f);
                continue;
            }
            Attr attrVal = Attr.valueOf(attr);
            if (attrVal == Attr.inan && list.contains(Attr.anim)) {
                continue;
            }
            if (attrVal == Attr.anim && list.contains(Attr.inan)) {
                list.remove(Attr.inan);
            }
            if (attrVal == Attr.femn || attrVal == Attr.masc || attrVal == Attr.neut || attrVal == Attr.GNdr) {
                if (list.contains(ms_f)) {
                    continue;
                }
            }
            if (attrVal == Attr.loc1 || attrVal == Attr.loc2) {
                attrVal = Attr.loct;
            }
            if (attrVal == Attr.gent || attrVal == Attr.gen1 || attrVal == Attr.gen2) {
                if (list.get(0) == Attr.ADJS) {
                    continue;
                }
                attrVal = Attr.gent;
            }
            if (attrVal == Attr.acc2) {
                attrVal = Attr.accs;
            }
            if (attrVal.getType() == Type.Other) {
                continue;
            }
            list.add(attrVal);
        }
        if (list.contains(Attr.Name) || list.contains(Attr.Surn) || list.contains(Attr.Patr)) {
            list.remove(Attr.inan);
            if (!list.contains(Attr.anim)) {
                list.add(Attr.anim);
            }
        }
        if (list.contains(Attr.VERB) && list.contains(Attr.Impe) && list.contains(Attr.neut)) {
            list.remove(Attr.neut);
        }
        return list;
    }

    private static List<String> extract(File file) throws IOException {
        ZipInputStream zis = new ZipInputStream(new FileInputStream(file));
        try {
            ZipEntry ze = zis.getNextEntry();
            if (ze == null) {
                throw new RuntimeException("can't unzip file");
            }
            return IOUtils.readLines(zis, Charset.forName("utf-8"));
        } finally {
            zis.close();
        }
    }
}