de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.impl.CybozuLanguageIdentifier.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.impl.CybozuLanguageIdentifier.java

Source

/*
 * Copyright 2016
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.impl;

import com.cybozu.labs.langdetect.Detector;
import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException;
import com.cybozu.labs.langdetect.Language;
import de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.LanguageIdentifier;
import org.apache.commons.io.IOUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;

/**
 * @author Ivan Habernal
 */
public class CybozuLanguageIdentifier implements LanguageIdentifier {
    private static final double PROBABILITY_THRESHOLD = 0.80;

    static final String[] PROFILES = { "af", "bn", "de", "es", "fi", "he", "hu", "ja", "lt", "ml", "nl", "pl", "ru",
            "so", "sw", "th", "uk", "zh-cn", "ar", "cs", "el", "et", "fr", "hi", "id", "kn", "lv", "mr", "no", "pt",
            "sk", "sq", "ta", "tl", "ur", "zh-tw", "bg", "da", "en", "fa", "gu", "hr", "it", "ko", "mk", "ne", "pa",
            "ro", "sl", "sv", "te", "tr", "vi" };

    public CybozuLanguageIdentifier() {
        List<String> jsonProfiles = new ArrayList<String>();
        for (String profile : PROFILES) {
            // locate the stream
            String resourceName = "profiles/" + profile;
            InputStream inputStream = CybozuLanguageIdentifier.class.getClassLoader()
                    .getResourceAsStream(resourceName);

            if (inputStream == null) {
                throw new RuntimeException("Cannot locate resource " + resourceName + " on the classpath.");
            }

            // read the profile to string
            StringWriter sw = new StringWriter();
            try {
                IOUtils.copy(inputStream, sw, "utf-8");
            } catch (IOException e) {
                throw new RuntimeException(e);
            }

            // add to all profiles
            jsonProfiles.add(sw.toString());
        }

        // and load all languages
        try {
            DetectorFactory.loadProfile(jsonProfiles);
        } catch (LangDetectException e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    public String identifyLanguage(String html) throws IOException {
        // extracting plain html text
        Document doc = Jsoup.parse(html);
        String text = doc.text();

        // we might have removed everything -> no lang
        if (text.isEmpty()) {
            return UNKNOWN_LANGUAGE;
        }

        try {
            Detector detector = DetectorFactory.create();
            detector.append(text);
            String detectedLang = detector.detect();

            ArrayList<Language> detectedProbabilities = detector.getProbabilities();

            if (detectedProbabilities.get(0).prob > PROBABILITY_THRESHOLD) {
                return detectedLang;
            } else {
                return UNKNOWN_LANGUAGE;
            }
        } catch (LangDetectException e) {
            return UNKNOWN_LANGUAGE;
        }
    }

}