jp.go.nict.langrid.wrapper.ws_1_2.translation.AbstractTranslationService.java Source code

Java tutorial

Introduction

Here is the source code for jp.go.nict.langrid.wrapper.ws_1_2.translation.AbstractTranslationService.java

Source

/*
 * $Id: AbstractTranslationService.java 507 2012-05-24 04:34:29Z t-nakaguchi $
 *
 * This is a program to wrap language resources as Web services.
 * Copyright (C) 2005-2008 NICT Language Grid Project.
 *
 * This program is free software: you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 2.1 of the License, or (at
 * your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
 * General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
package jp.go.nict.langrid.wrapper.ws_1_2.translation;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Scanner;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import jp.go.nict.langrid.commons.util.ArrayUtil;
import jp.go.nict.langrid.commons.ws.ServiceContext;
import jp.go.nict.langrid.language.Language;
import jp.go.nict.langrid.language.LanguagePair;
import jp.go.nict.langrid.service_1_2.AccessLimitExceededException;
import jp.go.nict.langrid.service_1_2.InvalidParameterException;
import jp.go.nict.langrid.service_1_2.LanguagePairNotUniquelyDecidedException;
import jp.go.nict.langrid.service_1_2.LanguagePath;
import jp.go.nict.langrid.service_1_2.LanguagePathNotUniquelyDecidedException;
import jp.go.nict.langrid.service_1_2.NoAccessPermissionException;
import jp.go.nict.langrid.service_1_2.NoValidEndpointsException;
import jp.go.nict.langrid.service_1_2.ProcessFailedException;
import jp.go.nict.langrid.service_1_2.ServerBusyException;
import jp.go.nict.langrid.service_1_2.ServiceNotActiveException;
import jp.go.nict.langrid.service_1_2.ServiceNotFoundException;
import jp.go.nict.langrid.service_1_2.UnsupportedLanguagePairException;
import jp.go.nict.langrid.service_1_2.UnsupportedLanguagePathException;
import jp.go.nict.langrid.service_1_2.backtranslation.BackTranslationResult;
import jp.go.nict.langrid.service_1_2.backtranslation.BackTranslationService;
import jp.go.nict.langrid.service_1_2.multihoptranslation.MultihopTranslationResult;
import jp.go.nict.langrid.service_1_2.multihoptranslation.MultihopTranslationService;
import jp.go.nict.langrid.service_1_2.transformer.LanguagePath_LanguageToWITransformer;
import jp.go.nict.langrid.service_1_2.translation.TranslationService;
import jp.go.nict.langrid.service_1_2.util.validator.LanguagePairValidator;
import jp.go.nict.langrid.service_1_2.util.validator.LanguagePathValidator;
import jp.go.nict.langrid.service_1_2.util.validator.LanguageValidator;
import jp.go.nict.langrid.service_1_2.util.validator.StringValidator;
import jp.go.nict.langrid.wrapper.ws_1_2.AbstractLanguagePairService;
import jp.go.nict.langrid.wrapper.ws_1_2.util.TextParser;

import org.apache.commons.lang.StringUtils;

/**
 * 
 * Base class for the translation service.
 * 
 * @author $Author: t-nakaguchi $
 * @version $Revision: 507 $
 */
public abstract class AbstractTranslationService extends AbstractLanguagePairService
        implements TranslationService, BackTranslationService, MultihopTranslationService {
    /**
     * 
     * Constructor that doesn't take parameter(s).
     * 
     */
    public AbstractTranslationService() {
        init();
    }

    /**
     * 
     * Constructor that takes the service context as a parameter(s).
     * @param serviceContext Service context
     * 
     */
    public AbstractTranslationService(ServiceContext serviceContext) {
        super(serviceContext);
        init();
    }

    /**
     * 
     * Constructor that takes the service context and supported language pair(s) as a parameter(s).
     * @param serviceContext Service context
     * @param supportedPairs Supported language pairs
     * 
     */
    public AbstractTranslationService(ServiceContext serviceContext, Collection<LanguagePair> supportedPairs) {
        super(serviceContext);
        init();
        setSupportedLanguagePairs(supportedPairs);
    }

    public void setMaxSourceLength(int maxSourceLength) {
        this.maxSourceLength = maxSourceLength;
    }

    public void setSentenceDivision(DivisionType sentenceDivision) {
        this.sentenceDivision = sentenceDivision;
    }

    private void init() {
        maxHop = getInitParameterInt("langrid.multihopTranslation.maxHop", 20);
        maxSourceLength = getInitParameterInt("langrid.maxSourceLength", 5000);
        try {
            String parameter = getInitParameter("langrid.translation.sentenceDivision");
            if (parameter == null || parameter.length() == 0) {
                sentenceDivision = DivisionType.NONE;
            } else {
                sentenceDivision = DivisionType.valueOf(parameter);
            }
        } catch (IllegalArgumentException e) {
            sentenceDivision = DivisionType.NONE;
        }
    }

    public String translate(String sourceLang, String targetLang, String source)
            throws AccessLimitExceededException, InvalidParameterException, LanguagePairNotUniquelyDecidedException,
            NoAccessPermissionException, NoValidEndpointsException, ProcessFailedException, ServerBusyException,
            ServiceNotActiveException, ServiceNotFoundException, UnsupportedLanguagePairException {
        checkStartupException();
        return invokeDoTranslation(sourceLang, targetLang, source);
    }

    public final BackTranslationResult backTranslate(String sourceLang, String interMediateLang, String source)
            throws AccessLimitExceededException, InvalidParameterException, LanguagePairNotUniquelyDecidedException,
            NoAccessPermissionException, NoValidEndpointsException, ProcessFailedException, ServerBusyException,
            ServiceNotActiveException, UnsupportedLanguagePairException {
        checkStartupException();
        String intermediate = invokeDoTranslation(sourceLang, interMediateLang, source);
        String r = invokeDoTranslation(interMediateLang, sourceLang, intermediate);
        return new BackTranslationResult(intermediate, r);
    }

    /**
     * 
     * <p>When the LanguagePathNotUnizuelyDecidedException occurs, only one language path data is returned as a candidate.
     * (It is not a formal interface, so the constructor of the language path data is omitted).</p>
     * 
     */
    public final MultihopTranslationResult multihopTranslate(String sourceLang, String[] intermediateLangs,
            String targetLang, String source)
            throws AccessLimitExceededException, InvalidParameterException, LanguagePathNotUniquelyDecidedException,
            NoAccessPermissionException, NoValidEndpointsException, ProcessFailedException, ServerBusyException,
            ServiceNotActiveException, UnsupportedLanguagePathException {
        checkStartupException();
        if (intermediateLangs.length >= maxHop) {
            throw new InvalidParameterException("intermediateLangs",
                    "too many hop: " + intermediateLangs.length + "(max: " + maxHop + ")");
        }
        jp.go.nict.langrid.language.LanguagePath path = new LanguagePathValidator("sourceLang", sourceLang,
                "intermediateLangs", intermediateLangs, "targetLang", targetLang).notNull().trim().notEmpty()
                        .getLanguagePath();
        Language[] langs = path.getPath();
        String[] intermediates = new String[langs.length - 2];
        int n = intermediates.length;
        String src = source;
        acquireSemaphore();
        try {
            for (int i = 0; i < n; i++) {
                Language sl = langs[i];
                Language tl = langs[i + 1];
                src = invokeDoTranslation(sl.getCode(), tl.getCode(), src);
                intermediates[i] = src;
            }
            String target = invokeDoTranslation(langs[n].getCode(), langs[n + 1].getCode(), src);
            return new MultihopTranslationResult(intermediates, target);
        } catch (LanguagePairNotUniquelyDecidedException e) {
            throw new LanguagePathNotUniquelyDecidedException(
                    new String[] { "sourceLang", "intermediateLangs", "targetLang" },
                    ArrayUtil.collect(new jp.go.nict.langrid.language.LanguagePath[] { path }, LanguagePath.class,
                            new LanguagePath_LanguageToWITransformer()));
        } finally {
            releaseSemaphore();
        }
    }

    /**
     * 
     * 
     */
    public final String multistatementTranslate(String sourceLang, String targetLang, String source,
            String delimiterRegx)
            throws AccessLimitExceededException, InvalidParameterException, LanguagePairNotUniquelyDecidedException,
            NoAccessPermissionException, NoValidEndpointsException, ProcessFailedException, ServerBusyException,
            ServiceNotActiveException, ServiceNotFoundException, UnsupportedLanguagePairException {
        checkStartupException();
        if (StringUtils.isBlank(delimiterRegx)) {
            throw new InvalidParameterException("delimiterRegx", "is Blank.");
        }
        StringBuilder sb = new StringBuilder();
        Scanner s = new Scanner(source).useDelimiter(delimiterRegx);
        int i = 0;
        while (s.hasNext()) {
            String text = s.next();
            MatchResult m = s.match();
            if (i != m.start()) {
                String tag = source.substring(i, m.start());
                sb.append(tag);
            }
            i = m.end();
            sb.append(invokeDoTranslation(sourceLang, targetLang, text));
        }
        if (source.length() != i) {
            String tag = source.substring(i);
            sb.append(tag);
        }

        return sb.toString();
    }

    protected String[] doMultistatementTranslation(Language sourceLang, Language targetLang, String[] sources)
            throws InvalidParameterException, ProcessFailedException {
        String[] ret = new String[sources.length];
        int n = ret.length;
        for (int i = 0; i < n; i++) {
            ret[i] = doTranslation(sourceLang, targetLang, sources[i]);
        }
        return ret;
    }

    /**
     * 
     * Template method supporting transtlate.
     * @param sourceLang Source language
     * @param targetLang Target language
     * @param source String to be translated
     * @return The translated string
     * @throws InvalidParameterException An invalid parameter was passed
     * @throws ProcessFailedException Process failed
     * 
     */
    protected abstract String doTranslation(Language sourceLang, Language targetLang, String source)
            throws InvalidParameterException, ProcessFailedException;

    /**
     * 
     * @param languagePair
     * @return
     * @throws InvalidParameterException An invalid parameter was passed
     * @throws LanguagePairNotUniquelyDecidedException Multiple candidate language pairs exist
     * @throws UnsupportedLanguagePairException An unsupported language pair was specified
     * 
     */
    protected LanguagePair validateLanguagePair(LanguagePairValidator languagePair)
            throws InvalidParameterException, LanguagePairNotUniquelyDecidedException,
            UnsupportedLanguagePairException {
        return languagePair.notNull().trim().notEmpty().getUniqueLanguagePair(getSupportedLanguagePairCollection());
    }

    public enum DivisionType {
        NONE, WORD, FULL, FULL_WITH_PUNCTUATION,;
    }

    private String invokeDoTranslation(String sourceLang, String targetLang, String source)
            throws InvalidParameterException, LanguagePairNotUniquelyDecidedException, ProcessFailedException,
            UnsupportedLanguagePairException {
        LanguagePair pair = validateLanguagePair(new LanguagePairValidator(
                new LanguageValidator("sourceLang", sourceLang), new LanguageValidator("targetLang", targetLang)));
        String src = new StringValidator("source", source).notNull().trim().notEmpty().getValue();
        src = toUpperCaseCharacterBehindDelimiter(src);

        processStart();
        try {
            acquireSemaphore();
            try {
                Language s = pair.getSource();
                Language t = pair.getTarget();
                String ret = null;
                switch (sentenceDivision) {
                default:
                case NONE:
                    ret = translateByNone(s, t, src);
                    break;
                case WORD:
                    ret = translateByWord(s, t, src);
                    break;
                case FULL:
                    ret = translateByFull(s, t, src);
                    break;
                case FULL_WITH_PUNCTUATION:
                    ret = translateByFullWithPunctuation(s, t, src);
                    break;
                }
                if (ret != null) {
                    ret = toLowerCaseInternalCode(ret);
                    ret = convertDelimiter(t.getCode(), ret);
                }
                return ret;
            } catch (InvalidParameterException e) {
                throw e;
            } catch (ProcessFailedException e) {
                logger.log(Level.WARNING, "process failed.", e);
                throw e;
            } catch (Throwable t) {
                logger.log(Level.SEVERE, "unknown error occurred.", t);
                throw new ProcessFailedException(t);
            } finally {
                releaseSemaphore();
            }
        } finally {
            processEnd();
        }
    }

    private String translateByNone(Language s, Language t, String src) throws InvalidParameterException,
            LanguagePairNotUniquelyDecidedException, ProcessFailedException, UnsupportedLanguagePairException {
        if (src.length() > this.maxSourceLength) {
            String[] sources = divideSource(src, this.maxSourceLength);
            String[] results = doMultistatementTranslation(s, t, sources);
            StringBuilder sb = new StringBuilder();
            for (String result : results) {
                sb.append(result);
            }
            return sb.toString();
        } else {
            return doTranslation(s, t, src);
        }
    }

    private String translateByWord(Language s, Language t, String src) throws InvalidParameterException,
            LanguagePairNotUniquelyDecidedException, ProcessFailedException, UnsupportedLanguagePairException {
        return divideByDelimiterAndTranslation(s, t, src, java.util.regex.Pattern.compile(" *\\Q*$%*\\E[.] *"));
    }

    private String translateByFull(Language s, Language t, String src) throws InvalidParameterException,
            LanguagePairNotUniquelyDecidedException, ProcessFailedException, UnsupportedLanguagePairException {
        return divideByDelimiterAndTranslation(s, t, src,
                java.util.regex.Pattern.compile(" *((\\Q*$%*\\E)|(\\Q*%$*\\E))[.] *"));
    }

    /**
     * ????<br/>
     * ??? translateByFull ? translateByWord ???<br/>
     * @author Hitoshi Sugihara
     * @param s ?
     * @param t ?
     * @param src 
     * @param deli ???Pattern
     * @return ?
     * @throws InvalidParameterException
     * @throws ProcessFailedException
     */
    private String divideByDelimiterAndTranslation(Language s, Language t, String src, java.util.regex.Pattern deli)
            throws InvalidParameterException, ProcessFailedException {
        java.util.List<String> delimiters; // ????????
        String[] sources; // ??
        String[] results; // ???
        delimiters = new java.util.LinkedList<String>();
        sources = divideSourceByDelimiter(src, deli, delimiters);
        results = doMultistatementTranslation(s, t, sources);
        return combineResult(results, delimiters, t);
    }

    private String translateByFullWithPunctuation(Language s, Language t, String src)
            throws InvalidParameterException, LanguagePairNotUniquelyDecidedException, ProcessFailedException,
            UnsupportedLanguagePairException {
        src = TextParser.preprocessOriginal(s.getCode(), src);
        ArrayList<String> sentences = new ArrayList<String>();
        while (src.length() != 0) {
            HashMap<String, String> parsed = TextParser.getFirstSentence(s.getCode(), src);
            sentences.add(parsed.get("first"));
            src = parsed.get("remain");
        }

        String[] retCodes = new String[sentences.size()];
        ArrayList<String> contents = new ArrayList<String>();
        for (int i = 0; i < retCodes.length; i++) {
            String aStr = sentences.get(i);
            if (aStr.equals(TextParser.getRetSymbol("\r"))) {
                retCodes[i] = "\r";
            } else if (aStr.equals(TextParser.getRetSymbol("\n"))) {
                retCodes[i] = "\n";
            } else if (aStr.equals(TextParser.getRetSymbol("\r\n"))) {
                retCodes[i] = "\r\n";
            } else {
                contents.add(aStr);
            }
        }

        String[] res = doMultistatementTranslation(s, t, contents.toArray(new String[] {}));

        StringBuilder sb = new StringBuilder();
        int index = 0;
        for (int i = 0; i < retCodes.length; i++) {
            if (retCodes[i] == null) {
                sb.append(res[index]);
                index++;
            } else {
                sb.append(retCodes[i]);
            }
        }

        return sb.toString();
    }

    private String[] divideSource(String source, int maxSourceLength) {
        ArrayList<String> resultArray = new ArrayList<String>();

        Pattern p = Pattern.compile("(\\Q*$%*\\E|\\Q*%$*\\E)");
        Matcher m = p.matcher(source);

        int index = 0;
        int end = 0;
        while (index + maxSourceLength < source.length()) {
            Matcher region = m.region(index, index + maxSourceLength);
            while (region.find()) {
                end = m.end();
            }
            if (end > index) {
                resultArray.add(source.substring(index, end + 1));
                index = end + 2;
                end = index;
            } else {
                break;
            }
        }

        if (index < source.length()) {
            resultArray.add(source.substring(index));
        }

        return resultArray.toArray(new String[] {});
    }

    /**
     * ??<br/>
     * ????divideByDelimiterAndTranslation ???<br/>
     * @author Hitoshi Sugihara
     * @param source ?
     * @param deli ?? Pattern 
     * @param delis ?????????
     * @return ???????
     */
    private String[] divideSourceByDelimiter(String source, java.util.regex.Pattern deli,
            java.util.List<String> delis) {
        java.util.List<String> sources = new java.util.ArrayList<String>(); // ????????????
        java.util.regex.Matcher mth = deli.matcher(source); // ??? Matcher 
        StringBuffer hoge; // ????????
        while (mth.find()) {
            delis.add(mth.group());
            hoge = new StringBuffer();
            mth.appendReplacement(hoge, "");
            sources.add(hoge.toString());
        }
        hoge = new StringBuffer();
        mth.appendTail(hoge);
        if (hoge.length() > 0) {
            sources.add(hoge.toString());
        }
        return sources.toArray(new String[] {});
    }

    /**
     * ???????????<br/>
     * ????divideByDelimiterAndTranslation ???
     * @author Hitoshi Sugihara
     * @param results ??
     * @param delis ???????
     * @param t ??
     * @return ????
     */
    private String combineResult(String[] results, java.util.List<String> delis, Language t) {
        StringBuilder sb = new StringBuilder();// ???? StringBuilder
        java.util.Iterator<String> deli = delis.iterator();// ????????Iterator
        for (int i = 0; i < results.length; i++) {
            sb.append(results[i]);
            if (deli.hasNext()) {
                sb.append(deli.next());
            }
        }
        return sb.toString();
    }

    private String toUpperCaseCharacterBehindDelimiter(String source) {
        StringBuilder sb = new StringBuilder(source);
        Pattern p = Pattern.compile("(\\Q*$%*\\E|\\Q*%$*\\E)\\. *[a-z]");
        Matcher m = p.matcher(sb);

        while (m.find()) {
            String sub = sb.substring(m.start(), m.end());
            sb.replace(m.start(), m.end(), sub.toUpperCase());
        }

        return sb.toString();
    }

    private String toLowerCaseInternalCode(String source) {
        return source.replaceAll("Xxx([a-z]+)xxx", "xxx$1xxx");
    }

    private String convertDelimiter(String targetLang, String src) {
        // ?'ja'???'zh'?????""????????"."?
        if (targetLang.equals("ja") || targetLang.equals("zh") || targetLang.startsWith("zh-")) {
            src = src.replace("*$%*.", "*$%*");
            src = src.replace("*%$*.", "*%$*");
        } else {
            src = src.replace("*$%*", "*$%*.");
            src = src.replace("*%$*", "*%$*.");
        }

        // ?????????????
        if (targetLang.equals("ja") || targetLang.equals("zh") || targetLang.startsWith("zh-")) {
            src = src.replaceAll("(\\Q*$%*\\E)(\\S)", "$1 $2");
            src = src.replaceAll("(\\Q*%$*\\E)(\\S)", "$1 $2");
        } else {
            src = src.replaceAll("(\\Q*$%*\\E.)(\\S)", "$1 $2");
            src = src.replaceAll("(\\Q*%$*\\E.)(\\S)", "$1 $2");
        }

        // ??? + ??????????
        if (targetLang.equals("ja") || targetLang.equals("zh") || targetLang.startsWith("zh-")) {
            src = src.replaceAll("([^\\s])\\s*(\\Q*$%*\\E)", "$1 $2");
            src = src.replaceAll("([^.??!\\s])\\s*(\\Q*%$*\\E)", "$1 $2");
        } else {
            src = src.replaceAll("([^.\\s])\\s*(\\Q*$%*\\E.)", "$1. $2");
            src = src.replaceAll("([^.?!\\s])\\s*(\\Q*%$*\\E.)", "$1. $2");
        }

        // ?????????????
        if (targetLang.equals("ja") || targetLang.equals("zh") || targetLang.startsWith("zh-")) {
            src = src.replaceAll("([])(\\Q*$%*\\E)", "$1 $2");
            src = src.replaceAll("([.??!])(\\Q*%$*\\E)", "$1 $2");
        } else {
            src = src.replaceAll("([.])(\\Q*$%*\\E.)", "$1 $2");
            src = src.replaceAll("([.?!])(\\Q*%$*\\E.)", "$1 $2");
        }

        return src;
    }

    private int maxHop;
    private int maxSourceLength;
    private DivisionType sentenceDivision;
    private static Logger logger = Logger.getLogger(AbstractTranslationService.class.getName());
}