org.asqatasun.rules.elementchecker.lang.LangChecker.java Source code

Java tutorial

Introduction

Here is the source code for org.asqatasun.rules.elementchecker.lang.LangChecker.java

Source

/*
 *  Asqatasun - Automated webpage assessment
 *  Copyright (C) 2008-2015  Asqatasun.org
 * 
 *  This file is part of Asqatasun.
 * 
 *  Asqatasun is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Affero General Public License as
 *  published by the Free Software Foundation, either version 3 of the
 *  License, or (at your option) any later version.
 * 
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Affero General Public License for more details.
 * 
 *  You should have received a copy of the GNU Affero General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * 
 *  Contact us by mail: asqatasun AT asqatasun DOT org
 */

package org.asqatasun.rules.elementchecker.lang;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.validator.GenericValidator;
import org.apache.log4j.Logger;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.asqatasun.entity.audit.EvidenceElement;
import org.asqatasun.entity.audit.TestSolution;
import org.asqatasun.entity.reference.Nomenclature;
import org.asqatasun.processor.SSPHandler;
import org.asqatasun.ruleimplementation.TestSolutionHandler;
import org.asqatasun.rules.elementchecker.NomenclatureBasedElementChecker;
import org.asqatasun.rules.elementchecker.lang.detector.LanguageDetectionResult;
import org.asqatasun.rules.elementchecker.lang.detector.LanguageDetector;
import org.asqatasun.rules.keystore.AttributeStore;
import static org.asqatasun.rules.keystore.AttributeStore.LANG_ATTR;
import static org.asqatasun.rules.keystore.AttributeStore.XML_LANG_ATTR;
import static org.asqatasun.rules.keystore.EvidenceStore.*;
import org.asqatasun.rules.keystore.HtmlElementStore;
import static org.asqatasun.rules.keystore.RemarkMessageStore.*;
import org.asqatasun.rules.textbuilder.CompleteTextElementBuilder;
import org.asqatasun.rules.textbuilder.TextElementBuilder;

/**
 * 
 * This class defines basic utility methods to deal with language checks.
 */
public abstract class LangChecker extends NomenclatureBasedElementChecker {

    private static final Logger LOGGER = Logger.getLogger(LangChecker.class);

    private static final String NON_ALPHANUMERIC_PATTERN_STR = "[\\d+\\W+]+?";
    private final Pattern nonAlphanumericPattern = Pattern.compile(NON_ALPHANUMERIC_PATTERN_STR);
    private static final String LANG_DECLARATION_PATTERN_STR = "\\w{2,3}(\\-\\w{2,})?$";
    private final Pattern langDeclarationPattern = Pattern.compile(LANG_DECLARATION_PATTERN_STR);
    private Collection<String> xhtmlDoctypesSet;
    private Collection<String> validLanguagesSet;
    private static final String XHTML_DOCTYPE_NOM = "XhtmlDoctypeDeclarations";
    private static final String LANG_NOM = "ValidLanguageCode";
    private static final int DISPLAYABLE_TEXT_SIZE = 200;
    private static final String[] EXCLUDED_ELEMENTS = { HtmlElementStore.SCRIPT_ELEMENT,
            HtmlElementStore.CODE_ELEMENT, HtmlElementStore.KBD_ELEMENT, HtmlElementStore.SAMP_ELEMENT,
            HtmlElementStore.TT_ELEMENT, HtmlElementStore.VAR_ELEMENT, HtmlElementStore.NO_FRAMES_ELEMENT,
            HtmlElementStore.NO_SCRIPT_ELEMENT, };
    private static final Collection<String> EXCLUDED_ELEMENTS_LIST = Arrays.asList(EXCLUDED_ELEMENTS);

    private String suspectedIdenticalLangMsg;

    public void setSuspectedIdenticalLangMsg(String suspectedIdenticalLangMsg) {
        this.suspectedIdenticalLangMsg = suspectedIdenticalLangMsg;
    }

    private String suspectedDifferentLangMsg;

    public void setSuspectedDifferentLangMsg(String suspectedDifferentLangMsg) {
        this.suspectedDifferentLangMsg = suspectedDifferentLangMsg;
    }

    private String differentLangMsg;

    public void setDifferentLangMsg(String differentLangMsg) {
        this.differentLangMsg = differentLangMsg;
    }

    private String identicalLangMsg;

    public void setIdenticalLangMsg(String identicalLangMsg) {
        this.identicalLangMsg = identicalLangMsg;
    }

    /** the number of elements tested */
    private int nbOfElementsTested = 0;

    public int getNbOfElementsTested() {
        return nbOfElementsTested;
    }

    public void newElementTested() {
        nbOfElementsTested++;
    }

    /* the text Element Builder used to extract text to test */
    private TextElementBuilder testableTextElementBuilder;

    public void setTestableTextElementBuilder(TextElementBuilder testableTextElementBuilder) {
        this.testableTextElementBuilder = testableTextElementBuilder;
    }

    /**
     * Default constructor
     */
    public LangChecker() {
        super();
    }

    /**
     * constructor
     * @param identicalLangMsg
     * @param differentLangMsg
     * @param suspectedIdenticalLangMsg
     * @param suspectedDifferentLangMsg 
     */
    public LangChecker(String identicalLangMsg, String differentLangMsg, String suspectedDifferentLangMsg,
            String suspectedIdenticalLangMsg) {
        super();
        this.identicalLangMsg = identicalLangMsg;
        this.differentLangMsg = differentLangMsg;
        this.suspectedIdenticalLangMsg = suspectedIdenticalLangMsg;
        this.suspectedDifferentLangMsg = suspectedDifferentLangMsg;
    }

    @Override
    protected void doCheck(SSPHandler sspHandler, Elements elements, TestSolutionHandler testSolutionHandler) {
        loadXhtmlDoctypes();
        loadValidLanguages();
        // the handler may contain the html element or nothing
        for (Element el : elements) {
            testSolutionHandler.addTestSolution(doCheckLanguage(el, sspHandler));
        }
    }

    /**
     * 
     * @param element
     * @param sspHandler 
     * @return  the solution of the check
     */
    protected abstract TestSolution doCheckLanguage(Element element, SSPHandler sspHandler);

    /**
     * 
     * @param element
     * @param langDefinition
     * @param effectiveLang
     * @param createProcessRemarkOnFailure
     * @return 
     */
    protected TestSolution checkLanguageDeclarationValidity(Element element, String langDefinition,
            String effectiveLang, boolean createProcessRemarkOnFailure) {
        TestSolution testSolution = TestSolution.PASSED;
        if (!isLangWellDeclared(langDefinition)) {
            testSolution = TestSolution.FAILED;
            if (createProcessRemarkOnFailure) {
                addInvalidDeclarationSourceCodeRemark(element, langDefinition, testSolution,
                        MALFORMED_LANGUAGE_DECLARATION_MSG);
            }
        } else if (!validLanguagesSet.contains(effectiveLang.toLowerCase())) {
            testSolution = TestSolution.FAILED;
            if (createProcessRemarkOnFailure) {
                addInvalidDeclarationSourceCodeRemark(element, effectiveLang, testSolution,
                        WRONG_LANGUAGE_DECLARATION_MSG);
            }
        }
        return testSolution;
    }

    /**
     * 
     * @param element
     * @param defaultLang
     * @param currentLang
     * @param text
     * @param solutionOnIdentical
     * @param solutionOnDifferent
     * @return 
     */
    protected TestSolution checkLanguageRelevancy(Element element, String defaultLang, String currentLang,
            String text, TestSolution solutionOnIdentical, TestSolution solutionOnDifferent) {

        Long startDetection = null;
        if (LOGGER.isDebugEnabled()) {
            startDetection = Calendar.getInstance().getTime().getTime();
        }
        LanguageDetectionResult ldr = LanguageDetector.getInstance().detectLanguage(text);
        if (LOGGER.isDebugEnabled()) {
            LOGGER.debug("Detection took " + (Calendar.getInstance().getTime().getTime() - startDetection)
                    + " ms on " + text.length() + " characters");
        }
        if (ldr == null) {
            addSourceCodeRemark(TestSolution.NEED_MORE_INFO, element, UNDETECTED_LANG_MSG, defaultLang, currentLang,
                    "", text);
            return TestSolution.NEED_MORE_INFO;
        }

        boolean isLangIdentical = StringUtils.equalsIgnoreCase(defaultLang, ldr.getDetectedLanguage());

        if (isLangIdentical && ldr.isReliable()) {
            addSourceCodeRemark(solutionOnIdentical, element, identicalLangMsg, defaultLang, currentLang,
                    ldr.getDetectedLanguage(), text);
            return solutionOnIdentical;
        } else if (isLangIdentical && !ldr.isReliable()) {
            addSourceCodeRemark(TestSolution.NEED_MORE_INFO, element, suspectedIdenticalLangMsg, defaultLang,
                    currentLang, ldr.getDetectedLanguage(), text);
            return TestSolution.NEED_MORE_INFO;
        } else if (!isLangIdentical && ldr.isReliable()) {
            addSourceCodeRemark(solutionOnDifferent, element, differentLangMsg, defaultLang, currentLang,
                    ldr.getDetectedLanguage(), text);
            return solutionOnDifferent;
        } else { //!isLangIdentical && !ldr.isReliable()
            addSourceCodeRemark(TestSolution.NEED_MORE_INFO, element, suspectedDifferentLangMsg, defaultLang,
                    currentLang, ldr.getDetectedLanguage(), text);
            return TestSolution.NEED_MORE_INFO;
        }
    }

    /**
     * 
     * @param element
     * @param sspHandler
     * @return 
     */
    protected String extractLangDefinitionFromElement(Element element, SSPHandler sspHandler) {
        String lang = element.attr(LANG_ATTR).trim();
        String xmlLang = element.attr(XML_LANG_ATTR).trim();
        // if the lang attributes are identical, return one of them
        if (xmlLang.isEmpty() && !lang.isEmpty()) {
            return lang;
        } else if (!xmlLang.isEmpty() && lang.isEmpty()) {
            return xmlLang;
        } else {
            if (xmlLang.equalsIgnoreCase(lang)) {
                return lang;
                // if the doctype defines html document, returns the lang attribute
                // value
            } else if (!hasSSPXhtmlDoctype(sspHandler)) {
                return lang;
            } else {
                // if the doctype defines a xhtml document, returns the xml:lang
                // attribute value
                return xmlLang;
            }
        }
    }

    /**
     * 
     * @param langDefinition
     * @return the language code (truncate language definition when option is
     * defined)
     */
    protected String extractEffectiveLang(String langDefinition) {
        int separatorIndex = StringUtils.indexOf(langDefinition, '-');
        if (separatorIndex != -1) {
            return StringUtils.substring(langDefinition, 0, separatorIndex);
        }
        return langDefinition;
    }

    /**
     * 
     * @param element
     * @param extractRecursively
     * @return 
     */
    protected String extractTextFromElement(Element element, boolean extractRecursively) {
        if (EXCLUDED_ELEMENTS_LIST.contains(element.tagName())) {
            return null;
        }
        StringBuilder strb = new StringBuilder();
        if (testableTextElementBuilder == null) {
            testableTextElementBuilder = new CompleteTextElementBuilder();
        }
        strb.append(testableTextElementBuilder.buildTextFromElement(element));

        if (extractRecursively) {
            for (Element el : element.children()) {
                if (!isLangDefinedForElement(el) && !EXCLUDED_ELEMENTS_LIST.contains(el.tagName())) {
                    strb.append(TextElementBuilder.SPACER);
                    strb.append(extractTextFromElement(el, true));
                }
            }
        }
        return strb.toString().replaceAll(" +", " ");
    }

    /**
     * 
     * @param extractedText
     * @return 
     */
    protected boolean isTextTestable(String extractedText) {
        if (StringUtils.isBlank(extractedText)) {
            return false;
        }
        String textToTest = StringUtils.trim(extractedText);
        Matcher m = nonAlphanumericPattern.matcher(textToTest);
        return !m.matches() && !GenericValidator.isEmail(textToTest) && !GenericValidator.isUrl(textToTest);
    }

    /**
     * 
     * @param element
     * @return whether the current element is defined with a "lang" attribute
     * or a "xml:lang" attribute
     */
    protected boolean isLangDefinedForElement(Element element) {
        return element.hasAttr(AttributeStore.LANG_ATTR) || element.hasAttr(AttributeStore.XML_LANG_ATTR);
    }

    /**
     * @param sspHandler
     * @return
     *      true if the ssp embeds a xhtml doctype, false instead.
     */
    protected boolean hasSSPXhtmlDoctype(SSPHandler sspHandler) {
        return xhtmlDoctypesSet.contains(sspHandler.getSSP().getDoctype());
    }

    /**
     * This method loads the allowed xhtml doctypes nomenclature
     */
    protected void loadXhtmlDoctypes() {
        if (xhtmlDoctypesSet == null) {
            Nomenclature xhtmlDoctypes = getNomenclatureLoaderService().loadByCode(XHTML_DOCTYPE_NOM);
            xhtmlDoctypesSet = xhtmlDoctypes.getValueList();
        }
    }

    /**
     * This method loads the valid languages nomenclature
     */
    protected void loadValidLanguages() {
        if (validLanguagesSet == null) {
            Nomenclature validLanguages = getNomenclatureLoaderService().loadByCode(LANG_NOM);
            validLanguagesSet = validLanguages.getValueList();
        }
    }

    /**
     * 
     * @param extractedLang
     * @return whether a lang is well-defined regarding the lang declaration 
     * pattern
     */
    protected boolean isLangWellDeclared(String extractedLang) {
        Matcher m = langDeclarationPattern.matcher(extractedLang);
        return m.matches();
    }

    /**
     * Create a sourceCodeRemark with the link text and the value of the
     * title attribute as arguments
     * @param testSolution
     * @param element
     * @param message
     * @param defaultLang
     * @param currentLang
     * @param linkTextValue
     */
    private void addSourceCodeRemark(TestSolution testSolution, Element element, String message, String defaultLang,
            String currentLang, String detectedLang, String testedText) {
        if (testSolution.equals(TestSolution.PASSED) || StringUtils.isBlank(message)) {
            return;
        }
        List<EvidenceElement> evidenceElementList = new ArrayList<>();

        if (StringUtils.isNotBlank(currentLang)) {
            evidenceElementList.add(getEvidenceElement(DEFAULT_LANGUAGE_EE, defaultLang));
            evidenceElementList.add(getEvidenceElement(CURRENT_LANGUAGE_EE, currentLang));
        } else {
            evidenceElementList.add(getEvidenceElement(LANGUAGE_EE, defaultLang));
        }
        evidenceElementList.add(getEvidenceElement(DETECTED_LANGUAGE_EE, detectedLang));

        if (testedText.length() > DISPLAYABLE_TEXT_SIZE) {
            testedText = testedText.substring(0, DISPLAYABLE_TEXT_SIZE);
        }
        evidenceElementList.add(getEvidenceElement(EXTRACTED_TEXT_EE, testedText));

        addSourceCodeRemark(testSolution, element, message, evidenceElementList);
    }

    /**
     * 
     * @param element
     * @param extractedLang
     * @param testSolution
     * @param message
     */
    private void addInvalidDeclarationSourceCodeRemark(Element element, String extractedLang,
            TestSolution testSolution, String message) {
        List<EvidenceElement> evidenceElementList = new ArrayList<>();

        evidenceElementList.add(getEvidenceElement(LANGUAGE_EE, extractedLang));

        addSourceCodeRemark(testSolution, element, message, evidenceElementList);
    }

}