org.aksw.dice.eaglet.uri.impl.WikipidiaUriChecker.java Source code

Java tutorial

Introduction

Here is the source code for org.aksw.dice.eaglet.uri.impl.WikipidiaUriChecker.java

Source

/**
 * This file is part of General Entity Annotator Benchmark.
 *
 * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * General Entity Annotator Benchmark is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with General Entity Annotator Benchmark.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.aksw.dice.eaglet.uri.impl;

import java.io.IOException;
import java.nio.charset.Charset;

import org.aksw.dice.eaglet.entitytypemodify.NamedEntityCorrections.Correction;
import org.aksw.dice.eaglet.entitytypemodify.NamedEntityCorrections.ErrorType;
import org.aksw.dice.eaglet.uri.UriChecker;
import org.aksw.gerbil.http.AbstractHttpRequestEmitter;
import org.aksw.gerbil.semantic.sameas.impl.SimpleDomainExtractor;
import org.aksw.gerbil.semantic.sameas.impl.wiki.WikipediaXMLParser;
import org.aksw.gerbil.utils.WikipediaHelper;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.escape.Escaper;
import com.google.common.net.UrlEscapers;

public class WikipidiaUriChecker extends AbstractHttpRequestEmitter implements UriChecker {

    private static final Logger LOGGER = LoggerFactory.getLogger(WikipidiaUriChecker.class);

    private static final String URL_PROTOCOL_PART = "http://";
    private static final String URL_QUERY_PART = "/w/api.php?format=xml&action=query&redirects=true&titles=";
    private static final String CHARSET_NAME = "UTF-8";
    private static final Escaper TITLE_ESCAPER = UrlEscapers.urlFormParameterEscaper();

    private Charset charset;
    private WikipediaXMLParser parser = new WikipediaXMLParser();

    public WikipidiaUriChecker() {
        try {
            charset = Charset.forName(CHARSET_NAME);
        } catch (Exception e) {
            charset = Charset.defaultCharset();
        }
    }

    @Override
    public ErrorType checkUri(String uri) {
        if (uri == null) {
            LOGGER.info("INVALID_URI \"{}\"", uri);
            return ErrorType.INVALIDURIERR;
        }
        String title = WikipediaHelper.getWikipediaTitle(uri);
        if (title == null) {
            LOGGER.info("INVALID_URI \"{}\"", uri);
            return ErrorType.INVALIDURIERR;
        }
        String xml = queryRedirect(SimpleDomainExtractor.extractDomain(uri), title);
        String redirect = parser.extractRedirect(xml);
        if ((redirect != null) && (!title.equals(redirect))) {
            LOGGER.info("OUTDATED_URI \"{}\"", uri);
            return ErrorType.OUTDATEDURIERR;
        } else {
            return ErrorType.NOERROR;
        }
    }

    public String queryRedirect(String domain, String title) {
        StringBuilder urlBuilder = new StringBuilder(150);
        urlBuilder.append(URL_PROTOCOL_PART);
        urlBuilder.append(domain);
        urlBuilder.append(URL_QUERY_PART);
        urlBuilder.append(TITLE_ESCAPER.escape(title));

        HttpGet request = null;
        try {
            request = createGetRequest(urlBuilder.toString());
        } catch (IllegalArgumentException e) {
            LOGGER.error("Got an exception while creating a request querying the wiki api of \"" + domain
                    + "\". Returning null.", e);
            return null;
        }
        CloseableHttpResponse response = null;
        HttpEntity entity = null;
        try {
            response = sendRequest(request);
            entity = response.getEntity();
            return IOUtils.toString(entity.getContent(), charset);
        } catch (Exception e) {
            LOGGER.error("Got an exception while querying the wiki api of \"" + domain + "\". Returning null.", e);
            return null;
        } finally {
            if (entity != null) {
                try {
                    EntityUtils.consume(entity);
                } catch (IOException e1) {
                }
            }
            IOUtils.closeQuietly(response);
            closeRequest(request);
        }
    }

}