jp.ac.ipu.ds.nekottyo.toolmaven.CrowlInternetArchive.java Source code

Java tutorial

Introduction

Here is the source code for jp.ac.ipu.ds.nekottyo.toolmaven.CrowlInternetArchive.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package jp.ac.ipu.ds.nekottyo.toolmaven;

import jp.ac.ipu.ds.nekottyo.kttestclient.ProcessBroker;

import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ListMultimap;
import java.io.File;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.Format;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.logging.FileHandler;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Stream;
import org.jsoup.nodes.Document;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 *
 * @author nekottyo
 */
public class CrowlInternetArchive {

    private static final String sourceFile = "data/source.txt";
    private static final String resultFile = "data/result1.txt";
    private static final String htmlFile = "data/";

    private static final String queryURL = "https://web.archive.org/web/*/";

    public static void main(String[] args) throws IOException {

        try {
            Date date = new Date();
            Format sdf = new SimpleDateFormat("dd_km");
            String now = sdf.format(date);
            FileHandler fh = new FileHandler("DSNotify_" + now + ".log", true);
            fh.setFormatter(new java.util.logging.SimpleFormatter());
            LOG.addHandler(fh);
        } catch (SecurityException ex) {
            Logger.getLogger(CrowlInternetArchive.class.getName()).log(Level.SEVERE, null, ex);
        }

        List list = new ArrayList();

        try (Stream<String> stream = Files.lines(Paths.get(sourceFile), StandardCharsets.UTF_8)) {
            stream.parallel().forEach((String l) -> {
                try {
                    ListMultimap<Integer, String> resultmap = crowlPastRDF(l);

                    StringBuilder logInfo = new StringBuilder();
                    logInfo.append(l).append("\n\t");

                    resultmap.keySet().stream().forEach((Integer key) -> {
                        logInfo.append(key).append(":").append(resultmap.get(key).size()).append(" ");
                    });
                    String except404 = String
                            .valueOf(1 - ((double) resultmap.get(200).size() / resultmap.values().size()));
                    System.out.println(resultmap.get(404));
                    Path dist = Paths.get("./data/404list.txt");
                    Files.write(dist, resultmap.get(404), Charset.defaultCharset());
                    list.add(l + "," + except404);
                    LOG.info(logInfo.append("\n\t").append(except404).toString());
                } catch (InterruptedException ex) {
                    Logger.getLogger(CrowlInternetArchive.class.getName()).log(Level.SEVERE, null, ex);
                } catch (IOException ex) {
                    Logger.getLogger(CrowlInternetArchive.class.getName()).log(Level.SEVERE, null, ex);
                }
            });
        }

        try {
            Path dest = Paths.get(resultFile);
            Files.write(dest, list, Charset.defaultCharset());
        } catch (Exception e) {
            LOG.severe(e.toString());
        }
    }

    public static ListMultimap<Integer, String> crowlPastRDF(String baseURL) throws InterruptedException {
        ListMultimap result = ArrayListMultimap.create();
        try {
            String queryString = queryURL + baseURL + "*";
            //            System.setProperty("http.proxyHost", "27.96.46.110");
            //            System.setProperty("http.proxyPort", "80");
            //            System.out.println(queryString);

            //            String command = "";
            //            String osName = System.getProperty("os.name").toLowerCase();
            //            if (osName.contains("windows")) {
            //                command = "./downloadArchive.bat " + queryString + " " + baseURL.replaceAll("/", ".") + ".html";
            //            } else if (osName.contains("mac")) {
            //                command = "./downloadArchive.sh " + queryString + " " + baseURL.replaceAll("/", ".") + ".html";
            //            }
            //            ProcessBroker pb = new ProcessBroker(command.split(" "));
            //            pb.execute();
            //            Document document = Jsoup.connect(queryString)
            //                    .timeout(2000000000)
            //                    .userAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.52 Safari/537.36")
            //                    .get();
            //            Document document = Jsoup.parse(new File(htmlFile + baseURL.replaceAll("/", ".") + ".html"), "UTF-8");
            Stream<String> htmlStream = Files.lines(Paths.get(htmlFile + baseURL.replaceAll("/", ".") + ".html"),
                    Charset.defaultCharset());
            //            Elements elements = document.select(".url a");

            htmlStream.parallel().filter(s -> s.contains(baseURL)).map(s -> s.replaceAll("<.+?>", "").trim())
                    //                    .filter(s -> s.text().endsWith("rdf"))
                    //                    .filter(s -> s.text().endsWith("rdf") || s.text().endsWith("nt") || s.text().endsWith("n3"))
                    //                    .filter(s -> s.text().contains("foaf"))
                    .limit(20000).forEach((String e) -> {
                        //                        System.out.println("checking -> " + e);
                        result.put(getStatusCode(e), e);
                    });

            result.keySet().stream().forEach(k -> {
                //                System.out.println(k + " " + result.get(k));
            });

            return result;

        } catch (IOException ex) {
            LOG.severe(ex.toString());
        }
        return result;
    }

    public static int getStatusCode(String urlString) {
        HttpURLConnection connection = null;

        try {
            URL url = new URL(urlString);
            connection = (HttpURLConnection) url.openConnection();
            connection.setRequestProperty("User-Agent",
                    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.52 Safari/537.36");
            connection.setRequestMethod("GET");
            connection.setReadTimeout(5000);

            return connection.getResponseCode();
        } catch (Exception e) {
        } finally {
            connection.disconnect();
        }
        return -1;
    }

    private static final Logger LOG = Logger.getLogger(CrowlInternetArchive.class.getName());

}