org.polimi.zarathustra.experiment.DOMsDumpExperiment.java Source code

Java tutorial

Introduction

Here is the source code for org.polimi.zarathustra.experiment.DOMsDumpExperiment.java

Source

/*
 * Copyright 2014 Google Inc. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */

package org.polimi.zarathustra.experiment;

import java.io.File;
import java.io.IOException;
import java.math.BigInteger;
import java.net.URI;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.openqa.selenium.TimeoutException;
import org.polimi.zarathustra.DOMHelper;
import org.polimi.zarathustra.webdriver.LocalWebdriverWorker;
import org.w3c.dom.Document;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Charsets;
import com.google.common.io.Files;

/**
 * This experiment dumps the DOMs from a set of URLs to files in a directory. It
 * also creates a manifest.txt file in the directory.
 */
public class DOMsDumpExperiment {

    private static String getFileName(String url) throws NoSuchAlgorithmException {
        String domain = URI.create(url).getHost();
        MessageDigest md5 = MessageDigest.getInstance("MD5");
        md5.update(url.getBytes(), 0, url.length());
        String urlMd5 = new BigInteger(1, md5.digest()).toString(16);
        return String.format("%s.%s%s", domain, urlMd5, DOMHelper.DOM_DUMP_SUFFIX);
    }

    /**
     * Dumps the DOMs from a set of URLs to a directory. Sample invocation: java
     * -jar domdump.jar urllist.txt /tmp/output. urllist.txt is expected to have
     * one URL per line, without an empty final line.
     * <p>
     * NOTE: for testing purposes, this can be run with an extra argument,
     * FIREFOX: if so, it runs a firefox webdriver instead of explorer.
     */
    public static void main(String[] args) throws Exception {
        if (args.length < 2) {
            System.err.println("Invoke this experiment with 2 parameters:" + "urlFileList and outputDir");
            System.exit(-1);
        }
        paramsAreOkOrDie(args);
        File inputFile = new File(args[0]);
        File outputDir = new File(args[1]);

        LocalWebdriverWorker worker;
        if ((args.length > 2) && (args[2].equals("FIREFOX"))) {
            System.out.println("Running in FIREFOX mode, for testing only");
            worker = new LocalWebdriverWorker(false);
        } else {
            worker = new LocalWebdriverWorker(true);
        }

        List<String> urls = Files.readLines(inputFile, Charsets.UTF_8);
        urls = removeComments(urls);
        File manifest = new File(outputDir, "manifest.txt");
        Files.append("Start capture. Version 1.0\n", manifest, Charsets.UTF_8);
        for (String url : urls) {
            try {
                storeDOM(url, outputDir, worker);
                storeDOMManifest(url, outputDir, new Date().getTime());
                updateManifest(url, new Date().getTime(), manifest);
            } catch (TimeoutException e2) {
                System.out.println("WebDriver Timeout: " + e2.toString());
            } catch (org.openqa.selenium.UnhandledAlertException e1) {
                System.out.println(url + " --- UnhandledAlertException caught" + e1.toString());
            } catch (Exception e) {
                System.out.println("Generic exception caught: " + e.toString());
            }

        }
        worker.quit();
    }

    private static void paramsAreOkOrDie(String[] args) {
        File inputFile = new File(args[0]);
        File outputDir = new File(args[1]);
        if (!inputFile.canRead()) {
            System.err.println("Cannot read input file");
            System.exit(1);
        }
        if (!outputDir.canWrite()) {
            System.err.println("Cannot write to output dir");
            System.exit(1);
        }
        if (!outputDir.isDirectory()) {
            System.err.println("Cannot find output dir");
            System.exit(1);
        }
    }

    static List<String> removeComments(List<String> urls) {
        List<String> selectedLines = new ArrayList<String>();
        for (String url : urls) {
            if (url.charAt(0) != '#') {
                selectedLines.add(url);
            }
        }
        return selectedLines;
    }

    /**
     * Saves the DOM at the provided URL to the given outptuDir.
     */
    @VisibleForTesting
    static void storeDOM(String url, File outputDir, LocalWebdriverWorker worker)
            throws IOException, NoSuchAlgorithmException, TimeoutException {
        String fileName = getFileName(url);
        Document dom = worker.getDocumentAndStoreSource(url, outputDir, fileName);
        File output = new File(outputDir, fileName);
        DOMHelper.serializeDocument(dom, output.getAbsolutePath());
    }

    private static void storeDOMManifest(String url, File outputDir, long time)
            throws NoSuchAlgorithmException, IOException {
        String fileName = getFileName(url) + ".manifest";
        File output = new File(outputDir, fileName);
        Files.write(url + ", " + time, output, Charsets.UTF_8);
    }

    /**
     * Updates the manifest file adding a track of the current request
     */
    private static void updateManifest(String url, Long timestamp, File manifest) throws IOException {
        String trace = String.format("%s, %s\n", url, timestamp.toString());
        Files.append(trace, manifest, Charsets.UTF_8);
    }
}