uk.bl.wa.hadoop.indexer.mdx.WARCMDXGeneratorIntegrationTest.java Source code

Introduction

Here is the source code for uk.bl.wa.hadoop.indexer.mdx.WARCMDXGeneratorIntegrationTest.java
Source

package uk.bl.wa.hadoop.indexer.mdx;

/*
 * #%L
 * warc-hadoop-indexer
 * %%
 * Copyright (C) 2013 - 2018 The webarchive-discovery project contributors
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as
 * published by the Free Software Foundation, either version 2 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/gpl-2.0.html>.
 * #L%
 */

import static org.junit.Assert.assertEquals;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.Writer;

import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MiniMRCluster;
import org.apache.hadoop.mapred.OutputLogFilter;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;

import uk.bl.wa.hadoop.mapreduce.mdx.MDX;
import uk.bl.wa.hadoop.mapreduce.mdx.MDXSeqMerger;

public class WARCMDXGeneratorIntegrationTest {

    private static final Log log = LogFactory.getLog(WARCMDXGeneratorIntegrationTest.class);

    // Test cluster:
    private MiniDFSCluster dfsCluster = null;
    private MiniMRCluster mrCluster = null;

    // Input files:
    public final static String[] testWarcs = new String[] {
            "gov.uk-revisit-warcs/BL-20140325121225068-00000-32090~opera~8443.warc.gz",
            "gov.uk-revisit-warcs/BL-20140325122341434-00000-32090~opera~8443.warc.gz" };

    private final Path input = new Path("inputs");
    private final Path output = new Path("outputs");
    private final Path outputMerged = new Path("outputs-merged");

    // Exported results
    public static File outputSeq = new File("target/test.seq");
    public static File outputMergedSeq = new File("target/test-merged.seq");

    @Before
    public void setUp() throws Exception {
        // Print out the full config for debugging purposes:
        // Config index_conf = ConfigFactory.load();
        // LOG.debug(index_conf.root().render());

        log.warn("Spinning up test cluster...");
        // make sure the log folder exists,
        // otherwise the test fill fail
        new File("target/test-logs").mkdirs();
        //
        System.setProperty("hadoop.log.dir", "target/test-logs");
        System.setProperty("javax.xml.parsers.SAXParserFactory",
                "com.sun.org.apache.xerces.internal.jaxp.SAXParserFactoryImpl");

        //
        Configuration conf = new Configuration();
        System.setProperty("test.build.data", new File("target/mini-dfs").getAbsolutePath());
        dfsCluster = new MiniDFSCluster(conf, 1, true, null);
        dfsCluster.getFileSystem().makeQualified(input);
        dfsCluster.getFileSystem().makeQualified(output);
        //
        mrCluster = new MiniMRCluster(1, getFileSystem().getUri().toString(), 1);

        // prepare for tests
        for (String filename : testWarcs) {
            copyFileToTestCluster(getFileSystem(), input, "../warc-indexer/src/test/resources/", filename);
        }

        log.warn("Spun up test cluster.");
    }

    protected FileSystem getFileSystem() throws IOException {
        return dfsCluster.getFileSystem();
    }

    public static void copyFileToTestCluster(FileSystem fs, Path input, String prefix, String filename)
            throws IOException {
        Path targetPath = new Path(input, filename);
        File sourceFile = new File(prefix + filename);
        log.info("Copying " + filename + " into cluster at " + targetPath.toUri() + "...");
        FSDataOutputStream os = fs.create(targetPath);
        InputStream is = new FileInputStream(sourceFile);
        IOUtils.copy(is, os);
        is.close();
        os.close();
        log.info("Copy completed.");
    }

    public static File writeInputFile(Path[] inputFiles) throws Exception {
        // Make a list:
        File tmpInputsFile = File.createTempFile("inputs", ".txt");
        tmpInputsFile.deleteOnExit();
        Writer s = new FileWriter(tmpInputsFile);
        for (Path p : inputFiles) {
            s.write(p.toString() + "\n");
        }
        s.close();
        return tmpInputsFile;
    }

    @SuppressWarnings("deprecation")
    @Test
    public void testMDXGenerator() throws Exception {
        // prepare for test
        // createTextInputFile();

        log.info("Checking input file is present...");
        // Check that the input file is present:
        Path[] inputFiles = FileUtil.stat2Paths(
                getFileSystem().listStatus(new Path(input, "gov.uk-revisit-warcs/"), new OutputLogFilter()));
        Assert.assertEquals(2, inputFiles.length);
        // Create a file of the inputs
        File tmpInputsFile = writeInputFile(inputFiles);

        // Set up arguments for the job:
        String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.output.getName() };

        // Set up the WARCIndexerRunner
        WARCMDXGenerator wir = new WARCMDXGenerator();

        // run job
        // Job configuration:
        log.info("Setting up job config...");
        JobConf jobConf = this.mrCluster.createJobConf();
        jobConf.setInt(WARCMDXGenerator.WARC_HADOOP_NUM_REDUCERS, 1);
        jobConf.set("mapred.child.java.opts", "-Xmx512m");
        wir.createJobConf(jobConf, args);
        log.info("Running job...");
        JobClient.runJob(jobConf);
        log.info("Job finished, checking the results...");

        // check the output exists
        Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(output, new OutputLogFilter()));
        // Default is 1 reducers (as knitting together multiple sequence files
        // is not a mere matter of concatentation):
        Assert.assertEquals(1, outputFiles.length);

        // Copy the output out of HDFS and onto local FS:
        FileOutputStream fout = new FileOutputStream(outputSeq);
        for (Path output : outputFiles) {
            log.info(" --- output : " + output);
            if (getFileSystem().isFile(output)) {
                InputStream is = getFileSystem().open(output);
                IOUtils.copy(is, fout);
            } else {
                log.info(" --- ...skipping directory...");
            }
            fout.flush();
        }
        fout.close();

        // Check contents of the output:
        Configuration config = new Configuration();
        Path path = new Path(outputSeq.getAbsolutePath());
        SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(config), path, config);
        WritableComparable key = (WritableComparable) reader.getKeyClass().newInstance();
        Writable value = (Writable) reader.getValueClass().newInstance();

        MDX mdx;
        int counter = 0;
        while (reader.next(key, value)) {
            mdx = new MDX(value.toString());
            System.out.println(
                    "Key is: " + key + " record_type: " + mdx.getRecordType() + " SURT: " + mdx.getUrlAsSURT());
            counter++;
        }
        assertEquals(114, counter);
        reader.close();

        // Now test the MDXSeqMerger
        testSeqMerger(outputFiles);
    }

    private void testSeqMerger(Path[] inputFiles) throws Exception {

        // Create a file of the inputs
        File tmpInputsFile = writeInputFile(inputFiles);

        // Set up arguments for the job:
        String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.outputMerged.getName(), "-r", "1" };

        // Set up the WARCIndexerRunner
        MDXSeqMerger msm = new MDXSeqMerger();

        // run job
        log.info("Setting up job config...");
        JobConf jobConf = this.mrCluster.createJobConf();
        msm.createJobConf(jobConf, args);
        log.info("Running job...");
        JobClient.runJob(jobConf);
        log.info("Job finished, checking the results...");

        // Copy the output out of HDFS and onto local FS:
        FileOutputStream fout = new FileOutputStream(outputMergedSeq);
        Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(outputMerged, new OutputLogFilter()));
        for (Path output : outputFiles) {
            log.info(" --- output : " + output);
            if (getFileSystem().isFile(output)) {
                InputStream is = getFileSystem().open(output);
                IOUtils.copy(is, fout);
            } else {
                log.info(" --- ...skipping directory...");
            }
            fout.flush();
        }
        fout.close();

    }

    @After
    public void tearDown() throws Exception {
        log.warn("Tearing down test cluster...");
        if (dfsCluster != null) {
            dfsCluster.shutdown();
            dfsCluster = null;
        }
        if (mrCluster != null) {
            mrCluster.shutdown();
            mrCluster = null;
        }
        log.warn("Torn down test cluster.");
    }
}