org.archive.hadoop.jobs.ArchiveFileExtractor.java Source code

Introduction

Here is the source code for org.archive.hadoop.jobs.ArchiveFileExtractor.java
Source

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.hadoop.jobs;

import com.google.common.io.ByteStreams;
import com.google.common.io.LimitInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.lang.*;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.text.ParseException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.logging.Logger;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapreduce.Job;
//import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;

import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.archive.extract.ExtractingResourceFactoryMapper;
import org.archive.extract.ExtractingResourceProducer;
import org.archive.extract.ExtractorOutput;
import org.archive.extract.ProducerUtils;
import org.archive.extract.ResourceFactoryMapper;
import org.archive.extract.WATExtractorOutput;
import org.archive.format.gzip.GZIPFormatException;
import org.archive.format.gzip.GZIPMemberSeries;
import org.archive.format.gzip.GZIPMemberWriter;
import org.archive.format.gzip.GZIPSeriesMember;
import org.archive.hadoop.util.FilenameInputFormat;
import org.archive.resource.Resource;
import org.archive.resource.ResourceProducer;
import org.archive.resource.producer.ARCFile;
import org.archive.resource.producer.EnvelopedResourceFile;
import org.archive.resource.producer.WARCFile;
import org.archive.streamcontext.SimpleStream;
import org.archive.util.DateUtils;
import org.archive.util.FileNameSpec;
import org.archive.util.HMACSigner;
import org.archive.util.IAUtils;
import org.archive.util.StringFieldExtractor.StringTuple;
import org.archive.util.StringFieldExtractor;

import org.archive.server.FileBackedInputStream;

/**
* ArchiveFileExtractor - Generate WAT files from (W)ARC files stored in HDFS
*/
public class ArchiveFileExtractor extends Configured implements Tool {

    public final static String TOOL_NAME = "ArchiveFileExtractor";
    public final static String TOOL_DESCRIPTION = "Repackage records from ARC/WARC files into new ARC/WARC files in HDFS";

    public static final Log LOG = LogFactory.getLog(ArchiveFileExtractor.class);

    public static class ArchiveFileExtractorMapper extends MapReduceBase
            implements Mapper<Object, Text, Text, Text> {
        private JobConf jobConf;

        private static final Charset UTF8 = Charset.forName("UTF-8");
        private static int CR = 13;
        private static int LF = 10;

        private String timestamp14;
        private String timestampZ;

        private byte[] warcHeaderContents;

        private final static String ARC_PATTERN = "filedesc://%s 0.0.0.0 %s text/plain 76\n"
                + "1 0 InternetArchive\n" + "URL IP-address Archive-date Content-type Archive-length\n\n";

        private final static String WARC_PATTERN = "WARC/1.0\r\n" + "WARC-Type: warcinfo\r\n" + "WARC-Date: %s\r\n"
                + "WARC-Filename: %s\r\n" + "WARC-Record-ID: <urn:uuid:%s>\r\n"
                + "Content-Type: application/warc-fields\r\n" + "Content-Length: %d\r\n\r\n";

        private String getWARCRecordID() {
            return "urn:uuid:" + UUID.randomUUID().toString();
        }

        private byte[] getARCHeader(String name) {
            return String.format(ARC_PATTERN, name, timestamp14).getBytes(UTF8);
        }

        private byte[] getWARCHeader(String name) throws IOException {
            String t = String.format(WARC_PATTERN, timestampZ, name, getWARCRecordID(),
                    warcHeaderContents.length + 4);
            byte[] b = t.getBytes(UTF8);
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            baos.write(b);
            baos.write(warcHeaderContents);
            baos.write(CR);
            baos.write(LF);
            baos.write(CR);
            baos.write(LF);
            return baos.toByteArray();
        }

        public byte[] getWarcHeaderContents() {
            return warcHeaderContents;
        }

        public void setWarcHeaderContents(byte[] warcHeaderContents) {
            this.warcHeaderContents = warcHeaderContents;
        }

        private long getGZLength(InputStream is) throws IOException, GZIPFormatException {

            SimpleStream s = new SimpleStream(is);
            GZIPMemberSeries gzs = new GZIPMemberSeries(s, "range", 0, true);
            GZIPSeriesMember m = gzs.getNextMember();
            m.skipMember();
            return m.getCompressedBytesRead();
        }

        /**
          * <p>Configures the job.</p>
          * * @param job The job configuration.
        */

        public void configure(JobConf job) {
            this.jobConf = job;
        }

        /**
        * Generate WAT file for the (w)arc file named in the
        * <code>key</code>
        */

        public void map(Object key, Text value, OutputCollector output, Reporter reporter) throws IOException {

            String inputString = value.toString();
            String[] inputParts = inputString.split("\t");

            if (inputParts.length != 2) {
                throw new IOException("invalid input");
            }

            boolean openArc = false;
            boolean openWarc = false;

            FileNameSpec warcNamer;
            FileNameSpec arcNamer;

            String hmacName = this.jobConf.get("hmacName", "");
            String hmacSignature = this.jobConf.get("hmacSignature", "");

            HMACSigner signer = null;
            if (hmacName != null && hmacSignature != null && !hmacName.isEmpty() && !hmacSignature.isEmpty())
                signer = new HMACSigner(hmacSignature, hmacName);

            timestamp14 = this.jobConf.get("timestamp14", DateUtils.get14DigitDate(System.currentTimeMillis()));
            String warcHeaderString = this.jobConf.get("warcHeaderString");
            warcHeaderContents = warcHeaderString.getBytes(UTF8);

            String outputDir = this.jobConf.get("outputDir");

            FileSystem hdfsSys = null;

            FSDataOutputStream currentArcOS = null;
            FSDataOutputStream currentWarcOS = null;

            try {
                long msse = DateUtils.parse14DigitDate(timestamp14).getTime();
                timestampZ = DateUtils.getLog17Date(msse);
            } catch (ParseException e) {
                LOG.error("Error parsing timestamp: ", e);
                throw new IOException(e);
            }

            String prefix = inputParts[0];
            prefix += "-";

            String resourceLocationBagString = inputParts[1];

            arcNamer = new FileNameSpec(prefix, ".arc.gz");
            warcNamer = new FileNameSpec(prefix, ".warc.gz");

            boolean firstArcRecord = true;
            boolean firstWarcRecord = true;

            //remove braces
            resourceLocationBagString = resourceLocationBagString.replaceAll("[{}]", "");

            //split into tuples
            resourceLocationBagString = resourceLocationBagString.replace("),(", ")\t(");

            //remove parentheses
            resourceLocationBagString = resourceLocationBagString.replaceAll("[()]", "");

            //inputs
            String[] resourceLocations = resourceLocationBagString.split("\t");

            FileBackedInputStream fbis = null;
            InputStream is = null;
            long millis = System.currentTimeMillis();
            String destArcOutputFileString = null;
            String destWarcOutputFileString = null;

            for (int i = 0; i < resourceLocations.length; i++) {

                String[] offLoc = resourceLocations[i].split(",");
                long offset = Long.parseLong(offLoc[0]);
                String url = offLoc[1];

                boolean isArc = false;

                try {
                    if (url.endsWith(".arc.gz")) {
                        isArc = true;
                    } else if (url.endsWith(".warc.gz")) {

                    } else {
                        throw new IOException("URL (" + url + ") must end with '.arc.gz' or '.warc.gz'");
                    }

                    if (url.startsWith("http://")) {
                        URL u = new URL(url);
                        URLConnection conn = u.openConnection();
                        conn.setRequestProperty("Range", String.format("bytes=%d-", offset));
                        if (signer != null)
                            conn.setRequestProperty("Cookie", signer.getHMacCookieStr(1000));
                        conn.connect();
                        is = conn.getInputStream();
                    } else if (url.startsWith("hdfs://")) {
                        URI u = new URI(url);
                        //only initialize the FS once
                        if (hdfsSys == null) {
                            URI defaultURI = new URI(u.getScheme() + "://" + u.getHost() + ":" + u.getPort() + "/");
                            hdfsSys = FileSystem.get(defaultURI, new Configuration());
                        }
                        Path path = new Path(u.getPath());
                        FSDataInputStream fis = hdfsSys.open(path);
                        fis.seek(offset);
                        is = fis;
                    }

                    fbis = new FileBackedInputStream(is);
                    long length = getGZLength(fbis);
                    InputStream orig = fbis.getInputStream();
                    if (isArc) {
                        if (firstArcRecord) {
                            String newArcName = arcNamer.getNextName();
                            destArcOutputFileString = this.jobConf.get("outputDir") + "/" + newArcName;
                            String outputFileString = destArcOutputFileString + "." + millis + ".TMP";
                            currentArcOS = FileSystem.get(new java.net.URI(outputFileString), this.jobConf)
                                    .create(new Path(outputFileString), false);
                            byte[] header = getARCHeader(newArcName);
                            GZIPMemberWriter w = new GZIPMemberWriter(currentArcOS);
                            w.write(new ByteArrayInputStream(header));
                            firstArcRecord = false;
                        }
                        LimitInputStream lis = new LimitInputStream(orig, length);
                        ByteStreams.copy(lis, currentArcOS);
                    } else {
                        if (firstWarcRecord) {
                            String newWarcName = warcNamer.getNextName();
                            destWarcOutputFileString = this.jobConf.get("outputDir") + "/" + newWarcName;
                            String outputFileString = destWarcOutputFileString + "." + millis + ".TMP";
                            currentWarcOS = FileSystem.get(new java.net.URI(outputFileString), this.jobConf)
                                    .create(new Path(outputFileString), false);
                            byte[] header = getWARCHeader(newWarcName);
                            GZIPMemberWriter w = new GZIPMemberWriter(currentWarcOS);
                            w.write(new ByteArrayInputStream(header));
                            firstWarcRecord = false;
                        }
                        LimitInputStream lis = new LimitInputStream(orig, length);
                        ByteStreams.copy(lis, currentWarcOS);
                    }
                    output.collect("SUCCESS", offset + "\t" + url);
                } catch (Exception e) {
                    LOG.error("Error processing: ", e);
                    output.collect("FAIL", offset + "\t" + url);
                    if (!this.jobConf.getBoolean("soft", false)) {
                        throw new IOException(e.toString() + "offset:" + offset + "url:" + url);
                    }
                } finally {
                    if (is != null) {
                        is.close();
                    }
                    if (fbis != null) {
                        fbis.resetBacker();
                    }
                }
            }

            // end of for loop
            try {
                if (currentArcOS != null) {
                    currentArcOS.close();
                    FileSystem.get(new java.net.URI(destArcOutputFileString + "." + millis + ".TMP"), this.jobConf)
                            .rename(new Path(destArcOutputFileString + "." + millis + ".TMP"),
                                    new Path(destArcOutputFileString));
                }

                if (currentWarcOS != null) {
                    currentWarcOS.close();
                    FileSystem.get(new java.net.URI(destWarcOutputFileString + "." + millis + ".TMP"), this.jobConf)
                            .rename(new Path(destWarcOutputFileString + "." + millis + ".TMP"),
                                    new Path(destWarcOutputFileString));
                }
            } catch (Exception e) {
                LOG.error("Error processing: ", e);
                throw new IOException(e.toString() + "Error finalizing files");
            }
        }
    }

    /** 
    * Print usage
    */
    public void printUsage() {
        String usage = "Usage: ArchiveFileExtractor [OPTIONS] <taskfile> <outputdir>\n";
        usage += "\tOptions:\n";
        usage += "\t\t-mappers NUM - try to run with approximately NUM map tasks (default: 10)\n";
        usage += "\t\t-timestamp14 TS - The 14 digit timestamp to use\n";
        usage += "\t\t-hmacname HMACNAME - The HMAC Name string\n";
        usage += "\t\t-hmacsignature HMACSIG - The HMAC Signature string\n";
        usage += "\t\t-warc-header-local-file LOCALPATH_TO_WARCHEADERFILE - The local file containing the WARC header to use\n";
        usage += "\t\t-soft - tolerate task exceptions\n";
        usage += "\t\t-timeout MILLISECONDS - mapred.task.timeout setting (default: 72000000)\n";
        usage += "\t\t-failpct PCT - mapred.max.map.failures.percent (default: 0). Set to 10 to allow 10% of map tasks to fail\n";
        usage += "\tThe taskfile contains lines of the form:\n";
        usage += "\t\tFilePrefix<tab>Bag of (offset,FilePath) tuples\n";
        usage += "\t\tFilePrefix is the prefix to be used by the extracted files\n";
        usage += "\t\toffset is the start offset of a W/ARC record\n";
        usage += "\t\tFilePath is a HTTP or HDFS URL to the file to extract from\n";
        System.out.println(usage);
    }

    /**
    * Run the job.
    */
    public int run(String[] args) throws Exception {
        if (args.length < 2) {
            printUsage();
            return 1;
        }

        // Create a job configuration
        JobConf job = new JobConf(getConf());

        // Job name uses output dir to help identify it to the operator.
        job.setJobName("Archive File Extractor");

        // This is a map-only job, no reducers.
        job.setNumReduceTasks(0);

        // turn off speculative execution
        job.setBoolean("mapred.map.tasks.speculative.execution", false);

        // set timeout to a high value - 20 hours
        job.setInt("mapred.task.timeout", 72000000);

        //tolerate task exceptions
        job.setBoolean("soft", false);

        int arg = 0;
        int numMaps = 10;

        String DEFAULT_WARC_PATTERN = "software: %s Extractor\r\n" + "format: WARC File Format 1.0\r\n"
                + "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n"
                + "publisher: Internet Archive\r\n" + "created: %s\r\n\r\n";

        String warcHeaderString = String.format(DEFAULT_WARC_PATTERN, IAUtils.COMMONS_VERSION,
                DateUtils.getLog17Date(System.currentTimeMillis()));

        while (arg < args.length - 1) {
            if (args[arg].equals("-soft")) {
                job.setBoolean("soft", true);
                arg++;
            } else if (args[arg].equals("-mappers")) {
                arg++;
                numMaps = Integer.parseInt(args[arg]);
                job.setNumMapTasks(numMaps);
                arg++;
            } else if (args[arg].equals("-timestamp14")) {
                arg++;
                String timestamp14 = DateUtils.get14DigitDate(DateUtils.parse14DigitDate(args[arg]));
                job.set("timestamp14", timestamp14);
                arg++;
            } else if (args[arg].equals("-warc-header-local-file")) {
                arg++;
                File f = new File(args[arg]);
                FileInputStream fis = new FileInputStream(f);
                warcHeaderString = IOUtils.toString(fis, "UTF-8");
                arg++;
            } else if (args[arg].equals("-hmacname")) {
                arg++;
                String hmacName = args[arg];
                job.set("hmacName", hmacName);
                arg++;
            } else if (args[arg].equals("-hmacsignature")) {
                arg++;
                String hmacSignature = args[arg];
                job.set("hmacSignature", hmacSignature);
                arg++;
            } else if (args[arg].equals("-timeout")) {
                arg++;
                int taskTimeout = Integer.parseInt(args[arg]);
                job.setInt("mapred.task.timeout", taskTimeout);
                arg++;
            } else if (args[arg].equals("-failpct")) {
                arg++;
                int failPct = Integer.parseInt(args[arg]);
                job.setInt("mapred.max.map.failures.percent", failPct);
                arg++;
            } else {
                break;
            }
        }

        job.set("warcHeaderString", warcHeaderString);

        if (args.length - 2 != arg) {
            printUsage();
            return 1;
        }

        Path inputPath = new Path(args[arg]);
        arg++;

        String outputDir = args[arg];
        arg++;

        job.set("outputDir", outputDir);
        Path outputPath = new Path(outputDir);

        job.setInputFormat(TextInputFormat.class);
        job.setOutputFormat(TextOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        job.setMapperClass(ArchiveFileExtractorMapper.class);
        job.setJarByClass(ArchiveFileExtractor.class);

        TextInputFormat.addInputPath(job, inputPath);
        FileOutputFormat.setOutputPath(job, outputPath);

        // Run the job!
        RunningJob rj = JobClient.runJob(job);
        if (!rj.isSuccessful()) {
            LOG.error("FAILED: " + rj.getID());
            return 2;
        }
        return 0;
    }

    /**
    * Command-line driver.  Runs the ArchiveFileExtractor as a Hadoop job.
    */
    public static void main(String args[]) throws Exception {
        int result = ToolRunner.run(new Configuration(), new ArchiveFileExtractor(), args);
        System.exit(result);
    }

}