org.commoncrawl.hadoop.io.JetS3tARCSource.java Source code

Introduction

Here is the source code for org.commoncrawl.hadoop.io.JetS3tARCSource.java
Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/
package org.commoncrawl.hadoop.io;

import java.io.IOException;
import java.io.InputStream;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.commoncrawl.util.EscapeUtils;
import org.jets3t.service.S3Service;
import org.jets3t.service.S3ServiceException;
import org.jets3t.service.impl.rest.httpclient.RestS3Service;
import org.jets3t.service.model.S3Bucket;
import org.jets3t.service.model.S3Object;
import org.jets3t.service.security.AWSCredentials;

/**
 * An {@link ARCSource} for gzipped ARC files stored on Amazon S3 that uses <a
 * href="http://jets3t.s3.amazonaws.com/index.html">JetS3t</a> to interact with
 * S3.
 * 
 * @author Albert Chern
 */
public class JetS3tARCSource extends ARCSplitCalculator implements ARCSource, JobConfigurable {

    /**
     * <tt>jets3t.arc.source.input.prefixes.csv</tt> - the property where the
     * prefixes to match for input files are stored (in the future, we may want to
     * extend this to do some sort of globbing or regular expression matching).
     * 
     * @see #setInputPrefixes
     */
    public static final String P_INPUT_PREFIXES = "jets3t.arc.source.input.prefixes.csv";

    /**
     * <tt>jets3t.arc.source.aws.access.key.id</tt> - the property where the AWS
     * Access Key ID to access the S3 account is stored.
     * 
     * @see #setAWSAccessKeyID
     */
    public static final String P_AWS_ACCESS_KEY_ID = "jets3t.arc.source.aws.access.key.id";

    /**
     * <tt>jets3t.arc.source.aws.secret.access.key</tt> - the property where the
     * AWS Secret Access Key to access the S3 account is stored.
     * 
     * @see #setAWSSecretAccessKey
     */
    public static final String P_AWS_SECRET_ACCESS_KEY = "jets3t.arc.source.aws.secret.access.key";

    /**
     * <tt>jets3t.arc.source.bucket.name</tt> - the property where the name of the
     * S3 bucket to access is stored.
     * 
     * @see #setBucketName
     */
    public static final String P_BUCKET_NAME = "jets3t.arc.source.bucket.name";

    /**
     * <tt>jets3t.arc.source.max.tries</tt> - the property where the maximum
     * number of times to try reading each file is stored.
     * 
     * @see #setMaxRetries
     */
    public static final String P_MAX_TRIES = "jets3t.arc.source.max.tries";

    private static final Log LOG = LogFactory.getLog(JetS3tARCSource.class);

    /**
     * Returns the set of input prefixes for a given job as set by
     * {@link #setInputPrefixes}.
     * 
     * @param job
     *          the job to get the input prefixes from
     * 
     * @see #P_INPUT_PREFIXES
     * 
     * @return a <tt>String[]</tt> with the input prefixes, or <tt>null</tt> if
     *         they have not been set
     */
    public static String[] getInputPrefixes(JobConf job) {
        String inputPrefixes = job.get(P_INPUT_PREFIXES);
        return inputPrefixes == null ? null : EscapeUtils.split(',', inputPrefixes);
    }

    /**
     * Sets the AWS access key ID of the S3 account to use.
     * 
     * @param job
     *          the job to set the AWS access key ID for
     * @param awsAccessKeyId
     *          the AWS access key ID
     * 
     * @see #P_AWS_ACCESS_KEY_ID
     */
    public static final void setAWSAccessKeyID(JobConf job, String awsAccessKeyId) {
        job.set(P_AWS_ACCESS_KEY_ID, awsAccessKeyId);
    }

    /**
     * Sets the AWS access key ID of the S3 account to use.
     * 
     * @param job
     *          the job to set the AWS secret access key for
     * @param awsSecretAccessKey
     *          the AWS secret access key
     * 
     * @see #P_AWS_SECRET_ACCESS_KEY
     */
    public static final void setAWSSecretAccessKey(JobConf job, String awsSecretAccessKey) {
        job.set(P_AWS_SECRET_ACCESS_KEY, awsSecretAccessKey);
    }

    /**
     * Sets the name of the S3 bucket to read from.
     * 
     * @param job
     *          the job to set the bucket name for
     * @param bucketName
     *          the bucket name
     * 
     * @see #P_BUCKET_NAME
     */
    public static final void setBucketName(JobConf job, String bucketName) {
        job.set(P_BUCKET_NAME, bucketName);
    }

    /**
     * Sets the key prefixes used to decide which S3 objects are included for
     * processing.
     * 
     * <p>
     * Call this method to set up the input parameters for a job. The matching
     * will be a pure prefix query with no delimiters.
     * 
     * @param job
     *          the job to set the prefix for
     * @param prefixes
     *          the key prefixes to match S3 objects with
     * 
     * @see #P_INPUT_PREFIXES
     */
    public static void setInputPrefixes(JobConf job, String... prefixes) {
        job.set(P_INPUT_PREFIXES, EscapeUtils.concatenate(',', prefixes));
    }

    /**
     * Sets the maximum number of times to try reading a file.
     * 
     * <p>
     * Default is 4.
     * 
     * @param job
     *          the job to set the maximum number of retries for
     * @param maxTries
     *          the maximum number of attempts per file
     */
    public static final void setMaxRetries(JobConf job, int maxtries) {
        job.setInt(P_MAX_TRIES, maxtries);
    }

    private S3Service service;
    private S3Bucket bucket;
    private int maxTries;

    /**
     * @inheritDoc
     */
    @Override
    protected void configureImpl(JobConf job) {
        try {

            // Pull credentials from the configuration
            String awsAccessKeyId = getProperty(job, P_AWS_ACCESS_KEY_ID);
            String awsSecretAccessKey = getProperty(job, P_AWS_SECRET_ACCESS_KEY);
            String bucketName = getProperty(job, P_BUCKET_NAME);

            // Instantiate JetS3t classes
            AWSCredentials awsCredentials = new AWSCredentials(awsAccessKeyId, awsSecretAccessKey);
            service = new RestS3Service(awsCredentials);
            bucket = new S3Bucket(bucketName);

            maxTries = job.getInt(P_MAX_TRIES, 4);

        } catch (S3ServiceException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * @inheritDoc
     */
    @Override
    protected Collection<ARCResource> getARCResources(JobConf job) throws IOException {

        try {
            String[] inputPrefixes = getInputPrefixes(job);
            if (inputPrefixes == null) {
                throw new IllegalArgumentException("No inputs prefixes set");
            }

            Map<String, ARCResource> resources = new HashMap<String, ARCResource>();
            for (String prefix : getInputPrefixes(job)) {
                for (S3Object object : service.listObjects(bucket, prefix, null)) {
                    String key = object.getKey();
                    resources.put(key, new ARCResource(key, object.getContentLength()));
                }
            }
            return resources.values();
        } catch (S3ServiceException e) {
            throw new IOException(e.toString());
        }
    }

    /**
     * Gets a property from a job and throws an exception if it is not set.
     */
    private String getProperty(JobConf job, String property) {
        String result = job.get(property);
        if (result == null) {
            throw new RuntimeException(property + " is not set");
        }
        return result;
    }

    /**
     * @inheritDoc
     */
    public InputStream getStream(String resource, long streamPosition, Throwable lastError, int previousFailures)
            throws Throwable {

        if (lastError == null || previousFailures < maxTries) {

            LOG.info("Opening " + resource + " at byte position " + streamPosition + ", attempt "
                    + (previousFailures + 1) + " out of " + maxTries);
            S3Object object = service.getObject(bucket, resource, null, null, null, null, streamPosition, null);
            return object.getDataInputStream();

        } else {

            LOG.info("Too many failures for " + resource + ", aborting");
            return null;
        }
    }
}