org.commoncrawl.hadoop.io.ARCSplitCalculator.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.hadoop.io.ARCSplitCalculator.java

Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/
package org.commoncrawl.hadoop.io;

import java.io.IOException;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;

/**
 * Calculates splits based on the desired number of files per split and the
 * desired size of each split.
 * 
 * <p>
 * Concrete implementations should override {@link #getARCResources}.
 * 
 * @author Albert Chern
 */
public abstract class ARCSplitCalculator implements JobConfigurable {

    /**
     * <tt>arc.split.calculator.files.per.split</tt> - the property where the
     * number of files per input split is stored.
     * 
     * @see #setFilesPerSplit
     */
    public static final String P_FILES_PER_SPLIT = "arc.split.calculator.files.per.split";

    /**
     * <tt>arc.split.calculator.mb.per.split</tt> - the property where the desired
     * size in megabytes of the split is stored.
     * 
     * @see #setMegabytesPerSplit
     */
    public static final String P_MB_PER_SPLIT = "arc.split.calculator.mb.per.split";

    /**
     * Sets the desired number of files per input split.
     * 
     * <p>
     * Default is 1.
     * 
     * @param job
     *          the job to set the number of files per split for
     * @param filesPerSplit
     *          the desired number of ARC files per split
     * 
     * @see #P_FILES_PER_SPLIT
     */
    public static final void setFilesPerSplit(JobConf job, int filesPerSplit) {
        job.setInt(P_FILES_PER_SPLIT, filesPerSplit);
    }

    /**
     * Sets the desired number of megabytes per split.
     * 
     * <p>
     * New files will be added to a split until the total size of the split
     * exceeds this threshold. Default is no limit.
     * 
     * @param job
     *          the job to set the number of megabytes per split for
     * @param mbPerSplit
     *          the desired number of megabytes per split
     */
    public static final void setMegabytesPerSplit(JobConf job, int mbPerSplit) {
        job.setInt(P_MB_PER_SPLIT, mbPerSplit);
    }

    private int filesPerSplit;
    private long bytesPerSplit;

    private void addSplit(List<ARCSplit> splits, ARCResource[] resources, int size) {
        if (size > 0) {
            ARCResource[] copy = new ARCResource[size];
            System.arraycopy(resources, 0, copy, 0, size);
            splits.add(new ARCSplit(copy));
        }
    }

    /**
     * @inheritDoc
     */
    public final void configure(JobConf job) {
        filesPerSplit = job.getInt(P_FILES_PER_SPLIT, 1);
        bytesPerSplit = job.get(P_MB_PER_SPLIT) == null ? Long.MAX_VALUE
                : Long.parseLong(job.get(P_MB_PER_SPLIT)) * 1024 * 1024;
        configureImpl(job);
    }

    /**
     * Hook for subclass configuration.
     * 
     * @param job
     *          the {@link JobConf} of the job
     * 
     * @see JobConfigurable#configure
     */
    protected void configureImpl(JobConf job) {
    }

    /**
     * Given a job, returns the {@link ARCResource}s it should process.
     * 
     * @param job
     *          the job for which to get the {@link ARCResource}s
     * 
     * @return the {@link ARCResource}s to process
     * 
     * @throws IOException
     *           if an IO error occurs
     */
    protected abstract Collection<ARCResource> getARCResources(JobConf job) throws IOException;

    /**
     * @inheritDoc
     */
    public ARCSplit[] getARCSplits(JobConf job) throws IOException {

        List<ARCSplit> splits = new LinkedList<ARCSplit>();

        ARCResource[] resources = new ARCResource[filesPerSplit];
        int nResources = 0;
        long length = 0;

        for (ARCResource resource : getARCResources(job)) {
            resources[nResources++] = resource;
            length += resource.getSize();
            // When the split is too big, add it
            if (nResources >= filesPerSplit || length >= bytesPerSplit) {
                addSplit(splits, resources, nResources);
                nResources = 0;
                length = 0;
            }
        }

        // Add the final split
        addSplit(splits, resources, nResources);
        return splits.toArray(new ARCSplit[splits.size()]);
    }
}