com.inmobi.conduit.distcp.tools.mapred.lib.DynamicInputFormat.java Source code

Introduction

Here is the source code for com.inmobi.conduit.distcp.tools.mapred.lib.DynamicInputFormat.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.inmobi.conduit.distcp.tools.mapred.lib;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileStatus;

import com.inmobi.conduit.distcp.tools.DistCpConstants;
import com.inmobi.conduit.distcp.tools.util.DistCpUtils;
import com.inmobi.conduit.distcp.tools.util.HadoopCompat;

import java.util.List;
import java.util.ArrayList;
import java.io.IOException;

/**
 * DynamicInputFormat implements the "Worker pattern" for DistCp.
 * Rather than to split up the copy-list into a set of static splits,
 * the DynamicInputFormat does the following:
 * 1. Splits the copy-list into small chunks on the DFS.
 * 2. Creates a set of empty "dynamic" splits, that each consume as many chunks
 *    as it can.
 * This arrangement ensures that a single slow mapper won't slow down the entire
 * job (since the slack will be picked up by other mappers, who consume more
 * chunks.)
 * By varying the split-ratio, one can vary chunk sizes to achieve different
 * performance characteristics. 
 */
public class DynamicInputFormat<K, V> extends InputFormat<K, V> {
    private static final Log LOG = LogFactory.getLog(DynamicInputFormat.class);

    private static final String CONF_LABEL_LISTING_SPLIT_RATIO = "mapred.listing.split.ratio";
    private static final String CONF_LABEL_NUM_SPLITS = "mapred.num.splits";
    private static final String CONF_LABEL_NUM_ENTRIES_PER_CHUNK = "mapred.num.entries.per.chunk";

    /**
     * Implementation of InputFormat::getSplits(). This method splits up the
     * copy-listing file into chunks, and assigns the first batch to different
     * tasks.
     * @param jobContext: JobContext for the map job.
     * @return The list of (empty) dynamic input-splits.
     * @throws IOException, on failure.
     * @throws InterruptedException
     */
    @Override
    public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
        LOG.info("DynamicInputFormat: Getting splits for job:" + HadoopCompat.getJobId(jobContext));
        return createSplits(jobContext, splitCopyListingIntoChunksWithShuffle(jobContext));
    }

    private List<InputSplit> createSplits(JobContext jobContext, List<DynamicInputChunk> chunks)
            throws IOException {
        int numMaps = getNumMapTasks(HadoopCompat.getConfiguration(jobContext));

        final int nSplits = Math.min(numMaps, chunks.size());
        List<InputSplit> splits = new ArrayList<InputSplit>(nSplits);

        for (int i = 0; i < nSplits; ++i) {
            TaskID taskId = new TaskID(HadoopCompat.getJobId(jobContext), true, i);
            chunks.get(i).assignTo(taskId);
            splits.add(new FileSplit(chunks.get(i).getPath(), 0, 0, null));
        }
        DistCpUtils.publish(HadoopCompat.getConfiguration(jobContext), CONF_LABEL_NUM_SPLITS, splits.size());
        return splits;
    }

    private static int N_CHUNKS_OPEN_AT_ONCE_DEFAULT = 16;

    private List<DynamicInputChunk> splitCopyListingIntoChunksWithShuffle(JobContext context) throws IOException {

        final Configuration configuration = HadoopCompat.getConfiguration(context);
        int numRecords = getNumberOfRecords(configuration);
        int numMaps = getNumMapTasks(configuration);
        // Number of chunks each map will process, on average.
        int splitRatio = getListingSplitRatio(configuration, numMaps, numRecords);
        validateNumChunksUsing(splitRatio, numMaps);

        int numEntriesPerChunk = (int) Math.ceil((float) numRecords / (splitRatio * numMaps));
        DistCpUtils.publish(HadoopCompat.getConfiguration(context), CONF_LABEL_NUM_ENTRIES_PER_CHUNK,
                numEntriesPerChunk);

        final int nChunksTotal = (int) Math.ceil((float) numRecords / numEntriesPerChunk);
        int nChunksOpenAtOnce = Math.min(N_CHUNKS_OPEN_AT_ONCE_DEFAULT, nChunksTotal);

        Path listingPath = getListingFilePath(configuration);
        SequenceFile.Reader reader = new SequenceFile.Reader(listingPath.getFileSystem(configuration), listingPath,
                configuration);

        List<DynamicInputChunk> openChunks = new ArrayList<DynamicInputChunk>();

        List<DynamicInputChunk> chunksFinal = new ArrayList<DynamicInputChunk>();

        FileStatus fileStatus = new FileStatus();
        Text relPath = new Text();
        int recordCounter = 0;
        int chunkCount = 0;
        DynamicInputChunkSet chunkSet = new DynamicInputChunkSet(configuration);

        try {

            while (reader.next(relPath, fileStatus)) {
                if (recordCounter % (nChunksOpenAtOnce * numEntriesPerChunk) == 0) {
                    // All chunks full. Create new chunk-set.
                    closeAll(openChunks);
                    chunksFinal.addAll(openChunks);

                    openChunks = createChunks(chunkSet, chunkCount, nChunksTotal, nChunksOpenAtOnce);

                    chunkCount += openChunks.size();

                    nChunksOpenAtOnce = openChunks.size();
                    recordCounter = 0;
                }

                // Shuffle into open chunks.
                openChunks.get(recordCounter % nChunksOpenAtOnce).write(relPath, fileStatus);
                ++recordCounter;
            }

        } finally {
            closeAll(openChunks);
            chunksFinal.addAll(openChunks);
            IOUtils.closeStream(reader);
        }

        LOG.info("Number of dynamic-chunk-files created: " + chunksFinal.size());
        return chunksFinal;
    }

    private static void validateNumChunksUsing(int splitRatio, int numMaps) throws IOException {
        if (splitRatio * numMaps > MAX_CHUNKS_TOLERABLE)
            throw new IOException("Too many chunks created with splitRatio:" + splitRatio + ", numMaps:" + numMaps
                    + ". Reduce numMaps or decrease split-ratio to proceed.");
    }

    private static void closeAll(List<DynamicInputChunk> chunks) throws IOException {
        for (DynamicInputChunk chunk : chunks)
            chunk.close();
    }

    private static List<DynamicInputChunk> createChunks(DynamicInputChunkSet chunkSet, int chunkCount,
            int nChunksTotal, int nChunksOpenAtOnce) throws IOException {
        List<DynamicInputChunk> chunks = new ArrayList<DynamicInputChunk>();
        int chunkIdUpperBound = Math.min(nChunksTotal, chunkCount + nChunksOpenAtOnce);

        // If there will be fewer than nChunksOpenAtOnce chunks left after
        // the current batch of chunks, fold the remaining chunks into
        // the current batch.
        if (nChunksTotal - chunkIdUpperBound < nChunksOpenAtOnce)
            chunkIdUpperBound = nChunksTotal;

        for (int i = chunkCount; i < chunkIdUpperBound; ++i)
            chunks.add(createChunk(i, chunkSet));
        return chunks;
    }

    private static DynamicInputChunk createChunk(int chunkId, DynamicInputChunkSet chunkSet) throws IOException {
        return chunkSet.createChunkForWrite(String.format("%05d", chunkId));
    }

    private static Path getListingFilePath(Configuration configuration) {
        String listingFilePathString = configuration.get(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, "");

        assert !listingFilePathString.equals("") : "Listing file not found.";

        Path listingFilePath = new Path(listingFilePathString);
        try {
            assert listingFilePath.getFileSystem(configuration).exists(listingFilePath) : "Listing file: "
                    + listingFilePath + " not found.";
        } catch (IOException e) {
            assert false : "Listing file: " + listingFilePath + " couldn't be accessed. " + e.getMessage();
        }
        return listingFilePath;
    }

    private static int getNumberOfRecords(Configuration configuration) {
        return DistCpUtils.getInt(configuration, DistCpConstants.CONF_LABEL_TOTAL_NUMBER_OF_RECORDS);
    }

    private static int getNumMapTasks(Configuration configuration) {
        return DistCpUtils.getInt(configuration, DistCpConstants.CONF_LABEL_NUM_MAPS);
    }

    private static int getListingSplitRatio(Configuration configuration, int numMaps, int numPaths) {
        return configuration.getInt(CONF_LABEL_LISTING_SPLIT_RATIO, getSplitRatio(numMaps, numPaths));
    }

    private static final int MAX_CHUNKS_TOLERABLE = 400;
    private static final int MAX_CHUNKS_IDEAL = 100;
    private static final int MIN_RECORDS_PER_CHUNK = 5;
    private static final int SPLIT_RATIO_DEFAULT = 2;

    /**
     * Package private, for testability.
     * @param nMaps The number of maps requested for.
     * @param nRecords The number of records to be copied.
     * @return The number of splits each map should handle, ideally.
     */
    static int getSplitRatio(int nMaps, int nRecords) {
        if (nMaps == 1) {
            LOG.warn("nMaps == 1. Why use DynamicInputFormat?");
            return 1;
        }

        if (nMaps > MAX_CHUNKS_IDEAL)
            return SPLIT_RATIO_DEFAULT;

        int nPickups = (int) Math.ceil((float) MAX_CHUNKS_IDEAL / nMaps);
        int nRecordsPerChunk = (int) Math.ceil((float) nRecords / (nMaps * nPickups));

        return nRecordsPerChunk < MIN_RECORDS_PER_CHUNK ? SPLIT_RATIO_DEFAULT : nPickups;
    }

    private int getNumEntriesPerChunk(Configuration configuration) {
        return DistCpUtils.getInt(configuration, CONF_LABEL_NUM_ENTRIES_PER_CHUNK);
    }

    /**
     * Implementation of Inputformat::createRecordReader().
     * @param inputSplit: The split for which the RecordReader is required.
     * @param taskAttemptContext: TaskAttemptContext for the current attempt.
     * @return DynamicRecordReader instance.
     * @throws IOException, on failure.
     * @throws InterruptedException
     */
    @Override
    public RecordReader<K, V> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
            throws IOException, InterruptedException {
        return new DynamicRecordReader<K, V>(
                getNumEntriesPerChunk(HadoopCompat.getTaskConfiguration(taskAttemptContext)));
    }
}