com.inmobi.conduit.distcp.tools.mapred.TestUniformSizeInputFormat.java Source code

Introduction

Here is the source code for com.inmobi.conduit.distcp.tools.mapred.TestUniformSizeInputFormat.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.inmobi.conduit.distcp.tools.mapred;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.security.Credentials;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import org.mockito.Mockito;

import com.inmobi.conduit.distcp.tools.CopyListing;
import com.inmobi.conduit.distcp.tools.DistCpOptions;
import com.inmobi.conduit.distcp.tools.mapred.UniformSizeInputFormat;

import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;

public class TestUniformSizeInputFormat {
    private static final Log LOG = LogFactory.getLog(TestUniformSizeInputFormat.class);

    private static MiniDFSCluster cluster;
    private static final int N_FILES = 20;
    private static final int SIZEOF_EACH_FILE = 1024;
    private static final Random random = new Random();
    private static int totalFileSize = 0;

    private static final Credentials CREDENTIALS = new Credentials();

    @BeforeClass
    public static void setup() throws Exception {
        Configuration conf = new Configuration();
        conf.set("dfs.datanode.max.xcievers", "512");
        cluster = new MiniDFSCluster(conf, 1, true, null);
        totalFileSize = 0;

        for (int i = 0; i < N_FILES; ++i)
            totalFileSize += createFile("/tmp/source/" + String.valueOf(i), SIZEOF_EACH_FILE);
    }

    private static DistCpOptions getOptions(int nMaps) throws Exception {
        Path sourcePath = new Path(cluster.getFileSystem().getUri().toString() + "/tmp/source");
        Path targetPath = new Path(cluster.getFileSystem().getUri().toString() + "/tmp/target");

        List<Path> sourceList = new ArrayList<Path>();
        sourceList.add(sourcePath);
        final DistCpOptions distCpOptions = new DistCpOptions(sourceList, targetPath);
        distCpOptions.setMaxMaps(nMaps);
        return distCpOptions;
    }

    private static int createFile(String path, int fileSize) throws Exception {
        FileSystem fileSystem = null;
        DataOutputStream outputStream = null;
        try {
            fileSystem = cluster.getFileSystem();
            outputStream = fileSystem.create(new Path(path), true, 0);
            int size = (int) Math.ceil(fileSize + (1 - random.nextFloat()) * fileSize);
            outputStream.write(new byte[size]);
            return size;
        } finally {
            IOUtils.cleanup(null, fileSystem, outputStream);
        }
    }

    @AfterClass
    public static void tearDown() {
        cluster.shutdown();
    }

    public void testGetSplits(int nMaps) throws Exception {
        DistCpOptions options = getOptions(nMaps);
        Configuration configuration = new Configuration();
        configuration.set("mapred.map.tasks", String.valueOf(options.getMaxMaps()));
        Path listFile = new Path(cluster.getFileSystem().getUri().toString() + "/tmp/testGetSplits_1/fileList.seq");
        CopyListing.getCopyListing(configuration, CREDENTIALS, options).buildListing(listFile, options);

        JobContext jobContext = Mockito.mock(JobContext.class);
        Mockito.when(jobContext.getConfiguration()).thenReturn(configuration);
        Mockito.when(jobContext.getJobID()).thenReturn(new JobID());
        UniformSizeInputFormat uniformSizeInputFormat = new UniformSizeInputFormat();
        List<InputSplit> splits = uniformSizeInputFormat.getSplits(jobContext);

        //Removing the legacy check - Refer HADOOP-9230
        int sizePerMap = totalFileSize / nMaps;

        checkSplits(listFile, splits);

        int doubleCheckedTotalSize = 0;
        int previousSplitSize = -1;
        for (int i = 0; i < splits.size(); ++i) {
            InputSplit split = splits.get(i);
            int currentSplitSize = 0;
            TaskAttemptID taskId = new TaskAttemptID("", 0, true, 0, 0);
            final TaskAttemptContext taskAttemptContext = Mockito.mock(TaskAttemptContext.class);
            Mockito.when(taskAttemptContext.getConfiguration()).thenReturn(configuration);
            Mockito.when(taskAttemptContext.getTaskAttemptID()).thenReturn(taskId);
            RecordReader<Text, FileStatus> recordReader = uniformSizeInputFormat.createRecordReader(split,
                    taskAttemptContext);
            recordReader.initialize(split, taskAttemptContext);
            while (recordReader.nextKeyValue()) {
                Path sourcePath = recordReader.getCurrentValue().getPath();
                FileSystem fs = sourcePath.getFileSystem(configuration);
                FileStatus fileStatus[] = fs.listStatus(sourcePath);
                Assert.assertEquals(fileStatus.length, 1);
                currentSplitSize += fileStatus[0].getLen();
            }
            Assert.assertTrue(previousSplitSize == -1
                    || Math.abs(currentSplitSize - previousSplitSize) < 0.1 * sizePerMap || i == splits.size() - 1);

            doubleCheckedTotalSize += currentSplitSize;
        }

        Assert.assertEquals(totalFileSize, doubleCheckedTotalSize);
    }

    private void checkSplits(Path listFile, List<InputSplit> splits) throws IOException {
        long lastEnd = 0;

        //Verify if each split's start is matching with the previous end and
        //we are not missing anything
        for (InputSplit split : splits) {
            FileSplit fileSplit = (FileSplit) split;
            long start = fileSplit.getStart();
            Assert.assertEquals(lastEnd, start);
            lastEnd = start + fileSplit.getLength();
        }

        //Verify there is nothing more to read from the input file
        SequenceFile.Reader reader = new SequenceFile.Reader(cluster.getFileSystem(), listFile,
                cluster.getFileSystem().getConf());
        try {
            reader.seek(lastEnd);
            FileStatus srcFileStatus = new FileStatus();
            Text srcRelPath = new Text();
            Assert.assertFalse(reader.next(srcRelPath, srcFileStatus));
        } finally {
            IOUtils.closeStream(reader);
        }
    }

    //@Test
    public void testGetSplits() throws Exception {
        testGetSplits(9);
        for (int i = 1; i < N_FILES; ++i)
            testGetSplits(i);
    }
}