com.flipkart.fdp.migration.distcp.core.MirrorFileInputFormat.java Source code

Java tutorial

Introduction

Here is the source code for com.flipkart.fdp.migration.distcp.core.MirrorFileInputFormat.java

Source

/*
 *
 *  Copyright 2015 Flipkart Internet Pvt. Ltd.
 *
 *     Licensed under the Apache License, Version 2.0 (the "License");
 *     you may not use this file except in compliance with the License.
 *     You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 *     Unless required by applicable law or agreed to in writing, software
 *     distributed under the License is distributed on an "AS IS" BASIS,
 *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *     See the License for the specific language governing permissions and
 *     limitations under the License.
 *
 */

package com.flipkart.fdp.migration.distcp.core;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import com.flipkart.fdp.migration.distcp.codec.DCMCodec;
import com.flipkart.fdp.migration.distcp.codec.DCMCodecFactory;
import com.flipkart.fdp.migration.distcp.codec.optimizer.WorkloadOptimizer;
import com.flipkart.fdp.migration.distcp.config.DCMConfig;
import com.flipkart.fdp.migration.distcp.config.DCMConstants.Status;
import com.flipkart.fdp.migration.distcp.state.StateManager;
import com.flipkart.fdp.migration.distcp.state.StateManagerFactory;
import com.flipkart.fdp.migration.distcp.state.TransferStatus;
import com.flipkart.fdp.migration.filter.FilterType;
import com.flipkart.fdp.migration.vo.FileTuple;
import com.flipkart.fdp.optimizer.OptimTuple;

public class MirrorFileInputFormat extends InputFormat<Text, Text> {

    public static final String DCM_CONFIG = "dcm_config";
    public static final String INCLUDE_FILES = "include_files";
    public static final String EXCLUDE_FILES = "exclude_files";

    private DCMCodec dcmInCodec = null;
    private Configuration conf = null;
    private DCMConfig dcmConfig = null;
    private StateManager stateManager = null;

    @Override
    public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {

        System.out.println("Calculating Job Splits...");

        conf = context.getConfiguration();
        dcmConfig = MirrorUtils.getConfigFromConf(conf);

        Set<String> excludeList = getExclusionsFileList(conf);
        Set<String> includeList = getInclusionFileList(conf);

        HashMap<String, FileTuple> inputFileMap = new HashMap<String, FileTuple>();
        Map<String, TransferStatus> previousState = null;

        List<FileTuple> fileTuples = null;
        List<InputSplit> splits = new ArrayList<InputSplit>();
        //TODO Bug - hashcode and equals not overridden in OptimTuple. If not HashSet, then fine.
        Set<OptimTuple> locations = new HashSet<OptimTuple>();

        long totalBatchSize = 0;

        try {

            System.out.println("Scanning source location...");

            dcmInCodec = DCMCodecFactory.getCodec(conf, dcmConfig.getSourceConfig().getDefaultConnectionConfig());

            //If includeList is used, then path is not considered, in either way excludeList been considered.
            if (includeList != null && includeList.size() > 0) {
                fileTuples = dcmInCodec.getInputPaths(includeList, excludeList);
            } else {
                fileTuples = dcmInCodec.getInputPaths(dcmConfig.getSourceConfig().getPath(), excludeList);
            }

            stateManager = StateManagerFactory.getStateManager(conf, dcmConfig);

            System.out.println("Fetching previous transfer states from StateManager...");
            previousState = stateManager.getPreviousTransferStatus();

            System.out.println("Filtering Input File Set based on User defined filters.");
            for (FileTuple fileTuple : fileTuples) {

                if (!ignoreFile(fileTuple, excludeList, previousState)) {

                    locations.add(new OptimTuple(fileTuple.getFileName(), fileTuple.getSize()));
                    inputFileMap.put(fileTuple.getFileName(), fileTuple);
                }
            }

            System.out.println("Optimizing Splits...");

            WorkloadOptimizer optimizer = DCMCodecFactory
                    .getCodecWorkloadOptimizer(dcmConfig.getSinkConfig().getDefaultConnectionConfig());
            splits.addAll(optimizer.optimizeWorkload(dcmConfig, locations, inputFileMap));

            sortSplits(splits);
            System.out.println(
                    "Total input paths to process: " + locations.size() + ", Total input splits: " + splits.size());
            System.out.println("Total Data to Transfer: " + totalBatchSize);

            stateManager.savePreviousTransferStatus(previousState);
        } catch (Exception e) {
            throw new IOException(e);
        }

        System.out.println("Done Calculating splits...");
        return splits;
    }

    private void sortSplits(List<InputSplit> splits) {
        Collections.sort(splits, new Comparator<InputSplit>() {
            // @Override
            public int compare(InputSplit f0, InputSplit f1) {
                try {
                    if (f1.getLength() > f0.getLength())
                        return 1;
                    if (f1.getLength() < f0.getLength())
                        return -1;
                    return 0;
                } catch (Exception e) {
                    return 0;
                }
            }
        });
    }

    private boolean ignoreFile(FileTuple fileTuple, Set<String> excludeList,
            Map<String, TransferStatus> previousState) {

        boolean ignoreFile = false;

        for (FilterType filterType : FilterType.values()) {
            if (filterType.getFilter().doFilter(dcmConfig, fileTuple)) {
                return true;
            }
        }

        String path = fileTuple.getFileName();
        if (previousState.containsKey(path)) {
            TransferStatus details = previousState.get(path);
            if (details.getStatus() == Status.COMPLETED) {
                if (dcmConfig.getSourceConfig().isIncludeUpdatedFiles() && details.getTs() < fileTuple.getTs()) {
                    return false;
                }
                return true;
            }
        }
        return ignoreFile;
    }

    @Override
    public RecordReader<Text, Text> createRecordReader(InputSplit arg0, TaskAttemptContext arg1)
            throws IOException, InterruptedException {
        return new MirrorFileRecordReader();
    }

    public static void setExclusionsFileList(Configuration conf, Collection<String> files) {
        conf.set(MirrorFileInputFormat.EXCLUDE_FILES, MirrorUtils.getListAsString(files));

    }

    public static void setInclusionFileList(Configuration conf, Collection<String> files) {
        conf.set(MirrorFileInputFormat.INCLUDE_FILES, MirrorUtils.getListAsString(files));
    }

    public static Set<String> getExclusionsFileList(Configuration conf) {

        return MirrorUtils.getStringAsLists(conf.get(EXCLUDE_FILES));

    }

    public static Set<String> getInclusionFileList(Configuration conf) {
        return MirrorUtils.getStringAsLists(conf.get(INCLUDE_FILES));
    }

}