com.datatorrent.lib.io.fs.HDFSFileMerger.java Source code

Java tutorial

Introduction

Here is the source code for com.datatorrent.lib.io.fs.HDFSFileMerger.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.datatorrent.lib.io.fs;

import java.io.IOException;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import com.google.common.annotations.VisibleForTesting;

import com.datatorrent.api.Context.OperatorContext;
import com.datatorrent.lib.io.fs.Synchronizer.OutputFileMetadata;

/**
 * HDFS file merger extends file merger to optimize for HDFS file copy usecase.
 * This uses fast merge from HDFS if destination filesystem is same as
 * application filesystem.
 *
 * @since 3.4.0
 */
@org.apache.hadoop.classification.InterfaceStability.Evolving
public class HDFSFileMerger extends FileMerger {
    /**
     * Fast merge is possible if append is allowed for output file system.
     */
    private transient boolean fastMergeActive;
    /**
     * Default block size for output file system
     */
    private transient long defaultBlockSize;
    /**
     * Decision maker to enable fast merge based on blocks directory, application
     * directory, block size
     */
    private transient FastMergerDecisionMaker fastMergerDecisionMaker;

    /**
     * Initializations based on output file system configuration
     */
    @Override
    public void setup(OperatorContext context) {
        super.setup(context);
        fastMergeActive = outputFS.getConf().getBoolean("dfs.support.append", true)
                && appFS.getUri().equals(outputFS.getUri());
        LOG.debug("appFS.getUri():{}", appFS.getUri());
        LOG.debug("outputFS.getUri():{}", outputFS.getUri());
        defaultBlockSize = outputFS.getDefaultBlockSize(new Path(filePath));
        fastMergerDecisionMaker = new FastMergerDecisionMaker(blocksDirectoryPath, appFS, defaultBlockSize);
    }

    /**
     * Uses fast merge if possible. Else, fall back to serial merge.
     */
    @Override
    protected void mergeBlocks(OutputFileMetadata outputFileMetadata) throws IOException {

        try {
            LOG.debug("fastMergeActive: {}", fastMergeActive);
            if (fastMergeActive && fastMergerDecisionMaker.isFastMergePossible(outputFileMetadata)
                    && outputFileMetadata.getNumberOfBlocks() > 0) {
                LOG.debug("Using fast merge on HDFS.");
                concatBlocks(outputFileMetadata);
                return;
            }
            LOG.debug("Falling back to slow merge on HDFS.");
            super.mergeBlocks(outputFileMetadata);

        } catch (BlockNotFoundException e) {
            if (recover(outputFileMetadata)) {
                LOG.debug("Recovery attempt successful.");
                successfulFiles.add(outputFileMetadata);
            } else {
                failedFiles.add(outputFileMetadata);
            }
        }
    }

    /**
     * Fast merge using HDFS block concat
     * 
     * @param outputFileMetadata
     * @throws IOException
     */
    private void concatBlocks(OutputFileMetadata outputFileMetadata) throws IOException {
        Path outputFilePath = new Path(filePath, outputFileMetadata.getRelativePath());

        int numBlocks = outputFileMetadata.getNumberOfBlocks();
        long[] blocksArray = outputFileMetadata.getBlockIds();

        Path firstBlock = new Path(blocksDirectoryPath, Long.toString(blocksArray[0]));
        if (numBlocks > 1) {
            Path[] blockFiles = new Path[numBlocks - 1]; // Leave the first block

            for (int index = 1; index < numBlocks; index++) {
                blockFiles[index - 1] = new Path(blocksDirectoryPath, Long.toString(blocksArray[index]));
            }

            outputFS.concat(firstBlock, blockFiles);
        }

        moveToFinalFile(firstBlock, outputFilePath);
    }

    /**
     * Attempt for recovery if block concat is successful but temp file is not
     * moved to final file
     * 
     * @param outputFileMetadata
     * @throws IOException
     */
    @VisibleForTesting
    protected boolean recover(OutputFileMetadata outputFileMetadata) throws IOException {
        Path firstBlockPath = new Path(blocksDirectoryPath + Path.SEPARATOR + outputFileMetadata.getBlockIds()[0]);
        Path outputFilePath = new Path(filePath, outputFileMetadata.getRelativePath());
        if (appFS.exists(firstBlockPath)) {
            FileStatus status = appFS.getFileStatus(firstBlockPath);
            if (status.getLen() == outputFileMetadata.getFileLength()) {
                moveToFinalFile(firstBlockPath, outputFilePath);
                return true;
            }
            LOG.error("Unable to recover in FileMerger for file: {}", outputFilePath);
            return false;
        }

        if (outputFS.exists(outputFilePath)) {
            LOG.debug("Output file already present at the destination, nothing to recover.");
            return true;
        }
        LOG.error("Unable to recover in FileMerger for file: {}", outputFilePath);
        return false;
    }

    private static final Logger LOG = LoggerFactory.getLogger(HDFSFileMerger.class);

    /**
     * Utility class to decide fast merge possibility
     */
    public static class FastMergerDecisionMaker {

        private String blocksDir;
        private FileSystem appFS;
        private long defaultBlockSize;

        public FastMergerDecisionMaker(String blocksDir, FileSystem appFS, long defaultBlockSize) {
            this.blocksDir = blocksDir;
            this.appFS = appFS;
            this.defaultBlockSize = defaultBlockSize;
        }

        /**
         * Checks if fast merge is possible for given settings for blocks directory,
         * application file system, block size
         * 
         * @param outputFileMetadata
         * @throws IOException
         * @throws BlockNotFoundException
         */
        public boolean isFastMergePossible(OutputFileMetadata outputFileMetadata)
                throws IOException, BlockNotFoundException {
            short replicationFactor = 0;
            boolean sameReplicationFactor = true;
            boolean multipleOfBlockSize = true;

            int numBlocks = outputFileMetadata.getNumberOfBlocks();
            LOG.debug("fileMetadata.getNumberOfBlocks(): {}", outputFileMetadata.getNumberOfBlocks());
            long[] blocksArray = outputFileMetadata.getBlockIds();
            LOG.debug("fileMetadata.getBlockIds().len: {}", outputFileMetadata.getBlockIds().length);

            for (int index = 0; index < numBlocks && (sameReplicationFactor && multipleOfBlockSize); index++) {
                Path blockFilePath = new Path(blocksDir + Path.SEPARATOR + blocksArray[index]);
                if (!appFS.exists(blockFilePath)) {
                    throw new BlockNotFoundException(blockFilePath);
                }
                FileStatus status = appFS.getFileStatus(new Path(blocksDir + Path.SEPARATOR + blocksArray[index]));
                if (index == 0) {
                    replicationFactor = status.getReplication();
                    LOG.debug("replicationFactor: {}", replicationFactor);
                } else {
                    sameReplicationFactor = (replicationFactor == status.getReplication());
                    LOG.debug("sameReplicationFactor: {}", sameReplicationFactor);
                }

                if (index != numBlocks - 1) {
                    multipleOfBlockSize = (status.getLen() % defaultBlockSize == 0);
                    LOG.debug("multipleOfBlockSize: {}", multipleOfBlockSize);
                }
            }
            return sameReplicationFactor && multipleOfBlockSize;
        }
    }

}