org.apache.hadoop.dfs.DataNode.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.dfs.DataNode.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.dfs;

import org.apache.commons.logging.*;

import org.apache.hadoop.fs.ChecksumException;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.ipc.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.net.DNS;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.net.SocketOutputStream;
import org.apache.hadoop.util.*;
import org.apache.hadoop.util.DiskChecker.DiskErrorException;
import org.apache.hadoop.util.DiskChecker.DiskOutOfSpaceException;
import org.apache.hadoop.dfs.IncorrectVersionException;
import org.apache.hadoop.mapred.StatusHttpServer;
import org.apache.hadoop.dfs.BlockCommand;
import org.apache.hadoop.dfs.DatanodeProtocol;
import org.apache.hadoop.dfs.FSDatasetInterface.MetaDataInputStream;
import org.apache.hadoop.dfs.datanode.metrics.DataNodeMetrics;
import org.apache.hadoop.dfs.BlockMetadataHeader;

import java.io.*;
import java.net.*;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.channels.ServerSocketChannel;
import java.nio.channels.SocketChannel;
import java.util.*;
import java.util.concurrent.Semaphore;
import java.security.NoSuchAlgorithmException;
import java.security.SecureRandom;

/**********************************************************
 * DataNode is a class (and program) that stores a set of
 * blocks for a DFS deployment.  A single deployment can
 * have one or many DataNodes.  Each DataNode communicates
 * regularly with a single NameNode.  It also communicates
 * with client code and other DataNodes from time to time.
 *
 * DataNodes store a series of named blocks.  The DataNode
 * allows client code to read these blocks, or to write new
 * block data.  The DataNode may also, in response to instructions
 * from its NameNode, delete blocks or copy blocks to/from other
 * DataNodes.
 *
 * The DataNode maintains just one critical table:
 *   block-> stream of bytes (of BLOCK_SIZE or less)
 *
 * This info is stored on a local disk.  The DataNode
 * reports the table's contents to the NameNode upon startup
 * and every so often afterwards.
 *
 * DataNodes spend their lives in an endless loop of asking
 * the NameNode for something to do.  A NameNode cannot connect
 * to a DataNode directly; a NameNode simply returns values from
 * functions invoked by a DataNode.
 *
 * DataNodes maintain an open server socket so that client code 
 * or other DataNodes can read/write data.  The host/port for
 * this server is reported to the NameNode, which then sends that
 * information to clients or other DataNodes that might be interested.
 *
 **********************************************************/
public class DataNode extends Configured
        implements InterDatanodeProtocol, ClientDatanodeProtocol, FSConstants, Runnable {
    public static final Log LOG = LogFactory.getLog("org.apache.hadoop.dfs.DataNode");

    /**
     * Use {@link NetUtils#createSocketAddr(String)} instead.
     */
    @Deprecated
    public static InetSocketAddress createSocketAddr(String target) throws IOException {
        return NetUtils.createSocketAddr(target);
    }

    /**
     * Minimum buffer used while sending data to clients. Used only if
     * transferTo() is enabled. 64KB is not that large. It could be larger, but
     * not sure if there will be much more improvement.
     */
    private static final int MIN_BUFFER_WITH_TRANSFERTO = 64 * 1024;

    DatanodeProtocol namenode = null;
    FSDatasetInterface data = null;
    DatanodeRegistration dnRegistration = null;

    volatile boolean shouldRun = true;
    private LinkedList<Block> receivedBlockList = new LinkedList<Block>();
    private LinkedList<String> delHints = new LinkedList<String>();
    final static String EMPTY_DEL_HINT = "";
    int xmitsInProgress = 0;
    Daemon dataXceiveServer = null;
    ThreadGroup threadGroup = null;
    long blockReportInterval;
    //disallow the sending of BR before instructed to do so
    long lastBlockReport = Long.MAX_VALUE;
    boolean resetBlockReportTime = true;
    long initialBlockReportDelay = BLOCKREPORT_INITIAL_DELAY * 1000L;
    private boolean waitForFirstBlockReportRequest = false;
    long lastHeartbeat = 0;
    long heartBeatInterval;
    private DataStorage storage = null;
    private StatusHttpServer infoServer = null;
    private DataNodeMetrics myMetrics;
    private static InetSocketAddress nameNodeAddr;
    private InetSocketAddress selfAddr;
    private static DataNode datanodeObject = null;
    private Thread dataNodeThread = null;
    String machineName;
    private static String dnThreadName;
    private int socketTimeout;
    private int socketWriteTimeout = 0;
    private boolean transferToAllowed = true;
    private int writePacketSize = 0;

    DataBlockScanner blockScanner = null;
    Daemon blockScannerThread = null;

    private static final Random R = new Random();

    /**
     * Maximal number of concurrent xceivers per node.
     * Enforcing the limit is required in order to avoid data-node
     * running out of memory.
     */
    private static final int MAX_XCEIVER_COUNT = 256;
    private int maxXceiverCount = MAX_XCEIVER_COUNT;

    /** A manager to make sure that cluster balancing does not
     * take too much resources.
     *
     * It limits the number of block moves for balancing and
     * the total amount of bandwidth they can use.
     */
    private static class BlockBalanceThrottler extends Throttler {
        private int numThreads;

        /**Constructor
         *
         * @param bandwidth Total amount of bandwidth can be used for balancing
         */
        private BlockBalanceThrottler(long bandwidth) {
            super(bandwidth);
            LOG.info("Balancing bandwith is " + bandwidth + " bytes/s");
        }

        /** Check if the block move can start.
         *
         * Return true if the thread quota is not exceeded and
         * the counter is incremented; False otherwise.
         */
        private synchronized boolean acquire() {
            if (numThreads >= Balancer.MAX_NUM_CONCURRENT_MOVES) {
                return false;
            }
            numThreads++;
            return true;
        }

        /** Mark that the move is completed. The thread counter is decremented. */
        private synchronized void release() {
            numThreads--;
        }
    }

    private BlockBalanceThrottler balancingThrottler;

    /**
     * We need an estimate for block size to check if the disk partition has
     * enough space. For now we set it to be the default block size set
     * in the server side configuration, which is not ideal because the
     * default block size should be a client-size configuration. 
     * A better solution is to include in the header the estimated block size,
     * i.e. either the actual block size or the default block size.
     */
    private long estimateBlockSize;

    // For InterDataNodeProtocol
    Server ipcServer;

    // Record all sockets opend for data transfer
    Map<Socket, Socket> childSockets = Collections.synchronizedMap(new HashMap<Socket, Socket>());

    /**
     * Current system time.
     * @return current time in msec.
     */
    static long now() {
        return System.currentTimeMillis();
    }

    /**
     * Create the DataNode given a configuration and an array of dataDirs.
     * 'dataDirs' is where the blocks are stored.
     */
    DataNode(Configuration conf, AbstractList<File> dataDirs) throws IOException {
        super(conf);
        datanodeObject = this;

        try {
            startDataNode(conf, dataDirs);
        } catch (IOException ie) {
            shutdown();
            throw ie;
        }
    }

    /**
     * This method starts the data node with the specified conf.
     * 
     * @param conf - the configuration
     *  if conf's CONFIG_PROPERTY_SIMULATED property is set
     *  then a simulated storage based data node is created.
     * 
     * @param dataDirs - only for a non-simulated storage data node
     * @throws IOException
     */
    void startDataNode(Configuration conf, AbstractList<File> dataDirs) throws IOException {
        // use configured nameserver & interface to get local hostname
        if (conf.get("slave.host.name") != null) {
            machineName = conf.get("slave.host.name");
        }
        if (machineName == null) {
            machineName = DNS.getDefaultHost(conf.get("dfs.datanode.dns.interface", "default"),
                    conf.get("dfs.datanode.dns.nameserver", "default"));
        }
        InetSocketAddress nameNodeAddr = NameNode.getAddress(conf);

        this.estimateBlockSize = conf.getLong("dfs.block.size", DEFAULT_BLOCK_SIZE);
        this.socketTimeout = conf.getInt("dfs.socket.timeout", FSConstants.READ_TIMEOUT);
        this.socketWriteTimeout = conf.getInt("dfs.datanode.socket.write.timeout", FSConstants.WRITE_TIMEOUT);
        /* Based on results on different platforms, we might need set the default 
         * to false on some of them. */
        this.transferToAllowed = conf.getBoolean("dfs.datanode.transferTo.allowed", true);
        this.writePacketSize = conf.getInt("dfs.write.packet.size", 64 * 1024);
        String address = NetUtils.getServerAddress(conf, "dfs.datanode.bindAddress", "dfs.datanode.port",
                "dfs.datanode.address");
        InetSocketAddress socAddr = NetUtils.createSocketAddr(address);
        int tmpPort = socAddr.getPort();
        storage = new DataStorage();
        // construct registration
        this.dnRegistration = new DatanodeRegistration(machineName + ":" + tmpPort);

        // connect to name node
        this.namenode = (DatanodeProtocol) RPC.waitForProxy(DatanodeProtocol.class, DatanodeProtocol.versionID,
                nameNodeAddr, conf);
        // get version and id info from the name-node
        NamespaceInfo nsInfo = handshake();
        StartupOption startOpt = getStartupOption(conf);
        assert startOpt != null : "Startup option must be set.";

        boolean simulatedFSDataset = conf.getBoolean("dfs.datanode.simulateddatastorage", false);
        if (simulatedFSDataset) {
            setNewStorageID(dnRegistration);
            dnRegistration.storageInfo.layoutVersion = FSConstants.LAYOUT_VERSION;
            dnRegistration.storageInfo.namespaceID = nsInfo.namespaceID;
            // it would have been better to pass storage as a parameter to
            // constructor below - need to augment ReflectionUtils used below.
            conf.set("StorageId", dnRegistration.getStorageID());
            try {
                //Equivalent of following (can't do because Simulated is in test dir)
                //  this.data = new SimulatedFSDataset(conf);
                this.data = (FSDatasetInterface) ReflectionUtils
                        .newInstance(Class.forName("org.apache.hadoop.dfs.SimulatedFSDataset"), conf);
            } catch (ClassNotFoundException e) {
                throw new IOException(StringUtils.stringifyException(e));
            }
        } else { // real storage
            // read storage info, lock data dirs and transition fs state if necessary
            storage.recoverTransitionRead(nsInfo, dataDirs, startOpt);
            // adjust
            this.dnRegistration.setStorageInfo(storage);
            // initialize data node internal structure
            this.data = new FSDataset(storage, conf);
        }

        // find free port
        ServerSocket ss = (socketWriteTimeout > 0) ? ServerSocketChannel.open().socket() : new ServerSocket();
        Server.bind(ss, socAddr, 0);
        ss.setReceiveBufferSize(DEFAULT_DATA_SOCKET_SIZE);
        ss.setSoTimeout(conf.getInt("dfs.dataXceiver.timeoutInMS", 30000)); //30s
        // adjust machine name with the actual port
        tmpPort = ss.getLocalPort();
        selfAddr = new InetSocketAddress(ss.getInetAddress().getHostAddress(), tmpPort);
        this.dnRegistration.setName(machineName + ":" + tmpPort);
        LOG.info("Opened info server at " + tmpPort);

        this.maxXceiverCount = conf.getInt("dfs.datanode.max.xcievers", MAX_XCEIVER_COUNT);
        this.threadGroup = new ThreadGroup("dataXceiveServer");
        this.dataXceiveServer = new Daemon(threadGroup, new DataXceiveServer(ss));
        this.threadGroup.setDaemon(true); // auto destroy when empty

        this.blockReportInterval = conf.getLong("dfs.blockreport.intervalMsec", BLOCKREPORT_INTERVAL);
        this.initialBlockReportDelay = conf.getLong("dfs.blockreport.initialDelay", BLOCKREPORT_INITIAL_DELAY)
                * 1000L;
        if (this.initialBlockReportDelay >= blockReportInterval) {
            this.initialBlockReportDelay = 0;
            LOG.info("dfs.blockreport.initialDelay is greater than " + "dfs.blockreport.intervalMsec."
                    + " Setting initial delay to 0 msec:");
        }
        this.heartBeatInterval = conf.getLong("dfs.heartbeat.interval", HEARTBEAT_INTERVAL) * 1000L;
        DataNode.nameNodeAddr = nameNodeAddr;

        this.balancingThrottler = new BlockBalanceThrottler(
                conf.getLong("dfs.balance.bandwidthPerSec", 1024L * 1024));

        //initialize periodic block scanner
        String reason = null;
        if (conf.getInt("dfs.datanode.scan.period.hours", 0) < 0) {
            reason = "verification is turned off by configuration";
        } else if (!(data instanceof FSDataset)) {
            reason = "verifcation is supported only with FSDataset";
        }
        if (reason == null) {
            blockScanner = new DataBlockScanner(this, (FSDataset) data, conf);
        } else {
            LOG.info("Periodic Block Verification is disabled because " + reason + ".");
        }

        //create a servlet to serve full-file content
        String infoAddr = NetUtils.getServerAddress(conf, "dfs.datanode.info.bindAddress", "dfs.datanode.info.port",
                "dfs.datanode.http.address");
        InetSocketAddress infoSocAddr = NetUtils.createSocketAddr(infoAddr);
        String infoHost = infoSocAddr.getHostName();
        int tmpInfoPort = infoSocAddr.getPort();
        this.infoServer = new StatusHttpServer("datanode", infoHost, tmpInfoPort, tmpInfoPort == 0);
        InetSocketAddress secInfoSocAddr = NetUtils
                .createSocketAddr(conf.get("dfs.datanode.https.address", infoHost + ":" + 0));
        Configuration sslConf = new Configuration(conf);
        sslConf.addResource(conf.get("https.keystore.info.rsrc", "sslinfo.xml"));
        String keyloc = sslConf.get("https.keystore.location");
        if (null != keyloc) {
            this.infoServer.addSslListener(secInfoSocAddr, keyloc, sslConf.get("https.keystore.password", ""),
                    sslConf.get("https.keystore.keypassword", ""));
        }
        this.infoServer.addServlet(null, "/streamFile/*", StreamFile.class);
        this.infoServer.setAttribute("datanode.blockScanner", blockScanner);
        this.infoServer.addServlet(null, "/blockScannerReport", DataBlockScanner.Servlet.class);
        this.infoServer.start();
        // adjust info port
        this.dnRegistration.setInfoPort(this.infoServer.getPort());
        myMetrics = new DataNodeMetrics(conf, dnRegistration.getStorageID());

        //init ipc server
        InetSocketAddress ipcAddr = NetUtils.createSocketAddr(conf.get("dfs.datanode.ipc.address"));
        ipcServer = RPC.getServer(this, ipcAddr.getHostName(), ipcAddr.getPort(),
                conf.getInt("dfs.datanode.handler.count", 3), false, conf);
        ipcServer.start();
        dnRegistration.setIpcPort(ipcServer.getListenerAddress().getPort());

        LOG.info("dnRegistration = " + dnRegistration);
    }

    /**
     * Creates either NIO or regular depending on socketWriteTimeout.
     */
    private Socket newSocket() throws IOException {
        return (socketWriteTimeout > 0) ? SocketChannel.open().socket() : new Socket();
    }

    private NamespaceInfo handshake() throws IOException {
        NamespaceInfo nsInfo = new NamespaceInfo();
        while (shouldRun) {
            try {
                nsInfo = namenode.versionRequest();
                break;
            } catch (SocketTimeoutException e) { // namenode is busy
                LOG.info("Problem connecting to server: " + getNameNodeAddr());
                try {
                    Thread.sleep(1000);
                } catch (InterruptedException ie) {
                }
            }
        }
        String errorMsg = null;
        // verify build version
        if (!nsInfo.getBuildVersion().equals(Storage.getBuildVersion())) {
            errorMsg = "Incompatible build versions: namenode BV = " + nsInfo.getBuildVersion() + "; datanode BV = "
                    + Storage.getBuildVersion();
            LOG.fatal(errorMsg);
            try {
                namenode.errorReport(dnRegistration, DatanodeProtocol.NOTIFY, errorMsg);
            } catch (SocketTimeoutException e) { // namenode is busy
                LOG.info("Problem connecting to server: " + getNameNodeAddr());
            }
            throw new IOException(errorMsg);
        }
        assert FSConstants.LAYOUT_VERSION == nsInfo
                .getLayoutVersion() : "Data-node and name-node layout versions must be the same." + "Expected: "
                        + FSConstants.LAYOUT_VERSION + " actual " + nsInfo.getLayoutVersion();
        return nsInfo;
    }

    /** Return the DataNode object
     * 
     */
    public static DataNode getDataNode() {
        return datanodeObject;
    }

    static InterDatanodeProtocol createInterDataNodeProtocolProxy(DatanodeID datanodeid, Configuration conf)
            throws IOException {
        InetSocketAddress addr = NetUtils.createSocketAddr(datanodeid.getHost() + ":" + datanodeid.getIpcPort());
        if (InterDatanodeProtocol.LOG.isDebugEnabled()) {
            InterDatanodeProtocol.LOG.info("InterDatanodeProtocol addr=" + addr);
        }
        return (InterDatanodeProtocol) RPC.getProxy(InterDatanodeProtocol.class, InterDatanodeProtocol.versionID,
                addr, conf);
    }

    public InetSocketAddress getNameNodeAddr() {
        return nameNodeAddr;
    }

    public InetSocketAddress getSelfAddr() {
        return selfAddr;
    }

    DataNodeMetrics getMetrics() {
        return myMetrics;
    }

    /**
     * Return the namenode's identifier
     */
    public String getNamenode() {
        //return namenode.toString();
        return "<namenode>";
    }

    static void setNewStorageID(DatanodeRegistration dnReg) {
        /* Return 
         * "DS-randInt-ipaddr-currentTimeMillis"
         * It is considered extermely rare for all these numbers to match
         * on a different machine accidentally for the following 
         * a) SecureRandom(INT_MAX) is pretty much random (1 in 2 billion), and
         * b) Good chance ip address would be different, and
         * c) Even on the same machine, Datanode is designed to use different ports.
         * d) Good chance that these are started at different times.
         * For a confict to occur all the 4 above have to match!.
         * The format of this string can be changed anytime in future without
         * affecting its functionality.
         */
        String ip = "unknownIP";
        try {
            ip = DNS.getDefaultIP("default");
        } catch (UnknownHostException ignored) {
            LOG.warn("Could not find ip address of \"default\" inteface.");
        }

        int rand = 0;
        try {
            rand = SecureRandom.getInstance("SHA1PRNG").nextInt(Integer.MAX_VALUE);
        } catch (NoSuchAlgorithmException e) {
            LOG.warn("Could not use SecureRandom");
            rand = R.nextInt(Integer.MAX_VALUE);
        }
        dnReg.storageID = "DS-" + rand + "-" + ip + "-" + dnReg.getPort() + "-" + System.currentTimeMillis();
    }

    /**
     * Register datanode
     * <p>
     * The datanode needs to register with the namenode on startup in order
     * 1) to report which storage it is serving now and 
     * 2) to receive a registrationID 
     * issued by the namenode to recognize registered datanodes.
     * 
     * @see FSNamesystem#registerDatanode(DatanodeRegistration,String)
     * @throws IOException
     */
    private void register() throws IOException {
        if (dnRegistration.getStorageID().equals("")) {
            setNewStorageID(dnRegistration);
        }
        while (shouldRun) {
            try {
                // reset name to machineName. Mainly for web interface.
                dnRegistration.name = machineName + ":" + dnRegistration.getPort();
                dnRegistration = namenode.register(dnRegistration);
                break;
            } catch (SocketTimeoutException e) { // namenode is busy
                LOG.info("Problem connecting to server: " + getNameNodeAddr());
                try {
                    Thread.sleep(1000);
                } catch (InterruptedException ie) {
                }
            }
        }
        assert ("".equals(storage.getStorageID()) && !"".equals(dnRegistration.getStorageID()))
                || storage.getStorageID().equals(dnRegistration
                        .getStorageID()) : "New storageID can be assigned only if data-node is not formatted";
        if (storage.getStorageID().equals("")) {
            storage.setStorageID(dnRegistration.getStorageID());
            storage.writeAll();
            LOG.info("New storage id " + dnRegistration.getStorageID() + " is assigned to data-node "
                    + dnRegistration.getName());
        }
        if (!storage.getStorageID().equals(dnRegistration.getStorageID())) {
            throw new IOException("Inconsistent storage IDs. Name-node returned " + dnRegistration.getStorageID()
                    + ". Expecting " + storage.getStorageID());
        }
        waitForFirstBlockReportRequest = true;
    }

    /**
     * Shut down this instance of the datanode.
     * Returns only after shutdown is complete.
     * This method can only be called by the offerService thread.
     * Otherwise, deadlock might occur.
     */
    public void shutdown() {
        if (infoServer != null) {
            try {
                infoServer.stop();
            } catch (Exception e) {
            }
        }
        if (ipcServer != null) {
            ipcServer.stop();
        }
        this.shouldRun = false;
        if (dataXceiveServer != null) {
            ((DataXceiveServer) this.dataXceiveServer.getRunnable()).kill();
            this.dataXceiveServer.interrupt();

            // wait for all data receiver threads to exit
            if (this.threadGroup != null) {
                while (true) {
                    this.threadGroup.interrupt();
                    LOG.info(
                            "Waiting for threadgroup to exit, active threads is " + this.threadGroup.activeCount());
                    if (this.threadGroup.activeCount() == 0) {
                        break;
                    }
                    try {
                        Thread.sleep(1000);
                    } catch (InterruptedException e) {
                    }
                }
            }
        }

        RPC.stopProxy(namenode); // stop the RPC threads

        if (upgradeManager != null)
            upgradeManager.shutdownUpgrade();
        if (blockScanner != null)
            blockScanner.shutdown();
        if (blockScannerThread != null)
            blockScannerThread.interrupt();
        if (storage != null) {
            try {
                this.storage.unlockAll();
            } catch (IOException ie) {
            }
        }
        if (dataNodeThread != null) {
            dataNodeThread.interrupt();
            try {
                dataNodeThread.join();
            } catch (InterruptedException ie) {
            }
        }
        if (data != null) {
            data.shutdown();
        }
        if (myMetrics != null) {
            myMetrics.shutdown();
        }
    }

    /* Check if there is no space in disk or the disk is read-only
     *  when IOException occurs. 
     * If so, handle the error */
    private void checkDiskError(IOException e) throws IOException {
        if (e.getMessage() != null && e.getMessage().startsWith("No space left on device")) {
            throw new DiskOutOfSpaceException("No space left on device");
        } else {
            checkDiskError();
        }
    }

    /* Check if there is no disk space and if so, handle the error*/
    private void checkDiskError() throws IOException {
        try {
            data.checkDataDir();
        } catch (DiskErrorException de) {
            handleDiskError(de.getMessage());
        }
    }

    private void handleDiskError(String errMsgr) {
        LOG.warn("DataNode is shutting down.\n" + errMsgr);
        shouldRun = false;
        try {
            namenode.errorReport(dnRegistration, DatanodeProtocol.DISK_ERROR, errMsgr);
        } catch (IOException ignored) {
        }
    }

    /** Number of concurrent xceivers per node. */
    int getXceiverCount() {
        return threadGroup == null ? 0 : threadGroup.activeCount();
    }

    /**
     * Main loop for the DataNode.  Runs until shutdown,
     * forever calling remote NameNode functions.
     */
    public void offerService() throws Exception {

        LOG.info("using BLOCKREPORT_INTERVAL of " + blockReportInterval + "msec" + " Initial delay: "
                + initialBlockReportDelay + "msec");

        //
        // Now loop for a long time....
        //

        while (shouldRun) {
            try {
                long startTime = now();

                //
                // Every so often, send heartbeat or block-report
                //

                if (startTime - lastHeartbeat > heartBeatInterval) {
                    //
                    // All heartbeat messages include following info:
                    // -- Datanode name
                    // -- data transfer port
                    // -- Total capacity
                    // -- Bytes remaining
                    //
                    lastHeartbeat = startTime;
                    DatanodeCommand cmd = namenode.sendHeartbeat(dnRegistration, data.getCapacity(),
                            data.getDfsUsed(), data.getRemaining(), xmitsInProgress, getXceiverCount());
                    myMetrics.heartbeats.inc(now() - startTime);
                    //LOG.info("Just sent heartbeat, with name " + localName);
                    if (!processCommand(cmd))
                        continue;
                }

                // check if there are newly received blocks
                Block[] blockArray = null;
                String[] delHintArray = null;
                synchronized (receivedBlockList) {
                    synchronized (delHints) {
                        int numBlocks = receivedBlockList.size();
                        if (numBlocks > 0) {
                            if (numBlocks != delHints.size()) {
                                LOG.warn("Panic: receiveBlockList and delHints are not of the same length");
                            }
                            //
                            // Send newly-received blockids to namenode
                            //
                            blockArray = receivedBlockList.toArray(new Block[numBlocks]);
                            delHintArray = delHints.toArray(new String[numBlocks]);
                        }
                    }
                }
                if (blockArray != null) {
                    if (delHintArray == null || delHintArray.length != blockArray.length) {
                        LOG.warn("Panic: block array & delHintArray are not the same");
                    }
                    namenode.blockReceived(dnRegistration, blockArray, delHintArray);
                    synchronized (receivedBlockList) {
                        synchronized (delHints) {
                            for (int i = 0; i < blockArray.length; i++) {
                                receivedBlockList.remove(blockArray[i]);
                                delHints.remove(delHintArray[i]);
                            }
                        }
                    }
                }

                // send block report
                if (startTime - lastBlockReport > blockReportInterval) {
                    //
                    // Send latest blockinfo report if timer has expired.
                    // Get back a list of local block(s) that are obsolete
                    // and can be safely GC'ed.
                    //
                    long brStartTime = now();
                    Block[] bReport = data.getBlockReport();
                    DatanodeCommand cmd = namenode.blockReport(dnRegistration,
                            BlockListAsLongs.convertToArrayLongs(bReport));
                    long brTime = now() - brStartTime;
                    myMetrics.blockReports.inc(brTime);
                    LOG.info("BlockReport of " + bReport.length + " blocks got processed in " + brTime + " msecs");
                    //
                    // If we have sent the first block report, then wait a random
                    // time before we start the periodic block reports.
                    //
                    if (resetBlockReportTime) {
                        lastBlockReport = startTime - R.nextInt((int) (blockReportInterval));
                        resetBlockReportTime = false;
                    } else {
                        /* say the last block report was at 8:20:14. The current report 
                         * should have started around 9:20:14 (default 1 hour interval). 
                         * If current time is :
                         *   1) normal like 9:20:18, next report should be at 10:20:14
                         *   2) unexpected like 11:35:43, next report should be at 12:20:14
                         */
                        lastBlockReport += (now() - lastBlockReport) / blockReportInterval * blockReportInterval;
                    }
                    processCommand(cmd);
                }

                // start block scanner
                if (blockScanner != null && blockScannerThread == null && upgradeManager.isUpgradeCompleted()) {
                    LOG.info("Starting Periodic block scanner.");
                    blockScannerThread = new Daemon(blockScanner);
                    blockScannerThread.start();
                }

                //
                // There is no work to do;  sleep until hearbeat timer elapses, 
                // or work arrives, and then iterate again.
                //
                long waitTime = heartBeatInterval - (System.currentTimeMillis() - lastHeartbeat);
                synchronized (receivedBlockList) {
                    if (waitTime > 0 && receivedBlockList.size() == 0) {
                        try {
                            receivedBlockList.wait(waitTime);
                        } catch (InterruptedException ie) {
                        }
                    }
                } // synchronized
            } catch (RemoteException re) {
                String reClass = re.getClassName();
                if (UnregisteredDatanodeException.class.getName().equals(reClass)
                        || DisallowedDatanodeException.class.getName().equals(reClass)
                        || IncorrectVersionException.class.getName().equals(reClass)) {
                    LOG.warn("DataNode is shutting down: " + StringUtils.stringifyException(re));
                    shutdown();
                    return;
                }
                LOG.warn(StringUtils.stringifyException(re));
            } catch (IOException e) {
                LOG.warn(StringUtils.stringifyException(e));
            }
        } // while (shouldRun)
    } // offerService

    /**
     * 
     * @param cmd
     * @return true if further processing may be required or false otherwise. 
     * @throws IOException
     */
    private boolean processCommand(DatanodeCommand cmd) throws IOException {
        if (cmd == null)
            return true;
        final BlockCommand bcmd = cmd instanceof BlockCommand ? (BlockCommand) cmd : null;

        switch (cmd.getAction()) {
        case DatanodeProtocol.DNA_TRANSFER:
            // Send a copy of a block to another datanode
            transferBlocks(bcmd.getBlocks(), bcmd.getTargets());
            myMetrics.blocksReplicated.inc(bcmd.getBlocks().length);
            break;
        case DatanodeProtocol.DNA_INVALIDATE:
            //
            // Some local block(s) are obsolete and can be 
            // safely garbage-collected.
            //
            Block toDelete[] = bcmd.getBlocks();
            try {
                if (blockScanner != null) {
                    blockScanner.deleteBlocks(toDelete);
                }
                data.invalidate(toDelete);
            } catch (IOException e) {
                checkDiskError();
                throw e;
            }
            myMetrics.blocksRemoved.inc(toDelete.length);
            break;
        case DatanodeProtocol.DNA_SHUTDOWN:
            // shut down the data node
            this.shutdown();
            return false;
        case DatanodeProtocol.DNA_REGISTER:
            // namenode requested a registration - at start or if NN lost contact
            if (shouldRun) {
                register();
            }
            break;
        case DatanodeProtocol.DNA_FINALIZE:
            storage.finalizeUpgrade();
            break;
        case UpgradeCommand.UC_ACTION_START_UPGRADE:
            // start distributed upgrade here
            processDistributedUpgradeCommand((UpgradeCommand) cmd);
            break;
        case DatanodeProtocol.DNA_BLOCKREPORT:
            // only send BR when receive request the 1st time
            if (waitForFirstBlockReportRequest) {
                // dropping all following BR requests
                waitForFirstBlockReportRequest = false;
                // random short delay - helps scatter the BR from all DNs
                scheduleBlockReport(initialBlockReportDelay);
            }
            break;
        case DatanodeProtocol.DNA_RECOVERBLOCK:
            recoverBlocks(bcmd.getBlocks(), bcmd.getTargets());
            break;
        default:
            LOG.warn("Unknown DatanodeCommand action: " + cmd.getAction());
        }
        return true;
    }

    // Distributed upgrade manager
    UpgradeManagerDatanode upgradeManager = new UpgradeManagerDatanode(this);

    private void processDistributedUpgradeCommand(UpgradeCommand comm) throws IOException {
        assert upgradeManager != null : "DataNode.upgradeManager is null.";
        upgradeManager.processUpgradeCommand(comm);
    }

    /**
     * Start distributed upgrade if it should be initiated by the data-node.
     */
    private void startDistributedUpgradeIfNeeded() throws IOException {
        UpgradeManagerDatanode um = DataNode.getDataNode().upgradeManager;
        assert um != null : "DataNode.upgradeManager is null.";
        if (!um.getUpgradeState())
            return;
        um.setUpgradeState(false, um.getUpgradeVersion());
        um.startUpgrade();
        return;
    }

    private void transferBlocks(Block blocks[], DatanodeInfo xferTargets[][]) throws IOException {
        for (int i = 0; i < blocks.length; i++) {
            if (!data.isValidBlock(blocks[i])) {
                String errStr = "Can't send invalid block " + blocks[i];
                LOG.info(errStr);
                namenode.errorReport(dnRegistration, DatanodeProtocol.INVALID_BLOCK, errStr);
                break;
            }
            int numTargets = xferTargets[i].length;
            if (numTargets > 0) {
                if (LOG.isInfoEnabled()) {
                    StringBuilder xfersBuilder = new StringBuilder();
                    for (int j = 0; j < numTargets; j++) {
                        DatanodeInfo nodeInfo = xferTargets[i][j];
                        xfersBuilder.append(nodeInfo.getName());
                        if (j < (numTargets - 1)) {
                            xfersBuilder.append(", ");
                        }
                    }
                    String xfersTo = xfersBuilder.toString();
                    LOG.info(dnRegistration + " Starting thread to transfer block " + blocks[i] + " to " + xfersTo);
                }
                new Daemon(new DataTransfer(xferTargets[i], blocks[i])).start();
            }
        }
    }

    /* utility function for receiving a response */
    private static void receiveResponse(Socket s, int numTargets) throws IOException {
        // check the response
        DataInputStream reply = new DataInputStream(
                new BufferedInputStream(NetUtils.getInputStream(s), BUFFER_SIZE));
        try {
            for (int i = 0; i < numTargets; i++) {
                short opStatus = reply.readShort();
                if (opStatus != OP_STATUS_SUCCESS) {
                    throw new IOException("operation failed at " + s.getInetAddress());
                }
            }
        } finally {
            IOUtils.closeStream(reply);
        }
    }

    /* utility function for sending a respose */
    private static void sendResponse(Socket s, short opStatus, long timeout) throws IOException {
        DataOutputStream reply = new DataOutputStream(NetUtils.getOutputStream(s, timeout));
        try {
            reply.writeShort(opStatus);
            reply.flush();
        } finally {
            IOUtils.closeStream(reply);
        }
    }

    /*
     * Informing the name node could take a long long time! Should we wait
     * till namenode is informed before responding with success to the
     * client? For now we don't.
     */
    private void notifyNamenodeReceivedBlock(Block block, String delHint) {
        if (block == null || delHint == null) {
            throw new IllegalArgumentException(block == null ? "Block is null" : "delHint is null");
        }
        synchronized (receivedBlockList) {
            synchronized (delHints) {
                receivedBlockList.add(block);
                delHints.add(delHint);
                receivedBlockList.notifyAll();
            }
        }
    }

    /**
     * Server used for receiving/sending a block of data.
     * This is created to listen for requests from clients or 
     * other DataNodes.  This small server does not use the 
     * Hadoop IPC mechanism.
     */
    class DataXceiveServer implements Runnable {
        ServerSocket ss;

        public DataXceiveServer(ServerSocket ss) {
            this.ss = ss;
        }

        /**
         */
        public void run() {
            while (shouldRun) {
                try {
                    Socket s = ss.accept();
                    s.setTcpNoDelay(true);
                    new Daemon(threadGroup, new DataXceiver(s)).start();
                } catch (SocketTimeoutException ignored) {
                    // wake up to see if should continue to run
                } catch (IOException ie) {
                    LOG.warn(dnRegistration + ":DataXceiveServer: " + StringUtils.stringifyException(ie));
                } catch (Throwable te) {
                    LOG.error(dnRegistration + ":DataXceiveServer: Exiting due to:"
                            + StringUtils.stringifyException(te));
                    shouldRun = false;
                }
            }
            try {
                ss.close();
            } catch (IOException ie) {
                LOG.warn(dnRegistration + ":DataXceiveServer: " + StringUtils.stringifyException(ie));
            }
        }

        public void kill() {
            assert shouldRun == false : "shoudRun should be set to false before killing";
            try {
                this.ss.close();
            } catch (IOException ie) {
                LOG.warn(dnRegistration + ":DataXceiveServer.kill(): " + StringUtils.stringifyException(ie));
            }

            // close all the sockets that were accepted earlier
            synchronized (childSockets) {
                for (Iterator<Socket> it = childSockets.values().iterator(); it.hasNext();) {
                    Socket thissock = it.next();
                    try {
                        thissock.close();
                    } catch (IOException e) {
                    }
                }
            }
        }
    }

    /**
     * Thread for processing incoming/outgoing data stream
     */
    class DataXceiver implements Runnable {
        Socket s;
        String remoteAddress; // address of remote side
        String localAddress; // local address of this daemon

        public DataXceiver(Socket s) {
            this.s = s;
            childSockets.put(s, s);
            InetSocketAddress isock = (InetSocketAddress) s.getRemoteSocketAddress();
            remoteAddress = isock.toString();
            localAddress = s.getInetAddress() + ":" + s.getLocalPort();
            LOG.debug("Number of active connections is: " + getXceiverCount());
        }

        /**
         * Read/write data from/to the DataXceiveServer.
         */
        public void run() {
            DataInputStream in = null;
            try {
                in = new DataInputStream(new BufferedInputStream(NetUtils.getInputStream(s), SMALL_BUFFER_SIZE));
                short version = in.readShort();
                if (version != DATA_TRANSFER_VERSION) {
                    throw new IOException("Version Mismatch");
                }
                boolean local = s.getInetAddress().equals(s.getLocalAddress());
                byte op = in.readByte();
                // Make sure the xciver count is not exceeded
                int curXceiverCount = getXceiverCount();
                if (curXceiverCount > maxXceiverCount) {
                    throw new IOException("xceiverCount " + curXceiverCount
                            + " exceeds the limit of concurrent xcievers " + maxXceiverCount);
                }
                long startTime = now();
                switch (op) {
                case OP_READ_BLOCK:
                    readBlock(in);
                    myMetrics.readBlockOp.inc(now() - startTime);
                    if (local)
                        myMetrics.readsFromLocalClient.inc();
                    else
                        myMetrics.readsFromRemoteClient.inc();
                    break;
                case OP_WRITE_BLOCK:
                    writeBlock(in);
                    myMetrics.writeBlockOp.inc(now() - startTime);
                    if (local)
                        myMetrics.writesFromLocalClient.inc();
                    else
                        myMetrics.writesFromRemoteClient.inc();
                    break;
                case OP_READ_METADATA:
                    readMetadata(in);
                    myMetrics.readMetadataOp.inc(now() - startTime);
                    break;
                case OP_REPLACE_BLOCK: // for balancing purpose; send to a destination
                    replaceBlock(in);
                    myMetrics.replaceBlockOp.inc(now() - startTime);
                    break;
                case OP_COPY_BLOCK: // for balancing purpose; send to a proxy source
                    copyBlock(in);
                    myMetrics.copyBlockOp.inc(now() - startTime);
                    break;
                default:
                    throw new IOException("Unknown opcode " + op + " in data stream");
                }
            } catch (Throwable t) {
                LOG.error(dnRegistration + ":DataXceiver: " + StringUtils.stringifyException(t));
            } finally {
                LOG.debug(dnRegistration + ":Number of active connections is: " + getXceiverCount());
                IOUtils.closeStream(in);
                IOUtils.closeSocket(s);
                childSockets.remove(s);
            }
        }

        /**
         * Read a block from the disk
         * @param in The stream to read from
         * @throws IOException
         */
        private void readBlock(DataInputStream in) throws IOException {
            //
            // Read in the header
            //
            long blockId = in.readLong();
            Block block = new Block(blockId, 0, in.readLong());

            long startOffset = in.readLong();
            long length = in.readLong();

            // send the block
            OutputStream baseStream = NetUtils.getOutputStream(s, socketWriteTimeout);
            DataOutputStream out = new DataOutputStream(new BufferedOutputStream(baseStream, SMALL_BUFFER_SIZE));

            BlockSender blockSender = null;
            try {
                try {
                    blockSender = new BlockSender(block, startOffset, length, true, true, false);
                } catch (IOException e) {
                    out.writeShort(OP_STATUS_ERROR);
                    throw e;
                }

                out.writeShort(DataNode.OP_STATUS_SUCCESS); // send op status
                long read = blockSender.sendBlock(out, baseStream, null); // send data

                if (blockSender.isBlockReadFully()) {
                    // See if client verification succeeded. 
                    // This is an optional response from client.
                    try {
                        if (in.readShort() == OP_STATUS_CHECKSUM_OK && blockScanner != null) {
                            blockScanner.verifiedByClient(block);
                        }
                    } catch (IOException ignored) {
                    }
                }

                myMetrics.bytesRead.inc((int) read);
                myMetrics.blocksRead.inc();
                LOG.info(dnRegistration + " Served block " + block + " to " + s.getInetAddress());
            } catch (SocketException ignored) {
                // Its ok for remote side to close the connection anytime.
                myMetrics.blocksRead.inc();
            } catch (IOException ioe) {
                /* What exactly should we do here?
                 * Earlier version shutdown() datanode if there is disk error.
                 */
                LOG.warn(dnRegistration + ":Got exception while serving " + block + " to " + s.getInetAddress()
                        + ":\n" + StringUtils.stringifyException(ioe));
                throw ioe;
            } finally {
                IOUtils.closeStream(out);
                IOUtils.closeStream(blockSender);
            }
        }

        /**
         * Write a block to disk.
         * 
         * @param in The stream to read from
         * @throws IOException
         */
        private void writeBlock(DataInputStream in) throws IOException {
            DatanodeInfo srcDataNode = null;
            LOG.debug("writeBlock receive buf size " + s.getReceiveBufferSize() + " tcp no delay "
                    + s.getTcpNoDelay());
            //
            // Read in the header
            //
            Block block = new Block(in.readLong(), estimateBlockSize, in.readLong());
            LOG.info("Receiving block " + block + " src: " + remoteAddress + " dest: " + localAddress);
            int pipelineSize = in.readInt(); // num of datanodes in entire pipeline
            boolean isRecovery = in.readBoolean(); // is this part of recovery?
            String client = Text.readString(in); // working on behalf of this client
            boolean hasSrcDataNode = in.readBoolean(); // is src node info present
            if (hasSrcDataNode) {
                srcDataNode = new DatanodeInfo();
                srcDataNode.readFields(in);
            }
            int numTargets = in.readInt();
            if (numTargets < 0) {
                throw new IOException("Mislabelled incoming datastream.");
            }
            DatanodeInfo targets[] = new DatanodeInfo[numTargets];
            for (int i = 0; i < targets.length; i++) {
                DatanodeInfo tmp = new DatanodeInfo();
                tmp.readFields(in);
                targets[i] = tmp;
            }

            DataOutputStream mirrorOut = null; // stream to next target
            DataInputStream mirrorIn = null; // reply from next target
            DataOutputStream replyOut = null; // stream to prev target
            Socket mirrorSock = null; // socket to next target
            BlockReceiver blockReceiver = null; // responsible for data handling
            String mirrorNode = null; // the name:port of next target
            String firstBadLink = ""; // first datanode that failed in connection setup
            try {
                // open a block receiver and check if the block does not exist
                blockReceiver = new BlockReceiver(block, in, s.getInetAddress().toString(), isRecovery, client,
                        srcDataNode);

                // get a connection back to the previous target
                replyOut = new DataOutputStream(NetUtils.getOutputStream(s, socketWriteTimeout));

                //
                // Open network conn to backup machine, if 
                // appropriate
                //
                if (targets.length > 0) {
                    InetSocketAddress mirrorTarget = null;
                    // Connect to backup machine
                    mirrorNode = targets[0].getName();
                    mirrorTarget = NetUtils.createSocketAddr(mirrorNode);
                    mirrorSock = newSocket();
                    try {
                        int timeoutValue = numTargets * socketTimeout;
                        int writeTimeout = socketWriteTimeout + (WRITE_TIMEOUT_EXTENSION * numTargets);
                        mirrorSock.connect(mirrorTarget, timeoutValue);
                        mirrorSock.setSoTimeout(timeoutValue);
                        mirrorSock.setSendBufferSize(DEFAULT_DATA_SOCKET_SIZE);
                        mirrorOut = new DataOutputStream(new BufferedOutputStream(
                                NetUtils.getOutputStream(mirrorSock, writeTimeout), SMALL_BUFFER_SIZE));
                        mirrorIn = new DataInputStream(NetUtils.getInputStream(mirrorSock));

                        // Write header: Copied from DFSClient.java!
                        mirrorOut.writeShort(DATA_TRANSFER_VERSION);
                        mirrorOut.write(OP_WRITE_BLOCK);
                        mirrorOut.writeLong(block.getBlockId());
                        mirrorOut.writeLong(block.getGenerationStamp());
                        mirrorOut.writeInt(pipelineSize);
                        mirrorOut.writeBoolean(isRecovery);
                        Text.writeString(mirrorOut, client);
                        mirrorOut.writeBoolean(hasSrcDataNode);
                        if (hasSrcDataNode) { // pass src node information
                            srcDataNode.write(mirrorOut);
                        }
                        mirrorOut.writeInt(targets.length - 1);
                        for (int i = 1; i < targets.length; i++) {
                            targets[i].write(mirrorOut);
                        }

                        blockReceiver.writeChecksumHeader(mirrorOut);
                        mirrorOut.flush();

                        // read connect ack (only for clients, not for replication req)
                        if (client.length() != 0) {
                            firstBadLink = Text.readString(mirrorIn);
                            if (LOG.isDebugEnabled() || firstBadLink.length() > 0) {
                                LOG.info("Datanode " + targets.length + " got response for connect ack "
                                        + " from downstream datanode with firstbadlink as " + firstBadLink);
                            }
                        }

                    } catch (IOException e) {
                        if (client.length() != 0) {
                            Text.writeString(replyOut, mirrorNode);
                            replyOut.flush();
                        }
                        IOUtils.closeStream(mirrorOut);
                        mirrorOut = null;
                        IOUtils.closeStream(mirrorIn);
                        mirrorIn = null;
                        IOUtils.closeSocket(mirrorSock);
                        mirrorSock = null;
                        if (client.length() > 0) {
                            throw e;
                        } else {
                            LOG.info(dnRegistration + ":Exception transfering block " + block + " to mirror "
                                    + mirrorNode + ". continuing without the mirror.\n"
                                    + StringUtils.stringifyException(e));
                        }
                    }
                }

                // send connect ack back to source (only for clients)
                if (client.length() != 0) {
                    if (LOG.isDebugEnabled() || firstBadLink.length() > 0) {
                        LOG.info("Datanode " + targets.length
                                + " forwarding connect ack to upstream firstbadlink is " + firstBadLink);
                    }
                    Text.writeString(replyOut, firstBadLink);
                    replyOut.flush();
                }

                // receive the block and mirror to the next target
                String mirrorAddr = (mirrorSock == null) ? null : mirrorNode;
                blockReceiver.receiveBlock(mirrorOut, mirrorIn, replyOut, mirrorAddr, null, targets.length);

                // if this write is for a replication request (and not
                // from a client), then confirm block. For client-writes,
                // the block is finalized in the PacketResponder.
                if (client.length() == 0) {
                    notifyNamenodeReceivedBlock(block, EMPTY_DEL_HINT);
                    LOG.info("Received block " + block + " src: " + remoteAddress + " dest: " + localAddress
                            + " of size " + block.getNumBytes());
                }

                if (blockScanner != null) {
                    blockScanner.addBlock(block);
                }

            } catch (IOException ioe) {
                LOG.info("writeBlock " + block + " received exception " + ioe);
                throw ioe;
            } finally {
                // close all opened streams
                IOUtils.closeStream(mirrorOut);
                IOUtils.closeStream(mirrorIn);
                IOUtils.closeStream(replyOut);
                IOUtils.closeSocket(mirrorSock);
                IOUtils.closeStream(blockReceiver);
            }
        }

        /**
         * Reads the metadata and sends the data in one 'DATA_CHUNK'
         * @param in
         */
        void readMetadata(DataInputStream in) throws IOException {
            Block block = new Block(in.readLong(), 0, in.readLong());
            MetaDataInputStream checksumIn = null;
            DataOutputStream out = null;

            try {

                checksumIn = data.getMetaDataInputStream(block);

                long fileSize = checksumIn.getLength();

                if (fileSize >= 1L << 31 || fileSize <= 0) {
                    throw new IOException("Unexpected size for checksumFile of block" + block);
                }

                byte[] buf = new byte[(int) fileSize];
                IOUtils.readFully(checksumIn, buf, 0, buf.length);

                out = new DataOutputStream(NetUtils.getOutputStream(s, socketWriteTimeout));

                out.writeByte(OP_STATUS_SUCCESS);
                out.writeInt(buf.length);
                out.write(buf);

                //last DATA_CHUNK
                out.writeInt(0);
            } finally {
                IOUtils.closeStream(out);
                IOUtils.closeStream(checksumIn);
            }
        }

        /**
         * Read a block from the disk and then sends it to a destination
         * 
         * @param in
         *          The stream to read from
         * @throws IOException
         */
        private void copyBlock(DataInputStream in) throws IOException {
            // Read in the header
            long blockId = in.readLong(); // read block id
            Block block = new Block(blockId, 0, in.readLong());
            String source = Text.readString(in); // read del hint
            DatanodeInfo target = new DatanodeInfo(); // read target
            target.readFields(in);

            if (!balancingThrottler.acquire()) { // not able to start
                LOG.info("Not able to copy block " + blockId + " to " + s.getRemoteSocketAddress()
                        + " because threads quota is exceeded.");
                sendResponse(s, (short) OP_STATUS_ERROR, socketWriteTimeout);
                return;
            }

            Socket targetSock = null;
            short opStatus = OP_STATUS_SUCCESS;
            BlockSender blockSender = null;
            DataOutputStream targetOut = null;
            try {
                // check if the block exists or not
                blockSender = new BlockSender(block, 0, -1, false, false, false);

                // get the output stream to the target
                InetSocketAddress targetAddr = NetUtils.createSocketAddr(target.getName());
                targetSock = newSocket();
                targetSock.connect(targetAddr, socketTimeout);
                targetSock.setSoTimeout(socketTimeout);

                OutputStream baseStream = NetUtils.getOutputStream(targetSock, socketWriteTimeout);
                targetOut = new DataOutputStream(new BufferedOutputStream(baseStream, SMALL_BUFFER_SIZE));

                /* send request to the target */
                // fist write header info
                targetOut.writeShort(DATA_TRANSFER_VERSION); // transfer version
                targetOut.writeByte(OP_REPLACE_BLOCK); // op code
                targetOut.writeLong(block.getBlockId()); // block id
                targetOut.writeLong(block.getGenerationStamp()); // block id
                Text.writeString(targetOut, source); // del hint

                // then send data
                long read = blockSender.sendBlock(targetOut, baseStream, balancingThrottler);

                myMetrics.bytesRead.inc((int) read);
                myMetrics.blocksRead.inc();

                // check the response from target
                receiveResponse(targetSock, 1);

                LOG.info("Copied block " + block + " to " + targetAddr);
            } catch (IOException ioe) {
                opStatus = OP_STATUS_ERROR;
                LOG.warn("Got exception while serving " + block + " to " + target.getName() + ": "
                        + StringUtils.stringifyException(ioe));
                throw ioe;
            } finally {
                // now release the thread resource
                balancingThrottler.release();

                /* send response to the requester */
                try {
                    sendResponse(s, opStatus, socketWriteTimeout);
                } catch (IOException replyE) {
                    LOG.warn("Error writing the response back to " + s.getRemoteSocketAddress() + "\n"
                            + StringUtils.stringifyException(replyE));
                }
                IOUtils.closeStream(targetOut);
                IOUtils.closeStream(blockSender);
            }
        }

        /**
         * Receive a block and write it to disk, it then notifies the namenode to
         * remove the copy from the source
         * 
         * @param in
         *          The stream to read from
         * @throws IOException
         */
        private void replaceBlock(DataInputStream in) throws IOException {
            /* read header */
            long blockId = in.readLong();
            Block block = new Block(blockId, estimateBlockSize, in.readLong()); // block id & len
            String sourceID = Text.readString(in);

            if (!balancingThrottler.acquire()) { // not able to start
                LOG.warn("Not able to receive block " + blockId + " from " + s.getRemoteSocketAddress()
                        + " because threads quota is exceeded.");
                return;
            }

            short opStatus = OP_STATUS_SUCCESS;
            BlockReceiver blockReceiver = null;
            try {
                // open a block receiver and check if the block does not exist
                blockReceiver = new BlockReceiver(block, in, s.getRemoteSocketAddress().toString(), false, "",
                        null);

                // receive a block
                blockReceiver.receiveBlock(null, null, null, null, balancingThrottler, -1);

                // notify name node
                notifyNamenodeReceivedBlock(block, sourceID);

                LOG.info("Moved block " + block + " from " + s.getRemoteSocketAddress());
            } catch (IOException ioe) {
                opStatus = OP_STATUS_ERROR;
                throw ioe;
            } finally {
                balancingThrottler.release();

                // send response back
                try {
                    sendResponse(s, opStatus, socketWriteTimeout);
                } catch (IOException ioe) {
                    LOG.warn("Error writing reply back to " + s.getRemoteSocketAddress());
                }
                IOUtils.closeStream(blockReceiver);
            }
        }
    }

    /** a class to throttle the block transfers
     * This class is thread safe. It can be shared by multiple threads.
     * The parameter bandwidthPerSec specifies the total bandwidth shared by threads.
     */
    static class Throttler {
        private long period; // period over which bw is imposed
        private long periodExtension; // Max period over which bw accumulates.
        private long bytesPerPeriod; // total number of bytes can be sent in each period
        private long curPeriodStart; // current period starting time
        private long curReserve; // remaining bytes can be sent in the period
        private long bytesAlreadyUsed;

        /** Constructor 
         * @param bandwidthPerSec bandwidth allowed in bytes per second. 
         */
        Throttler(long bandwidthPerSec) {
            this(500, bandwidthPerSec); // by default throttling period is 500ms 
        }

        /**
         * Constructor
         * @param period in milliseconds. Bandwidth is enforced over this
         *        period.
         * @param bandwidthPerSec bandwidth allowed in bytes per second. 
         */
        Throttler(long period, long bandwidthPerSec) {
            this.curPeriodStart = System.currentTimeMillis();
            this.period = period;
            this.curReserve = this.bytesPerPeriod = bandwidthPerSec * period / 1000;
            this.periodExtension = period * 3;
        }

        /**
         * @return current throttle bandwidth in bytes per second.
         */
        public synchronized long getBandwidth() {
            return bytesPerPeriod * 1000 / period;
        }

        /**
         * Sets throttle bandwidth. This takes affect latest by the end of current
         * period.
         * 
         * @param bytesPerSecond 
         */
        public synchronized void setBandwidth(long bytesPerSecond) {
            if (bytesPerSecond <= 0) {
                throw new IllegalArgumentException("" + bytesPerSecond);
            }
            bytesPerPeriod = bytesPerSecond * period / 1000;
        }

        /** Given the numOfBytes sent/received since last time throttle was called,
         * make the current thread sleep if I/O rate is too fast
         * compared to the given bandwidth
         *
         * @param numOfBytes
         *     number of bytes sent/received since last time throttle was called
         */
        public synchronized void throttle(long numOfBytes) {
            if (numOfBytes <= 0) {
                return;
            }

            curReserve -= numOfBytes;
            bytesAlreadyUsed += numOfBytes;

            while (curReserve <= 0) {
                long now = System.currentTimeMillis();
                long curPeriodEnd = curPeriodStart + period;

                if (now < curPeriodEnd) {
                    // Wait for next period so that curReserve can be increased.
                    try {
                        wait(curPeriodEnd - now);
                    } catch (InterruptedException ignored) {
                    }
                } else if (now < (curPeriodStart + periodExtension)) {
                    curPeriodStart = curPeriodEnd;
                    curReserve += bytesPerPeriod;
                } else {
                    // discard the prev period. Throttler might not have
                    // been used for a long time.
                    curPeriodStart = now;
                    curReserve = bytesPerPeriod - bytesAlreadyUsed;
                }
            }

            bytesAlreadyUsed -= numOfBytes;
        }
    }

    /* ********************************************************************
    Protocol when a client reads data from Datanode (Cur Ver: 9):
        
    Client's Request :
    =================
         
       Processed in DataXceiver:
       +----------------------------------------------+
       | Common Header   | 1 byte OP == OP_READ_BLOCK |
       +----------------------------------------------+
           
       Processed in readBlock() :
       +-------------------------------------------------------------------------+
       | 8 byte Block ID | 8 byte genstamp | 8 byte start offset | 8 byte length |
       +-------------------------------------------------------------------------+
           
       Client sends optional response only at the end of receiving data.
             
    DataNode Response :
    ===================
         
      In readBlock() :
      If there is an error while initializing BlockSender :
         +---------------------------+
         | 2 byte OP_STATUS_ERROR    | and connection will be closed.
         +---------------------------+
      Otherwise
         +---------------------------+
         | 2 byte OP_STATUS_SUCCESS  |
         +---------------------------+
             
      Actual data, sent by BlockSender.sendBlock() :
          
        ChecksumHeader :
        +--------------------------------------------------+
        | 1 byte CHECKSUM_TYPE | 4 byte BYTES_PER_CHECKSUM |
        +--------------------------------------------------+
        Followed by actual data in the form of PACKETS: 
        +------------------------------------+
        | Sequence of data PACKETs ....      |
        +------------------------------------+
          
      A "PACKET" is defined further below.
          
      The client reads data until it receives a packet with 
      "LastPacketInBlock" set to true or with a zero length. If there is 
      no checksum error, it replies to DataNode with OP_STATUS_CHECKSUM_OK:
          
      Client optional response at the end of data transmission :
        +------------------------------+
        | 2 byte OP_STATUS_CHECKSUM_OK |
        +------------------------------+
          
      PACKET : Contains a packet header, checksum and data. Amount of data
      ======== carried is set by BUFFER_SIZE.
          
        +-----------------------------------------------------+
        | 4 byte packet length (excluding packet header)      |
        +-----------------------------------------------------+
        | 8 byte offset in the block | 8 byte sequence number |
        +-----------------------------------------------------+
        | 1 byte isLastPacketInBlock                          |
        +-----------------------------------------------------+
        | 4 byte Length of actual data                        |
        +-----------------------------------------------------+
        | x byte checksum data. x is defined below            |
        +-----------------------------------------------------+
        | actual data ......                                  |
        +-----------------------------------------------------+
            
        x = (length of data + BYTE_PER_CHECKSUM - 1)/BYTES_PER_CHECKSUM *
      CHECKSUM_SIZE
          
        CHECKSUM_SIZE depends on CHECKSUM_TYPE (usually, 4 for CRC32)
            
        The above packet format is used while writing data to DFS also.
        Not all the fields might be used while reading.
          
     ************************************************************************ */

    /** Header size for a packet */
    static final int PKT_HEADER_LEN = (4 + /* Packet payload length */
            8 + /* offset in block */
            8 + /* seqno */
            1 /* isLastPacketInBlock */);

    class BlockSender implements java.io.Closeable {
        private Block block; // the block to read from
        private InputStream blockIn; // data stream
        private long blockInPosition = -1; // updated while using transferTo().
        private DataInputStream checksumIn; // checksum datastream
        private DataChecksum checksum; // checksum stream
        private long offset; // starting position to read
        private long endOffset; // ending position
        private long blockLength;
        private int bytesPerChecksum; // chunk size
        private int checksumSize; // checksum size
        private boolean corruptChecksumOk; // if need to verify checksum
        private boolean chunkOffsetOK; // if need to send chunk offset
        private long seqno; // sequence number of packet

        private boolean blockReadFully; //set when the whole block is read
        private boolean verifyChecksum; //if true, check is verified while reading
        private Throttler throttler;

        BlockSender(Block block, long startOffset, long length, boolean corruptChecksumOk, boolean chunkOffsetOK,
                boolean verifyChecksum) throws IOException {

            try {
                this.block = block;
                this.chunkOffsetOK = chunkOffsetOK;
                this.corruptChecksumOk = corruptChecksumOk;
                this.verifyChecksum = verifyChecksum;
                this.blockLength = data.getLength(block);

                if (!corruptChecksumOk || data.metaFileExists(block)) {
                    checksumIn = new DataInputStream(
                            new BufferedInputStream(data.getMetaDataInputStream(block), BUFFER_SIZE));

                    // read and handle the common header here. For now just a version
                    BlockMetadataHeader header = BlockMetadataHeader.readHeader(checksumIn);
                    short version = header.getVersion();

                    if (version != FSDataset.METADATA_VERSION) {
                        LOG.warn(
                                "Wrong version (" + version + ") for metadata file for " + block + " ignoring ...");
                    }
                    checksum = header.getChecksum();
                } else {
                    LOG.warn("Could not find metadata file for " + block);
                    // This only decides the buffer size. Use BUFFER_SIZE?
                    checksum = DataChecksum.newDataChecksum(DataChecksum.CHECKSUM_NULL, 16 * 1024);
                }

                /* If bytesPerChecksum is very large, then the metadata file
                 * is mostly corrupted. For now just truncate bytesPerchecksum to
                 * blockLength.
                 */
                bytesPerChecksum = checksum.getBytesPerChecksum();
                if (bytesPerChecksum > 10 * 1024 * 1024 && bytesPerChecksum > blockLength) {
                    checksum = DataChecksum.newDataChecksum(checksum.getChecksumType(),
                            Math.max((int) blockLength, 10 * 1024 * 1024));
                    bytesPerChecksum = checksum.getBytesPerChecksum();
                }
                checksumSize = checksum.getChecksumSize();

                if (length < 0) {
                    length = blockLength;
                }

                endOffset = blockLength;
                if (startOffset < 0 || startOffset > endOffset || (length + startOffset) > endOffset) {
                    String msg = " Offset " + startOffset + " and length " + length + " don't match block " + block
                            + " ( blockLen " + endOffset + " )";
                    LOG.warn(dnRegistration + ":sendBlock() : " + msg);
                    throw new IOException(msg);
                }

                offset = (startOffset - (startOffset % bytesPerChecksum));
                if (length >= 0) {
                    // Make sure endOffset points to end of a checksumed chunk.
                    long tmpLen = startOffset + length + (startOffset - offset);
                    if (tmpLen % bytesPerChecksum != 0) {
                        tmpLen += (bytesPerChecksum - tmpLen % bytesPerChecksum);
                    }
                    if (tmpLen < endOffset) {
                        endOffset = tmpLen;
                    }
                }

                // seek to the right offsets
                if (offset > 0) {
                    long checksumSkip = (offset / bytesPerChecksum) * checksumSize;
                    // note blockInStream is  seeked when created below
                    if (checksumSkip > 0) {
                        // Should we use seek() for checksum file as well?
                        IOUtils.skipFully(checksumIn, checksumSkip);
                    }
                }
                seqno = 0;

                blockIn = data.getBlockInputStream(block, offset); // seek to offset
            } catch (IOException ioe) {
                IOUtils.closeStream(this);
                IOUtils.closeStream(blockIn);
                throw ioe;
            }
        }

        // close opened files
        public void close() throws IOException {
            IOException ioe = null;
            // close checksum file
            if (checksumIn != null) {
                try {
                    checksumIn.close();
                } catch (IOException e) {
                    ioe = e;
                }
                checksumIn = null;
            }
            // close data file
            if (blockIn != null) {
                try {
                    blockIn.close();
                } catch (IOException e) {
                    ioe = e;
                }
                blockIn = null;
            }
            // throw IOException if there is any
            if (ioe != null) {
                throw ioe;
            }
        }

        /**
         * Sends upto maxChunks chunks of data.
         * 
         * When blockInPosition is >= 0, assumes 'out' is a 
         * {@link SocketOutputStream} and tries 
         * {@link SocketOutputStream#transferToFully(FileChannel, long, int)} to
         * send data (and updates blockInPosition).
         */
        private int sendChunks(ByteBuffer pkt, int maxChunks, OutputStream out) throws IOException {
            // Sends multiple chunks in one packet with a single write().

            int len = Math.min((int) (endOffset - offset), bytesPerChecksum * maxChunks);
            if (len == 0) {
                return 0;
            }

            int numChunks = (len + bytesPerChecksum - 1) / bytesPerChecksum;
            int packetLen = len + numChunks * checksumSize + 4;
            pkt.clear();

            // write packet header
            pkt.putInt(packetLen);
            pkt.putLong(offset);
            pkt.putLong(seqno);
            pkt.put((byte) ((offset + len >= endOffset) ? 1 : 0));
            //why no ByteBuf.putBoolean()?
            pkt.putInt(len);

            int checksumOff = pkt.position();
            int checksumLen = numChunks * checksumSize;
            byte[] buf = pkt.array();

            if (checksumSize > 0 && checksumIn != null) {
                try {
                    checksumIn.readFully(buf, checksumOff, checksumLen);
                } catch (IOException e) {
                    LOG.warn(" Could not read or failed to veirfy checksum for data" + " at offset " + offset
                            + " for block " + block + " got : " + StringUtils.stringifyException(e));
                    IOUtils.closeStream(checksumIn);
                    checksumIn = null;
                    if (corruptChecksumOk) {
                        if (checksumOff < checksumLen) {
                            // Just fill the array with zeros.
                            Arrays.fill(buf, checksumOff, checksumLen, (byte) 0);
                        }
                    } else {
                        throw e;
                    }
                }
            }

            int dataOff = checksumOff + checksumLen;

            if (blockInPosition < 0) {
                //normal transfer
                IOUtils.readFully(blockIn, buf, dataOff, len);

                if (verifyChecksum) {
                    int dOff = dataOff;
                    int cOff = checksumOff;
                    int dLeft = len;

                    for (int i = 0; i < numChunks; i++) {
                        checksum.reset();
                        int dLen = Math.min(dLeft, bytesPerChecksum);
                        checksum.update(buf, dOff, dLen);
                        if (!checksum.compare(buf, cOff)) {
                            throw new ChecksumException("Checksum failed at " + (offset + len - dLeft), len);
                        }
                        dLeft -= dLen;
                        dOff += dLen;
                        cOff += checksumSize;
                    }
                }
                //writing is done below (mainly to handle IOException)
            }

            try {
                if (blockInPosition >= 0) {
                    //use transferTo(). Checks on out and blockIn are already done. 

                    SocketOutputStream sockOut = (SocketOutputStream) out;
                    //first write the packet
                    sockOut.write(buf, 0, dataOff);
                    // no need to flush. since we know out is not a buffered stream. 

                    sockOut.transferToFully(((FileInputStream) blockIn).getChannel(), blockInPosition, len);

                    blockInPosition += len;
                } else {
                    // normal transfer
                    out.write(buf, 0, dataOff + len);
                }

            } catch (IOException e) {
                /* exception while writing to the client (well, with transferTo(),
                 * it could also be while reading from the local file). Many times 
                 * this error can be ignored. We will let the callers distinguish this 
                 * from other exceptions if this is not a subclass of IOException. 
                 */
                if (e.getClass().equals(IOException.class)) {
                    // "se" could be a new class in stead of SocketException.
                    IOException se = new SocketException("Original Exception : " + e);
                    se.initCause(e);
                    /* Cange the stacktrace so that original trace is not truncated
                     * when printed.*/
                    se.setStackTrace(e.getStackTrace());
                    throw se;
                }
                throw e;
            }

            if (throttler != null) { // rebalancing so throttle
                throttler.throttle(packetLen);
            }

            return len;
        }

        /**
         * sendBlock() is used to read block and its metadata and stream the data to
         * either a client or to another datanode. 
         * 
         * @param out  stream to which the block is written to
         * @param baseStream optional. if non-null, <code>out</code> is assumed to 
         *        be a wrapper over this stream. This enables optimizations for
         *        sending the data, e.g. 
         *        {@link SocketOutputStream#transferToFully(FileChannel, 
         *        long, int)}.
         * @param throttler for sending data.
         * @return total bytes reads, including crc.
         */
        long sendBlock(DataOutputStream out, OutputStream baseStream, Throttler throttler) throws IOException {
            if (out == null) {
                throw new IOException("out stream is null");
            }
            this.throttler = throttler;

            long initialOffset = offset;
            long totalRead = 0;
            OutputStream streamForSendChunks = out;

            try {
                checksum.writeHeader(out);
                if (chunkOffsetOK) {
                    out.writeLong(offset);
                }
                out.flush();

                int maxChunksPerPacket;
                int pktSize = PKT_HEADER_LEN + SIZE_OF_INTEGER;

                if (transferToAllowed && !verifyChecksum && baseStream instanceof SocketOutputStream
                        && blockIn instanceof FileInputStream) {

                    FileChannel fileChannel = ((FileInputStream) blockIn).getChannel();

                    // blockInPosition also indicates sendChunks() uses transferTo.
                    blockInPosition = fileChannel.position();
                    streamForSendChunks = baseStream;

                    // assure a mininum buffer size.
                    maxChunksPerPacket = (Math.max(BUFFER_SIZE, MIN_BUFFER_WITH_TRANSFERTO) + bytesPerChecksum - 1)
                            / bytesPerChecksum;

                    // allocate smaller buffer while using transferTo(). 
                    pktSize += checksumSize * maxChunksPerPacket;
                } else {
                    maxChunksPerPacket = Math.max(1, (BUFFER_SIZE + bytesPerChecksum - 1) / bytesPerChecksum);
                    pktSize += (bytesPerChecksum + checksumSize) * maxChunksPerPacket;
                }

                ByteBuffer pktBuf = ByteBuffer.allocate(pktSize);

                while (endOffset > offset) {
                    long len = sendChunks(pktBuf, maxChunksPerPacket, streamForSendChunks);
                    offset += len;
                    totalRead += len + ((len + bytesPerChecksum - 1) / bytesPerChecksum * checksumSize);
                    seqno++;
                }
                out.writeInt(0); // mark the end of block        
                out.flush();
            } finally {
                close();
            }

            blockReadFully = (initialOffset == 0 && offset >= blockLength);

            return totalRead;
        }

        boolean isBlockReadFully() {
            return blockReadFully;
        }
    }

    // This information is cached by the Datanode in the ackQueue
    static private class Packet {
        long seqno;
        boolean lastPacketInBlock;

        Packet(long seqno, boolean lastPacketInBlock) {
            this.seqno = seqno;
            this.lastPacketInBlock = lastPacketInBlock;
        }
    }

    /**
     * Processed responses from downstream datanodes in the pipeline
     * and sends back replies to the originator.
     */
    class PacketResponder implements Runnable {
        private LinkedList<Packet> ackQueue = new LinkedList<Packet>(); // packet waiting for ack
        private volatile boolean running = true;
        private Block block;
        DataInputStream mirrorIn; // input from downstream datanode
        DataOutputStream replyOut; // output to upstream datanode
        private int numTargets; // number of downstream datanodes including myself
        private String clientName; // The name of the client (if any)
        private BlockReceiver receiver; // The owner of this responder.

        public String toString() {
            return "PacketResponder " + numTargets + " for Block " + this.block;
        }

        PacketResponder(BlockReceiver receiver, Block b, DataInputStream in, DataOutputStream out, int numTargets,
                String clientName) {
            this.receiver = receiver;
            this.block = b;
            mirrorIn = in;
            replyOut = out;
            this.numTargets = numTargets;
            this.clientName = clientName;
        }

        // enqueue the seqno that is still be to acked by the downstream datanode
        synchronized void enqueue(long seqno, boolean lastPacketInBlock) {
            if (running) {
                LOG.debug("PacketResponder " + numTargets + " adding seqno " + seqno + " to ack queue.");
                ackQueue.addLast(new Packet(seqno, lastPacketInBlock));
                notifyAll();
            }
        }

        // wait for all pending packets to be acked. Then shutdown thread.
        synchronized void close() {
            while (running && ackQueue.size() != 0 && shouldRun) {
                try {
                    wait();
                } catch (InterruptedException e) {
                    running = false;
                }
            }
            LOG.debug("PacketResponder " + numTargets + " for block " + block + " Closing down.");
            running = false;
            notifyAll();
        }

        private synchronized void lastDataNodeRun() {
            long lastHeartbeat = System.currentTimeMillis();
            boolean lastPacket = false;

            while (running && shouldRun && !lastPacket) {
                long now = System.currentTimeMillis();
                try {

                    // wait for a packet to be sent to downstream datanode
                    while (running && shouldRun && ackQueue.size() == 0) {
                        long idle = now - lastHeartbeat;
                        long timeout = (socketTimeout / 2) - idle;
                        if (timeout <= 0) {
                            timeout = 1000;
                        }
                        try {
                            wait(timeout);
                        } catch (InterruptedException e) {
                            if (running) {
                                LOG.info("PacketResponder " + numTargets + " for block " + block + " Interrupted.");
                                running = false;
                            }
                            break;
                        }

                        // send a heartbeat if it is time.
                        now = System.currentTimeMillis();
                        if (now - lastHeartbeat > socketTimeout / 2) {
                            replyOut.writeLong(-1); // send heartbeat
                            replyOut.flush();
                            lastHeartbeat = now;
                        }
                    }

                    if (!running || !shouldRun) {
                        break;
                    }
                    Packet pkt = ackQueue.removeFirst();
                    long expected = pkt.seqno;
                    notifyAll();
                    LOG.debug("PacketResponder " + numTargets + " for block " + block + " acking for packet "
                            + expected);

                    // If this is the last packet in block, then close block
                    // file and finalize the block before responding success
                    if (pkt.lastPacketInBlock) {
                        if (!receiver.finalized) {
                            receiver.close();
                            block.setNumBytes(receiver.offsetInBlock);
                            data.finalizeBlock(block);
                            myMetrics.blocksWritten.inc();
                            notifyNamenodeReceivedBlock(block, EMPTY_DEL_HINT);
                            LOG.info("Received block " + block + " of size " + block.getNumBytes() + " from "
                                    + receiver.inAddr);
                        }
                        lastPacket = true;
                    }

                    replyOut.writeLong(expected);
                    replyOut.writeShort(OP_STATUS_SUCCESS);
                    replyOut.flush();
                } catch (Exception e) {
                    if (running) {
                        LOG.info("PacketResponder " + block + " " + numTargets + " Exception "
                                + StringUtils.stringifyException(e));
                        running = false;
                    }
                }
            }
            LOG.info("PacketResponder " + numTargets + " for block " + block + " terminating");
        }

        // Thread to process incoming acks
        public void run() {

            // If this is the last datanode in pipeline, then handle differently
            if (numTargets == 0) {
                lastDataNodeRun();
                return;
            }

            boolean lastPacketInBlock = false;
            while (running && shouldRun && !lastPacketInBlock) {

                try {
                    short op = OP_STATUS_SUCCESS;
                    boolean didRead = false;
                    long expected = -2;
                    try {
                        // read seqno from downstream datanode
                        long seqno = mirrorIn.readLong();
                        didRead = true;
                        if (seqno == -1) {
                            replyOut.writeLong(-1); // send keepalive
                            replyOut.flush();
                            LOG.debug("PacketResponder " + numTargets + " got -1");
                            continue;
                        } else if (seqno == -2) {
                            LOG.debug("PacketResponder " + numTargets + " got -2");
                        } else {
                            LOG.debug("PacketResponder " + numTargets + " got seqno = " + seqno);
                            Packet pkt = null;
                            synchronized (this) {
                                while (running && shouldRun && ackQueue.size() == 0) {
                                    if (LOG.isDebugEnabled()) {
                                        LOG.debug("PacketResponder " + numTargets + " seqno = " + seqno
                                                + " for block " + block
                                                + " waiting for local datanode to finish write.");
                                    }
                                    wait();
                                }
                                pkt = ackQueue.removeFirst();
                                expected = pkt.seqno;
                                notifyAll();
                                LOG.debug("PacketResponder " + numTargets + " seqno = " + seqno);
                                if (seqno != expected) {
                                    throw new IOException("PacketResponder " + numTargets + " for block " + block
                                            + " expected seqno:" + expected + " received:" + seqno);
                                }
                                lastPacketInBlock = pkt.lastPacketInBlock;
                            }
                        }
                    } catch (Throwable e) {
                        if (running) {
                            LOG.info("PacketResponder " + block + " " + numTargets + " Exception "
                                    + StringUtils.stringifyException(e));
                            running = false;
                        }
                    }

                    if (Thread.interrupted()) {
                        /* The receiver thread cancelled this thread. 
                         * We could also check any other status updates from the 
                         * receiver thread (e.g. if it is ok to write to replyOut). 
                         */
                        LOG.info("PacketResponder " + block + " " + numTargets + " : Thread is interrupted.");
                        running = false;
                    }

                    if (!didRead) {
                        op = OP_STATUS_ERROR;
                    }

                    // If this is the last packet in block, then close block
                    // file and finalize the block before responding success
                    if (lastPacketInBlock && !receiver.finalized) {
                        receiver.close();
                        block.setNumBytes(receiver.offsetInBlock);
                        data.finalizeBlock(block);
                        myMetrics.blocksWritten.inc();
                        notifyNamenodeReceivedBlock(block, EMPTY_DEL_HINT);
                        LOG.info("Received block " + block + " of size " + block.getNumBytes() + " from "
                                + receiver.inAddr);
                    }

                    // send my status back to upstream datanode
                    replyOut.writeLong(expected); // send seqno upstream
                    replyOut.writeShort(OP_STATUS_SUCCESS);

                    LOG.debug("PacketResponder " + numTargets + " for block " + block + " responded my status "
                            + " for seqno " + expected);

                    // forward responses from downstream datanodes.
                    for (int i = 0; i < numTargets && shouldRun; i++) {
                        try {
                            if (op == OP_STATUS_SUCCESS) {
                                op = mirrorIn.readShort();
                                if (op != OP_STATUS_SUCCESS) {
                                    LOG.debug("PacketResponder for block " + block
                                            + ": error code received from downstream " + " datanode[" + i + "] "
                                            + op);
                                }
                            }
                        } catch (Throwable e) {
                            op = OP_STATUS_ERROR;
                        }
                        replyOut.writeShort(op);
                    }
                    replyOut.flush();
                    LOG.debug("PacketResponder " + block + " " + numTargets + " responded other status "
                            + " for seqno " + expected);

                    // If we were unable to read the seqno from downstream, then stop.
                    if (expected == -2) {
                        running = false;
                    }
                    // If we forwarded an error response from a downstream datanode
                    // and we are acting on behalf of a client, then we quit. The 
                    // client will drive the recovery mechanism.
                    if (op == OP_STATUS_ERROR && clientName.length() > 0) {
                        running = false;
                    }
                } catch (IOException e) {
                    if (running) {
                        LOG.info("PacketResponder " + block + " " + numTargets + " Exception "
                                + StringUtils.stringifyException(e));
                        running = false;
                    }
                } catch (RuntimeException e) {
                    if (running) {
                        LOG.info("PacketResponder " + block + " " + numTargets + " Exception "
                                + StringUtils.stringifyException(e));
                        running = false;
                    }
                }
            }
            LOG.info("PacketResponder " + numTargets + " for block " + block + " terminating");
        }
    }

    /* A class that receives a block and wites to its own disk, meanwhile
     * may copies it to another site. If a throttler is provided,
     * streaming throttling is also supported. 
     * */
    private class BlockReceiver implements java.io.Closeable {
        private Block block; // the block to receive
        private boolean finalized;
        private DataInputStream in = null; // from where data are read
        private DataChecksum checksum; // from where chunks of a block can be read
        private OutputStream out = null; // to block file at local disk
        private DataOutputStream checksumOut = null; // to crc file at local disk
        private int bytesPerChecksum;
        private int checksumSize;
        private ByteBuffer buf; // contains one full packet.
        private int bufRead; //amount of valid data in the buf
        private int maxPacketReadLen;
        private long offsetInBlock;
        final private String inAddr;
        private String mirrorAddr;
        private DataOutputStream mirrorOut;
        private Daemon responder = null;
        private Throttler throttler;
        private FSDataset.BlockWriteStreams streams;
        private boolean isRecovery = false;
        private String clientName;
        DatanodeInfo srcDataNode = null;

        BlockReceiver(Block block, DataInputStream in, String inAddr, boolean isRecovery, String clientName,
                DatanodeInfo srcDataNode) throws IOException {
            try {
                this.block = block;
                this.in = in;
                this.inAddr = inAddr;
                this.isRecovery = isRecovery;
                this.clientName = clientName;
                this.offsetInBlock = 0;
                this.srcDataNode = srcDataNode;
                this.checksum = DataChecksum.newDataChecksum(in);
                this.bytesPerChecksum = checksum.getBytesPerChecksum();
                this.checksumSize = checksum.getChecksumSize();
                //
                // Open local disk out
                //
                streams = data.writeToBlock(block, isRecovery);
                this.finalized = data.isValidBlock(block);
                if (streams != null) {
                    this.out = streams.dataOut;
                    this.checksumOut = new DataOutputStream(
                            new BufferedOutputStream(streams.checksumOut, SMALL_BUFFER_SIZE));
                }
            } catch (IOException ioe) {
                IOUtils.closeStream(this);
                removeBlock();

                // check if there is a disk error
                IOException cause = FSDataset.getCauseIfDiskError(ioe);
                if (cause != null) { // possible disk error
                    ioe = cause;
                    checkDiskError(ioe);
                }
                throw ioe;
            }
        }

        // close files
        public void close() throws IOException {

            IOException ioe = null;
            // close checksum file
            try {
                if (checksumOut != null) {
                    checksumOut.close();
                    checksumOut = null;
                }
            } catch (IOException e) {
                ioe = e;
            }
            // close block file
            try {
                if (out != null) {
                    out.close();
                    out = null;
                }
            } catch (IOException e) {
                ioe = e;
            }
            // disk check
            if (ioe != null) {
                checkDiskError(ioe);
                throw ioe;
            }
        }

        // flush block data and metadata files to disk.
        void flush() throws IOException {
            if (checksumOut != null) {
                checksumOut.flush();
            }
            if (out != null) {
                out.flush();
            }
        }

        /**
         * While writing to mirrorOut, failure to write to mirror should not
         * affect this datanode unless a client is writing the block.
         */
        private void handleMirrorOutError(IOException ioe) throws IOException {
            LOG.info(dnRegistration + ":Exception writing block " + block + " to mirror " + mirrorAddr + "\n"
                    + StringUtils.stringifyException(ioe));
            mirrorOut = null;
            //
            // If stream-copy fails, continue
            // writing to disk for replication requests. For client
            // writes, return error so that the client can do error
            // recovery.
            //
            if (clientName.length() > 0) {
                throw ioe;
            }
        }

        /**
         * Verify multiple CRC chunks. 
         */
        private void verifyChunks(byte[] dataBuf, int dataOff, int len, byte[] checksumBuf, int checksumOff)
                throws IOException {
            while (len > 0) {
                int chunkLen = Math.min(len, bytesPerChecksum);

                checksum.update(dataBuf, dataOff, chunkLen);

                if (!checksum.compare(checksumBuf, checksumOff)) {
                    if (srcDataNode != null) {
                        try {
                            LOG.info("report corrupt block " + block + " from datanode " + srcDataNode
                                    + " to namenode");
                            LocatedBlock lb = new LocatedBlock(block, new DatanodeInfo[] { srcDataNode });
                            namenode.reportBadBlocks(new LocatedBlock[] { lb });
                        } catch (IOException e) {
                            LOG.warn("Failed to report bad block " + block + " from datanode " + srcDataNode
                                    + " to namenode");
                        }
                    }
                    throw new IOException(
                            "Unexpected checksum mismatch " + "while writing " + block + " from " + inAddr);
                }

                checksum.reset();
                dataOff += chunkLen;
                checksumOff += checksumSize;
                len -= chunkLen;
            }
        }

        /**
         * Makes sure buf.position() is zero without modifying buf.remaining().
         * It moves the data if position needs to be changed.
         */
        private void shiftBufData() {
            if (bufRead != buf.limit()) {
                throw new IllegalStateException("bufRead should be same as " + "buf.limit()");
            }

            //shift the remaining data on buf to the front
            if (buf.position() > 0) {
                int dataLeft = buf.remaining();
                if (dataLeft > 0) {
                    byte[] b = buf.array();
                    System.arraycopy(b, buf.position(), b, 0, dataLeft);
                }
                buf.position(0);
                bufRead = dataLeft;
                buf.limit(bufRead);
            }
        }

        /**
         * reads upto toRead byte to buf at buf.limit() and increments the limit.
         * throws an IOException if read does not succeed.
         */
        private int readToBuf(int toRead) throws IOException {
            if (toRead < 0) {
                toRead = (maxPacketReadLen > 0 ? maxPacketReadLen : buf.capacity()) - buf.limit();
            }

            int nRead = in.read(buf.array(), buf.limit(), toRead);

            if (nRead < 0) {
                throw new EOFException("while trying to read " + toRead + " bytes");
            }
            bufRead = buf.limit() + nRead;
            buf.limit(bufRead);
            return nRead;
        }

        /**
         * Reads (at least) one packet and returns the packet length.
         * buf.position() points to the start of the packet and 
         * buf.limit() point to the end of the packet. There could 
         * be more data from next packet in buf.<br><br>
         * 
         * It tries to read a full packet with single read call.
         * Consecutinve packets are usually of the same length.
         */
        private int readNextPacket() throws IOException {
            /* This dances around buf a little bit, mainly to read 
             * full packet with single read and to accept arbitarary size  
             * for next packet at the same time.
             */
            if (buf == null) {
                /* initialize buffer to the best guess size:
                 * 'chunksPerPacket' calculation here should match the same 
                 * calculation in DFSClient to make the guess accurate.
                 */
                int chunkSize = bytesPerChecksum + checksumSize;
                int chunksPerPacket = (writePacketSize - PKT_HEADER_LEN - SIZE_OF_INTEGER + chunkSize - 1)
                        / chunkSize;
                buf = ByteBuffer
                        .allocate(PKT_HEADER_LEN + SIZE_OF_INTEGER + Math.max(chunksPerPacket, 1) * chunkSize);
                buf.limit(0);
            }

            // See if there is data left in the buffer :
            if (bufRead > buf.limit()) {
                buf.limit(bufRead);
            }

            while (buf.remaining() < SIZE_OF_INTEGER) {
                if (buf.position() > 0) {
                    shiftBufData();
                }
                readToBuf(-1);
            }

            /* We mostly have the full packet or at least enough for an int
             */
            buf.mark();
            int payloadLen = buf.getInt();
            buf.reset();

            if (payloadLen == 0) {
                //end of stream!
                buf.limit(buf.position() + SIZE_OF_INTEGER);
                return 0;
            }

            // check corrupt values for pktLen, 100MB upper limit should be ok?
            if (payloadLen < 0 || payloadLen > (100 * 1024 * 1024)) {
                throw new IOException("Incorrect value for packet payload : " + payloadLen);
            }

            int pktSize = payloadLen + PKT_HEADER_LEN;

            if (buf.remaining() < pktSize) {
                //we need to read more data
                int toRead = pktSize - buf.remaining();

                // first make sure buf has enough space.        
                int spaceLeft = buf.capacity() - buf.limit();
                if (toRead > spaceLeft && buf.position() > 0) {
                    shiftBufData();
                    spaceLeft = buf.capacity() - buf.limit();
                }
                if (toRead > spaceLeft) {
                    byte oldBuf[] = buf.array();
                    int toCopy = buf.limit();
                    buf = ByteBuffer.allocate(toCopy + toRead);
                    System.arraycopy(oldBuf, 0, buf.array(), 0, toCopy);
                    buf.limit(toCopy);
                }

                //now read:
                while (toRead > 0) {
                    toRead -= readToBuf(toRead);
                }
            }

            if (buf.remaining() > pktSize) {
                buf.limit(buf.position() + pktSize);
            }

            if (pktSize > maxPacketReadLen) {
                maxPacketReadLen = pktSize;
            }

            return payloadLen;
        }

        /** 
         * Receives and processes a packet. It can contain many chunks.
         * returns size of the packet.
         */
        private int receivePacket() throws IOException {

            int payloadLen = readNextPacket();

            if (payloadLen <= 0) {
                return payloadLen;
            }

            buf.mark();
            //read the header
            buf.getInt(); // packet length
            offsetInBlock = buf.getLong(); // get offset of packet in block
            long seqno = buf.getLong(); // get seqno
            boolean lastPacketInBlock = (buf.get() != 0);

            int endOfHeader = buf.position();
            buf.reset();

            if (LOG.isDebugEnabled()) {
                LOG.debug("Receiving one packet for block " + block + " of length " + payloadLen + " seqno " + seqno
                        + " offsetInBlock " + offsetInBlock + " lastPacketInBlock " + lastPacketInBlock);
            }

            setBlockPosition(offsetInBlock);

            //First write the packet to the mirror:
            if (mirrorOut != null) {
                try {
                    mirrorOut.write(buf.array(), buf.position(), buf.remaining());
                    mirrorOut.flush();
                } catch (IOException e) {
                    handleMirrorOutError(e);
                }
            }

            buf.position(endOfHeader);
            int len = buf.getInt();

            if (len < 0) {
                throw new IOException("Got wrong length during writeBlock(" + block + ") from " + inAddr
                        + " at offset " + offsetInBlock + ": " + len);
            }

            if (len == 0) {
                LOG.debug("Receiving empty packet for block " + block);
            } else {
                offsetInBlock += len;

                int checksumLen = ((len + bytesPerChecksum - 1) / bytesPerChecksum) * checksumSize;

                if (buf.remaining() != (checksumLen + len)) {
                    throw new IOException(
                            "Data remaining in packet does not match " + "sum of checksumLen and dataLen");
                }
                int checksumOff = buf.position();
                int dataOff = checksumOff + checksumLen;
                byte pktBuf[] = buf.array();

                buf.position(buf.limit()); // move to the end of the data.

                verifyChunks(pktBuf, dataOff, len, pktBuf, checksumOff);

                try {
                    if (!finalized) {
                        //finally write to the disk :
                        out.write(pktBuf, dataOff, len);
                        checksumOut.write(pktBuf, checksumOff, checksumLen);
                        myMetrics.bytesWritten.inc(len);
                    }
                } catch (IOException iex) {
                    checkDiskError(iex);
                    throw iex;
                }
            }

            /// flush entire packet before sending ack
            flush();

            // put in queue for pending acks
            if (responder != null) {
                ((PacketResponder) responder.getRunnable()).enqueue(seqno, lastPacketInBlock);
            }

            if (throttler != null) { // throttle I/O
                throttler.throttle(payloadLen);
            }

            return payloadLen;
        }

        public void writeChecksumHeader(DataOutputStream mirrorOut) throws IOException {
            checksum.writeHeader(mirrorOut);
        }

        public void receiveBlock(DataOutputStream mirrOut, // output to next datanode
                DataInputStream mirrIn, // input from next datanode
                DataOutputStream replyOut, // output to previous datanode
                String mirrAddr, Throttler throttlerArg, int numTargets) throws IOException {

            mirrorOut = mirrOut;
            mirrorAddr = mirrAddr;
            throttler = throttlerArg;

            try {
                // write data chunk header
                if (!finalized) {
                    BlockMetadataHeader.writeHeader(checksumOut, checksum);
                }
                if (clientName.length() > 0) {
                    responder = new Daemon(threadGroup,
                            new PacketResponder(this, block, mirrIn, replyOut, numTargets, clientName));
                    responder.start(); // start thread to processes reponses
                }

                /* 
                 * Receive until packet length is zero.
                 */
                while (receivePacket() > 0) {
                }

                // flush the mirror out
                if (mirrorOut != null) {
                    try {
                        mirrorOut.writeInt(0); // mark the end of the block
                        mirrorOut.flush();
                    } catch (IOException e) {
                        handleMirrorOutError(e);
                    }
                }

                // wait for all outstanding packet responses. And then
                // indicate responder to gracefully shutdown.
                if (responder != null) {
                    ((PacketResponder) responder.getRunnable()).close();
                }

                // if this write is for a replication request (and not
                // from a client), then finalize block. For client-writes, 
                // the block is finalized in the PacketResponder.
                if (clientName.length() == 0) {
                    // close the block/crc files
                    close();

                    // Finalize the block. Does this fsync()?
                    block.setNumBytes(offsetInBlock);
                    data.finalizeBlock(block);
                    myMetrics.blocksWritten.inc();
                }

            } catch (IOException ioe) {
                LOG.info("Exception in receiveBlock for block " + block + " " + ioe);
                IOUtils.closeStream(this);
                if (responder != null) {
                    responder.interrupt();
                }
                removeBlock();
                throw ioe;
            } finally {
                if (responder != null) {
                    try {
                        responder.join();
                    } catch (InterruptedException e) {
                        throw new IOException("Interrupted receiveBlock");
                    }
                    responder = null;
                }
            }
        }

        /** Remove a partial block
         * if this write is for a replication request (and not from a client)
         */
        private void removeBlock() throws IOException {
            if (clientName.length() == 0) { // not client write
                data.unfinalizeBlock(block);
            }
        }

        /**
         * Sets the file pointer in the local block file to the specified value.
         */
        private void setBlockPosition(long offsetInBlock) throws IOException {
            if (finalized) {
                if (!isRecovery) {
                    throw new IOException("Write to offset " + offsetInBlock + " of block " + block
                            + " that is already finalized.");
                }
                if (offsetInBlock > data.getLength(block)) {
                    throw new IOException("Write to offset " + offsetInBlock + " of block " + block
                            + " that is already finalized and is of size " + data.getLength(block));
                }
                return;
            }

            if (data.getChannelPosition(block, streams) == offsetInBlock) {
                return; // nothing to do 
            }
            if (offsetInBlock % bytesPerChecksum != 0) {
                throw new IOException("setBlockPosition trying to set position to " + offsetInBlock
                        + " which is not a multiple of bytesPerChecksum " + bytesPerChecksum);
            }
            long offsetInChecksum = BlockMetadataHeader.getHeaderSize()
                    + offsetInBlock / bytesPerChecksum * checksumSize;
            if (out != null) {
                out.flush();
            }
            if (checksumOut != null) {
                checksumOut.flush();
            }
            LOG.info("Changing block file offset of block " + block + " from "
                    + data.getChannelPosition(block, streams) + " to " + offsetInBlock + " meta file offset to "
                    + offsetInChecksum);

            // set the position of the block file
            data.setChannelPosition(block, streams, offsetInBlock, offsetInChecksum);
        }
    }

    /**
     * Used for transferring a block of data.  This class
     * sends a piece of data to another DataNode.
     */
    class DataTransfer implements Runnable {
        DatanodeInfo targets[];
        Block b;

        /**
         * Connect to the first item in the target list.  Pass along the 
         * entire target list, the block, and the data.
         */
        public DataTransfer(DatanodeInfo targets[], Block b) throws IOException {
            this.targets = targets;
            this.b = b;
        }

        /**
         * Do the deed, write the bytes
         */
        public void run() {
            xmitsInProgress++;
            Socket sock = null;
            DataOutputStream out = null;
            BlockSender blockSender = null;

            try {
                InetSocketAddress curTarget = NetUtils.createSocketAddr(targets[0].getName());
                sock = newSocket();
                sock.connect(curTarget, socketTimeout);
                sock.setSoTimeout(targets.length * socketTimeout);

                long writeTimeout = socketWriteTimeout + WRITE_TIMEOUT_EXTENSION * (targets.length - 1);
                OutputStream baseStream = NetUtils.getOutputStream(sock, writeTimeout);
                out = new DataOutputStream(new BufferedOutputStream(baseStream, SMALL_BUFFER_SIZE));

                blockSender = new BlockSender(b, 0, -1, false, false, false);
                DatanodeInfo srcNode = new DatanodeInfo(dnRegistration);

                //
                // Header info
                //
                out.writeShort(DATA_TRANSFER_VERSION);
                out.writeByte(OP_WRITE_BLOCK);
                out.writeLong(b.getBlockId());
                out.writeLong(b.getGenerationStamp());
                out.writeInt(0); // no pipelining
                out.writeBoolean(false); // not part of recovery
                Text.writeString(out, ""); // client
                out.writeBoolean(true); // sending src node information
                srcNode.write(out); // Write src node DatanodeInfo
                // write targets
                out.writeInt(targets.length - 1);
                for (int i = 1; i < targets.length; i++) {
                    targets[i].write(out);
                }
                // send data & checksum
                blockSender.sendBlock(out, baseStream, null);

                // no response necessary
                LOG.info(dnRegistration + ":Transmitted block " + b + " to " + curTarget);

            } catch (IOException ie) {
                LOG.warn(dnRegistration + ":Failed to transfer " + b + " to " + targets[0].getName() + " got "
                        + StringUtils.stringifyException(ie));
            } finally {
                IOUtils.closeStream(blockSender);
                IOUtils.closeStream(out);
                IOUtils.closeSocket(sock);
                xmitsInProgress--;
            }
        }
    }

    /**
     * No matter what kind of exception we get, keep retrying to offerService().
     * That's the loop that connects to the NameNode and provides basic DataNode
     * functionality.
     *
     * Only stop when "shouldRun" is turned off (which can only happen at shutdown).
     */
    public void run() {
        LOG.info(dnRegistration + "In DataNode.run, data = " + data);

        // start dataXceiveServer
        dataXceiveServer.start();

        while (shouldRun) {
            try {
                startDistributedUpgradeIfNeeded();
                offerService();
            } catch (Exception ex) {
                LOG.error("Exception: " + StringUtils.stringifyException(ex));
                if (shouldRun) {
                    try {
                        Thread.sleep(5000);
                    } catch (InterruptedException ie) {
                    }
                }
            }
        }

        // wait for dataXceiveServer to terminate
        try {
            this.dataXceiveServer.join();
        } catch (InterruptedException ie) {
        }

        LOG.info(dnRegistration + ":Finishing DataNode in: " + data);
        shutdown();
    }

    /** Start a single datanode daemon and wait for it to finish.
     *  If this thread is specifically interrupted, it will stop waiting.
     */
    static void runDatanodeDaemon(DataNode dn) throws IOException {
        if (dn != null) {
            //register datanode
            dn.register();
            dn.dataNodeThread = new Thread(dn, dnThreadName);
            dn.dataNodeThread.setDaemon(true); // needed for JUnit testing
            dn.dataNodeThread.start();
        }
    }

    /** check if a datanode is up */
    static boolean isDatanodeUp(DataNode dn) {
        return dn.dataNodeThread != null && dn.dataNodeThread.isAlive();
    }

    /** Instantiate a single datanode object. This must be run by invoking
     *  {@link DataNode#runDatanodeDaemon(DataNode)} subsequently. 
     */
    static DataNode instantiateDataNode(String args[], Configuration conf) throws IOException {
        if (conf == null)
            conf = new Configuration();
        if (!parseArguments(args, conf)) {
            printUsage();
            return null;
        }
        if (conf.get("dfs.network.script") != null) {
            LOG.error("This configuration for rack identification is not supported"
                    + " anymore. RackID resolution is handled by the NameNode.");
            System.exit(-1);
        }
        String[] dataDirs = conf.getStrings("dfs.data.dir");
        dnThreadName = "DataNode: [" + StringUtils.arrayToString(dataDirs) + "]";
        return makeInstance(dataDirs, conf);
    }

    /** Instantiate & Start a single datanode daemon and wait for it to finish.
     *  If this thread is specifically interrupted, it will stop waiting.
     */
    static DataNode createDataNode(String args[], Configuration conf) throws IOException {
        DataNode dn = instantiateDataNode(args, conf);
        runDatanodeDaemon(dn);
        return dn;
    }

    void join() {
        if (dataNodeThread != null) {
            try {
                dataNodeThread.join();
            } catch (InterruptedException e) {
            }
        }
    }

    /**
     * Make an instance of DataNode after ensuring that at least one of the
     * given data directories (and their parent directories, if necessary)
     * can be created.
     * @param dataDirs List of directories, where the new DataNode instance should
     * keep its files.
     * @param conf Configuration instance to use.
     * @return DataNode instance for given list of data dirs and conf, or null if
     * no directory from this directory list can be created.
     * @throws IOException
     */
    static DataNode makeInstance(String[] dataDirs, Configuration conf) throws IOException {
        ArrayList<File> dirs = new ArrayList<File>();
        for (int i = 0; i < dataDirs.length; i++) {
            File data = new File(dataDirs[i]);
            try {
                DiskChecker.checkDir(data);
                dirs.add(data);
            } catch (DiskErrorException e) {
                LOG.warn("Invalid directory in dfs.data.dir: " + e.getMessage());
            }
        }
        if (dirs.size() > 0)
            return new DataNode(conf, dirs);
        LOG.error("All directories in dfs.data.dir are invalid.");
        return null;
    }

    @Override
    public String toString() {
        return "DataNode{" + "data=" + data + ", localName='" + dnRegistration.getName() + "'" + ", storageID='"
                + dnRegistration.getStorageID() + "'" + ", xmitsInProgress=" + xmitsInProgress + "}";
    }

    private static void printUsage() {
        System.err.println("Usage: java DataNode");
        System.err.println("           [-rollback]");
    }

    /**
     * Parse and verify command line arguments and set configuration parameters.
     *
     * @return false if passed argements are incorrect
     */
    private static boolean parseArguments(String args[], Configuration conf) {
        int argsLen = (args == null) ? 0 : args.length;
        StartupOption startOpt = StartupOption.REGULAR;
        for (int i = 0; i < argsLen; i++) {
            String cmd = args[i];
            if ("-r".equalsIgnoreCase(cmd) || "--rack".equalsIgnoreCase(cmd)) {
                LOG.error("-r, --rack arguments are not supported anymore. RackID "
                        + "resolution is handled by the NameNode.");
                System.exit(-1);
            } else if ("-rollback".equalsIgnoreCase(cmd)) {
                startOpt = StartupOption.ROLLBACK;
            } else if ("-regular".equalsIgnoreCase(cmd)) {
                startOpt = StartupOption.REGULAR;
            } else
                return false;
        }
        setStartupOption(conf, startOpt);
        return true;
    }

    private static void setStartupOption(Configuration conf, StartupOption opt) {
        conf.set("dfs.datanode.startup", opt.toString());
    }

    static StartupOption getStartupOption(Configuration conf) {
        return StartupOption.valueOf(conf.get("dfs.datanode.startup", StartupOption.REGULAR.toString()));
    }

    /**
     * This methods  arranges for the data node to send the block report at the next heartbeat.
     */
    public void scheduleBlockReport(long delay) {
        if (delay > 0) { // send BR after random delay
            lastBlockReport = System.currentTimeMillis() - (blockReportInterval - R.nextInt((int) (delay)));
        } else { // send at next heartbeat
            lastBlockReport = lastHeartbeat - blockReportInterval;
        }
        resetBlockReportTime = true; // reset future BRs for randomness
    }

    /**
     * This method is used for testing. 
     * Examples are adding and deleting blocks directly.
     * The most common usage will be when the data node's storage is similated.
     * 
     * @return the fsdataset that stores the blocks
     */
    public FSDatasetInterface getFSDataset() {
        return data;
    }

    /**
     */
    public static void main(String args[]) {
        try {
            StringUtils.startupShutdownMessage(DataNode.class, args, LOG);
            DataNode datanode = createDataNode(args, null);
            if (datanode != null)
                datanode.join();
        } catch (Throwable e) {
            LOG.error(StringUtils.stringifyException(e));
            System.exit(-1);
        }
    }

    // InterDataNodeProtocol implementation
    /** {@inheritDoc} */
    public BlockMetaDataInfo getBlockMetaDataInfo(Block block) throws IOException {
        if (LOG.isDebugEnabled()) {
            LOG.debug("block=" + block);
        }
        Block stored = data.getStoredBlock(block.blkid);

        if (stored == null) {
            return null;
        }
        BlockMetaDataInfo info = new BlockMetaDataInfo(stored, blockScanner.getLastScanTime(stored));
        if (LOG.isDebugEnabled()) {
            LOG.debug("getBlockMetaDataInfo successful block=" + stored + " length " + stored.getNumBytes()
                    + " genstamp " + stored.getGenerationStamp());
        }

        // paranoia! verify that the contents of the stored block
        // matches the block file on disk.
        data.validateBlockMetadata(stored);
        return info;
    }

    Daemon recoverBlocks(final Block[] blocks, final DatanodeInfo[][] targets) {
        Daemon d = new Daemon(threadGroup, new Runnable() {
            public void run() {
                LeaseManager.recoverBlocks(blocks, targets, DataNode.this, namenode, getConf());
            }
        });
        d.start();
        return d;
    }

    /** {@inheritDoc} */
    public void updateBlock(Block oldblock, Block newblock, boolean finalize) throws IOException {
        LOG.info("oldblock=" + oldblock + ", newblock=" + newblock + ", datanode=" + dnRegistration.getName());
        data.updateBlock(oldblock, newblock);
        if (finalize) {
            data.finalizeBlock(newblock);
            myMetrics.blocksWritten.inc();
            notifyNamenodeReceivedBlock(newblock, EMPTY_DEL_HINT);
            LOG.info("Received block " + newblock + " of size " + newblock.getNumBytes()
                    + " as part of lease recovery.");
        }
    }

    /** {@inheritDoc} */
    public long getProtocolVersion(String protocol, long clientVersion) throws IOException {
        if (protocol.equals(InterDatanodeProtocol.class.getName())) {
            return InterDatanodeProtocol.versionID;
        } else if (protocol.equals(ClientDatanodeProtocol.class.getName())) {
            return ClientDatanodeProtocol.versionID;
        }
        throw new IOException("Unknown protocol to " + getClass().getSimpleName() + ": " + protocol);
    }

    // ClientDataNodeProtocol implementation
    /** {@inheritDoc} */
    public Block recoverBlock(Block block, DatanodeInfo[] targets) throws IOException {
        logRecoverBlock("Client", block, targets);
        return LeaseManager.recoverBlock(block, targets, this, namenode, getConf(), false);
    }

    static void logRecoverBlock(String who, Block block, DatanodeID[] targets) {
        StringBuilder msg = new StringBuilder(targets[0].getName());
        for (int i = 1; i < targets.length; i++) {
            msg.append(", " + targets[i].getName());
        }
        LOG.info(who + " calls recoverBlock(block=" + block + ", targets=[" + msg + "])");
    }
}