org.apache.hadoop.hdfs.MiniAvatarCluster.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hdfs.MiniAvatarCluster.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs;

import java.io.FileOutputStream;
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.Enumeration;
import java.util.List;
import java.util.Properties;
import java.util.ArrayList;
import java.util.Random;
import java.net.InetSocketAddress;
import java.net.NetworkInterface;
import java.util.concurrent.atomic.AtomicInteger;

import junit.framework.Assert;

import org.apache.zookeeper.server.NIOServerCnxnFactory;
import org.apache.zookeeper.server.ZooKeeperServer;
import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException;
import org.apache.zookeeper.server.persistence.FileTxnSnapLog;
import org.apache.zookeeper.server.ServerConfig;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.MiniDFSCluster.ShutdownInterface;
import org.apache.hadoop.hdfs.MiniDFSCluster.ShutDownUtil;
import org.apache.hadoop.hdfs.protocol.AvatarConstants;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.qjournal.MiniJournalCluster;
import org.apache.hadoop.hdfs.server.common.HdfsConstants;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption;
import org.apache.hadoop.hdfs.server.namenode.AvatarNode;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.namenode.Standby;
import org.apache.hadoop.hdfs.server.namenode.NNStorageDirectoryRetentionManager;
import org.apache.hadoop.hdfs.server.datanode.AvatarDataNode;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.datanode.SimulatedFSDataset;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.net.StaticMapping;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.net.DNSToSwitchMapping;

/**
 * This class manages a Avatar/HDFS cluster with all nodes running
 * locally.
 * To synchronize the AvatarNodes, it uses a local ZooKeeper
 * server.
 */
public class MiniAvatarCluster {

    public static final String NAMESERVICE_ID_PREFIX = "nameserviceId";
    public static int currNSId = 0;
    public static int instantiationRetries = 15;
    public static final String JID = "test-journal";

    public static class DataNodeProperties implements ShutdownInterface {
        public AvatarDataNode datanode;
        public Configuration conf;
        public String[] dnArgs;

        DataNodeProperties(AvatarDataNode node, Configuration conf, String[] args) {
            this.datanode = node;
            this.conf = conf;
            this.dnArgs = args;
        }

        @Override
        public void shutdown() throws IOException {
            if (this.datanode != null)
                this.datanode.shutdown();
        }
    }

    public static enum AvatarState {
        ACTIVE, STANDBY, DEAD
    }

    public static class AvatarInfo implements ShutdownInterface {
        public AvatarNode avatar;
        AvatarState state;
        int nnPort;
        int nnDnPort;
        int httpPort;
        int rpcPort;
        String startupOption;

        AvatarInfo(AvatarNode avatar, AvatarState state, int nnPort, int nnDnPort, int httpPort, int rpcPort,
                String startupOption) {
            this.avatar = avatar;
            this.state = state;
            this.nnPort = nnPort;
            this.nnDnPort = nnDnPort;
            this.httpPort = httpPort;
            this.rpcPort = rpcPort;
            this.startupOption = startupOption;
        }

        @Override
        public void shutdown() throws IOException {
            if (this.avatar != null)
                this.avatar.shutdown(true);
        }
    }

    private static final Log LOG = LogFactory.getLog(MiniAvatarCluster.class);

    private static final String DEFAULT_TEST_DIR = "build/contrib/highavailability/test/data";
    public static final String TEST_DIR = new File(System.getProperty("test.build.data", DEFAULT_TEST_DIR))
            .getAbsolutePath();

    private static final AtomicInteger ClusterId = new AtomicInteger(1);

    private static final String ZK_DATA_DIR = TEST_DIR + "/zk.data";
    private static final String ZK_CONF_FILE = TEST_DIR + "/zk.conf";

    public static final int zkClientPort = MiniDFSCluster.getFreePort();

    private static String baseAvatarDir;
    private static String dataDir;
    private int numDataNodes;
    private boolean format;
    private String[] racks;
    private String[] hosts;
    private boolean federation;
    private NameNodeInfo[] nameNodes;
    private final boolean enableQJM;
    private StartupOption startOpt;
    private final int numJournalNodes;
    private MiniJournalCluster journalCluster = null;
    private Configuration conf;

    /**
     * Some test cases only work with FileJournalManager, need a way to tell
     * if QJM is enabled here.
     */
    public boolean isUsingJournalCluster() {
        return journalCluster != null;
    }

    public MiniJournalCluster getJournalCluster() {
        if (journalCluster == null) {
            throw new IllegalArgumentException("MiniAvatarCluster not configured to use journal cluster");
        }
        return journalCluster;
    }

    public class NameNodeInfo {
        Configuration conf;
        public ArrayList<AvatarInfo> avatars = null;
        private final String fsimage0Dir;
        private final String fsimage1Dir;
        private final String fsedits0Dir;
        private final String fsedits1Dir;

        private final String fsimagelocalDir;
        private final String fseditslocalDir;

        private final int nnPort;
        private final int nn0Port;
        private final int nn1Port;
        private final int nnDnPort;
        private final int nnDn0Port;
        private final int nnDn1Port;
        private final int httpPort;
        private final int http0Port;
        private final int http1Port;
        private final int rpcPort;
        private final int rpc0Port;
        private final int rpc1Port;

        private Configuration clientConf;
        private Configuration a0Conf;
        private Configuration a1Conf;
        private final String avatarDir;
        String nameserviceId;

        NameNodeInfo(int nnIndex) {
            avatarDir = baseAvatarDir;

            fsimagelocalDir = avatarDir + "/fsimagelocal-" + FSConstants.DFS_NAMENODE_NAME_DIR_WILDCARD;
            fseditslocalDir = avatarDir + "/fseditslocal-" + FSConstants.DFS_NAMENODE_NAME_DIR_WILDCARD;

            fsimage0Dir = avatarDir + "/fsimage0";
            fsimage1Dir = avatarDir + "/fsimage1";
            fsedits0Dir = avatarDir + "/fsedits0";
            fsedits1Dir = avatarDir + "/fsedits1";

            rpcPort = nnPort = MiniDFSCluster.getFreePort();
            nnDnPort = MiniDFSCluster.getFreePort();
            httpPort = MiniDFSCluster.getFreePort();
            rpc0Port = nn0Port = MiniDFSCluster.getFreePorts(2);
            nnDn0Port = MiniDFSCluster.getFreePort();
            http0Port = MiniDFSCluster.getFreePort();
            rpc1Port = nn1Port = MiniDFSCluster.getFreePorts(2);
            nnDn1Port = MiniDFSCluster.getFreePort();
            http1Port = MiniDFSCluster.getFreePort();
        }

        public void setAvatarNodes(ArrayList<AvatarInfo> avatars) {
            this.avatars = avatars;
        }

        void unlockStorageDirectory(String instance) {
            if (!instance.equals("zero") && !instance.equals("one")) {
                throw new IllegalArgumentException("Specify one or zero, invalid argument : " + instance);
            }
            new File(fsimagelocalDir.replaceAll(FSConstants.DFS_NAMENODE_NAME_DIR_WILDCARD, instance),
                    "in_use.lock").delete();
            new File(fseditslocalDir.replaceAll(FSConstants.DFS_NAMENODE_NAME_DIR_WILDCARD, instance),
                    "in_use.lock").delete();
        }

        public void initClientConf(Configuration conf) {
            clientConf = new Configuration(conf);
            clientConf.set("fs.default.name", "hdfs://localhost:" + nnPort);
            clientConf.set("fs.default.name0", "hdfs://127.0.0.1:" + nn0Port);
            clientConf.set("fs.default.name1", "hdfs://127.0.0.1:" + nn1Port);
            clientConf.set(NameNode.DFS_NAMENODE_RPC_ADDRESS_KEY, "localhost:" + nnPort);
            clientConf.set(NameNode.DFS_NAMENODE_RPC_ADDRESS_KEY + "0", "127.0.0.1:" + nn0Port);
            clientConf.set(NameNode.DFS_NAMENODE_RPC_ADDRESS_KEY + "1", "127.0.0.1:" + nn1Port);
            clientConf.set("dfs.namenode.dn-address", "localhost:" + nnDnPort);
            clientConf.set("dfs.namenode.dn-address0", "127.0.0.1:" + nnDn0Port);
            clientConf.set("dfs.namenode.dn-address1", "127.0.0.1:" + nnDn1Port);
            clientConf.set("fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedAvatarFileSystem");
            clientConf.setBoolean("fs.hdfs.impl.disable.cache", true);
            // Lower the number of retries to close connections quickly.
            clientConf.setInt("ipc.client.connect.max.retries", 3);
        }

        public void initGeneralConf(Configuration conf, String nameserviceId) {
            // overwrite relevant settings
            initClientConf(conf);
            this.nameserviceId = nameserviceId;
            // avatar nodes
            if (federation) {
                conf.set("dfs.namenode.rpc-address0", "127.0.0.1:" + rpc0Port);
                conf.set("dfs.namenode.rpc-address1", "127.0.0.1:" + rpc1Port);
            } else {
                conf.set("fs.default.name", "hdfs://localhost:" + nnPort);
                conf.set("fs.default.name0", "hdfs://localhost:" + nn0Port);
                conf.set("fs.default.name1", "hdfs://localhost:" + nn1Port);
                conf.set("dfs.namenode.dn-address", "localhost:" + nnDnPort);
                conf.set(NameNode.DFS_NAMENODE_RPC_ADDRESS_KEY, "localhost:" + nnPort);
                conf.set("dfs.http.address", "127.0.0.1:" + httpPort);
            }
            // Enable avatar testing framework for unit tests.
            conf.setFloat("dfs.avatarnode.failover.sample.percent", 1.0f);
            conf.set("dfs.avatarnode.failover.test.data.dir", avatarDir);

            conf.set("dfs.namenode.dn-address0", "127.0.0.1:" + nnDn0Port);
            conf.set("dfs.namenode.dn-address1", "127.0.0.1:" + nnDn1Port);
            conf.set("dfs.http.address0", "127.0.0.1:" + http0Port);
            conf.set("dfs.http.address1", "127.0.0.1:" + http1Port);
            conf.set(NameNode.DFS_NAMENODE_RPC_ADDRESS_KEY + "0", "127.0.0.1:" + nn0Port);
            conf.set(NameNode.DFS_NAMENODE_RPC_ADDRESS_KEY + "1", "127.0.0.1:" + nn1Port);

            // set the shared edits and image dirs.
            if (enableQJM) {
                String journalURI = journalCluster.getQuorumJournalURI(JID).toString();

                // set the edits dir
                conf.set("dfs.name.edits.dir.shared0", journalURI + "/zero");
                conf.set("dfs.name.edits.dir.shared1", journalURI + "/one");

                // set the image dir
                conf.set("dfs.name.dir.shared0", journalURI + "/zero");
                conf.set("dfs.name.dir.shared1", journalURI + "/one");

                conf.setBoolean("dfs.force.remote.image", true);
            } else {
                conf.set("dfs.name.edits.dir.shared0", fsedits0Dir);
                conf.set("dfs.name.edits.dir.shared1", fsedits1Dir);

                conf.set("dfs.name.dir.shared0", fsimage0Dir);
                conf.set("dfs.name.dir.shared1", fsimage1Dir);
            }

            conf.setInt("dfs.safemode.extension", 1000);
            // These two ipc parameters help RPC connections to shut down quickly in
            // unit tests.
            conf.setInt("ipc.client.connect.max.retries", 3);
            conf.setInt("ipc.client.connect.timeout", 2000);
            // We need to disable the filesystem cache so that unit tests and
            // MiniAvatarCluster don't end up sharing FileSystem objects.
            if (federation) {
                for (String key : AvatarNode.AVATARSERVICE_SPECIFIC_KEYS) {
                    String value = conf.get(key);
                    if (value != null) {
                        String newKey = DFSUtil.getNameServiceIdKey(key, nameserviceId);
                        conf.set(newKey, value);
                        conf.set(key, "");
                    }
                }
                String rpcKey = DFSUtil.getNameServiceIdKey(AvatarNode.DFS_NAMENODE_RPC_ADDRESS_KEY, nameserviceId);
                conf.set(rpcKey, "localhost:" + rpcPort);
                String dnKey = DFSUtil.getNameServiceIdKey(NameNode.DATANODE_PROTOCOL_ADDRESS, nameserviceId);
                conf.set(dnKey, "localhost:" + nnDnPort);
                String httpKey = DFSUtil.getNameServiceIdKey(NameNode.DFS_NAMENODE_HTTP_ADDRESS_KEY, nameserviceId);
                conf.set(httpKey, "localhost:" + httpPort);
            }
        }

        public void updateAvatarConf(Configuration newConf) {
            conf = new Configuration(newConf);
            if (federation) {
                conf.set(FSConstants.DFS_FEDERATION_NAMESERVICE_ID, nameserviceId);
            }

            // server config for avatar nodes
            a0Conf = new Configuration(conf);
            a1Conf = new Configuration(conf);

            a0Conf.set("dfs.name.dir", fsimagelocalDir);
            a0Conf.set("dfs.name.edits.dir", fseditslocalDir);
            a0Conf.set("fs.checkpoint.dir", avatarDir + "/checkpoint0");

            a1Conf.set("dfs.name.dir", fsimagelocalDir);
            a1Conf.set("dfs.name.edits.dir", fseditslocalDir);
            a1Conf.set("fs.checkpoint.dir", avatarDir + "/checkpoint1");
        }

        public void createAvatarDirs() {
            new File(fsimagelocalDir.replaceAll(FSConstants.DFS_NAMENODE_NAME_DIR_WILDCARD, "zero")).mkdirs();
            new File(fsimagelocalDir.replaceAll(FSConstants.DFS_NAMENODE_NAME_DIR_WILDCARD, "one")).mkdirs();
            new File(fsimage0Dir).mkdirs();
            new File(fsimage1Dir).mkdirs();
            new File(fseditslocalDir.replaceAll(FSConstants.DFS_NAMENODE_NAME_DIR_WILDCARD, "zero")).mkdirs();
            new File(fseditslocalDir.replaceAll(FSConstants.DFS_NAMENODE_NAME_DIR_WILDCARD, "one")).mkdirs();
            new File(fsedits0Dir).mkdirs();
            new File(fsedits1Dir).mkdirs();
        }

        public void cleanupAvatarDirs() throws IOException {
            String[] files = new String[] {
                    fsimagelocalDir.replaceAll(FSConstants.DFS_NAMENODE_NAME_DIR_WILDCARD, "zero"),
                    fsimagelocalDir.replaceAll(FSConstants.DFS_NAMENODE_NAME_DIR_WILDCARD, "one"), fsimage0Dir,
                    fsimage1Dir, fseditslocalDir.replaceAll(FSConstants.DFS_NAMENODE_NAME_DIR_WILDCARD, "zero"),
                    fseditslocalDir.replaceAll(FSConstants.DFS_NAMENODE_NAME_DIR_WILDCARD, "one"), fsedits0Dir,
                    fsedits1Dir };
            for (String filename : files) {
                FileUtil.fullyDelete(new File(filename));
            }
        }

        public String getNameserviceId() {
            return nameserviceId;
        }
    }

    private static ZooKeeperServer zooKeeper;
    private static NIOServerCnxnFactory cnxnFactory;

    private ArrayList<DataNodeProperties> dataNodes = new ArrayList<DataNodeProperties>();

    static {
        DataNode.setSecureRandom(new Random());
    }

    public static class Builder {
        private Configuration conf;
        private int numDataNodes = 1;
        private boolean format = true;
        private String[] racks = null;
        private String[] hosts = null;
        private int numNameNodes = 1;
        private boolean federation = false;
        private long[] simulatedCapacities = null;
        private int numJournalNodes = 3;
        private boolean enableQJM = true;
        private MiniJournalCluster journalCluster = null;
        private StartupOption startOpt = null;
        private int instantiationRetries = 15;

        public Builder(Configuration conf) {
            this.conf = conf;
        }

        public Builder startOpt(StartupOption startOpt) {
            this.startOpt = startOpt;
            return this;
        }

        public Builder instantionRetries(int instantionRetries) {
            this.instantiationRetries = instantionRetries;
            return this;
        }

        public Builder numDataNodes(int numDataNodes) {
            this.numDataNodes = numDataNodes;
            return this;
        }

        public Builder format(boolean format) {
            this.format = format;
            return this;
        }

        public Builder racks(String[] racks) {
            this.racks = racks;
            return this;
        }

        public Builder hosts(String[] hosts) {
            this.hosts = hosts;
            return this;
        }

        public Builder numNameNodes(int numNameNodes) {
            this.numNameNodes = numNameNodes;
            return this;
        }

        public Builder federation(boolean federation) {
            this.federation = federation;
            return this;
        }

        public Builder simulatedCapacities(long[] simulatedCapacities) {
            this.simulatedCapacities = simulatedCapacities;
            return this;
        }

        public Builder numJournalNodes(int numJournalNodes) {
            this.numJournalNodes = numJournalNodes;
            return this;
        }

        public Builder enableQJM(boolean enableQJM) {
            this.enableQJM = enableQJM;
            return this;
        }

        public Builder setJournalCluster(MiniJournalCluster journalCluster) {
            this.journalCluster = journalCluster;
            this.enableQJM = true;
            return this;
        }

        public MiniAvatarCluster build() throws IOException, ConfigException, InterruptedException {
            return new MiniAvatarCluster(this);
        }
    }

    public MiniAvatarCluster(Configuration conf, int numDataNodes, boolean format, String[] racks, String[] hosts)
            throws IOException, ConfigException, InterruptedException {
        this(new Builder(conf).numDataNodes(numDataNodes).format(format).racks(racks).hosts(hosts));
    }

    public MiniAvatarCluster(Configuration conf, int numDataNodes, boolean format, String[] racks, String[] hosts,
            int numNameNodes, boolean federation) throws IOException, ConfigException, InterruptedException {
        this(new Builder(conf).numDataNodes(numDataNodes).format(format).racks(racks).hosts(hosts)
                .numNameNodes(numNameNodes).federation(federation));
    }

    /**
     * Modify the config and start up the servers.  The rpc and info ports for
     * servers are guaranteed to use free ports.
     * <p>
     * NameNode and DataNode directory creation and configuration will be
     * managed by this class.
     *
     * @param conf the base configuration to use in starting the servers.  This
     *          will be modified as necessary.
     * @param numDataNodes Number of DataNodes to start; may be zero
     * @param format if true, format the NameNode and DataNodes before starting up
     * @param racks array of strings indicating the rack that each DataNode is on
     * @param hosts array of strings indicating the hostname of each DataNode
     * @param numNameNodes Number of NameNodes to start; 
     * @param federation if true, we start it with federation configure;
     */
    public MiniAvatarCluster(Configuration conf, int numDataNodes, boolean format, String[] racks, String[] hosts,
            int numNameNodes, boolean federation, long[] simulatedCapacities)
            throws IOException, ConfigException, InterruptedException {
        this(new Builder(conf).numDataNodes(numDataNodes).format(format).racks(racks).hosts(hosts)
                .numNameNodes(numNameNodes).federation(federation).simulatedCapacities(simulatedCapacities));
    }

    public MiniAvatarCluster(Builder b) throws IOException, ConfigException, InterruptedException {

        Standby.CHECKPOINT_SLEEP_BEFORE_RETRY = 100;
        this.conf = b.conf;

        final String testDir = TEST_DIR + "/" + conf.get(MiniDFSCluster.DFS_CLUSTER_ID, "");
        baseAvatarDir = testDir + "/avatar";
        dataDir = testDir + "/data";

        this.instantiationRetries = b.instantiationRetries;
        this.numDataNodes = b.numDataNodes;
        this.format = b.format;
        this.racks = b.racks;
        this.hosts = b.hosts;
        this.numJournalNodes = b.numJournalNodes;
        this.enableQJM = b.enableQJM;
        this.startOpt = b.startOpt;
        this.journalCluster = b.journalCluster;

        int clusterId = ClusterId.getAndIncrement();
        conf.setInt(FSConstants.DFS_CLUSTER_ID, clusterId);
        conf.set(FSConstants.DFS_CLUSTER_NAME, "MiniAvatarCluster-" + clusterId);

        conf.setInt("dfs.secondary.info.port", 0);
        conf.set("fs.ha.zookeeper.prefix", "/hdfs");
        conf.set("fs.ha.zookeeper.quorum", "localhost:" + zkClientPort);
        conf.setInt("fs.ha.zookeeper.connect.timeout", 30000);
        conf.setInt("fs.ha.zookeeper.timeout", 30000);

        // datanodes

        conf.setInt("dfs.datanode.fullblockreport.delay", 1000);
        conf.setInt("dfs.datanode.blockreceived.retry.internval", 1000);

        conf.set(FSConstants.DFS_DATANODE_ADDRESS_KEY, "localhost:0");
        conf.set("dfs.datanode.http.address", "localhost:0");
        conf.set("dfs.datanode.ipc.address", "localhost:0");

        String loopBack = getLoopBackInterface();
        LOG.info("LoopBack interface is : " + loopBack);
        conf.set(FSConstants.DFS_DATANODE_DNS_INTERFACE, loopBack);
        conf.set(FSConstants.DFS_NAMENODE_DNS_INTERFACE, loopBack);

        // other settings
        conf.setBoolean("dfs.permissions", false);
        conf.setBoolean("dfs.persist.blocks", true);
        conf.set("fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedAvatarFileSystem");
        conf.setLong("dfs.blockreport.initialDelay", 0);
        conf.setClass("topology.node.switch.mapping.impl", StaticMapping.class, DNSToSwitchMapping.class);

        if (conf.get("dfs.ingest.retries") == null) {
            conf.setInt("dfs.ingest.retries", 2);
        }
        conf.setLong("rpc.polling.interval", 10);
        conf.setLong("lease.check.interval", 10);

        conf.set("dfs.secondary.http.address", "0.0.0.0:0");

        // enable checkpoint by default
        if (conf.get("fs.checkpoint.enabled") == null) {
            conf.setBoolean("fs.checkpoint.enabled", true);
        }

        //http image download timeout - 5s
        if (conf.get("dfs.image.transfer.timeout") == null) {
            conf.setInt("dfs.image.transfer.timeout", 5 * 1000);
        }

        // make the standby actions (e.g., checkpoint trigger) quicker
        conf.setInt("hdfs.avatarnode.sleep", 1000);

        // disable standby backup limits
        conf.setInt(NNStorageDirectoryRetentionManager.NN_IMAGE_DAYS_TOKEEP, 0);
        conf.setInt(NNStorageDirectoryRetentionManager.NN_IMAGE_COPIES_TOKEEP, 0);

        // start the JournalCluster.
        if (this.enableQJM) {
            startJournalCluster();
        }

        this.federation = b.federation;
        Collection<String> nameserviceIds = DFSUtil.getNameServiceIds(conf);
        if (nameserviceIds.size() > 1)
            this.federation = true;
        if (!federation && b.numNameNodes != 1) {
            throw new IOException("Only 1 namenode is allowed in non-federation cluster.");
        }
        nameNodes = new NameNodeInfo[b.numNameNodes];
        for (int nnIndex = 0; nnIndex < b.numNameNodes; nnIndex++) {
            nameNodes[nnIndex] = new NameNodeInfo(nnIndex);
            if (format)
                nameNodes[nnIndex].cleanupAvatarDirs();
            nameNodes[nnIndex].createAvatarDirs();
        }
        if (!federation) {
            nameNodes[0].initGeneralConf(conf, null);
        } else {
            if (nameserviceIds.isEmpty()) {
                for (int i = 0; i < nameNodes.length; i++) {
                    nameserviceIds.add(NAMESERVICE_ID_PREFIX + getNSId());
                }
            }
            initFederationConf(conf, nameserviceIds);
        }

        if (this.format) {
            File data_dir = new File(dataDir);
            if (data_dir.exists() && !FileUtil.fullyDelete(data_dir)) {
                throw new IOException("Cannot remove data directory: " + data_dir);
            }
        }

        // Need to start datanodes before avatarnodes, since the primary starts up
        // in safemode and when the standby starts up, it waits for the primary to
        // exit safemode. So if we start avatarnodes first with non-empty FSImage
        // and FSEdits, the primary avatar would wait for datanode block reports and
        // the standby would wait for the primary to exit safemode and since we
        // wouldn't return from the standby initialization we would never start the
        // datanodes and hence we enter a deadlock.
        registerZooKeeperNodes();
        startDataNodes(b.simulatedCapacities);
        startAvatarNodes();
        waitAvatarNodesActive();

        waitDataNodesActive();

        waitExitSafeMode();
        waitForTheFirstCheckpoint();
    }

    /**
     * Retrieves the name of the loopback interface in a platform independent way.
     */
    private static String getLoopBackInterface() throws IOException {
        String loopBack = "lo";
        Enumeration<NetworkInterface> ifaces = NetworkInterface.getNetworkInterfaces();
        while (ifaces.hasMoreElements()) {
            NetworkInterface iface = ifaces.nextElement();
            if (iface.isLoopback()) {
                loopBack = iface.getName();
                break;
            }
        }
        return loopBack;
    }

    private void startJournalCluster() throws IOException {
        if (journalCluster == null) {
            this.journalCluster = new MiniJournalCluster.Builder(conf).numJournalNodes(numJournalNodes).build();
        }
    }

    private void initFederationConf(Configuration conf, Collection<String> nameserviceIds) {
        String nameserviceIdList = "";
        int nnIndex = 0;
        for (String nameserviceId : nameserviceIds) {
            // Create comma separated list of nameserviceIds
            if (nameserviceIdList.length() > 0) {
                nameserviceIdList += ",";
            }
            nameserviceIdList += nameserviceId;
            nameNodes[nnIndex].initGeneralConf(conf, nameserviceId);
            nnIndex++;
        }
        conf.set(FSConstants.DFS_FEDERATION_NAMESERVICES, nameserviceIdList);
    }

    private static ServerConfig createZooKeeperConf() throws IOException, ConfigException {

        // create conf file
        File zkConfDir = new File(TEST_DIR);
        zkConfDir.mkdirs();
        File zkConfFile = new File(ZK_CONF_FILE);
        zkConfFile.delete();
        zkConfFile.createNewFile();

        Properties zkConfProps = new Properties();
        zkConfProps.setProperty("tickTime", "2000");
        zkConfProps.setProperty("dataDir", ZK_DATA_DIR);
        zkConfProps.setProperty("clientPort", new Integer(zkClientPort).toString());
        zkConfProps.setProperty("maxClientCnxns", "500");
        zkConfProps.store(new FileOutputStream(zkConfFile), "");

        // create config object
        ServerConfig zkConf = new ServerConfig();
        zkConf.parse(ZK_CONF_FILE);

        return zkConf;
    }

    private static ServerConfig getZooKeeperConf() throws Exception {
        if (new File(ZK_CONF_FILE).exists()) {
            ServerConfig zkConf = new ServerConfig();
            zkConf.parse(ZK_CONF_FILE);

            return zkConf;
        } else {
            return createZooKeeperConf();
        }
    }

    public static boolean clearZooKeeperData() throws Exception {
        ServerConfig zkConf = getZooKeeperConf();
        File dataLogDir = new File(zkConf.getDataLogDir());
        File dataDir = new File(zkConf.getDataDir());
        return (FileUtil.fullyDelete(dataLogDir) && FileUtil.fullyDelete(dataDir));
    }

    public static void createAndStartZooKeeper() throws IOException, ConfigException, InterruptedException {
        logStateChange("Creating zookeeper server");
        AvatarShell.retrySleep = 1000;
        ServerConfig zkConf = createZooKeeperConf();

        zooKeeper = new ZooKeeperServer();
        FileTxnSnapLog ftxn = new FileTxnSnapLog(new File(zkConf.getDataLogDir()), new File(zkConf.getDataDir()));
        zooKeeper.setTxnLogFactory(ftxn);
        zooKeeper.setTickTime(zkConf.getTickTime());
        zooKeeper.setMinSessionTimeout(zkConf.getMinSessionTimeout());
        zooKeeper.setMaxSessionTimeout(zkConf.getMaxSessionTimeout());

        cnxnFactory = new NIOServerCnxnFactory();
        cnxnFactory.configure(zkConf.getClientPortAddress(), zkConf.getMaxClientCnxns());
        cnxnFactory.startup(zooKeeper);
        logStateChange("Creating zookeeper server - completed");
    }

    private void registerZooKeeperNode(int nnPrimaryPort, int nnDnPrimaryPort, int httpPrimaryPort,
            int rpcPrimaryPort, NameNodeInfo nni) throws IOException {
        int retries = 5;
        for (int i = 0; i < retries; i++) {
            try {
                AvatarZooKeeperClient zkClient = new AvatarZooKeeperClient(nni.conf, null, false);
                zkClient.registerPrimary("localhost:" + nni.nnPort, "127.0.0.1:" + nnPrimaryPort, true);
                zkClient.registerPrimary("localhost:" + nni.nnDnPort, "127.0.0.1:" + nnDnPrimaryPort, true);
                zkClient.registerPrimary("localhost:" + nni.httpPort, "127.0.0.1:" + httpPrimaryPort, true);
                zkClient.registerPrimary("localhost:" + nni.rpcPort, "127.0.0.1:" + rpcPrimaryPort, true);
                try {
                    zkClient.shutdown();
                } catch (InterruptedException ie) {
                    throw new IOException("zkClient.shutdown() interrupted");
                }
                LOG.info("Closed zk client connection for registerZookeeper");
                return;
            } catch (IOException e) {
                LOG.info("Got exception when registering to zk, retrying", e);
                sleep(1000);
            }
        }
        throw new IOException("Cannot talk to ZK.");
    }

    public void clearZooKeeperNode(int nnIndex) throws IOException {
        int retries = 5;
        for (int i = 0; i < retries; i++) {
            try {
                NameNodeInfo nni = this.nameNodes[nnIndex];
                AvatarZooKeeperClient zkClient = new AvatarZooKeeperClient(nni.conf, null, false);
                zkClient.clearPrimary("localhost:" + nni.httpPort);
                zkClient.clearPrimary("localhost:" + nni.nnPort);
                zkClient.clearPrimary("localhost:" + nni.nnDnPort);
                zkClient.clearPrimary("localhost:" + nni.rpcPort);
                try {
                    zkClient.shutdown();
                } catch (InterruptedException ie) {
                    throw new IOException("zkClient.shutdown() interrupted");
                }
                LOG.info("Closed zk client connection for clearZKNode");
                return;
            } catch (IOException e) {
                LOG.info("Got exception when clearing zk, retrying", e);
                sleep(1000);
            }
        }
        throw new IOException("Cannot talk to ZK.");
    }

    static Configuration getServerConf(String startupOption, NameNodeInfo nni) {
        // namenode should use DFS, not DAFS

        if (startupOption.equals(AvatarConstants.StartupOption.NODEZERO.getName())) {
            return new Configuration(nni.a0Conf);
        } else if (startupOption.equals(AvatarConstants.StartupOption.NODEONE.getName())) {
            return new Configuration(nni.a1Conf);
        } else {
            throw new IllegalArgumentException("invalid avatar");
        }
    }

    public void registerZooKeeperNodes() throws IOException {
        for (NameNodeInfo nni : this.nameNodes) {
            nni.updateAvatarConf(this.conf);
            registerZooKeeperNode(nni.nn0Port, nni.nnDn0Port, nni.http0Port, nni.rpc0Port, nni);
        }
    }

    private void startAvatarNodes() throws IOException {
        for (NameNodeInfo nni : this.nameNodes) {
            nni.updateAvatarConf(this.conf);
            startAvatarNode(nni, startOpt);
        }
    }

    private void startAvatarNode(NameNodeInfo nni, StartupOption operation) throws IOException {
        registerZooKeeperNode(nni.nn0Port, nni.nnDn0Port, nni.http0Port, nni.rpc0Port, nni);

        if (format) {
            LOG.info("formatting");
            // Start the NameNode
            String[] a0FormatArgs;
            ArrayList<String> argList = new ArrayList<String>();
            argList.add(AvatarConstants.StartupOption.NODEZERO.getName());
            argList.add(AvatarConstants.StartupOption.FORMATFORCE.getName());
            if (federation) {
                argList.add(StartupOption.SERVICE.getName());
                argList.add(nni.nameserviceId);
            }
            a0FormatArgs = new String[argList.size()];
            argList.toArray(a0FormatArgs);
            instantiateAvatarNode(a0FormatArgs,
                    getServerConf(AvatarConstants.StartupOption.NODEZERO.getName(), nni));
        }
        ArrayList<AvatarInfo> avatars = new ArrayList<AvatarInfo>(2);
        {
            LOG.info("starting avatar 0");
            String[] a0Args;
            ArrayList<String> argList = new ArrayList<String>();
            if (operation != null) {
                argList.add(operation.getName());
            }
            argList.add(AvatarConstants.StartupOption.NODEZERO.getName());
            if (federation) {
                argList.add(StartupOption.SERVICE.getName());
                argList.add(nni.nameserviceId);
            }
            a0Args = new String[argList.size()];
            argList.toArray(a0Args);
            AvatarNode a0 = instantiateAvatarNode(a0Args,
                    getServerConf(AvatarConstants.StartupOption.NODEZERO.getName(), nni));

            avatars.add(new AvatarInfo(a0, AvatarState.ACTIVE, nni.nn0Port, nni.nnDn0Port, nni.http0Port,
                    nni.rpc0Port, AvatarConstants.StartupOption.NODEZERO.getName()));

            // wait for up to 10 seconds until the ACTIVE is initialized
            for (int i = 0; i < 10; i++) {
                if (a0.isInitDone())
                    break;
                LOG.info("Waiting for the ACTIVE to be initialized...");
                sleep(1000);
            }
            if (!a0.isInitDone()) {
                throw new IOException("The ACTIVE cannot be initialized");
            }
        }

        {
            LOG.info("starting avatar 1");
            String[] a1Args;
            ArrayList<String> argList = new ArrayList<String>();
            argList.add(AvatarConstants.StartupOption.NODEONE.getName());
            argList.add(AvatarConstants.StartupOption.STANDBY.getName());
            argList.add(AvatarConstants.StartupOption.REGULAR.getName());
            if (federation) {
                argList.add(StartupOption.SERVICE.getName());
                argList.add(nni.nameserviceId);
            }
            a1Args = new String[argList.size()];
            argList.toArray(a1Args);
            avatars.add(new AvatarInfo(
                    instantiateAvatarNode(a1Args,
                            getServerConf(AvatarConstants.StartupOption.NODEONE.getName(), nni)),
                    AvatarState.STANDBY, nni.nn1Port, nni.nnDn1Port, nni.http1Port, nni.rpc1Port,
                    AvatarConstants.StartupOption.NODEONE.getName()));
        }

        for (AvatarInfo avatar : avatars) {
            if (avatar.avatar == null) {
                throw new IOException("Cannot create avatar nodes");
            }
            Assert.assertTrue(avatar.avatar.getConf().getBoolean("dfs.persist.blocks", false));
        }
        nni.setAvatarNodes(avatars);
        DFSUtil.setGenericConf(nni.conf, nni.nameserviceId, AvatarNode.AVATARSERVICE_SPECIFIC_KEYS);
        nni.updateAvatarConf(nni.conf);
    }

    public void restartAvatarNodes() throws Exception {
        logStateChange("Restarting avatar nodes");
        shutDownAvatarNodes();
        for (NameNodeInfo nni : this.nameNodes) {
            nni.avatars.clear();
        }
        this.format = false;
        startAvatarNodes();
        waitAvatarNodesActive();

        waitDataNodesActive();

        waitExitSafeMode();
        logStateChange("Restarting avatar nodes - completed");
    }

    /*
     * Adds all datanodes to shutdown list
     */
    private void processDatanodesForShutdown(Collection<Thread> threads) {
        for (int i = 0; i < dataNodes.size(); i++) {
            LOG.info("Shutting down data node " + i);
            Thread st = new Thread(new ShutDownUtil(dataNodes.get(i)));
            st.start();
            threads.add(st);
        }
    }

    /*
     * Adds all namenodes to shutdown list
     */
    private void processNamenodesForShutdown(Collection<Thread> threads) {
        for (NameNodeInfo nni : this.nameNodes) {
            for (AvatarInfo avatar : nni.avatars) {
                if (avatar.state == AvatarState.ACTIVE || avatar.state == AvatarState.STANDBY) {
                    LOG.info("Shutting down Avatar " + avatar.state);
                    Thread st = new Thread(new ShutDownUtil(avatar));
                    st.start();
                    threads.add(st);
                }
            }
        }
    }

    public void shutDownDataNode(int i) throws IOException, InterruptedException {
        logStateChange("Shutting down datanode: " + i);
        dataNodes.get(i).datanode.shutdown();
        logStateChange("Shutting down datanode: " + i + " - completed");
    }

    public void shutDownDataNodes() throws IOException, InterruptedException {
        logStateChange("Shutting down avatar datanodes");
        List<Thread> threads = new ArrayList<Thread>();
        processDatanodesForShutdown(threads);
        MiniDFSCluster.joinThreads(threads);
        logStateChange("Shutting down avatar datanodes - completed");
    }

    private void shutDownJournalCluster() throws IOException {
        if (journalCluster != null) {
            journalCluster.shutdown();
        }
    }

    public void shutDownAvatarNodes() throws IOException, InterruptedException {
        logStateChange("Shutting down avatar nodes");
        List<Thread> threads = new ArrayList<Thread>();
        processNamenodesForShutdown(threads);
        MiniDFSCluster.joinThreads(threads);
        try {
            Thread.sleep(1000);
        } catch (InterruptedException ignore) {
            // do nothing
        }
        logStateChange("Shutting down avatar nodes - completed");
    }

    public static void shutDownZooKeeper() throws IOException, InterruptedException {
        logStateChange("Shutting down zookeeper server");
        cnxnFactory.shutdown();
        cnxnFactory.join();
        LOG.info("Zookeeper Connection Factory shutdown");
        if (zooKeeper.isRunning()) {
            zooKeeper.shutdown();
        }
        logStateChange("Shutting down zookeeper server - completed");
    }

    /**
     * Shut down the cluster
     */
    public void shutDown() throws IOException, InterruptedException {
        logStateChange("Shutting down Mini Avatar Cluster");
        List<Thread> threads = new ArrayList<Thread>();
        // add all datanodes to be shutdown
        processDatanodesForShutdown(threads);
        // add all namenodes to be shutdown
        processNamenodesForShutdown(threads);
        MiniDFSCluster.joinThreads(threads);
        shutDownJournalCluster();
        logStateChange("Shutting down Mini Avatar Cluster - completed");
    }

    private void startDataNodes(long[] simulatedCapacities) throws IOException {
        startDataNodes(simulatedCapacities, numDataNodes, hosts, racks, conf);
    }

    private void startDataNodes() throws IOException {
        startDataNodes(numDataNodes, racks, hosts, conf);
    }

    public void startDataNodes(int numDataNodes, String[] racks, String[] hosts, Configuration conf)
            throws IOException {
        startDataNodes(null, numDataNodes, racks, hosts, conf);
    }

    public void startDataNodes(long[] simulatedCapacities, int numDataNodes, String[] racks, String[] hosts,
            Configuration conf) throws IOException {
        int curDn = dataNodes.size();
        if (racks != null && numDataNodes > racks.length) {
            throw new IllegalArgumentException("The length of racks [" + racks.length + "] is less than the number "
                    + "of datanodes [" + numDataNodes + "].");
        }
        if (hosts != null && numDataNodes > hosts.length) {
            throw new IllegalArgumentException("The length of hosts [" + hosts.length + "] is less than the number "
                    + "of datanodes [" + numDataNodes + "].");
        }

        //Generate some hostnames if required
        if (racks != null && hosts == null) {
            LOG.info("Generating host names for datanodes");
            hosts = new String[numDataNodes];
            for (int i = 0; i < numDataNodes; i++) {
                hosts[i] = "host" + (curDn + i) + ".foo.com";
            }
        }

        ArrayList<Thread> threads = new ArrayList<Thread>();
        for (int i = 0; i < numDataNodes; i++) {
            Thread st = new Thread(new StartDatanodeUtil(i, curDn, simulatedCapacities));
            st.start();
            threads.add(st);
        }
        if (!MiniDFSCluster.joinThreads(threads)) {
            throw new IOException("Failed to startup the nodes");
        }
        this.numDataNodes = dataNodes.size();
    }

    class StartDatanodeUtil implements Runnable {
        private int i;
        private int curDn;
        private long[] simulatedCapacities;

        StartDatanodeUtil(int node, int curDn, long[] simulatedCapacities) {
            this.i = node;
            this.curDn = curDn;
            this.simulatedCapacities = simulatedCapacities;
        }

        @Override
        public void run() {
            try {
                String dnArg = StartupOption.REGULAR.getName();
                if (startOpt != null && startOpt == StartupOption.ROLLBACK) {
                    dnArg = startOpt.getName();
                }
                String[] dnArgs = { dnArg };
                int iN = curDn + i;
                Configuration dnConf = new Configuration(conf);

                if (simulatedCapacities != null) {
                    dnConf.setBoolean("dfs.datanode.simulateddatastorage", true);
                    dnConf.setLong(SimulatedFSDataset.CONFIG_PROPERTY_CAPACITY, simulatedCapacities[i]);
                }

                File dir1 = new File(dataDir, "data" + (2 * iN + 1));
                File dir2 = new File(dataDir, "data" + (2 * iN + 2));
                dir1.mkdirs();
                dir2.mkdirs();
                if (!dir1.isDirectory() || !dir2.isDirectory()) {
                    throw new IOException(
                            "Mkdirs failed to create directory for DataNode " + iN + ": " + dir1 + " or " + dir2);
                }
                dnConf.set("dfs.data.dir", dir1.getPath() + "," + dir2.getPath());

                LOG.info("Starting DataNode " + iN + " with dfs.data.dir: " + dnConf.get("dfs.data.dir"));

                if (hosts != null) {
                    dnConf.set(FSConstants.SLAVE_HOST_NAME, hosts[i]);
                    LOG.info("Starting DataNode " + iN + " with hostname set to: "
                            + dnConf.get(FSConstants.SLAVE_HOST_NAME));
                }

                if (racks != null) {
                    String name = hosts[i];
                    LOG.info("Adding node with hostname : " + name + " to rack " + racks[i]);
                    StaticMapping.addNodeToRack(name, racks[i]);
                }
                Configuration newconf = new Configuration(dnConf); // save config
                AvatarDataNode dn = instantiateDataNode(dnArgs, dnConf);
                // since the HDFS does things based on IP:port, we need to add the
                // mapping
                // for IP:port to rackId

                String ipAddr = dn.getSelfAddr().getAddress().getHostAddress();
                if (racks != null) {
                    int port = dn.getSelfAddr().getPort();
                    System.out
                            .println("Adding node with IP:port : " + ipAddr + ":" + port + " to rack " + racks[i]);
                    StaticMapping.addNodeToRack(ipAddr + ":" + port, racks[i]);
                }
                dn.runDatanodeDaemon();
                synchronized (dataNodes) {
                    dataNodes.add(new DataNodeProperties(dn, newconf, dnArgs));
                }
            } catch (IOException e) {
                LOG.error("Exception when creating datanode", e);
            }
        }
    }

    public void waitAvatarNodesActive() {
        for (int nnIndex = 0; nnIndex < this.nameNodes.length; nnIndex++) {
            waitAvatarNodesActive(nnIndex);
        }
    }

    public void waitAvatarNodesActive(int nnIndex) {
        NameNodeInfo nni = this.nameNodes[nnIndex];
        for (AvatarInfo avatar : nni.avatars) {
            while (avatar.avatar.getNameNodeDNAddress() == null) {
                try {
                    logStateChange("Waiting for avatar");
                    Thread.sleep(200);
                } catch (InterruptedException ignore) {
                    // do nothing
                }
            }
        }
    }

    /* wait Datanodes active for all namespaces */
    public void waitDataNodesActive() throws IOException {
        if (conf.getBoolean("fs.datanodes.wait", true)) {
            for (int nnIndex = 0; nnIndex < this.nameNodes.length; nnIndex++) {
                waitDataNodesActive(nnIndex);
            }
        } else {
            LOG.info("Will not wait for datanodes");
        }
    }

    /* wait Datanodes active for specific namespaces */
    public void waitDataNodesActive(int nnIndex) throws IOException {
        DistributedAvatarFileSystem dafs = null;
        logStateChange("Waiting for data nodes");
        int liveDataNodes = 0;
        // make sure all datanodes are alive
        while (liveDataNodes != numDataNodes) {
            try {
                dafs = getFileSystem(nnIndex);
                Thread.sleep(200);
                liveDataNodes = dafs.getLiveDataNodeStats(false).length;
                logStateChange("Waiting for data nodes : live=" + liveDataNodes + ", total=" + numDataNodes);
            } catch (Exception e) {
                LOG.warn("Exception waiting for datanodes : ", e);
            } finally {
                if (dafs != null) {
                    dafs.close();
                }
            }
        }
        logStateChange("Waiting for data nodes - completed");
    }

    private void checkSingleNameNode() {
        if (nameNodes.length != 1) {
            throw new IllegalArgumentException("It's not a single namenode cluster, use index instead.");
        }
    }

    public AvatarInfo getPrimaryAvatar(int nnIndex) {
        return getAvatarByState(nnIndex, AvatarState.ACTIVE);
    }

    public AvatarInfo getStandbyAvatar(int nnIndex) {
        return getAvatarByState(nnIndex, AvatarState.STANDBY);
    }

    private AvatarInfo getDeadAvatar(int nnIndex) {
        return getAvatarByState(nnIndex, AvatarState.DEAD);
    }

    private AvatarInfo getAvatarByState(int nnIndex, AvatarState state) {
        for (AvatarInfo avatar : this.nameNodes[nnIndex].avatars) {
            if (avatar.state == state) {
                return avatar;
            }
        }
        return null;
    }

    /**
     * Wait until the primary avatars have been checkpointed
     */
    private void waitForTheFirstCheckpoint() {
        if ((!conf.getBoolean("fs.checkpoint.wait", true)) || (!conf.getBoolean("fs.checkpoint.enabled", true))) {
            logStateChange("Waiting for checkpoint is disabled");
            return;
        }
        logStateChange("Waiting for first checkpoint");
        // wait for the first checkpoint to happen, as we
        // assert txids which depend on the checkpoints
        for (int nnIndex = 0; nnIndex < this.nameNodes.length; nnIndex++) {
            while (!isCheckpointed(nnIndex)) {
                try {
                    logStateChange("Waiting until avatar0 has been checkpointed");
                    Thread.sleep(50);
                } catch (InterruptedException ignore) {
                    // do nothing
                }
            }
        }
        logStateChange("Waiting for first checkpoint - completed");
    }

    /**
     * Return if the primary avatar has been checkpointed.
     */
    private boolean isCheckpointed(int nnIndex) {
        AvatarInfo primary = getPrimaryAvatar(nnIndex);
        return (primary != null && primary.avatar.getFSImage().getLastCheckpointTxId() > -1);
    }

    /**
     * Return true if primary avatar has left safe mode
     */
    private boolean hasLeftSafeMode(int nnIndex) throws IOException {
        AvatarInfo primary = getPrimaryAvatar(nnIndex);

        return (primary != null && !primary.avatar.isInSafeMode()
                && (this.numDataNodes == 0 || primary.avatar.getStats()[0] != 0));
    }

    private void waitExitSafeMode() throws IOException {
        for (int nnIndex = 0; nnIndex < this.nameNodes.length; nnIndex++) {
            // make sure all datanodes are alive
            while (!hasLeftSafeMode(nnIndex)) {
                try {
                    logStateChange("Waiting until avatar0 has left safe mode");
                    Thread.sleep(50);
                } catch (InterruptedException ignore) {
                    // do nothing
                }
            }
        }
    }

    public DistributedAvatarFileSystem getFileSystem() throws IOException {
        checkSingleNameNode();
        return getFileSystem(0);
    }

    /**
     * Get DAFS.
     */
    public DistributedAvatarFileSystem getFileSystem(int nnIndex) throws IOException {
        FileSystem fs = FileSystem.get(this.nameNodes[nnIndex].clientConf);

        if (!(fs instanceof DistributedAvatarFileSystem)) {
            throw new IOException("fs is not avatar fs");
        }

        return (DistributedAvatarFileSystem) fs;
    }

    /**
     * Kill the primary avatar node.
     * @param updateZK clear zookeeper?
     */
    public void killPrimary() throws IOException {
        checkSingleNameNode();
        killPrimary(0, true);
    }

    public void killPrimary(int nnIndex) throws IOException {
        killPrimary(nnIndex, true);
    }

    public void killPrimary(boolean clearZK) throws IOException {
        checkSingleNameNode();
        killPrimary(0, clearZK);
    }

    /**
     * Kill the primary avatar node.
     * @param clearZK clear zookeeper?
     */
    public void killPrimary(int nnIndex, boolean clearZK) throws IOException {
        logStateChange("Killing primary avatar: " + nnIndex);
        AvatarInfo primary = getPrimaryAvatar(nnIndex);
        if (primary != null) {
            if (clearZK) {
                clearZooKeeperNode(nnIndex);
            }

            primary.avatar.shutdown(true);

            primary.avatar = null;
            primary.state = AvatarState.DEAD;

            try {
                Thread.sleep(1000);
            } catch (InterruptedException ignore) {
                // do nothing
            }
            logStateChange("Killing primary avatar: " + nnIndex + " - completed");
        } else {
            throw new IOException("can't kill primary avatar, already dead");
        }
    }

    public void killStandby() throws IOException {
        checkSingleNameNode();
        killStandby(0);
    }

    /**
     * Kill the standby avatar node.
     */
    public void killStandby(int nnIndex) throws IOException {
        logStateChange("Killing standby avatar: " + nnIndex);
        AvatarInfo standby = getStandbyAvatar(nnIndex);
        if (standby != null) {
            standby.avatar.shutdown(true);

            standby.avatar = null;
            standby.state = AvatarState.DEAD;

            try {
                Thread.sleep(1000);
            } catch (InterruptedException ignore) {
                // do nothing
            }
            logStateChange("Killing standby avatar: " + nnIndex + " - completed");
        } else {
            logStateChange("Can't kill standby avatar, already dead");
        }
    }

    public void failOver() throws IOException {
        failOver(false);
    }

    public void failOver(boolean force) throws IOException {
        checkSingleNameNode();
        failOver(0, force);
    }

    /**
     * Make standby avatar the new primary avatar. Kill the old
     * primary avatar first if necessary.
     */
    public void failOver(int nnIndex) throws IOException {
        failOver(nnIndex, false);
    }

    public void failOver(int nnIndex, boolean force) throws IOException {
        logStateChange("Failover avatar: " + nnIndex);
        if (getPrimaryAvatar(nnIndex) != null) {
            LOG.info("killing primary avatar before failover");
            killPrimary(nnIndex);
        }

        AvatarInfo standby = getStandbyAvatar(nnIndex);
        if (standby == null) {
            throw new IOException("no standby avatar running");
        }

        standby.avatar.quiesceForFailover(force);
        // Introduce a synthetic delay since this is what will happen in practice.
        // There will be some delay between both calls and this is to make sure
        // there are no locking issues since this was earlier one RPC under a single
        // lock and now its two RPCs which take the lock twice.
        DFSTestUtil.waitNSecond(5);
        standby.avatar.performFailover();
        standby.state = AvatarState.ACTIVE;
        registerZooKeeperNode(standby.nnPort, standby.nnDnPort, standby.httpPort, standby.rpcPort,
                this.nameNodes[nnIndex]);
        logStateChange("Failover avatar: " + nnIndex + " : completed");
    }

    public void restartStandby() throws IOException {
        checkSingleNameNode();
        restartStandby(0);
    }

    /**
     * Restart a dead avatar node as a standby avatar.
     */
    public void restartStandby(int nnIndex) throws IOException {
        AvatarInfo dead = getDeadAvatar(nnIndex);
        if (getPrimaryAvatar(nnIndex) == null || dead == null) {
            throw new IOException("cannot start standby avatar: " + "primary or dead avatar not found");

        }
        logStateChange("Restarting " + dead.startupOption + " as standby");
        NameNodeInfo nni = this.nameNodes[nnIndex];
        String[] args;
        ArrayList<String> argList = new ArrayList<String>();
        argList.add(dead.startupOption);
        argList.add(AvatarConstants.StartupOption.STANDBY.getName());
        argList.add(AvatarConstants.StartupOption.REGULAR.getName());
        if (federation) {
            argList.add(StartupOption.SERVICE.getName());
            argList.add(nni.nameserviceId);
        }
        args = new String[argList.size()];
        argList.toArray(args);
        dead.avatar = instantiateAvatarNode(args, getServerConf(dead.startupOption, nni));
        dead.state = AvatarState.STANDBY;

        if (dead.avatar == null) {
            throw new IOException("cannot start avatar node");
        }
        logStateChange("Restarting " + dead.startupOption + " as standby - completed");
    }

    /**
     * return NameNodeInfo 
     */
    public NameNodeInfo getNameNode(int nnIndex) {
        return this.nameNodes[nnIndex];
    }

    public ArrayList<DataNodeProperties> getDataNodeProperties() {
        return dataNodes;
    }

    /**
     * Gets a list of the started DataNodes.  May be empty.
     */
    public ArrayList<AvatarDataNode> getDataNodes() {
        ArrayList<AvatarDataNode> list = new ArrayList<AvatarDataNode>();
        for (int i = 0; i < dataNodes.size(); i++) {
            AvatarDataNode node = dataNodes.get(i).datanode;
            list.add(node);
        }
        return list;
    }

    /*
     * return number of namenodes
     */
    public int getNumNameNodes() {
        return this.nameNodes.length;
    }

    /**
     * Add a namenode to cluster and start it. Configuration of datanodes
     * in the cluster is refreshed to register with the new namenode.
     * @return newly started namenode
     */
    public NameNodeInfo addNameNode(Configuration conf) throws IOException {
        if (!federation) {
            throw new IOException("cannot add namenode to non-federated cluster");
        }
        int nnIndex = nameNodes.length;
        int numNameNodes = nameNodes.length + 1;
        NameNodeInfo[] newlist = new NameNodeInfo[numNameNodes];
        System.arraycopy(nameNodes, 0, newlist, 0, nameNodes.length);
        nameNodes = newlist;
        nameNodes[nnIndex] = new NameNodeInfo(nnIndex);

        NameNodeInfo nni = nameNodes[nnIndex];
        nni.createAvatarDirs();
        String nameserviceId = NAMESERVICE_ID_PREFIX + getNSId();
        String nameserviceIds = conf.get(FSConstants.DFS_FEDERATION_NAMESERVICES);
        nameserviceIds += "," + nameserviceId;
        nni.initGeneralConf(conf, nameserviceId);
        conf.set(FSConstants.DFS_FEDERATION_NAMESERVICES, nameserviceIds);

        nni.updateAvatarConf(conf);
        startAvatarNode(nni, null);

        // Refresh datanodes with the newly started namenode
        for (DataNodeProperties dn : dataNodes) {
            DataNode datanode = dn.datanode;
            datanode.refreshNamenodes(conf);
        }
        // Wait for new namenode to get registrations from all the datanodes
        waitDataNodesActive(nnIndex);
        return nni;
    }

    private void updateAvatarConfWithServiceId(Configuration dstConf, Configuration srcConf, String nameserviceId) {
        for (String key : AvatarNode.AVATARSERVICE_SPECIFIC_KEYS) {
            String federationKey = DFSUtil.getNameServiceIdKey(key, nameserviceId);
            String value = srcConf.get(federationKey);
            if (value != null) {
                dstConf.set(federationKey, value);
            }
        }
        for (String key : NameNode.NAMESERVICE_SPECIFIC_KEYS) {
            String federationKey = DFSUtil.getNameServiceIdKey(key, nameserviceId);
            String value = srcConf.get(federationKey);
            if (value != null) {
                dstConf.set(federationKey, value);
            }
        }
    }

    /**
     * Add another cluster to current cluster and start it. Configuration of datanodes
     * in the cluster is refreshed to register with the new namenodes;
     */
    public void addCluster(MiniAvatarCluster cluster, boolean format) throws IOException, InterruptedException {
        if (!federation || !cluster.federation) {
            throw new IOException("Cannot handle non-federated cluster");
        }
        if (cluster.dataNodes.size() > this.dataNodes.size()) {
            throw new IOException("Cannot merge: new cluster has more datanodes the old one.");
        }
        this.shutDown();
        cluster.shutDown();

        int nnIndex = nameNodes.length;
        int numNameNodes = nameNodes.length + cluster.nameNodes.length;
        NameNodeInfo[] newlist = new NameNodeInfo[numNameNodes];
        System.arraycopy(nameNodes, 0, newlist, 0, nameNodes.length);
        System.arraycopy(cluster.nameNodes, 0, newlist, nameNodes.length, cluster.nameNodes.length);
        nameNodes = newlist;
        String newNameserviceIds = cluster.conf.get(FSConstants.DFS_FEDERATION_NAMESERVICES);
        String nameserviceIds = conf.get(FSConstants.DFS_FEDERATION_NAMESERVICES);
        nameserviceIds += "," + newNameserviceIds;
        this.format = format;
        conf.set(FSConstants.DFS_FEDERATION_NAMESERVICES, nameserviceIds);

        int i;
        for (i = 0; i < nameNodes.length; i++) {
            NameNodeInfo nni = nameNodes[i];
            String nameserviceId = nni.nameserviceId;
            nni.initGeneralConf(nni.conf, nni.nameserviceId);
            nni.updateAvatarConf(nni.conf);
            for (int dnIndex = 0; dnIndex < dataNodes.size(); dnIndex++) {
                Configuration dstConf = dataNodes.get(dnIndex).conf;
                if (i >= nnIndex) {
                    String dataStr = cluster.dataNodes.get(dnIndex).conf.get("dfs.data.dir");
                    dstConf.set("dfs.merge.data.dir." + nameserviceId, dataStr);
                }
                updateAvatarConfWithServiceId(dstConf, nni.conf, nameserviceId);
            }
        }

        for (DataNodeProperties dn : dataNodes) {
            dn.conf.set(FSConstants.DFS_FEDERATION_NAMESERVICES, nameserviceIds);
            dn.datanode = instantiateDataNode(dn.dnArgs, dn.conf);
            dn.datanode.runDatanodeDaemon();
        }

        for (i = 0; i < nameNodes.length; i++) {
            NameNodeInfo nni = nameNodes[i];
            Thread.sleep(2000);
            if (i < nnIndex) {
                startAvatarNode(nni, StartupOption.UPGRADE);
            } else {
                startAvatarNode(nni, null);
            }
        }
        waitAvatarNodesActive();
        waitDataNodesActive();
        waitExitSafeMode();
    }

    public synchronized boolean restartDataNodes() throws IOException, InterruptedException {
        return restartDataNodes(true);
    }

    public synchronized void restartDataNode(boolean waitActive, int index)
            throws IOException, InterruptedException {
        this.shutDownDataNode(index);
        DataNodeProperties dn = dataNodes.get(index);
        LOG.info("Restart Datanode " + index);
        // Use the same port since dn is identified by host:port.
        int port = dn.datanode.getSelfAddr().getPort();
        dn.conf.set(FSConstants.DFS_DATANODE_ADDRESS_KEY, "localhost:" + port);
        dn.datanode = instantiateDataNode(dn.dnArgs, dn.conf);
        dn.datanode.runDatanodeDaemon();
        if (waitActive) {
            waitDataNodeInitialized(dn.datanode);
        }
    }

    /*
     * Restart all datanodes
     */
    public synchronized boolean restartDataNodes(boolean waitActive) throws IOException, InterruptedException {
        logStateChange("Restarting avatar datanodes");
        shutDownDataNodes();
        for (int i = 0; i < dataNodes.size(); i++) {
            restartDataNode(waitActive, i);
        }
        if (waitActive) {
            waitDataNodesActive();
        }
        logStateChange("Restarting avatar datanodes - completed");
        return true;
    }

    /**
     * Wait until the Datanode is initialized, or it throws an IOException
     * @param AvatarDataNode dn;
     * @throws IOException when some ServicePair threads are dead. 
     */
    public synchronized void waitDataNodeInitialized(AvatarDataNode dn) throws IOException {
        if (dn == null) {
            return;
        }
        boolean initialized = false;
        while (!initialized) {
            initialized = true;
            for (int i = 0; i < nameNodes.length; i++) {
                InetSocketAddress nameNodeAddr = new InetSocketAddress("localhost",
                        getNameNode(i).avatars.get(0).nnDnPort);
                if (!dn.initialized(nameNodeAddr)) {
                    initialized = false;
                    break;
                }
            }
            try {
                Thread.sleep(100);
            } catch (Exception e) {
            }
        }
    }

    public int getNamespaceId(int index) {
        return this.nameNodes[index].avatars.get(0).avatar.getNamespaceID();
    }

    static public int getNSId() {
        return MiniAvatarCluster.currNSId++;
    }

    public static AvatarDataNode instantiateDataNode(String[] dnArgs, Configuration conf) throws IOException {
        IOException e = null;
        for (int i = 0; i < instantiationRetries; i++) {
            try {
                return AvatarDataNode.instantiateDataNode(dnArgs, new Configuration(conf));
            } catch (IOException ioe) {
                e = ioe;
                LOG.info("Trying to instantiate datanode... ", e);
            }
            sleep(1000);
        }
        LOG.fatal("Exception when instantiating avatardatanode", e);
        throw e;
    }

    public static AvatarNode instantiateAvatarNode(String argv[], Configuration conf) throws IOException {
        IOException e = null;
        for (int i = 0; i < instantiationRetries; i++) {
            try {
                return AvatarNode.createAvatarNode(argv, conf);
            } catch (IOException ioe) {
                e = ioe;
                LOG.info("Trying to instantiate avatarnode... ", e);
            }
            sleep(1000);
        }
        LOG.fatal("Exception when instantiating avatarnode", e);
        throw e;
    }

    public static void clearAvatarDir() {
        try {
            FileUtil.fullyDelete(new File(baseAvatarDir));
        } catch (Exception e) {
            LOG.warn("Exception when deleting directory " + baseAvatarDir, e);
        }
    }

    private static void sleep(long time) throws IOException {
        try {
            Thread.sleep(time);
        } catch (InterruptedException e) {
            LOG.fatal("Thread interrupted");
            throw new IOException(e.toString());
        }
    }

    private static void logStateChange(String msg) {
        LOG.info("----- " + msg + " -----");
    }
}