org.apache.hadoop.hbase.regionserver.TestRSKilledWhenInitializing.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hbase.regionserver.TestRSKilledWhenInitializing.java

Source

/**
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.regionserver;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.CategoryBasedTimeout;
import org.apache.hadoop.hbase.CoordinatedStateManager;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.LocalHBaseCluster;
import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.ServerListener;
import org.apache.hadoop.hbase.master.ServerManager;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionServerStartupResponse;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.testclassification.RegionServerTests;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
import org.apache.hadoop.hbase.util.Threads;
import org.junit.Rule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.junit.rules.TestName;
import org.junit.rules.TestRule;

/**
 * Tests that a regionserver that dies after reporting for duty gets removed
 * from list of online regions. See HBASE-9593.
 */
@Category({ RegionServerTests.class, MediumTests.class })
public class TestRSKilledWhenInitializing {
    private static final Log LOG = LogFactory.getLog(TestRSKilledWhenInitializing.class);
    @Rule
    public TestName testName = new TestName();
    @Rule
    public final TestRule timeout = CategoryBasedTimeout.builder().withTimeout(this.getClass())
            .withLookingForStuckThread(true).build();

    // This boolean needs to be globally available. It is used below in our
    // mocked up regionserver so it knows when to die.
    private static AtomicBoolean masterActive = new AtomicBoolean(false);
    // Ditto for this variable. It also is used in the mocked regionserver class.
    private static final AtomicReference<ServerName> killedRS = new AtomicReference<ServerName>();

    private static final int NUM_MASTERS = 1;
    private static final int NUM_RS = 2;

    /**
     * Test verifies whether a region server is removing from online servers list in master if it went
     * down after registering with master. Test will TIMEOUT if an error!!!!
     * @throws Exception
     */
    @Test
    public void testRSTerminationAfterRegisteringToMasterBeforeCreatingEphemeralNode() throws Exception {
        // Create config to use for this cluster
        Configuration conf = HBaseConfiguration.create();
        conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
        // Start the cluster
        final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
        TEST_UTIL.startMiniDFSCluster(3);
        TEST_UTIL.startMiniZKCluster();
        TEST_UTIL.createRootDir();
        final LocalHBaseCluster cluster = new LocalHBaseCluster(conf, NUM_MASTERS, NUM_RS, HMaster.class,
                RegisterAndDieRegionServer.class);
        final MasterThread master = startMaster(cluster.getMasters().get(0));
        try {
            // Master is up waiting on RegionServers to check in. Now start RegionServers.
            for (int i = 0; i < NUM_RS; i++) {
                cluster.getRegionServers().get(i).start();
            }
            // Now wait on master to see NUM_RS + 1 servers as being online, thats NUM_RS plus
            // the Master itself (because Master hosts hbase:meta and checks in as though it a RS).
            List<ServerName> onlineServersList = null;
            do {
                onlineServersList = master.getMaster().getServerManager().getOnlineServersList();
            } while (onlineServersList.size() < (NUM_RS + 1));
            // Wait until killedRS is set. Means RegionServer is starting to go down.
            while (killedRS.get() == null) {
                Threads.sleep(1);
            }
            // Wait on the RegionServer to fully die.
            while (cluster.getLiveRegionServers().size() > NUM_RS) {
                Threads.sleep(1);
            }
            // Make sure Master is fully up before progressing. Could take a while if regions
            // being reassigned.
            while (!master.getMaster().isInitialized()) {
                Threads.sleep(1);
            }

            // Now in steady state. How many regions open? Master should have too many regionservers
            // showing still. The downed RegionServer should still be showing as registered.
            assertTrue(master.getMaster().getServerManager().isServerOnline(killedRS.get()));
            // Find non-meta region (namespace?) and assign to the killed server. That'll trigger cleanup.
            Map<HRegionInfo, ServerName> assignments = null;
            do {
                assignments = master.getMaster().getAssignmentManager().getRegionStates().getRegionAssignments();
            } while (assignments == null || assignments.size() < 2);
            HRegionInfo hri = null;
            for (Map.Entry<HRegionInfo, ServerName> e : assignments.entrySet()) {
                if (e.getKey().isMetaRegion())
                    continue;
                hri = e.getKey();
                break;
            }
            // Try moving region to the killed server. It will fail. As by-product, we will
            // remove the RS from Master online list because no corresponding znode.
            assertEquals(NUM_RS + 1, master.getMaster().getServerManager().getOnlineServersList().size());
            LOG.info("Move " + hri.getEncodedName() + " to " + killedRS.get());
            master.getMaster().move(hri.getEncodedNameAsBytes(), Bytes.toBytes(killedRS.get().toString()));
            // Wait until the RS no longer shows as registered in Master.
            while (onlineServersList.size() > (NUM_RS + 1)) {
                Thread.sleep(100);
                onlineServersList = master.getMaster().getServerManager().getOnlineServersList();
            }
        } finally {
            // Shutdown is messy with complaints about fs being closed. Why? TODO.
            cluster.shutdown();
            cluster.join();
            TEST_UTIL.shutdownMiniDFSCluster();
            TEST_UTIL.shutdownMiniZKCluster();
            TEST_UTIL.cleanupTestDir();
        }
    }

    /**
     * Start Master. Get as far as the state where Master is waiting on
     * RegionServers to check in, then return.
     */
    private MasterThread startMaster(MasterThread master) {
        master.start();
        // It takes a while until ServerManager creation to happen inside Master startup.
        while (master.getMaster().getServerManager() == null) {
            continue;
        }
        // Set a listener for the waiting-on-RegionServers state. We want to wait
        // until this condition before we leave this method and start regionservers.
        final AtomicBoolean waiting = new AtomicBoolean(false);
        if (master.getMaster().getServerManager() == null)
            throw new NullPointerException("SM");
        master.getMaster().getServerManager().registerListener(new ServerListener() {
            @Override
            public void waiting() {
                waiting.set(true);
            }
        });
        // Wait until the Master gets to place where it is waiting on RegionServers to check in.
        while (!waiting.get()) {
            continue;
        }
        // Set the global master-is-active; gets picked up by regionservers later.
        masterActive.set(true);
        return master;
    }

    /**
     * A RegionServer that reports for duty and then immediately dies if it is the first to receive
     * the response to a reportForDuty. When it dies, it clears its ephemeral znode which the master
     * notices and so removes the region from its set of online regionservers.
     */
    static class RegisterAndDieRegionServer extends MiniHBaseCluster.MiniHBaseClusterRegionServer {
        public RegisterAndDieRegionServer(Configuration conf, CoordinatedStateManager cp)
                throws IOException, InterruptedException {
            super(conf, cp);
        }

        @Override
        protected void handleReportForDutyResponse(RegionServerStartupResponse c) throws IOException {
            if (killedRS.compareAndSet(null, getServerName())) {
                // Make sure Master is up so it will see the removal of the ephemeral znode for this RS.
                while (!masterActive.get()) {
                    Threads.sleep(100);
                }
                super.kill();
            } else {
                super.handleReportForDutyResponse(c);
            }
        }
    }
}