com.alibaba.wasp.fserver.handler.OpenEntityGroupHandler.java Source code

Java tutorial

Introduction

Here is the source code for com.alibaba.wasp.fserver.handler.OpenEntityGroupHandler.java

Source

/**
 * Copyright 2010 The Apache Software Foundation
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.alibaba.wasp.fserver.handler;

import com.alibaba.wasp.EntityGroupInfo;
import com.alibaba.wasp.Server;
import com.alibaba.wasp.executor.EventHandler;
import com.alibaba.wasp.fserver.EntityGroup;
import com.alibaba.wasp.fserver.FServerServices;
import com.alibaba.wasp.meta.FTable;
import com.alibaba.wasp.zookeeper.ZKAssign;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.util.CancelableProgressable;
import org.apache.zookeeper.KeeperException;

import java.io.IOException;
import java.util.concurrent.atomic.AtomicBoolean;

/**
 * Handles opening of a entityGroup on a FServer.
 */
public class OpenEntityGroupHandler extends EventHandler {
    // NOTE on priorities shutting down. There are none for close. There are some
    // for open. I think that is right. On shutdown, we want the meta to close
    // before root and both to close after the user entityGroups have closed. What
    // about the case where master tells us to shutdown a catalog entityGroup and
    // we
    // have a running queue of user entityGroups to close?
    private static final Log LOG = LogFactory.getLog(OpenEntityGroupHandler.class);

    private final int FAILED = -1;
    int expectedVersion = FAILED;

    // We get version of our znode at start of open process and monitor it across
    // the total open. We'll fail the open if someone hijacks our znode; we can
    // tell this has happened if version is not as expected.
    private volatile int version = -1;

    private final FServerServices fsServices;

    private final EntityGroupInfo entityGroupInfo;

    private final FTable table;

    // version of the offline node that was set by the master
    private volatile int versionOfOfflineNode = -1;

    // If true, the hosting server is aborting. EntityGroup close process is
    // different
    // when we are aborting.

    // Update zk on closing transitions. Usually true. Its false if cluster
    // is going down. In this case, its the rs that initiates the entityGroup
    // close -- not the master process so state up in zk will unlikely be
    // CLOSING.

    /**
    * Default base class constructor.
    */
    public OpenEntityGroupHandler(final Server server, final FServerServices fsServices,
            EntityGroupInfo entityGroupInfo, FTable table) {
        this(server, fsServices, entityGroupInfo, table, EventType.M_FSERVER_OPEN_ENTITYGROUP, -1);
    }

    public OpenEntityGroupHandler(final Server server, final FServerServices fsServices,
            EntityGroupInfo entityGroupInfo, final FTable table, EventType eventType, int versionOfOfflineNode) {
        super(server, eventType);
        this.fsServices = fsServices;
        this.entityGroupInfo = entityGroupInfo;
        this.table = table;
        this.versionOfOfflineNode = versionOfOfflineNode;
    }

    public OpenEntityGroupHandler(final Server server, final FServerServices fsServices,
            EntityGroupInfo entityGroupInfo, FTable ftd, int versionOfOfflineNode) {
        this(server, fsServices, entityGroupInfo, ftd, EventType.M_FSERVER_OPEN_ENTITYGROUP, versionOfOfflineNode);
    }

    public EntityGroupInfo getEntityGroupInfo() {
        return entityGroupInfo;
    }

    @Override
    public void process() throws IOException {
        try {
            final String name = entityGroupInfo.getEntityGroupNameAsString();
            if (this.server.isStopped() || this.fsServices.isStopping()) {
                return;
            }
            final String encodedName = entityGroupInfo.getEncodedName();

            // Check that this entityGroup is not already online
            EntityGroup entityGroup = this.fsServices.getFromOnlineEntityGroups(encodedName);

            // If fails, just return. Someone stole the entityGroup from under us.
            // Calling transitionZookeeperOfflineToOpening initalizes this.version.
            if (!transitionZookeeperOfflineToOpening(encodedName, versionOfOfflineNode)) {
                LOG.warn("EntityGroup was hijacked? It no longer exists, encodedName=" + encodedName);
                return;
            }

            // Open entityGroup. After a successful open, failures in subsequent
            // processing needs to do a close as part of cleanup.
            entityGroup = openEntityGroup();
            if (entityGroup == null) {
                tryTransitionToFailedOpen(entityGroupInfo);
                return;
            }
            boolean failed = true;
            if (tickleOpening("post_entitygroup_open")) {
                if (updateMeta(entityGroup)) {
                    failed = false;
                }
            }
            if (failed || this.server.isStopped() || this.fsServices.isStopping()) {
                cleanupFailedOpen(entityGroup);
                tryTransitionToFailedOpen(entityGroupInfo);
                return;
            }

            if (!transitionToOpened(entityGroup)) {
                // If we fail to transition to opened, it's because of one of two cases:
                // (a) we lost our ZK lease
                // OR (b) someone else opened the entityGroup before us
                // In either case, we don't need to transition to FAILED_OPEN state.
                // In case (a), the Master will process us as a dead server. In case
                // (b) the entityGroup is already being handled elsewhere anyway.
                cleanupFailedOpen(entityGroup);
                return;
            }
            // Successful entityGroup open, and add it to OnlineEntityGroups
            this.fsServices.addToOnlineEntityGroups(entityGroup);

            // Done! Successful entityGroup open
            LOG.debug("Opened " + name + " on server:" + this.server.getServerName());
        } finally {
            this.fsServices.getEntityGroupsInTransitionInFS().remove(this.entityGroupInfo.getEncodedNameAsBytes());
        }
    }

    private void cleanupFailedOpen(EntityGroup entityGroup) throws IOException {
        if (entityGroup != null)
            entityGroup.close();

    }

    /**
     * Update ZK, ROOT or META. This can take a while if for example the .META. is
     * not available -- if server hosting .META. crashed and we are waiting on it
     * to come back -- so run in a thread and keep updating znode state meantime
     * so master doesn't timeout our entityGroup-in-transition. Caller must
     * cleanup entityGroup if this fails.
     */
    boolean updateMeta(final EntityGroup entityGroup) {
        if (this.server.isStopped() || this.fsServices.isStopping()) {
            return false;
        }
        // Object we do wait/notify on. Make it boolean. If set, we're done.
        // Else, wait.
        final AtomicBoolean signaller = new AtomicBoolean(false);
        PostOpenDeployTasksThread t = new PostOpenDeployTasksThread(entityGroup, this.server, this.fsServices,
                signaller);
        t.start();
        int assignmentTimeout = this.server.getConfiguration()
                .getInt("wasp.master.assignment.timeoutmonitor.period", 10000);
        // Total timeout for meta edit. If we fail adding the edit then close out
        // the entityGroup and let it be assigned elsewhere.
        long timeout = assignmentTimeout * 10;
        long now = System.currentTimeMillis();
        long endTime = now + timeout;
        // Let our period at which we update OPENING state to be be 1/3rd of the
        // entityGroups-in-transition timeout period.
        long period = Math.max(1, assignmentTimeout / 3);
        long lastUpdate = now;
        boolean tickleOpening = true;
        while (!signaller.get() && t.isAlive() && !this.server.isStopped() && !this.fsServices.isStopping()
                && (endTime > now)) {
            long elapsed = now - lastUpdate;
            if (elapsed > period) {
                // Only tickle OPENING if postOpenDeployTasks is taking some time.
                lastUpdate = now;
                tickleOpening = tickleOpening("post_open_deploy");
            }
            synchronized (signaller) {
                try {
                    signaller.wait(period);
                } catch (InterruptedException e) {
                    // Go to the loop check.
                }
            }
            now = System.currentTimeMillis();
        }
        // Is thread still alive? We may have left above loop because server is
        // stopping or we timed out the edit. Is so, interrupt it.
        if (t.isAlive()) {
            if (!signaller.get()) {
                // Thread still running; interrupt
                LOG.debug("Interrupting thread " + t);
                t.interrupt();
            }
            try {
                t.join();
            } catch (InterruptedException ie) {
                LOG.warn("Interrupted joining " + entityGroup.getEntityGroupInfo().getEntityGroupNameAsString(),
                        ie);
                Thread.currentThread().interrupt();
            }
        }

        // Was there an exception opening the entityGroup? This should trigger on
        // InterruptedException too. If so, we failed. Even if tickle opening fails
        // then it is a failure.
        return ((!Thread.interrupted() && t.getException() == null) && tickleOpening);
    }

    /**
     * Thread to run entityGroup post open tasks. Call {@link #getException()}
     * after the thread finishes to check for exceptions running
     * {@link com.alibaba.wasp.fserver.FServerServices#postOpenDeployTasks(com.alibaba.wasp.fserver.EntityGroup, boolean)} .
     */
    static class PostOpenDeployTasksThread extends Thread {
        private Exception exception = null;
        private final Server server;
        private final FServerServices services;
        private final EntityGroup entityGroup;
        private final AtomicBoolean signaller;

        PostOpenDeployTasksThread(final EntityGroup entityGroup, final Server server,
                final FServerServices services, final AtomicBoolean signaller) {
            super("PostOpenDeployTasks:" + entityGroup.getEntityGroupInfo().getEncodedName());
            this.setDaemon(true);
            this.server = server;
            this.services = services;
            this.entityGroup = entityGroup;
            this.signaller = signaller;
        }

        public void run() {
            try {
                this.services.postOpenDeployTasks(this.entityGroup, false);
            } catch (Exception e) {
                LOG.warn("Exception running postOpenDeployTasks; entityGroup="
                        + this.entityGroup.getEntityGroupInfo().getEncodedName(), e);
                this.exception = e;
            }
            // We're done. Set flag then wake up anyone waiting on thread to complete.
            this.signaller.set(true);
            synchronized (this.signaller) {
                this.signaller.notify();
            }
        }

        /**
         * @return Null or the run exception; call this method after thread is done.
         */
        Exception getException() {
            return this.exception;
        }
    }

    /**
     * Transition ZK node from OFFLINE to OPENING.
     * 
     * @param encodedName
     *          Name of the znode file (EntityGroup encodedName is the znode
     *          name).
     * @param versionOfOfflineNode
     *          - version Of OfflineNode that needs to be compared before changing
     *          the node's state from OFFLINE
     * @return True if successful transition.
     */
    boolean transitionZookeeperOfflineToOpening(final String encodedName, int versionOfOfflineNode) {
        try {
            // Initialize the znode version.
            this.version = ZKAssign.transitionNode(server.getZooKeeper(), entityGroupInfo, server.getServerName(),
                    EventType.M_ZK_ENTITYGROUP_OFFLINE, EventType.FSERVER_ZK_ENTITYGROUP_OPENING,
                    versionOfOfflineNode);
        } catch (KeeperException e) {
            LOG.error("Error transition from OFFLINE to OPENING for entityGroup=" + encodedName, e);
        }
        boolean b = isGoodVersion();
        if (!b) {
            LOG.warn("Failed transition from OFFLINE to OPENING for entityGroup=" + encodedName);
        }
        return b;
    }

    /**
     * @param entityGroup
     *          EntityGroup we're working on.
     * @return whether znode is successfully transitioned to OPENED state.
     * @throws java.io.IOException
     */
    private boolean transitionToOpened(final EntityGroup entityGroup) throws IOException {
        boolean result = false;
        EntityGroupInfo egi = entityGroup.getEntityGroupInfo();
        final String name = egi.getEntityGroupNameAsString();
        // Finally, Transition ZK node to OPENED
        try {
            if (ZKAssign.transitionNodeOpened(this.server.getZooKeeper(), egi, this.server.getServerName(),
                    this.version) == -1) {
                LOG.warn("Completed the OPEN of entityGroup " + name + " but when transitioning from "
                        + " OPENING to OPENED got a version mismatch, someone else clashed "
                        + "so now unassigning -- closing entityGroup on server: " + this.server.getServerName());
            } else {
                LOG.debug("entityGroup transitioned to opened in zookeeper: " + entityGroup.getEntityGroupInfo()
                        + ", server: " + this.server.getServerName());
                result = true;
            }
        } catch (KeeperException e) {
            LOG.error("Failed transitioning node " + name + " from OPENING to OPENED -- closing entityGroup", e);
        }
        return result;
    }

    /**
     * @param egi
     *          we're working on. This is not guaranteed to succeed, we just do
     *          our best.
     * @return whether znode is successfully transitioned to FAILED_OPEN state.
     */
    private boolean tryTransitionToFailedOpen(final EntityGroupInfo egi) {
        boolean result = false;
        final String name = egi.getEntityGroupNameAsString();
        try {
            LOG.info("Opening of entityGroup " + egi + " failed, marking as FAILED_OPEN in ZK");
            if (ZKAssign.transitionNode(this.server.getZooKeeper(), egi, this.server.getServerName(),
                    EventType.FSERVER_ZK_ENTITYGROUP_OPENING, EventType.FSERVER_ZK_ENTITYGROUP_FAILED_OPEN,
                    this.version) == -1) {
                LOG.warn("Unable to mark entityGroup " + egi + " as FAILED_OPEN. "
                        + "It's likely that the master already timed out this open "
                        + "attempt, and thus another RS already has the entityGroup.");
            } else {
                result = true;
            }
        } catch (KeeperException e) {
            LOG.error("Failed transitioning node " + name + " from OPENING to FAILED_OPEN", e);
        }
        return result;
    }

    /**
     * @return Instance of EntityGroup if successful open else null.
     */
    EntityGroup openEntityGroup() {
        EntityGroup entityGroup = null;
        try {
            // Instantiate the entityGroup. This also periodically tickles our zk
            // OPENING
            // state so master doesn't timeout this entityGroup in transition.
            entityGroup = EntityGroup.openEntityGroup(this.entityGroupInfo, this.table,
                    this.server.getConfiguration(), this.fsServices, new CancelableProgressable() {
                        public boolean progress() {
                            // We may lose the znode ownership during the open. Currently its
                            // too hard interrupting ongoing entityGroup open. Just let it
                            // complete
                            // and check we still have the znode after entityGroup open.
                            return tickleOpening("open_entitygroup_progress");
                        }
                    });
        } catch (Throwable t) {
            // We failed open. Our caller will see the 'null' return value
            // and transition the node back to FAILED_OPEN. If that fails,
            // we rely on the Timeout Monitor in the master to reassign.
            LOG.error("Failed open of entityGroup=" + this.entityGroupInfo.getEntityGroupNameAsString()
                    + ", starting to roll back the global memstore size.", t);
        }
        return entityGroup;
    }

    /**
     * Transition ZK node to CLOSED
     * 
     * @param expectedVersion
     * @return If the state is set successfully
     */
    private boolean setClosedState(final int expectedVersion, final EntityGroupInfo entityGroupInfo) {
        return false;
    }

    /**
     * Update our OPENING state in zookeeper. Do this so master doesn't timeout
     * this entityGroup-in-transition.
     * 
     * @param context
     *          Some context to add to logs if failure
     * @return True if successful transition.
     */
    boolean tickleOpening(final String context) {
        // If previous checks failed... do not try again.
        if (!isGoodVersion())
            return false;
        String encodedName = this.entityGroupInfo.getEncodedName();
        try {
            this.version = ZKAssign.retransitionNodeOpening(server.getZooKeeper(), this.entityGroupInfo,
                    this.server.getServerName(), this.version);
        } catch (KeeperException e) {
            LOG.warn("Exception refreshing OPENING; entityGroup=" + encodedName + ", context=" + context, e);
            this.version = -1;
        }
        boolean b = isGoodVersion();
        if (!b) {
            LOG.warn("Failed refreshing OPENING; entityGroup=" + encodedName + ", context=" + context);
        }
        return b;
    }

    private boolean isGoodVersion() {
        return this.version != -1;
    }

}