ml.shifu.guagua.master.AsyncMasterCoordinator.java Source code

Java tutorial

Introduction

Here is the source code for ml.shifu.guagua.master.AsyncMasterCoordinator.java

Source

/*
 * Copyright [2013-2014] PayPal Software Foundation
 *  
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *  
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package ml.shifu.guagua.master;

import java.util.List;
import java.util.concurrent.TimeUnit;

import ml.shifu.guagua.GuaguaConstants;
import ml.shifu.guagua.io.Bytable;
import ml.shifu.guagua.util.NumberFormatUtils;
import ml.shifu.guagua.util.ProgressLock;

import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.WatchedEvent;
import org.apache.zookeeper.Watcher.Event.EventType;
import org.apache.zookeeper.Watcher.Event.KeeperState;
import org.apache.zookeeper.ZooDefs.Ids;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * {@link AsyncMasterCoordinator} is used to as a barrier for each iteration.
 * 
 * <p>
 * For each iteration, {@link AsyncMasterCoordinator} will wait until all workers are done.
 * 
 * <p>
 * To start a new iteration, {@link AsyncMasterCoordinator} will write a znode for each iteration like
 * '/_guagua/job_201312041304_189025/master/{currentIteration}' with with {@link MasterComputable} result as its data.
 * This is like a signal to notify workers.
 * 
 * <p>
 * Workers are waiting on current master znode, if got current master znode, it will start another iteration.
 * 
 * @param <MASTER_RESULT>
 *            master result for computation in each iteration.
 * @param <WORKER_RESULT>
 *            worker result for computation in each iteration.
 */
public class AsyncMasterCoordinator<MASTER_RESULT extends Bytable, WORKER_RESULT extends Bytable>
        extends AbstractMasterCoordinator<MASTER_RESULT, WORKER_RESULT> {

    private static final Logger LOG = LoggerFactory.getLogger(AsyncMasterCoordinator.class);

    /**
     * Current iteration
     */
    private int currentIteration;

    /**
     * Current app id.
     */
    private String appId;

    /**
     * Lock is used to check register info from all workers.
     */
    protected ProgressLock workerInitLock = new ProgressLock();

    /**
     * Lock is used to check iteration info from all workers.
     */
    protected ProgressLock workerIterationLock = new ProgressLock();

    @Override
    public void process(WatchedEvent event) {
        LOG.debug("DEBUG: process: Got a new event, path = {}, type = {}, state = {}", event.getPath(),
                event.getType(), event.getState());

        if ((event.getPath() == null) && (event.getType() == EventType.None)) {
            if (event.getState() == KeeperState.SyncConnected) {
                LOG.info("process: Asynchronous connection complete.");
                super.getZkConnLatch().countDown();
            } else {
                LOG.warn("process: Got unknown null path event " + event);
            }
            return;
        }

        /**
         * Check lock signal condition.
         */
        String appWorkerBaseNode = getWorkerBaseNode(getAppId(), getCurrentIteration()).toString();
        if (event.getPath().equals(appWorkerBaseNode) && (event.getType() == EventType.NodeChildrenChanged)) {
            if (getCurrentIteration() == 0) {
                this.workerInitLock.signal();
            } else {
                this.workerIterationLock.signal();
            }
        }
    }

    public int getCurrentIteration() {
        return currentIteration;
    }

    public void setCurrentIteration(int currentIteration) {
        this.currentIteration = currentIteration;
    }

    public String getAppId() {
        return appId;
    }

    public void setAppId(String appId) {
        this.appId = appId;
    }

    @Override
    public void preApplication(final MasterContext<MASTER_RESULT, WORKER_RESULT> context) {
        initialize(context.getProps());
        this.setAppId(context.getAppId());

        // Master election which is used here to use the same zookeeper instance.
        if (NumberFormatUtils.getInt(context.getProps().getProperty(GuaguaConstants.GUAGUA_MASTER_NUMBER),
                GuaguaConstants.DEFAULT_MASTER_NUMBER) > 1) {
            new MasterElectionCommand(context.getAppId()).execute();
        }

        // Check last successful iteration
        new FailOverCommand(context).execute();

        if (context.getCurrentIteration() != GuaguaConstants.GUAGUA_INIT_STEP) {
            // if not init step, return, because of no need initialize twice for fail-over task
            return;
        }
        new BasicCoordinatorCommand() {
            @Override
            public void doExecute() throws KeeperException, InterruptedException {
                final String appWorkersNode = getWorkerBaseNode(context.getAppId(), context.getCurrentIteration())
                        .toString();

                new RetryCoordinatorCommand(isFixedTime(), getSleepTime()) {
                    @Override
                    public boolean retryExecution() throws KeeperException, InterruptedException {
                        try {
                            // to avoid re-watching
                            List<String> children = getZooKeeper().getChildrenExt(appWorkersNode, false, false,
                                    false);
                            int size = children == null ? 0 : children.size();
                            if (isTerminated(size, context.getWorkers(), context.getMinWorkersRatio(),
                                    context.getMinWorkersTimeOut())) {
                                return true;
                            }
                            children = getZooKeeper().getChildrenExt(appWorkersNode, true, false, false);
                            size = children == null ? 0 : children.size();
                            if (isTerminated(size, context.getWorkers(), context.getMinWorkersRatio(),
                                    context.getMinWorkersTimeOut())) {
                                return true;
                            }
                            // to avoid log flood
                            if (System.nanoTime() % 20 == 0) {
                                LOG.info("workers already initialized: {}, still {} workers are not synced.", size,
                                        (context.getWorkers() - size));
                            }
                            AsyncMasterCoordinator.this.workerInitLock.waitForever();
                            AsyncMasterCoordinator.this.workerInitLock.reset();
                        } catch (KeeperException.NoNodeException e) {
                            // to avoid log flood
                            if (System.nanoTime() % 10 == 0) {
                                LOG.warn("No such node:{}", appWorkersNode);
                            }
                        }
                        return false;
                    }
                }.execute();

                LOG.info("All workers are initiliazed successfully.");

                String znode = null;
                try {
                    // create worker znode 1: '/_guagua/<jobId>/workers/1' to avoid re-create znode from workers
                    znode = getWorkerBaseNode(context.getAppId(), context.getCurrentIteration() + 1).toString();
                    getZooKeeper().createExt(znode, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, false);
                    // create master init znode
                    znode = getMasterBaseNode(context.getAppId()).toString();
                    getZooKeeper().createExt(znode, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, false);
                    znode = getCurrentMasterNode(context.getAppId(), context.getCurrentIteration()).toString();
                    getZooKeeper().createExt(znode, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, false);
                } catch (KeeperException.NodeExistsException e) {
                    LOG.warn("Node exists: {}", znode);
                }
            }
        }.execute();
    }

    @Override
    public void preIteration(final MasterContext<MASTER_RESULT, WORKER_RESULT> context) {
        this.setCurrentIteration(context.getCurrentIteration());

        new BasicCoordinatorCommand() {
            @Override
            public void doExecute() throws KeeperException, InterruptedException {
                // wait All Workers Done
                final int currentIteration = context.getCurrentIteration();
                final int workers = context.getWorkers();
                final String appCurrentWorkersNode = getWorkerBaseNode(context.getAppId(), currentIteration)
                        .toString();

                long start = System.nanoTime();
                // wait to get all workers results.
                new RetryCoordinatorCommand(isFixedTime(), getSleepTime()) {
                    @Override
                    public boolean retryExecution() throws KeeperException, InterruptedException {
                        try {
                            List<String> workerChildern = getZooKeeper().getChildrenExt(appCurrentWorkersNode,
                                    false, false, false);

                            int size = workerChildern == null ? 0 : workerChildern.size();
                            if (isTerminated(size, context.getWorkers(), context.getMinWorkersRatio(),
                                    context.getMinWorkersTimeOut())) {
                                return true;
                            }

                            workerChildern = getZooKeeper().getChildrenExt(appCurrentWorkersNode, true, false,
                                    false);

                            size = workerChildern == null ? 0 : workerChildern.size();
                            if (isTerminated(size, context.getWorkers(), context.getMinWorkersRatio(),
                                    context.getMinWorkersTimeOut())) {
                                return true;
                            }
                            // to avoid log flood
                            if (System.nanoTime() % 20 == 0) {
                                LOG.info("iteration {}, workers compelted: {}, still {} workers are not synced.",
                                        currentIteration, size, (workers - size));
                            }
                            AsyncMasterCoordinator.this.workerIterationLock.waitForever();
                            AsyncMasterCoordinator.this.workerIterationLock.reset();
                        } catch (KeeperException.NoNodeException e) {
                            // to avoid log flood
                            if (System.nanoTime() % 10 == 0) {
                                LOG.warn("No such node:{}", appCurrentWorkersNode);
                            }
                        }
                        return false;
                    }
                }.execute();
                LOG.info("Application {} container {} iteration {} waiting ends with {}ms execution time.",
                        context.getAppId(), context.getContainerId(), context.getCurrentIteration(),
                        TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start));
                // wait until worker results are set from zookeeper znodes.
                setWorkerResults(context, appCurrentWorkersNode, context.getAppId(), currentIteration);

            }
        }.execute();
    }

}