org.apache.giraph.graph.GraphTaskManager.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.giraph.graph.GraphTaskManager.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.giraph.graph;

import java.io.IOException;
import java.lang.management.GarbageCollectorMXBean;
import java.lang.management.ManagementFactory;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Enumeration;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;

import com.google.common.collect.Maps;
import com.sun.management.GarbageCollectionNotificationInfo;
import com.yammer.metrics.core.Counter;

import org.apache.commons.lang3.exception.ExceptionUtils;
import org.apache.giraph.bsp.BspService;
import org.apache.giraph.bsp.CentralizedServiceMaster;
import org.apache.giraph.bsp.CentralizedServiceWorker;
import org.apache.giraph.bsp.checkpoints.CheckpointStatus;
import org.apache.giraph.comm.messages.MessageStore;
import org.apache.giraph.comm.messages.SimpleMessageStore;
import org.apache.giraph.conf.ClassConfOption;
import org.apache.giraph.conf.GiraphConstants;
import org.apache.giraph.conf.ImmutableClassesGiraphConfiguration;
import org.apache.giraph.job.JobProgressTracker;
import org.apache.giraph.master.BspServiceMaster;
import org.apache.giraph.master.MasterThread;
import org.apache.giraph.metrics.GiraphMetrics;
import org.apache.giraph.metrics.GiraphMetricsRegistry;
import org.apache.giraph.metrics.GiraphTimer;
import org.apache.giraph.metrics.GiraphTimerContext;
import org.apache.giraph.metrics.ResetSuperstepMetricsObserver;
import org.apache.giraph.metrics.SuperstepMetricsRegistry;
import org.apache.giraph.ooc.OutOfCoreEngine;
import org.apache.giraph.partition.PartitionOwner;
import org.apache.giraph.partition.PartitionStats;
import org.apache.giraph.partition.PartitionStore;
import org.apache.giraph.partition.SimplePartition;
import org.apache.giraph.scripting.ScriptLoader;
import org.apache.giraph.utils.CallableFactory;
import org.apache.giraph.utils.MemoryUtils;
import org.apache.giraph.utils.ProgressableUtils;
import org.apache.giraph.worker.BspServiceWorker;
import org.apache.giraph.worker.InputSplitsCallable;
import org.apache.giraph.worker.WorkerContext;
import org.apache.giraph.worker.WorkerObserver;
import org.apache.giraph.worker.WorkerProgress;
import org.apache.giraph.zk.ZooKeeperManager;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.log4j.Appender;
import org.apache.log4j.Level;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.log4j.PatternLayout;

import javax.management.Notification;
import javax.management.NotificationEmitter;
import javax.management.NotificationListener;
import javax.management.openmbean.CompositeData;

/**
 * The Giraph-specific business logic for a single BSP
 * compute node in whatever underlying type of cluster
 * our Giraph job will run on. Owning object will provide
 * the glue into the underlying cluster framework
 * and will call this object to perform Giraph work.
 *
 * @param <I> Vertex id
 * @param <V> Vertex data
 * @param <E> Edge data
 */
@SuppressWarnings("rawtypes")
public class GraphTaskManager<I extends WritableComparable, V extends Writable, E extends Writable>
        implements ResetSuperstepMetricsObserver {
    /*if_not[PURE_YARN]
      static { // Eliminate this? Even MRv1 tasks should not need it here.
        Configuration.addDefaultResource("giraph-site.xml");
      }
    end[PURE_YARN]*/
    /**
     * Class which checks if an exception on some thread should cause worker
     * to fail
     */
    public static final ClassConfOption<CheckerIfWorkerShouldFailAfterException> CHECKER_IF_WORKER_SHOULD_FAIL_AFTER_EXCEPTION_CLASS = ClassConfOption
            .create("giraph.checkerIfWorkerShouldFailAfterExceptionClass", FailWithEveryExceptionOccurred.class,
                    CheckerIfWorkerShouldFailAfterException.class,
                    "Class which checks if an exception on some thread should cause worker "
                            + "to fail, by default all exceptions cause failure");
    /** Name of metric for superstep time in msec */
    public static final String TIMER_SUPERSTEP_TIME = "superstep-time-ms";
    /** Name of metric for compute on all vertices in msec */
    public static final String TIMER_COMPUTE_ALL = "compute-all-ms";
    /** Name of metric for time from begin compute to first message sent */
    public static final String TIMER_TIME_TO_FIRST_MSG = "time-to-first-message-ms";
    /** Name of metric for time from first message till last message flushed */
    public static final String TIMER_COMMUNICATION_TIME = "communication-time-ms";
    /** Name of metric for time spent doing GC per superstep in msec */
    public static final String TIMER_SUPERSTEP_GC_TIME = "superstep-gc-time-ms";

    /** Class logger */
    private static final Logger LOG = Logger.getLogger(GraphTaskManager.class);
    /** Coordination service worker */
    private CentralizedServiceWorker<I, V, E> serviceWorker;
    /** Coordination service master */
    private CentralizedServiceMaster<I, V, E> serviceMaster;
    /** Coordination service master thread */
    private Thread masterThread = null;
    /** The worker should be run exactly once, or else there is a problem. */
    private boolean alreadyRun = false;
    /** Manages the ZooKeeper servers if necessary (dynamic startup) */
    private ZooKeeperManager zkManager;
    /** Configuration */
    private ImmutableClassesGiraphConfiguration<I, V, E> conf;
    /** Already complete? */
    private boolean done = false;
    /** What kind of functions is this mapper doing? */
    private GraphFunctions graphFunctions = GraphFunctions.UNKNOWN;
    /** Superstep stats */
    private FinishedSuperstepStats finishedSuperstepStats = new FinishedSuperstepStats(0, false, 0, 0, false,
            CheckpointStatus.NONE);
    /** Job progress tracker */
    private JobProgressTrackerClient jobProgressTracker;

    // Per-Job Metrics
    /** Timer for WorkerContext#preApplication() */
    private GiraphTimer wcPreAppTimer;
    /** Timer for WorkerContext#postApplication() */
    private GiraphTimer wcPostAppTimer;

    // Per-Superstep Metrics
    /** Time for how long superstep took */
    private GiraphTimer superstepTimer;
    /** Time for all compute() calls in a superstep */
    private GiraphTimer computeAll;
    /** Time from starting compute to sending first message */
    private GiraphTimer timeToFirstMessage;
    /** Context for timing time to first message above */
    private GiraphTimerContext timeToFirstMessageTimerContext;
    /** Time from first sent message till last message flushed. */
    private GiraphTimer communicationTimer;
    /** Context for timing communication time above */
    private GiraphTimerContext communicationTimerContext;
    /** Timer for WorkerContext#preSuperstep() */
    private GiraphTimer wcPreSuperstepTimer;
    /** Timer to keep aggregated time spent in GC in a superstep */
    private Counter gcTimeMetric;
    /** The Hadoop Mapper#Context for this job */
    private final Mapper<?, ?, ?, ?>.Context context;
    /** is this GraphTaskManager the master? */
    private boolean isMaster;
    /** Mapper observers */
    private MapperObserver[] mapperObservers;

    /**
     * Default constructor for GiraphTaskManager.
     * @param context a handle to the underlying cluster framework.
     *                For Hadoop clusters, this is a Mapper#Context.
     */
    public GraphTaskManager(Mapper<?, ?, ?, ?>.Context context) {
        this.context = context;
        this.isMaster = false;
    }

    /**
     * Run the user's input checking code.
     */
    private void checkInput() {
        if (conf.hasEdgeInputFormat()) {
            conf.createWrappedEdgeInputFormat().checkInputSpecs(conf);
        }
        if (conf.hasVertexInputFormat()) {
            conf.createWrappedVertexInputFormat().checkInputSpecs(conf);
        }
    }

    /**
     * In order for job client to know which ZooKeeper the job is using,
     * we create a counter with server:port as its name inside of
     * ZOOKEEPER_SERVER_PORT_COUNTER_GROUP.
     *
     * @param serverPortList Server:port list for ZooKeeper used
     */
    private void createZooKeeperCounter(String serverPortList) {
        // Getting the counter will actually create it.
        context.getCounter(GiraphConstants.ZOOKEEPER_SERVER_PORT_COUNTER_GROUP, serverPortList);
    }

    /**
     * Called by owner of this GraphTaskManager on each compute node
     *
     * @param zkPathList the path to the ZK jars we need to run the job
     */
    public void setup(Path[] zkPathList) throws IOException, InterruptedException {
        context.setStatus("setup: Beginning worker setup.");
        Configuration hadoopConf = context.getConfiguration();
        conf = new ImmutableClassesGiraphConfiguration<I, V, E>(hadoopConf);
        setupMapperObservers();
        initializeJobProgressTracker();
        // Write user's graph types (I,V,E,M) back to configuration parameters so
        // that they are set for quicker access later. These types are often
        // inferred from the Computation class used.
        conf.getGiraphTypes().writeIfUnset(conf);
        // configure global logging level for Giraph job
        initializeAndConfigureLogging();
        // init the metrics objects
        setupAndInitializeGiraphMetrics();
        // Check input
        checkInput();
        // Load any scripts that were deployed
        ScriptLoader.loadScripts(conf);
        // One time setup for computation factory
        conf.createComputationFactory().initialize(conf);
        // Do some task setup (possibly starting up a Zookeeper service)
        context.setStatus("setup: Initializing Zookeeper services.");
        String serverPortList = conf.getZookeeperList();
        if (serverPortList.isEmpty()) {
            if (startZooKeeperManager()) {
                return; // ZK connect/startup failed
            }
        } else {
            createZooKeeperCounter(serverPortList);
        }
        if (zkManager != null && zkManager.runsZooKeeper()) {
            if (LOG.isInfoEnabled()) {
                LOG.info("setup: Chosen to run ZooKeeper...");
            }
        }
        context.setStatus("setup: Connected to Zookeeper service " + serverPortList);
        this.graphFunctions = determineGraphFunctions(conf, zkManager);
        try {
            instantiateBspService();
        } catch (IOException e) {
            LOG.error("setup: Caught exception just before end of setup", e);
            if (zkManager != null) {
                zkManager.offlineZooKeeperServers(ZooKeeperManager.State.FAILED);
            }
            throw new RuntimeException("setup: Offlining servers due to exception...", e);
        }
        context.setStatus(getGraphFunctions().toString() + " starting...");
    }

    /**
     * Create and connect a client to JobProgressTrackerService,
     * or no-op implementation if progress shouldn't be tracked or something
     * goes wrong
     */
    private void initializeJobProgressTracker() {
        if (!conf.trackJobProgressOnClient()) {
            jobProgressTracker = new JobProgressTrackerClientNoOp();
        } else {
            try {
                jobProgressTracker = new RetryableJobProgressTrackerClient(conf);
            } catch (InterruptedException | ExecutionException e) {
                LOG.warn("createJobProgressClient: Exception occurred while trying to"
                        + " connect to JobProgressTracker - not reporting progress", e);
                jobProgressTracker = new JobProgressTrackerClientNoOp();
            }
        }
        jobProgressTracker.mapperStarted();
    }

    /**
    * Perform the work assigned to this compute node for this job run.
    * 1) Run checkpoint per frequency policy.
    * 2) For every vertex on this mapper, run the compute() function
    * 3) Wait until all messaging is done.
    * 4) Check if all vertices are done.  If not goto 2).
    * 5) Dump output.
    */
    public void execute() throws IOException, InterruptedException {
        if (checkTaskState()) {
            return;
        }
        preLoadOnWorkerObservers();
        GiraphTimerContext superstepTimerContext = superstepTimer.time();
        finishedSuperstepStats = serviceWorker.setup();
        superstepTimerContext.stop();
        if (collectInputSuperstepStats(finishedSuperstepStats)) {
            return;
        }
        prepareGraphStateAndWorkerContext();
        List<PartitionStats> partitionStatsList = new ArrayList<PartitionStats>();
        int numComputeThreads = conf.getNumComputeThreads();

        // main superstep processing loop
        while (!finishedSuperstepStats.allVerticesHalted()) {
            final long superstep = serviceWorker.getSuperstep();
            superstepTimerContext = getTimerForThisSuperstep(superstep);
            GraphState graphState = new GraphState(superstep, finishedSuperstepStats.getVertexCount(),
                    finishedSuperstepStats.getEdgeCount(), context);
            Collection<? extends PartitionOwner> masterAssignedPartitionOwners = serviceWorker.startSuperstep();
            if (LOG.isDebugEnabled()) {
                LOG.debug("execute: " + MemoryUtils.getRuntimeMemoryStats());
            }
            context.progress();
            serviceWorker.exchangeVertexPartitions(masterAssignedPartitionOwners);
            context.progress();
            boolean hasBeenRestarted = checkSuperstepRestarted(superstep);

            GlobalStats globalStats = serviceWorker.getGlobalStats();

            if (hasBeenRestarted) {
                graphState = new GraphState(superstep, finishedSuperstepStats.getVertexCount(),
                        finishedSuperstepStats.getEdgeCount(), context);
            } else if (storeCheckpoint(globalStats.getCheckpointStatus())) {
                break;
            }
            serviceWorker.getServerData().prepareResolveMutations();
            context.progress();
            prepareForSuperstep(graphState);
            context.progress();
            MessageStore<I, Writable> messageStore = serviceWorker.getServerData().getCurrentMessageStore();
            int numPartitions = serviceWorker.getPartitionStore().getNumPartitions();
            int numThreads = Math.min(numComputeThreads, numPartitions);
            if (LOG.isInfoEnabled()) {
                LOG.info("execute: " + numPartitions + " partitions to process with " + numThreads
                        + " compute thread(s), originally " + numComputeThreads + " thread(s) on superstep "
                        + superstep);
            }
            partitionStatsList.clear();
            // execute the current superstep
            if (numPartitions > 0) {
                processGraphPartitions(context, partitionStatsList, graphState, messageStore, numThreads);
            }
            finishedSuperstepStats = completeSuperstepAndCollectStats(partitionStatsList, superstepTimerContext);

            // END of superstep compute loop
        }

        if (LOG.isInfoEnabled()) {
            LOG.info("execute: BSP application done (global vertices marked done)");
        }
        updateSuperstepGraphState();
        postApplication();
    }

    /**
     * Handle post-application callbacks.
     */
    private void postApplication() throws IOException, InterruptedException {
        GiraphTimerContext postAppTimerContext = wcPostAppTimer.time();
        serviceWorker.getWorkerContext().postApplication();
        serviceWorker.getSuperstepOutput().postApplication();
        postAppTimerContext.stop();
        context.progress();

        for (WorkerObserver obs : serviceWorker.getWorkerObservers()) {
            obs.postApplication();
            context.progress();
        }
    }

    /**
     * Sets the "isMaster" flag for final output commit to happen on master.
     * @param im the boolean input to set isMaster. Applies to "pure YARN only"
     */
    public void setIsMaster(final boolean im) {
        this.isMaster = im;
    }

    /**
     * Get "isMaster" status flag -- we need to know if we're the master in the
     * "finally" block of our GiraphYarnTask#execute() to commit final job output.
     * @return true if this task IS the master.
     */
    public boolean isMaster() {
        return isMaster;
    }

    /**
     * Produce a reference to the "start" superstep timer for the current
     * superstep.
     * @param superstep the current superstep count
     * @return a GiraphTimerContext representing the "start" of the supestep
     */
    private GiraphTimerContext getTimerForThisSuperstep(long superstep) {
        GiraphMetrics.get().resetSuperstepMetrics(superstep);
        return superstepTimer.time();
    }

    /**
     * Utility to encapsulate Giraph metrics setup calls
     */
    private void setupAndInitializeGiraphMetrics() {
        GiraphMetrics.init(conf);
        GiraphMetrics.get().addSuperstepResetObserver(this);
        initJobMetrics();
        MemoryUtils.initMetrics();
        InputSplitsCallable.initMetrics();
    }

    /**
     * Instantiate and configure ZooKeeperManager for this job. This will
     * result in a Giraph-owned Zookeeper instance, a connection to an
     * existing quorum as specified in the job configuration, or task failure
     * @return true if this task should terminate
     */
    private boolean startZooKeeperManager() throws IOException, InterruptedException {
        zkManager = new ZooKeeperManager(context, conf);
        context.setStatus("setup: Setting up Zookeeper manager.");
        zkManager.setup();
        if (zkManager.computationDone()) {
            done = true;
            return true;
        }
        zkManager.onlineZooKeeperServer();
        String serverPortList = zkManager.getZooKeeperServerPortString();
        conf.setZookeeperList(serverPortList);
        createZooKeeperCounter(serverPortList);
        return false;
    }

    /**
     * Utility to place a new, updated GraphState object into the serviceWorker.
     */
    private void updateSuperstepGraphState() {
        serviceWorker.getWorkerContext().setGraphState(new GraphState(serviceWorker.getSuperstep(),
                finishedSuperstepStats.getVertexCount(), finishedSuperstepStats.getEdgeCount(), context));
    }

    /**
     * Utility function for boilerplate updates and cleanup done at the
     * end of each superstep processing loop in the <code>execute</code> method.
     * @param partitionStatsList list of stas for each superstep to append to
     * @param superstepTimerContext for job metrics
     * @return the collected stats at the close of the current superstep.
     */
    private FinishedSuperstepStats completeSuperstepAndCollectStats(List<PartitionStats> partitionStatsList,
            GiraphTimerContext superstepTimerContext) {

        // the superstep timer is stopped inside the finishSuperstep function
        // (otherwise metrics are not available at the end of the computation
        //  using giraph.metrics.enable=true).
        finishedSuperstepStats = serviceWorker.finishSuperstep(partitionStatsList, superstepTimerContext);
        if (conf.metricsEnabled()) {
            GiraphMetrics.get().perSuperstep().printSummary(System.err);
        }
        return finishedSuperstepStats;
    }

    /**
     * Utility function to prepare various objects managing BSP superstep
     * operations for the next superstep.
     * @param graphState graph state metadata object
     */
    private void prepareForSuperstep(GraphState graphState) {
        serviceWorker.prepareSuperstep();

        serviceWorker.getWorkerContext().setGraphState(graphState);
        serviceWorker.getWorkerContext().setupSuperstep(serviceWorker);
        GiraphTimerContext preSuperstepTimer = wcPreSuperstepTimer.time();
        serviceWorker.getWorkerContext().preSuperstep();
        preSuperstepTimer.stop();
        context.progress();

        for (WorkerObserver obs : serviceWorker.getWorkerObservers()) {
            obs.preSuperstep(graphState.getSuperstep());
            context.progress();
        }
    }

    /**
     * Prepare graph state and worker context for superstep cycles.
     */
    private void prepareGraphStateAndWorkerContext() {
        updateSuperstepGraphState();
        workerContextPreApp();
    }

    /**
      * Get the worker function enum.
      *
      * @return an enum detailing the roles assigned to this
      *         compute node for this Giraph job.
      */
    public GraphFunctions getGraphFunctions() {
        return graphFunctions;
    }

    public final WorkerContext getWorkerContext() {
        return serviceWorker.getWorkerContext();
    }

    public JobProgressTracker getJobProgressTracker() {
        return jobProgressTracker;
    }

    /**
     * Copied from JobConf to get the location of this jar.  Workaround for
     * things like Oozie map-reduce jobs. NOTE: Pure YARN profile cannot
     * make use of this, as the jars are unpacked at each container site.
     *
     * @param myClass Class to search the class loader path for to locate
     *        the relevant jar file
     * @return Location of the jar file containing myClass
     */
    private static String findContainingJar(Class<?> myClass) {
        ClassLoader loader = myClass.getClassLoader();
        String classFile = myClass.getName().replaceAll("\\.", "/") + ".class";
        try {
            for (Enumeration<?> itr = loader.getResources(classFile); itr.hasMoreElements();) {
                URL url = (URL) itr.nextElement();
                if ("jar".equals(url.getProtocol())) {
                    String toReturn = url.getPath();
                    if (toReturn.startsWith("file:")) {
                        toReturn = toReturn.substring("file:".length());
                    }
                    toReturn = URLDecoder.decode(toReturn, "UTF-8");
                    return toReturn.replaceAll("!.*$", "");
                }
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return null;
    }

    /**
     * Figure out what roles this BSP compute node should take on in the job.
     * Basic logic is as follows:
     * 1) If not split master, everyone does the everything and/or running
     *    ZooKeeper.
     * 2) If split master/worker, masters also run ZooKeeper
     *
     * 3) If split master/worker == true and <code>giraph.zkList</code> is
     *    externally provided, the master will not instantiate a ZK instance, but
     *    will assume a quorum is already active on the cluster for Giraph to use.
     *
     * @param conf Configuration to use
     * @param zkManager ZooKeeper manager to help determine whether to run
     *        ZooKeeper.
     * @return Functions that this mapper should do.
     */
    private static GraphFunctions determineGraphFunctions(ImmutableClassesGiraphConfiguration conf,
            ZooKeeperManager zkManager) {
        boolean splitMasterWorker = conf.getSplitMasterWorker();
        int taskPartition = conf.getTaskPartition();
        boolean zkAlreadyProvided = conf.isZookeeperExternal();
        GraphFunctions functions = GraphFunctions.UNKNOWN;
        // What functions should this mapper do?
        if (!splitMasterWorker) {
            if ((zkManager != null) && zkManager.runsZooKeeper()) {
                functions = GraphFunctions.ALL;
            } else {
                functions = GraphFunctions.ALL_EXCEPT_ZOOKEEPER;
            }
        } else {
            if (zkAlreadyProvided) {
                if (taskPartition == 0) {
                    functions = GraphFunctions.MASTER_ONLY;
                } else {
                    functions = GraphFunctions.WORKER_ONLY;
                }
            } else {
                if ((zkManager != null) && zkManager.runsZooKeeper()) {
                    functions = GraphFunctions.MASTER_ZOOKEEPER_ONLY;
                } else {
                    functions = GraphFunctions.WORKER_ONLY;
                }
            }
        }
        return functions;
    }

    /**
     * Instantiate the appropriate BspService object (Master or Worker)
     * for this compute node.
     */
    private void instantiateBspService() throws IOException, InterruptedException {
        if (graphFunctions.isMaster()) {
            if (LOG.isInfoEnabled()) {
                LOG.info("setup: Starting up BspServiceMaster " + "(master thread)...");
            }
            serviceMaster = new BspServiceMaster<I, V, E>(context, this);
            masterThread = new MasterThread<I, V, E>(serviceMaster, context);
            masterThread.start();
        }
        if (graphFunctions.isWorker()) {
            if (LOG.isInfoEnabled()) {
                LOG.info("setup: Starting up BspServiceWorker...");
            }
            serviceWorker = new BspServiceWorker<I, V, E>(context, this);
            installGCMonitoring();
            if (LOG.isInfoEnabled()) {
                LOG.info("setup: Registering health of this worker...");
            }
        }
    }

    /**
     * Install GC monitoring. This method intercepts all GC, log the gc, and
     * notifies an out-of-core engine (if any is used) about the GC.
     */
    private void installGCMonitoring() {
        List<GarbageCollectorMXBean> mxBeans = ManagementFactory.getGarbageCollectorMXBeans();
        final OutOfCoreEngine oocEngine = serviceWorker.getServerData().getOocEngine();
        for (GarbageCollectorMXBean gcBean : mxBeans) {
            NotificationEmitter emitter = (NotificationEmitter) gcBean;
            NotificationListener listener = new NotificationListener() {
                @Override
                public void handleNotification(Notification notification, Object handle) {
                    if (notification.getType()
                            .equals(GarbageCollectionNotificationInfo.GARBAGE_COLLECTION_NOTIFICATION)) {
                        GarbageCollectionNotificationInfo info = GarbageCollectionNotificationInfo
                                .from((CompositeData) notification.getUserData());

                        if (LOG.isInfoEnabled()) {
                            LOG.info("installGCMonitoring: name = " + info.getGcName() + ", action = "
                                    + info.getGcAction() + ", cause = " + info.getGcCause() + ", duration = "
                                    + info.getGcInfo().getDuration() + "ms");
                        }
                        gcTimeMetric.inc(info.getGcInfo().getDuration());
                        if (oocEngine != null) {
                            oocEngine.gcCompleted(info);
                        }
                    }
                }
            };
            //Add the listener
            emitter.addNotificationListener(listener, null, null);
        }
    }

    /**
     * Initialize the root logger and appender to the settings in conf.
     */
    private void initializeAndConfigureLogging() {
        // Set the log level
        String logLevel = conf.getLocalLevel();
        if (!Logger.getRootLogger().getLevel().equals(Level.toLevel(logLevel))) {
            Logger.getRootLogger().setLevel(Level.toLevel(logLevel));
            if (LOG.isInfoEnabled()) {
                LOG.info("setup: Set log level to " + logLevel);
            }
        } else {
            if (LOG.isInfoEnabled()) {
                LOG.info("setup: Log level remains at " + logLevel);
            }
        }
        // Sets pattern layout for all appenders
        if (conf.useLogThreadLayout()) {
            PatternLayout layout = new PatternLayout("%-7p %d [%t] %c %x - %m%n");
            Enumeration<Appender> appenderEnum = Logger.getRootLogger().getAllAppenders();
            while (appenderEnum.hasMoreElements()) {
                appenderEnum.nextElement().setLayout(layout);
            }
        }
        // Change ZooKeeper logging level to error (info is quite verbose) for
        // testing only
        if (conf.getLocalTestMode()) {
            LogManager.getLogger(org.apache.zookeeper.server.PrepRequestProcessor.class.getName())
                    .setLevel(Level.ERROR);
        }
    }

    /**
     * Initialize job-level metrics used by this class.
     */
    private void initJobMetrics() {
        GiraphMetricsRegistry jobMetrics = GiraphMetrics.get().perJobOptional();
        wcPreAppTimer = new GiraphTimer(jobMetrics, "worker-context-pre-app", TimeUnit.MILLISECONDS);
        wcPostAppTimer = new GiraphTimer(jobMetrics, "worker-context-post-app", TimeUnit.MILLISECONDS);
    }

    @Override
    public void newSuperstep(SuperstepMetricsRegistry superstepMetrics) {
        superstepTimer = new GiraphTimer(superstepMetrics, TIMER_SUPERSTEP_TIME, TimeUnit.MILLISECONDS);
        computeAll = new GiraphTimer(superstepMetrics, TIMER_COMPUTE_ALL, TimeUnit.MILLISECONDS);
        timeToFirstMessage = new GiraphTimer(superstepMetrics, TIMER_TIME_TO_FIRST_MSG, TimeUnit.MICROSECONDS);
        communicationTimer = new GiraphTimer(superstepMetrics, TIMER_COMMUNICATION_TIME, TimeUnit.MILLISECONDS);
        gcTimeMetric = superstepMetrics.getCounter(TIMER_SUPERSTEP_GC_TIME);
        wcPreSuperstepTimer = new GiraphTimer(superstepMetrics, "worker-context-pre-superstep",
                TimeUnit.MILLISECONDS);
    }

    /**
     * Notification from Vertex that a message has been sent.
     */
    public void notifySentMessages() {
        // We are tracking the time between when the compute started and the first
        // message get sent. We use null to flag that we have already recorded it.
        GiraphTimerContext tmp = timeToFirstMessageTimerContext;
        if (tmp != null) {
            synchronized (timeToFirstMessage) {
                if (timeToFirstMessageTimerContext != null) {
                    timeToFirstMessageTimerContext.stop();
                    timeToFirstMessageTimerContext = null;
                    communicationTimerContext = communicationTimer.time();
                }
            }
        }
    }

    /**
     * Notification of last message flushed. Comes when we finish the superstep
     * and are done waiting for all messages to send.
     */
    public void notifyFinishedCommunication() {
        GiraphTimerContext tmp = communicationTimerContext;
        if (tmp != null) {
            synchronized (communicationTimer) {
                if (communicationTimerContext != null) {
                    communicationTimerContext.stop();
                    communicationTimerContext = null;
                }
            }
        }
    }

    /**
     * Process graph data partitions active in this superstep.
     * @param context handle to the underlying cluster framework
     * @param partitionStatsList to pick up this superstep's processing stats
     * @param graphState the BSP graph state
     * @param messageStore the messages to be processed in this superstep
     * @param numThreads number of concurrent threads to do processing
     */
    private void processGraphPartitions(final Mapper<?, ?, ?, ?>.Context context,
            List<PartitionStats> partitionStatsList, final GraphState graphState,
            final MessageStore<I, Writable> messageStore, int numThreads) {
        PartitionStore<I, V, E> partitionStore = serviceWorker.getPartitionStore();
        long verticesToCompute = 0;
        //    ConcurrentMap map = ((SimpleMessageStore)messageStore).getMap();
        //    ConcurrentMap incomingMessageStoreMap = ((SimpleMessageStore)serviceWorker.getServerData().getIncomingMessageStore()).getMap();
        for (Integer partitionId : partitionStore.getPartitionIds()) {
            //      map.putIfAbsent(partitionId, Maps.newConcurrentMap());
            //      incomingMessageStoreMap.putIfAbsent(partitionId, Maps.newConcurrentMap());
            verticesToCompute += partitionStore.getPartitionVertexCount(partitionId);
        }
        LOG.info("Message store class: " + serviceWorker.getServerData().getIncomingMessageStore().getClass());
        WorkerProgress.get().startSuperstep(serviceWorker.getSuperstep(), verticesToCompute,
                serviceWorker.getPartitionStore().getNumPartitions());
        partitionStore.startIteration();

        GiraphTimerContext computeAllTimerContext = computeAll.time();
        timeToFirstMessageTimerContext = timeToFirstMessage.time();

        CallableFactory<Collection<PartitionStats>> callableFactory = new CallableFactory<Collection<PartitionStats>>() {
            @Override
            public Callable<Collection<PartitionStats>> newCallable(int callableId) {
                return new ComputeCallable<I, V, E, Writable, Writable>(context, graphState, messageStore, conf,
                        serviceWorker);
            }
        };
        List<Collection<PartitionStats>> results = ProgressableUtils.getResultsWithNCallables(callableFactory,
                numThreads, "compute-%d", context);

        for (Collection<PartitionStats> result : results) {
            partitionStatsList.addAll(result);
        }

        computeAllTimerContext.stop();
    }

    /**
     * Handle the event that this superstep is a restart of a failed one.
     * @param superstep current superstep
     * @return the graph state, updated if this is a restart superstep
     */
    private boolean checkSuperstepRestarted(long superstep) throws IOException {
        // Might need to restart from another superstep
        // (manually or automatic), or store a checkpoint
        if (serviceWorker.getRestartedSuperstep() == superstep) {
            if (LOG.isInfoEnabled()) {
                LOG.info("execute: Loading from checkpoint " + superstep);
            }
            VertexEdgeCount vertexEdgeCount = serviceWorker.loadCheckpoint(serviceWorker.getRestartedSuperstep());
            finishedSuperstepStats = new FinishedSuperstepStats(0, false, vertexEdgeCount.getVertexCount(),
                    vertexEdgeCount.getEdgeCount(), false, CheckpointStatus.NONE);
            return true;
        }
        return false;
    }

    /**
     * Check if it's time to checkpoint and actually does checkpointing
     * if it is.
     * @param checkpointStatus master's decision
     * @return true if we need to stop computation after checkpoint
     * @throws IOException
     */
    private boolean storeCheckpoint(CheckpointStatus checkpointStatus) throws IOException {
        if (checkpointStatus != CheckpointStatus.NONE) {
            serviceWorker.storeCheckpoint();
        }
        return checkpointStatus == CheckpointStatus.CHECKPOINT_AND_HALT;
    }

    /**
     * Attempt to collect the final statistics on the graph data
     * processed in this superstep by this compute node
     * @param inputSuperstepStats the final graph data stats object for the
     *                            input superstep
     * @return true if the graph data has no vertices (error?) and
     *         this node should terminate
     */
    private boolean collectInputSuperstepStats(FinishedSuperstepStats inputSuperstepStats) {
        if (inputSuperstepStats.getVertexCount() == 0 && !inputSuperstepStats.mustLoadCheckpoint()) {
            LOG.warn("map: No vertices in the graph, exiting.");
            return true;
        }
        if (conf.metricsEnabled()) {
            GiraphMetrics.get().perSuperstep().printSummary(System.err);
        }
        return false;
    }

    /**
     * Did the state of this compute node change?
     * @return true if the processing of supersteps should terminate.
     */
    private boolean checkTaskState() {
        if (done) {
            return true;
        }
        GiraphMetrics.get().resetSuperstepMetrics(BspService.INPUT_SUPERSTEP);
        if (graphFunctions.isNotAWorker()) {
            if (LOG.isInfoEnabled()) {
                LOG.info("map: No need to do anything when not a worker");
            }
            return true;
        }
        if (alreadyRun) {
            throw new RuntimeException(
                    "map: In BSP, map should have only been" + " run exactly once, (already run)");
        }
        alreadyRun = true;
        return false;
    }

    /**
     * Call to the WorkerContext before application begins.
     */
    private void workerContextPreApp() {
        GiraphTimerContext preAppTimerContext = wcPreAppTimer.time();
        try {
            serviceWorker.getWorkerContext().preApplication();
        } catch (InstantiationException e) {
            LOG.fatal("execute: preApplication failed in instantiation", e);
            throw new RuntimeException("execute: preApplication failed in instantiation", e);
        } catch (IllegalAccessException e) {
            LOG.fatal("execute: preApplication failed in access", e);
            throw new RuntimeException("execute: preApplication failed in access", e);
        }
        preAppTimerContext.stop();
        context.progress();

        for (WorkerObserver obs : serviceWorker.getWorkerObservers()) {
            obs.preApplication();
            context.progress();
        }
    }

    /**
     * Setup mapper observers
     */
    public void setupMapperObservers() {
        mapperObservers = conf.createMapperObservers();
        for (MapperObserver mapperObserver : mapperObservers) {
            mapperObserver.setup();
        }
    }

    /**
     * Executes preLoad() on worker observers.
     */
    private void preLoadOnWorkerObservers() {
        for (WorkerObserver obs : serviceWorker.getWorkerObservers()) {
            obs.preLoad();
            context.progress();
        }
    }

    /**
     * Executes postSave() on worker observers.
     */
    private void postSaveOnWorkerObservers() {
        for (WorkerObserver obs : serviceWorker.getWorkerObservers()) {
            obs.postSave();
            context.progress();
        }
    }

    /**
     * Called by owner of this GraphTaskManager object on each compute node
     */
    public void cleanup() throws IOException, InterruptedException {
        if (LOG.isInfoEnabled()) {
            LOG.info("cleanup: Starting for " + getGraphFunctions());
        }
        jobProgressTracker.cleanup();
        if (done) {
            return;
        }

        if (serviceWorker != null) {
            serviceWorker.cleanup(finishedSuperstepStats);
            postSaveOnWorkerObservers();
        }
        try {
            if (masterThread != null) {
                masterThread.join();
                LOG.info("cleanup: Joined with master thread");
            }
        } catch (InterruptedException e) {
            // cleanup phase -- just log the error
            LOG.error("cleanup: Master thread couldn't join");
        }
        if (zkManager != null) {
            LOG.info("cleanup: Offlining ZooKeeper servers");
            try {
                zkManager.offlineZooKeeperServers(ZooKeeperManager.State.FINISHED);
                // We need this here cause apparently exceptions are eaten by Hadoop
                // when they come from the cleanup lifecycle and it's useful to know
                // if something is wrong.
                //
                // And since it's cleanup nothing too bad should happen if we don't
                // propagate and just allow the job to finish normally.
                // CHECKSTYLE: stop IllegalCatch
            } catch (Throwable e) {
                // CHECKSTYLE: resume IllegalCatch
                LOG.error("cleanup: Error offlining zookeeper", e);
            }
        }

        // Stop tracking metrics
        GiraphMetrics.get().shutdown();
    }

    /**
     * Cleanup a ZooKeeper instance managed by this
     * GiraphWorker upon job run failure.
     */
    public void zooKeeperCleanup() {
        if (graphFunctions.isZooKeeper()) {
            // ZooKeeper may have had an issue
            if (zkManager != null) {
                zkManager.cleanup();
            }
        }
    }

    /**
     * Cleanup all of Giraph's framework-agnostic resources
     * regardless of which type of cluster Giraph is running on.
     */
    public void workerFailureCleanup() {
        try {
            if (graphFunctions.isWorker()) {
                serviceWorker.failureCleanup();
            }
            // Stop tracking metrics
            GiraphMetrics.get().shutdown();
            // Checkstyle exception due to needing to get the original
            // exception on failure
            // CHECKSTYLE: stop IllegalCatch
        } catch (RuntimeException e1) {
            // CHECKSTYLE: resume IllegalCatch
            LOG.error("run: Worker failure failed on another RuntimeException, "
                    + "original expection will be rethrown", e1);
        }
    }

    /**
     * Creates exception handler that will terminate process gracefully in case
     * of any uncaught exception.
     * @return new exception handler object.
     */
    public Thread.UncaughtExceptionHandler createUncaughtExceptionHandler() {
        return new OverrideExceptionHandler(
                CHECKER_IF_WORKER_SHOULD_FAIL_AFTER_EXCEPTION_CLASS.newInstance(getConf()),
                getJobProgressTracker());
    }

    public ImmutableClassesGiraphConfiguration<I, V, E> getConf() {
        return conf;
    }

    /**
     * @return Time spent in GC recorder by the GC listener
     */
    public long getSuperstepGCTime() {
        return (gcTimeMetric == null) ? 0 : gcTimeMetric.count();
    }

    /**
     * Returns a list of zookeeper servers to connect to.
     * If the port is set to 0 and Giraph is starting a single
     * ZooKeeper server, then Zookeeper will pick its own port.
     * Otherwise, the ZooKeeper port set by the user will be used.
     * @return host:port,host:port for each zookeeper
     */
    public String getZookeeperList() {
        if (zkManager != null) {
            return zkManager.getZooKeeperServerPortString();
        } else {
            return conf.getZookeeperList();
        }
    }

    /**
     * Default handler for uncaught exceptions.
     * It will do the best to clean up and then will terminate current giraph job.
     */
    class OverrideExceptionHandler implements Thread.UncaughtExceptionHandler {
        /** Checker if worker should fail after a thread gets an exception */
        private final CheckerIfWorkerShouldFailAfterException checker;
        /** JobProgressTracker to log problems to */
        private final JobProgressTracker jobProgressTracker;

        /**
         * Constructor
         *
         * @param checker Checker if worker should fail after a thread gets an
         *                exception
         * @param jobProgressTracker JobProgressTracker to log problems to
         */
        public OverrideExceptionHandler(CheckerIfWorkerShouldFailAfterException checker,
                JobProgressTracker jobProgressTracker) {
            this.checker = checker;
            this.jobProgressTracker = jobProgressTracker;
        }

        @Override
        public void uncaughtException(final Thread t, final Throwable e) {
            if (!checker.checkIfWorkerShouldFail(t, e)) {
                return;
            }
            try {
                LOG.fatal("uncaughtException: OverrideExceptionHandler on thread " + t.getName() + ", msg = "
                        + e.getMessage() + ", exiting...", e);
                jobProgressTracker.logError(ExceptionUtils.getStackTrace(e));

                zooKeeperCleanup();
                workerFailureCleanup();
            } finally {
                System.exit(1);
            }
        }
    }

    /**
     * Interface to check if worker should fail after a thread gets an exception
     */
    public interface CheckerIfWorkerShouldFailAfterException {
        /**
         * Check if worker should fail after a thread gets an exception
         *
         * @param thread Thread which raised the exception
         * @param exception Exception which occurred
         * @return True iff worker should fail after this exception
         */
        boolean checkIfWorkerShouldFail(Thread thread, Throwable exception);
    }

    /**
     * Class to use by default, where each exception causes job failure
     */
    public static class FailWithEveryExceptionOccurred implements CheckerIfWorkerShouldFailAfterException {
        @Override
        public boolean checkIfWorkerShouldFail(Thread thread, Throwable exception) {
            return true;
        }
    }
}