com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteParForSparkWorker.java Source code

Introduction

Here is the source code for com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteParForSparkWorker.java
Source

/**
 * (C) Copyright IBM Corp. 2010, 2015
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
*/

package com.ibm.bi.dml.runtime.controlprogram.parfor;

import java.io.IOException;
import java.util.ArrayList;

import org.apache.spark.Accumulator;
import org.apache.spark.TaskContext;
import org.apache.spark.api.java.function.PairFlatMapFunction;

import com.ibm.bi.dml.runtime.DMLRuntimeException;
import com.ibm.bi.dml.runtime.DMLUnsupportedOperationException;
import com.ibm.bi.dml.runtime.controlprogram.caching.CacheableData;
import com.ibm.bi.dml.runtime.controlprogram.parfor.util.IDHandler;
import com.ibm.bi.dml.runtime.util.LocalFileUtils;

import scala.Tuple2;

/**
 * 
 * 
 */
public class RemoteParForSparkWorker extends ParWorker implements PairFlatMapFunction<Task, Long, String> {

    private static final long serialVersionUID = -3254950138084272296L;

    private boolean _initialized = false;
    private String _prog = null;
    private boolean _caching = true;

    private Accumulator<Integer> _aTasks = null;
    private Accumulator<Integer> _aIters = null;

    public RemoteParForSparkWorker(String program, boolean cpCaching, Accumulator<Integer> atasks,
            Accumulator<Integer> aiters) throws DMLRuntimeException, DMLUnsupportedOperationException {
        //keep inputs (unfortunately, spark does not expose task ids and it would be implementation-dependent
        //when this constructor is actually called; hence, we do lazy initialization on task execution)
        _initialized = false;
        _prog = program;
        _caching = cpCaching;

        //setup spark accumulators
        _aTasks = atasks;
        _aIters = aiters;
    }

    @Override
    public Iterable<Tuple2<Long, String>> call(Task arg0) throws Exception {
        //lazy parworker initialization
        if (!_initialized)
            configureWorker(TaskContext.get().taskAttemptId()); //requires Spark 1.3

        //execute a single task
        long numIter = getExecutedIterations();
        super.executeTask(arg0);

        //maintain accumulators
        _aTasks.add(1);
        _aIters.add((int) (getExecutedIterations() - numIter));

        //write output if required (matrix indexed write) 
        //note: this copy is necessary for environments without spark libraries
        ArrayList<Tuple2<Long, String>> ret = new ArrayList<Tuple2<Long, String>>();
        ArrayList<String> tmp = RemoteParForUtils.exportResultVariables(_workerID, _ec.getVariables(), _resultVars);
        for (String val : tmp)
            ret.add(new Tuple2<Long, String>(_workerID, val));

        return ret;
    }

    /**
     * 
     * @param ID
     * @throws DMLUnsupportedOperationException 
     * @throws DMLRuntimeException 
     * @throws IOException 
     */
    private void configureWorker(long ID)
            throws DMLRuntimeException, DMLUnsupportedOperationException, IOException {
        _workerID = ID;

        //parse and setup parfor body program
        ParForBody body = ProgramConverter.parseParForBody(_prog, (int) _workerID);
        _childBlocks = body.getChildBlocks();
        _ec = body.getEc();
        _resultVars = body.getResultVarNames();
        _numTasks = 0;
        _numIters = 0;

        //init local cache manager 
        if (!CacheableData.isCachingActive()) {
            String uuid = IDHandler.createDistributedUniqueID();
            LocalFileUtils.createWorkingDirectoryWithUUID(uuid);
            CacheableData.initCaching(uuid); //incl activation, cache dir creation (each map task gets its own dir for simplified cleanup)
        }
        if (!CacheableData.cacheEvictionLocalFilePrefix.contains("_")) { //account for local mode
            CacheableData.cacheEvictionLocalFilePrefix = CacheableData.cacheEvictionLocalFilePrefix + "_"
                    + _workerID;
        }

        //ensure that resultvar files are not removed
        super.pinResultVariables();

        //enable/disable caching (if required)
        if (!_caching)
            CacheableData.disableCaching();

        //make as lazily intialized
        _initialized = true;
    }
}