org.deeplearning4j.AnimalModelByHdfsSparkCluster.java Source code

Introduction

Here is the source code for org.deeplearning4j.AnimalModelByHdfsSparkCluster.java
Source

/*******************************************************************************
 * Copyright (c) 2015-2019 Skymind, Inc.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/

package org.deeplearning4j;

import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.deeplearning4j.api.storage.StatsStorageRouter;
import org.deeplearning4j.api.storage.impl.RemoteUIStatsStorageRouter;
import org.deeplearning4j.datasets.DatasetIteratorFromHdfs;
import org.deeplearning4j.eval.Evaluation;
import org.deeplearning4j.listeners.SparkScoreIterationListener;
import org.deeplearning4j.nn.api.Model;
import org.deeplearning4j.nn.api.OptimizationAlgorithm;
import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
import org.deeplearning4j.nn.conf.inputs.InputType;
import org.deeplearning4j.nn.conf.layers.ConvolutionLayer;
import org.deeplearning4j.nn.conf.layers.DenseLayer;
import org.deeplearning4j.nn.conf.layers.OutputLayer;
import org.deeplearning4j.nn.conf.layers.SubsamplingLayer;
import org.deeplearning4j.nn.graph.ComputationGraph;
import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
import org.deeplearning4j.nn.weights.WeightInit;
import org.deeplearning4j.spark.api.RDDTrainingApproach;
import org.deeplearning4j.spark.api.TrainingMaster;
import org.deeplearning4j.spark.api.stats.SparkTrainingStats;
import org.deeplearning4j.spark.impl.multilayer.SparkDl4jMultiLayer;
import org.deeplearning4j.spark.impl.paramavg.ParameterAveragingTrainingMaster;
import org.deeplearning4j.spark.parameterserver.training.SharedTrainingMaster;
import org.deeplearning4j.spark.stats.StatsUtils;
import org.deeplearning4j.utils.CommonUtils;
import org.nd4j.linalg.activations.Activation;
import org.nd4j.linalg.dataset.DataSet;
import org.nd4j.linalg.dataset.api.iterator.DataSetIterator;
import org.nd4j.linalg.learning.config.Nesterovs;
import org.nd4j.linalg.lossfunctions.LossFunctions;
import org.nd4j.parameterserver.distributed.conf.VoidConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;

/**
 * @Description: The distributed training model based on spark and hadoop,the dataset is image
 * The spark and hadoop clusters've two nodes, one is master node that its domain name is "cluster1",the domain name of the slave node is "cluster2",of course,everyone can extend more nodes
 * Tip: the pom.xml configuration ->For Spark examples: change the _1 to _2 to switch between Spark 1 and Spark 2
 * running ways:
 * one way: spark on yarn--->>>
 * export the module to jar,and then execute command:spark-submit --class org.deeplearning4j.AnimalModelByHdfsSparkCluster --master yarn --deploy-mode cluster /home/out/dl4j-spark-cluster-1.0.0-beta-bin.jar
 * second way: standalone--->>>
 * export the module to jar,and then execute command:spark-submit --class org.deeplearning4j.AnimalModelByHdfsSparkCluster --master spark://cluster1:7077 --deploy-mode cluster /home/out/dl4j-spark-cluster-1.0.0-beta-bin.jar
 * you may encounter some error:NTPTimeSource: Error querying NTP server, attempt 1 of 10, reference->https://deeplearning4j.org/spark#sparkstatsntp
 * @author wangfeng
 */
public class AnimalModelByHdfsSparkCluster {
    private static final Logger log = LoggerFactory.getLogger(AnimalModelByHdfsSparkCluster.class);
    private static int height = 100;
    private static int width = 100;
    private static int channels = 3;
    private static long seed = 42;
    private static int averagingFrequency = 3;
    private static int batchSizePerWorker = 4;

    private static int examplesPerDataSetObject = 1;
    private static boolean trainingMode = true;
    private static String modelPath = "/user/hadoop/models/AnimalModelByHdfsSparkClusterModel.bin";
    private static String trainMonitorLog = "/user/hadoop/trainlog/AnimalModelByHdfsTrainingStatsSpark.dl4j";
    private static String trainPerformanceLog = "/user/hadoop/trainlog/AnimalModelByHdfsSparkCluster.html";
    private static String trainIteratorScoreLog = "/user/hadoop/trainlog/scoreAnimalModelByHdfsSparkCluster.log";

    public static void main(String[] args) throws Exception {

        JavaSparkContext sc = CommonUtils.createConf();
        FileSystem fs = FileSystem.get(sc.hadoopConfiguration());

        MultiLayerConfiguration conf = lenetModelConf();

        TrainingMaster tm = null;
        if (!trainingMode) {
            //Introduce & configuration for Spark training: https://deeplearning4j.org/distributed
            //SharedTraining
            VoidConfiguration voidConfiguration = VoidConfiguration.builder().unicastPort(40123)// This can be any port, but it should be open for IN/OUT comms on all Spark nodes
                    //.networkMask("192.168.122.0/24")//The master node (cluster1) ip address is 192.168.122.2, the slave node (cluster2) ip address is 192.168.122.83
                    .build();
            tm = new SharedTrainingMaster.Builder(voidConfiguration, batchSizePerWorker).updatesThreshold(1e-3)// encoding threshold. Please check https://deeplearning4j.org/distributed for details
                    .rddTrainingApproach(RDDTrainingApproach.Direct).batchSizePerWorker(batchSizePerWorker)//This controls the minibatch size for each worker.
                    .workersPerNode(1)// this option will enforce exactly 4 workers for each Spark node
                    .build();
        } else {
            //Introduce & configuration  for Spark training:https://deeplearning4j.org/spark
            //ParameterAveragingTraining
            tm = new ParameterAveragingTrainingMaster.Builder(examplesPerDataSetObject)//This is specified in the builder constructor. This value specifies how many examples are in each DataSet object.
                    .workerPrefetchNumBatches(2)//Asynchronously prefetch up to 2 batches
                    .averagingFrequency(averagingFrequency)//This controls how frequently the parameters are averaged and redistributed, in terms of number of minibatches of size batchSizePerWorker
                    .batchSizePerWorker(batchSizePerWorker)//This controls the minibatch size for each worker.
                    .build();
        }

        SparkDl4jMultiLayer sparkNetwork = new SparkDl4jMultiLayer(sc, conf, tm);
        sparkNetwork.setCollectTrainingStats(true);
        //remote monitor
        StatsStorageRouter remoteUIRouter = new RemoteUIStatsStorageRouter("http://cluster2:9000");//tip the port conflict hadoop

        //StatsStorage statsStorage = new FileStatsStorage(new File(trainMonitorLog));
        //SparkNetwork.setListeners(remoteUIRouter, new StatsListener(statsStorage), new SparkScoreIterationListener(10));
        sparkNetwork.setListeners(remoteUIRouter, new SparkScoreIterationListener(1, trainIteratorScoreLog));
        SparkTrainingStats stats = sparkNetwork.getSparkTrainingStats();

        DataSetIterator trainIterator = new DatasetIteratorFromHdfs(batchSizePerWorker, true);

        List<DataSet> trainDataList = new ArrayList<>();
        while (trainIterator.hasNext()) {
            trainDataList.add(trainIterator.next());
        }
        JavaRDD<DataSet> trainData = sc.parallelize(trainDataList);

        MultiLayerNetwork network = null;
        for (int i = 0; i < 10; i++) {
            network = sparkNetwork.fit(trainData);
            //network = sparkNetwork.fit(CommonUtils.TRAIN_HDFS_PATH);
            log.info("Completed Epoch {}", i);
        }
        if (trainingMode) {
            StatsUtils.exportStatsAsHtml(stats, trainPerformanceLog, sc);
        }
        saveModel(fs, network);

        DataSetIterator validateIterator = new DatasetIteratorFromHdfs(batchSizePerWorker, false);
        List<DataSet> testDataList = new ArrayList<>();
        while (validateIterator.hasNext()) {
            testDataList.add(validateIterator.next());
        }
        JavaRDD<DataSet> testData = sc.parallelize(testDataList);
        Evaluation evaluation = sparkNetwork.doEvaluation(testData, 32, new Evaluation(4))[0];//Work-around for 0.9.1 bug: see https://deeplearning4j.org/releasenotes
        log.info(evaluation.stats());
    }

    public static MultiLayerConfiguration lenetModelConf() {

        MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().seed(seed).l2(0.005)
                .activation(Activation.RELU).weightInit(WeightInit.XAVIER)
                .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
                .updater(new Nesterovs(0.0001, 0.9)).list()
                .layer(0,
                        new ConvolutionLayer.Builder(new int[] { 5, 5 }, new int[] { 1, 1 }, new int[] { 0, 0 })
                                .name("cnn1").nIn(channels).nOut(50).biasInit(0).build())
                .layer(1,
                        new SubsamplingLayer.Builder(new int[] { 2, 2 }, new int[] { 2, 2 }).name("maxpool1")
                                .build())
                .layer(2,
                        new ConvolutionLayer.Builder(new int[] { 5, 5 }, new int[] { 5, 5 }, new int[] { 1, 1 })
                                .name("cnn2").nOut(100).biasInit(0).build())
                .layer(3,
                        new SubsamplingLayer.Builder(new int[] { 2, 2 }, new int[] { 2, 2 }).name("maxpool2")
                                .build())
                .layer(4, new DenseLayer.Builder().nOut(500).build())
                .layer(5,
                        new OutputLayer.Builder(LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD).nOut(4)
                                .activation(Activation.SOFTMAX).build())
                .setInputType(InputType.convolutional(height, width, channels)).build();

        return conf;

    }

    public static void saveModel(FileSystem fs, Model model) throws Exception {

        String json = null;
        if (model instanceof MultiLayerNetwork) {
            json = ((MultiLayerNetwork) model).getLayerWiseConfigurations().toJson();
        } else if (model instanceof ComputationGraph) {
            json = ((ComputationGraph) model).getConfiguration().toJson();
        }
        byte[] byts = json.getBytes();
        FSDataOutputStream out = fs.create(new Path(modelPath));
        out.write(byts);
        out.hsync();
        fs.close();

    }
}