io.seldon.spark.actions.GroupActionsJob.java Source code

Java tutorial

Introduction

Here is the source code for io.seldon.spark.actions.GroupActionsJob.java

Source

/*
 * Seldon -- open source prediction engine
 * =======================================
 * Copyright 2011-2015 Seldon Technologies Ltd and Rummble Ltd (http://www.seldon.io/)
 *
 **********************************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at       
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ********************************************************************************************** 
*/
package io.seldon.spark.actions;

import java.text.ParseException;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Queue;

import org.apache.commons.lang3.builder.ReflectionToStringBuilder;
import org.apache.commons.lang3.builder.ToStringStyle;
import org.apache.hadoop.conf.Configuration;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.storage.StorageLevel;

import scala.Tuple2;

import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;

public class GroupActionsJob {

    public static class ClientDetail {
        public final String client;
        public final long itemCount;

        public ClientDetail(String client, long itemCount) {
            this.client = client;
            this.itemCount = itemCount;
        }
    }

    public static class CmdLineArgs {
        @Parameter(names = "--input-path-pattern", required = true)
        private String input_path_pattern;

        @Parameter(names = "--input-date-string", required = true)
        private String input_date_string;

        @Parameter(names = "--output-path-dir", required = true)
        private String output_path_dir;

        @Parameter(names = "--debug-use-local-master")
        private Boolean debug_use_local_master = false;

        @Parameter(names = "--aws-access-key-id", required = false)
        private String aws_access_key_id;

        @Parameter(names = "--aws-secret-access-key", required = false)
        private String aws_secret_access_key;

        @Parameter(names = "--gzip-output", required = false)
        private boolean gzip_output = false;

        @Parameter(names = "--single-client", required = false)
        private String single_client;

        @Override
        public String toString() {
            return ReflectionToStringBuilder.toString(this, ToStringStyle.SHORT_PREFIX_STYLE);
        }
    }

    public static void main(String[] args) {
        CmdLineArgs cmdLineArgs = new CmdLineArgs();
        new JCommander(cmdLineArgs, args);

        run(cmdLineArgs);
    }

    public static void run(CmdLineArgs cmdLineArgs) {
        long unixDays = 0;
        try {
            unixDays = JobUtils.dateToUnixDays(cmdLineArgs.input_date_string);
        } catch (ParseException e) {
            unixDays = 0;
        }
        System.out.println(String.format("--- started GroupActionsJob date[%s] unixDays[%s] ---",
                cmdLineArgs.input_date_string, unixDays));

        System.out.println("Env: " + System.getenv());
        System.out.println("Properties: " + System.getProperties());

        SparkConf sparkConf = new SparkConf().setAppName("GroupActionsJob");

        if (cmdLineArgs.debug_use_local_master) {
            System.out.println("Using 'local' master");
            sparkConf.setMaster("local");
        }

        Tuple2<String, String>[] sparkConfPairs = sparkConf.getAll();
        System.out.println("--- sparkConf ---");
        for (int i = 0; i < sparkConfPairs.length; i++) {
            Tuple2<String, String> kvPair = sparkConfPairs[i];
            System.out.println(String.format("%s:%s", kvPair._1, kvPair._2));
        }
        System.out.println("-----------------");

        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
        { // setup aws access
            Configuration hadoopConf = jsc.hadoopConfiguration();
            hadoopConf.set("fs.s3.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem");
            if (cmdLineArgs.aws_access_key_id != null && !"".equals(cmdLineArgs.aws_access_key_id)) {
                hadoopConf.set("fs.s3n.awsAccessKeyId", cmdLineArgs.aws_access_key_id);
                hadoopConf.set("fs.s3n.awsSecretAccessKey", cmdLineArgs.aws_secret_access_key);
            }
        }

        // String output_path_dir = "./out/" + input_date_string + "-" + UUID.randomUUID();

        JavaRDD<String> dataSet = jsc.textFile(
                JobUtils.getSourceDirFromDate(cmdLineArgs.input_path_pattern, cmdLineArgs.input_date_string))
                .repartition(4);

        final ObjectMapper objectMapper = new ObjectMapper();
        objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);

        final String single_client = cmdLineArgs.single_client;
        if (single_client != null) {
            Function<String, Boolean> clientFilter = new Function<String, Boolean>() {

                @Override
                public Boolean call(String t) throws Exception {
                    ActionData actionData = JobUtils.getActionDataFromActionLogLine(objectMapper, t);
                    return ((actionData.client != null) && (actionData.client.equals(single_client)));
                }
            };
            dataSet = dataSet.filter(clientFilter);
        }

        JavaPairRDD<String, ActionData> pairs = dataSet.mapToPair(new PairFunction<String, String, ActionData>() {

            @Override
            public Tuple2<String, ActionData> call(String t) throws Exception {
                ActionData actionData = JobUtils.getActionDataFromActionLogLine(objectMapper, t);
                // String key = (actionData.userid == 0) ? "__no_userid__" : actionData.client;
                String key = actionData.client;
                return new Tuple2<String, ActionData>(key, actionData);
            }

        }).persist(StorageLevel.MEMORY_AND_DISK());

        List<String> clientList = pairs.keys().distinct().collect();
        Queue<ClientDetail> clientDetailQueue = new PriorityQueue<ClientDetail>(30, new Comparator<ClientDetail>() {

            @Override
            public int compare(ClientDetail o1, ClientDetail o2) {
                if (o1.itemCount > o2.itemCount) {
                    return -1;
                } else if (o1.itemCount < o2.itemCount) {
                    return 1;
                }
                return 0;
            }
        });
        Queue<ClientDetail> clientDetailZeroQueue = new PriorityQueue<ClientDetail>(30,
                new Comparator<ClientDetail>() {

                    @Override
                    public int compare(ClientDetail o1, ClientDetail o2) {
                        if (o1.itemCount > o2.itemCount) {
                            return -1;
                        } else if (o1.itemCount < o2.itemCount) {
                            return 1;
                        }
                        return 0;
                    }
                });
        System.out.println("Client list " + clientList.toString());
        for (String client : clientList) {
            if (client != null) {
                System.out.println("looking at client " + client);
                final String currentClient = client;

                JavaPairRDD<String, ActionData> filtered_by_client = pairs
                        .filter(new Function<Tuple2<String, ActionData>, Boolean>() {

                            @Override
                            public Boolean call(Tuple2<String, ActionData> v1) throws Exception {
                                if (currentClient.equalsIgnoreCase(v1._1)) {
                                    return Boolean.TRUE;
                                } else {
                                    return Boolean.FALSE;
                                }
                            }
                        });

                JavaPairRDD<String, ActionData> nonZeroUserIds = filtered_by_client
                        .filter(new Function<Tuple2<String, ActionData>, Boolean>() {

                            @Override
                            public Boolean call(Tuple2<String, ActionData> v1) throws Exception {
                                if (v1._2.userid == 0) {
                                    return Boolean.FALSE;
                                } else {
                                    return Boolean.TRUE;
                                }
                            }
                        });

                JavaPairRDD<String, Integer> userIdLookupRDD = nonZeroUserIds
                        .mapToPair(new PairFunction<Tuple2<String, ActionData>, String, Integer>() {

                            @Override
                            public Tuple2<String, Integer> call(Tuple2<String, ActionData> t) throws Exception {
                                String key = currentClient + "_" + t._2.client_userid;
                                return new Tuple2<String, Integer>(key, t._2.userid);
                            }
                        });

                Map<String, Integer> userIdLookupMap = userIdLookupRDD.collectAsMap();
                Map<String, Integer> userIdLookupMap_wrapped = new HashMap<String, Integer>(userIdLookupMap);
                final Broadcast<Map<String, Integer>> broadcastVar = jsc.broadcast(userIdLookupMap_wrapped);
                JavaRDD<String> json_only_with_zeros = filtered_by_client
                        .map(new Function<Tuple2<String, ActionData>, String>() {

                            @Override
                            public String call(Tuple2<String, ActionData> v1) throws Exception {
                                Map<String, Integer> m = broadcastVar.getValue();
                                ActionData actionData = v1._2;
                                if (actionData.userid == 0) {
                                    String key = currentClient + "_" + actionData.client_userid;
                                    if (m.containsKey(key)) {
                                        actionData.userid = m.get(key);
                                    } else {
                                        return "";
                                    }
                                }
                                String json = JobUtils.getJsonFromActionData(actionData);
                                return json;
                            }
                        });

                JavaRDD<String> json_only = json_only_with_zeros.filter(new Function<String, Boolean>() {

                    @Override
                    public Boolean call(String v1) throws Exception {
                        return (v1.length() == 0) ? Boolean.FALSE : Boolean.TRUE;
                    }
                });

                String outputPath = getOutputPath(cmdLineArgs.output_path_dir, unixDays, client);
                if (cmdLineArgs.gzip_output) {
                    json_only.saveAsTextFile(outputPath, org.apache.hadoop.io.compress.GzipCodec.class);
                } else {
                    json_only.saveAsTextFile(outputPath);
                }
                long json_only_count = json_only.count();
                clientDetailZeroQueue
                        .add(new ClientDetail(currentClient, json_only_with_zeros.count() - json_only_count));
                clientDetailQueue.add(new ClientDetail(currentClient, json_only_count));
            } else
                System.out.println("Found null client!");
        }

        System.out.println("- Client Action (Zero Userid) Count -");
        while (clientDetailZeroQueue.size() != 0) {
            GroupActionsJob.ClientDetail clientDetail = clientDetailZeroQueue.remove();
            System.out.println(String.format("%s: %d", clientDetail.client, clientDetail.itemCount));
        }

        System.out.println("- Client Action Count -");
        while (clientDetailQueue.size() != 0) {
            GroupActionsJob.ClientDetail clientDetail = clientDetailQueue.remove();
            System.out.println(String.format("%s: %d", clientDetail.client, clientDetail.itemCount));
        }

        jsc.stop();
        System.out.println(String.format("--- finished GroupActionsJob date[%s] unixDays[%s] ---",
                cmdLineArgs.input_date_string, unixDays));

    }

    public static String getOutputPath(String output_path_dir, long unixDays, String client) {

        return output_path_dir + "/" + client + "/actions/" + unixDays;
    }
}