com.cloudera.oryx.lazarus.batch.ExampleBatchLayerUpdate.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.oryx.lazarus.batch.ExampleBatchLayerUpdate.java

Source

/*
 * Copyright (c) 2014, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */

package com.cloudera.oryx.lazarus.batch;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import scala.Tuple2;

import com.cloudera.oryx.api.TopicProducer;
import com.cloudera.oryx.api.batch.BatchLayerUpdate;

/**
 * Input keys are ignored. Values are treated as lines of space-separated text. The job
 * counts, for each word, the number of distinct other words that co-occur in some line
 * of text in the input. These are written as a "MODEL" update, where the word-count mapping
 * is written as a JSON string.
 */
public final class ExampleBatchLayerUpdate implements BatchLayerUpdate<String, String, String> {

    @Override
    public void runUpdate(JavaSparkContext sparkContext, long timestamp, JavaPairRDD<String, String> newData,
            JavaPairRDD<String, String> pastData, String modelDirString,
            TopicProducer<String, String> modelUpdateTopic) throws IOException {
        JavaPairRDD<String, String> allData = pastData == null ? newData : newData.union(pastData);
        String modelString;
        try {
            System.out.println("Print New Data ...................................");
            System.out.println(newData);
            System.out.println("Printing All Data .................................");
            System.out.println(allData);
            modelString = new ObjectMapper().writeValueAsString(countDistinctOtherWords(allData));
        } catch (JsonProcessingException jpe) {
            throw new IOException(jpe);
        }
        System.out.println("Updating the Model Biatch !!!!!!!!!!!!!!!!!!!!!!!!!!!");
        System.out.println("Updating the Model Biatch !!!!!!!!!!!!!!!!!!!!!!!!!!!");
        System.out.println("Updating the Model Biatch !!!!!!!!!!!!!!!!!!!!!!!!!!!");
        System.out.println("Updating the Model Biatch !!!!!!!!!!!!!!!!!!!!!!!!!!!");
        System.out.println("Updating the Model Biatch !!!!!!!!!!!!!!!!!!!!!!!!!!!");

        modelUpdateTopic.send("MODEL", modelString);
    }

    public static Map<String, Integer> countDistinctOtherWords(JavaPairRDD<String, String> data) {
        return data.values().flatMapToPair(new PairFlatMapFunction<String, String, String>() {
            @Override
            public Iterable<Tuple2<String, String>> call(String line) {
                List<Tuple2<String, String>> result = new ArrayList<>();
                Set<String> distinctTokens = new HashSet<>(Arrays.asList(line.split(" ")));
                for (String a : distinctTokens) {
                    for (String b : distinctTokens) {
                        if (!a.equals(b)) {
                            result.add(new Tuple2<>(a, b));
                        }
                    }
                }
                return result;
            }
        }).distinct().groupByKey().mapValues(new Function<Iterable<String>, Integer>() {
            @Override
            public Integer call(Iterable<String> values) {
                int count = 0;
                for (String v : values) {
                    count++;
                }
                return count;
            }
        }).collectAsMap();
    }

}