edu.indiana.soic.ts.streaming.dataflow.StockAnalysisPipeline1.java Source code

Java tutorial

Introduction

Here is the source code for edu.indiana.soic.ts.streaming.dataflow.StockAnalysisPipeline1.java

Source

/*
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
*/
package edu.indiana.soic.ts.streaming.dataflow;

import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.options.Default;
import com.google.cloud.dataflow.sdk.options.Description;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
import com.google.cloud.dataflow.sdk.transforms.Combine;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.transforms.windowing.SlidingWindows;
import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PCollectionView;
import edu.indiana.soic.ts.streaming.dataflow.utils.DistanceMatrix;
import edu.indiana.soic.ts.streaming.dataflow.utils.StockPricePoint;
import edu.indiana.soic.ts.streaming.dataflow.utils.SymbolEncoder;
import org.joda.time.Duration;
import org.joda.time.Instant;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.*;

public class StockAnalysisPipeline1 {
    private final static Logger logger = LoggerFactory.getLogger(StockAnalysisPipeline1.class);

    public static int WINDOW_LENGTH = 360;

    public static int SLIDING_INTERVAL = 360;

    public static interface StockAnalysisPipelineOptions extends PipelineOptions {
        @Description("Path to input file")
        @Default.String("")
        String getInputFilePath();

        void setInputFilePath(String value);

        @Description("Output file path")
        @Default.String("")
        String getOutputFilePath();

        void setOutputFilePath(String value);
    }

    public static void main(String[] args) throws IOException {
        final SymbolEncoder symbolEncoder = new SymbolEncoder();

        StockAnalysisPipelineOptions options = PipelineOptionsFactory.fromArgs(args)
                .as(StockAnalysisPipelineOptions.class);
        Pipeline pipeline = Pipeline.create(options);

        //Reading and time stamping the stock prices
        PCollection<KV<Integer, StockPricePoint>> stockPrices = pipeline
                .apply(TextIO.Read.named("Reading Input File").from(options.getInputFilePath()))
                .apply(ParDo.named("Timestamping").of(new DoFn<String, KV<Integer, StockPricePoint>>() {
                    @Override
                    public void processElement(ProcessContext c) throws Exception {
                        try {
                            String[] fields = c.element().split(",");
                            StockPricePoint stockPoint = new StockPricePoint();
                            stockPoint.setId(fields[0]);
                            SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
                            stockPoint.setDate(sdf.parse(fields[1].trim()));
                            stockPoint.setSymbol(fields[2]);
                            stockPoint.setPrice(Double.parseDouble(fields[5].trim()));
                            stockPoint.setCap(Double.parseDouble(fields[6].trim()));
                            Instant instant = new Instant(stockPoint.getDate().getTime());

                            //debugging - we cannot handle large amounts of data when using local runner
                            //int index = symbolEncoder.getSymbolIndex(stockPoint.getSymbol());
                            //if(index > 1000 && index < 1100)
                            c.outputWithTimestamp(
                                    KV.of(symbolEncoder.getSymbolIndex(stockPoint.getSymbol()), stockPoint),
                                    instant);
                        } catch (Exception ex) {
                            //input format issue
                        }
                    }
                }));

        //creating the sliding windows
        PCollection<KV<Integer, StockPricePoint>> slidingWindowStockPrices = stockPrices
                .apply(Window.named("Windowing").<KV<Integer, StockPricePoint>>into(SlidingWindows
                        .of(Duration.standardDays(WINDOW_LENGTH)).every(Duration.standardDays(SLIDING_INTERVAL))));

        //combining stock prices per company per window
        PCollection<KV<Integer, List<StockPricePoint>>> stockPricesPerCompanyPerWindow = slidingWindowStockPrices
                .apply(GroupByKey.create()).apply(ParDo.named("Combining By Company")
                        .of(new DoFn<KV<Integer, Iterable<StockPricePoint>>, KV<Integer, List<StockPricePoint>>>() {
                            @Override
                            public void processElement(ProcessContext c) throws Exception {
                                Integer key = c.element().getKey();
                                Iterator<StockPricePoint> iterator = c.element().getValue().iterator();
                                List<StockPricePoint> stockPricePoints = new ArrayList<>();
                                while (iterator.hasNext()) {
                                    stockPricePoints.add(iterator.next());
                                }
                                c.output(KV.of(key, stockPricePoints));
                            }
                        }));

        //accumulating companies per window
        PCollectionView<Set<Integer>> companiesPerWindow = slidingWindowStockPrices.apply(
                Combine.globally(new Combine.CombineFn<KV<Integer, StockPricePoint>, Set<Integer>, Set<Integer>>() {
                    @Override
                    public Set<Integer> createAccumulator() {
                        return new HashSet<>();
                    }

                    @Override
                    public Set<Integer> addInput(Set<Integer> indices,
                            KV<Integer, StockPricePoint> integerStockPricePointKV) {
                        indices.add(integerStockPricePointKV.getKey());
                        return indices;
                    }

                    @Override
                    public Set<Integer> mergeAccumulators(Iterable<Set<Integer>> iterable) {
                        HashSet<Integer> indices = new HashSet<>();
                        Iterator<Set<Integer>> iterator = iterable.iterator();
                        while (iterator.hasNext()) {
                            indices.addAll(iterator.next());
                        }
                        return indices;
                    }

                    @Override
                    public Set<Integer> extractOutput(Set<Integer> indices) {
                        return indices;
                    }
                }).named("Combine By Window").asSingletonView());

        //duplicate the company entries in each window to create distance matrix entries
        PCollection<KV<String, List<StockPricePoint>>> explodedEntries = stockPricesPerCompanyPerWindow
                .apply(ParDo.named("Duplicating Entries").withSideInputs(companiesPerWindow)
                        .of(new DoFn<KV<Integer, List<StockPricePoint>>, KV<String, List<StockPricePoint>>>() {
                            @Override
                            public void processElement(ProcessContext c) throws Exception {
                                Set<Integer> indices = c.sideInput(companiesPerWindow);
                                Integer key = c.element().getKey();
                                List<StockPricePoint> stockPricePoints = c.element().getValue();
                                Iterator<Integer> iterator = indices.iterator();
                                while (iterator.hasNext()) {
                                    Integer temp = iterator.next();
                                    // we generate only the lower half. The distance matrix is symmetric
                                    if (key > temp) {
                                        c.output(KV.of(key + "_" + temp, stockPricePoints));
                                    } else if (temp > key) {
                                        c.output(KV.of(temp + "_" + key, stockPricePoints));
                                    }
                                }
                            }
                        }));

        //grouping two entries to create a distance entry in the matrix and calculating the distance
        PCollection<KV<String, Double>> distances = explodedEntries.apply(GroupByKey.create())
                .apply(ParDo.named("Calculate Distances")
                        .of(new DoFn<KV<String, Iterable<List<StockPricePoint>>>, KV<String, Double>>() {
                            @Override
                            public void processElement(ProcessContext processContext) throws Exception {
                                Integer keyX = Integer.parseInt(processContext.element().getKey().split("_")[0]);
                                Integer keyY = Integer.parseInt(processContext.element().getKey().split("_")[1]);
                                Iterator<List<StockPricePoint>> iterator = processContext.element().getValue()
                                        .iterator();
                                List<StockPricePoint> stockPricesX = iterator.next();
                                List<StockPricePoint> stockPricesY = iterator.next();
                                //TODO calculate distance
                                processContext.output(KV.of(keyX + "_" + keyY, 0.0));
                            }
                        }));

        //formulate the distance matrix
        PCollection<DistanceMatrix> distanceMatrix = distances.apply(
                Combine.globally(new Combine.CombineFn<KV<String, Double>, DistanceMatrix, DistanceMatrix>() {
                    @Override
                    public DistanceMatrix createAccumulator() {
                        return new DistanceMatrix();
                    }

                    @Override
                    public DistanceMatrix addInput(DistanceMatrix distanceMatrix,
                            KV<String, Double> stringDoubleKV) {
                        distanceMatrix.addPoint(Integer.parseInt(stringDoubleKV.getKey().split("_")[0]),
                                Integer.parseInt(stringDoubleKV.getKey().split("_")[1]), stringDoubleKV.getValue());
                        return distanceMatrix;
                    }

                    @Override
                    public DistanceMatrix mergeAccumulators(Iterable<DistanceMatrix> iterable) {
                        DistanceMatrix distanceMatrix = new DistanceMatrix();
                        Iterator<DistanceMatrix> iterator = iterable.iterator();
                        while (iterator.hasNext()) {
                            distanceMatrix.merge(iterator.next());
                        }
                        return distanceMatrix;
                    }

                    @Override
                    public DistanceMatrix extractOutput(DistanceMatrix distanceMatrix) {
                        return distanceMatrix;
                    }
                }).named("Combine Distance Matrix").withoutDefaults());

        //write to file
        distanceMatrix.apply(ParDo.named("Matrix to String").of(new DoFn<DistanceMatrix, String>() {
            @Override
            public void processElement(ProcessContext processContext) throws Exception {
                String temp = "<distance-matrix-entry>\n" + processContext.timestamp() + "\n";
                temp += processContext.element().getDistanceValues().toString() + "\n";
                temp += processContext.element().getRow().toString() + "\n";
                temp += processContext.element().getColumn().toString() + "\n";
                temp += "</distance-matrix-entry>\n";
                processContext.output(temp);
            }
        })).apply(TextIO.Write.named("Writing Output File").to(options.getOutputFilePath()));

        pipeline.run();
        System.exit(0);
    }
}