edu.indiana.soic.ts.crunch.CrunchDataReader.java Source code

Java tutorial

Introduction

Here is the source code for edu.indiana.soic.ts.crunch.CrunchDataReader.java

Source

/*
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
*/

package edu.indiana.soic.ts.crunch;

import com.google.protobuf.ServiceException;
import edu.indiana.soic.ts.utils.Constants;
import org.apache.commons.math.stat.regression.SimpleRegression;
import org.apache.crunch.*;
import org.apache.crunch.fn.Aggregators;
import org.apache.crunch.impl.mr.MRPipeline;
import org.apache.crunch.io.hbase.HBaseSourceTarget;
import org.apache.crunch.io.hbase.HBaseTarget;
import org.apache.crunch.io.hbase.HBaseTypes;
import org.apache.crunch.types.writable.Writables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.Serializable;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;

public class CrunchDataReader extends Configured implements Tool, Serializable {
    private static final Logger log = LoggerFactory.getLogger(CrunchDataReader.class);
    private static String startDate;
    private static String endDate;

    public static void main(final String[] args) throws Exception {
        final int res = ToolRunner.run(new Configuration(), new CrunchDataReader(), args);
        System.exit(res);
    }

    @Override
    public int run(final String[] args) throws Exception {
        try {
            startDate = args[1];
            endDate = args[2];
            System.out.println("Start Date : " + startDate);
            System.out.println("End Date : " + endDate);
            if (startDate == null || startDate.isEmpty()) {
                // set 1st starting date
                startDate = "20040102";
            }
            if (endDate == null || endDate.isEmpty()) {
                endDate = "20141231";
            }
            Configuration config = HBaseConfiguration.create();
            Pipeline pipeline = new MRPipeline(CrunchDataReader.class, config);

            Scan scan = new Scan();
            scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs
            scan.setCacheBlocks(false); // don't set to true for MR jobs
            List<String> suitableDates = getDates();
            if (suitableDates != null && !suitableDates.isEmpty()) {
                for (String date : suitableDates) {
                    scan.addColumn(Constants.STOCK_TABLE_CF_BYTES, date.getBytes());
                }
            }
            createTable();
            // Our hbase source
            HBaseSourceTarget source = new HBaseSourceTarget(Constants.STOCK_TABLE_NAME, scan);

            // Our source, in a format which can be use by crunch
            PTable<ImmutableBytesWritable, Result> rawText = pipeline.read(source);
            PTable<String, String> stringStringPTable = extractText(rawText);
            PTable<String, String> result1 = stringStringPTable.groupByKey()
                    .combineValues(Aggregators.STRING_CONCAT(" ", true));
            // We create the collection of puts from the concatenated datas
            PCollection<Put> resultPut = createPut(result1);

            // We write the puts in hbase, in the target table
            pipeline.write(resultPut, new HBaseTarget(Constants.REGRESSION_TABLE_NAME));

            PipelineResult result = pipeline.done();
            return result.succeeded() ? 0 : 1;
        } catch (ParseException e) {
            log.error("Error while parsing date", e);
            throw new RuntimeException("Error while parsing date", e);
        }
    }

    private static void createTable() throws Exception {
        try {
            Configuration configuration = HBaseConfiguration.create();
            HBaseAdmin.checkHBaseAvailable(configuration);
            Connection connection = ConnectionFactory.createConnection(configuration);

            // Instantiating HbaseAdmin class
            Admin admin = connection.getAdmin();

            // Instantiating table descriptor class
            HTableDescriptor stockTableDesc = new HTableDescriptor(
                    TableName.valueOf(Constants.REGRESSION_TABLE_NAME));

            // Adding column families to table descriptor
            HColumnDescriptor stock_0414 = new HColumnDescriptor(Constants.REGRESSION_TABLE_CF);
            stockTableDesc.addFamily(stock_0414);

            // Execute the table through admin
            if (!admin.tableExists(stockTableDesc.getTableName())) {
                admin.createTable(stockTableDesc);
                System.out.println("Stock table created !!!");
            }

            // Load hbase-site.xml
            HBaseConfiguration.addHbaseResources(configuration);
        } catch (ServiceException e) {
            log.error("Error occurred while creating HBase tables", e);
            throw new Exception("Error occurred while creating HBase tables", e);
        }
    }

    public PCollection<Put> createPut(PTable<String, String> extractedText) {
        return extractedText.parallelDo("Convert to puts", new DoFn<Pair<String, String>, Put>() {
            @Override
            public void process(Pair<String, String> input, Emitter<Put> emitter) {
                Put put = new Put(Bytes.toBytes(input.first()));
                put.add(Constants.REGRESSION_TABLE_CF.getBytes(), Constants.REGRESSION_TABLE_QUALIFIER.getBytes(),
                        Bytes.toBytes(input.second()));
                emitter.emit(put);
            }
        }, HBaseTypes.puts());
    }

    private void getRows(Scan scan, List<String> suitableDates) throws ServiceException, IOException {
        Configuration configuration = HBaseConfiguration.create();
        HBaseConfiguration.addHbaseResources(configuration);
        HBaseAdmin.checkHBaseAvailable(configuration);
        Connection connection = ConnectionFactory.createConnection(configuration);
        // Instantiating HbaseAdmin class
        Admin admin = connection.getAdmin();

        HTableDescriptor[] tableDescriptor = admin.listTables();

        for (HTableDescriptor aTableDescriptor : tableDescriptor) {
            if (aTableDescriptor.getTableName().getNameAsString().equals(Constants.STOCK_TABLE_NAME)) {
                Table table = connection.getTable(aTableDescriptor.getTableName());
                ResultScanner scanner = table.getScanner(scan);
                printRows(scanner, suitableDates);
            }
        }
    }

    public static void printRows(ResultScanner resultScanner, List<String> allDates) {
        for (Result aResultScanner : resultScanner) {
            printRow(aResultScanner, allDates);
        }
    }

    public static void printRow(Result result, List<String> allDates) {
        try {
            String rowName = Bytes.toString(result.getRow());
            //if you want to get the entire row
            for (String date : allDates) {
                byte[] value = result.getValue(Constants.STOCK_TABLE_CF_BYTES, date.getBytes());
                if (value != null) {
                    System.out.println("Row Name : " + rowName + " : values : " + new String(value));
                }
            }

        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static List<String> getDates() throws ParseException {
        List<String> allDates = new ArrayList<String>();
        Date startDate = getDate(CrunchDataReader.startDate);
        Date endDate = getDate(CrunchDataReader.endDate);
        ResultScanner scannerForDateTable = getScannerForDateTable();
        for (Result aResultScanner : scannerForDateTable) {
            String date = new String(aResultScanner.getRow());
            Date rowDate = getDate(date);
            if (startDate.compareTo(rowDate) * rowDate.compareTo(endDate) > 0) {
                allDates.add(date);
            }
        }
        return allDates;

    }

    public static Date getDate(String date) throws ParseException {
        DateFormat df = new SimpleDateFormat("yyyyMMdd");
        return df.parse(date);
    }

    private static ResultScanner getScannerForDateTable() {
        try {
            Configuration configuration = HBaseConfiguration.create();
            HBaseConfiguration.addHbaseResources(configuration);
            HBaseAdmin.checkHBaseAvailable(configuration);
            Connection connection = ConnectionFactory.createConnection(configuration);
            // Instantiating HbaseAdmin class
            Admin admin = connection.getAdmin();

            HTableDescriptor[] tableDescriptor = admin.listTables();
            // printing all the table names.
            for (HTableDescriptor aTableDescriptor : tableDescriptor) {
                if (aTableDescriptor.getTableName().getNameAsString().equals(Constants.STOCK_DATES_TABLE)) {
                    Table table = connection.getTable(aTableDescriptor.getTableName());
                    Scan scan = new Scan();
                    scan.setCaching(20);
                    scan.addFamily(Constants.STOCK_DATES_CF_BYTES);
                    return table.getScanner(scan);
                }
            }
        } catch (ServiceException e) {
            log.error("Error while reading Stock Dates table", e);
        } catch (MasterNotRunningException e) {
            log.error("Error while reading Stock Dates table", e);
        } catch (ZooKeeperConnectionException e) {
            log.error("Error while reading Stock Dates table", e);
        } catch (IOException e) {
            log.error("Error while reading Stock Dates table", e);
        }
        return null;
    }

    public PTable<String, String> extractText(PTable<ImmutableBytesWritable, Result> tableContent) {
        return tableContent.parallelDo("Read data",
                new DoFn<Pair<ImmutableBytesWritable, Result>, Pair<String, String>>() {
                    @Override
                    public void process(Pair<ImmutableBytesWritable, Result> row,
                            Emitter<Pair<String, String>> emitter) {
                        SimpleRegression regression;
                        NavigableMap<byte[], NavigableMap<byte[], NavigableMap<Long, byte[]>>> map = row.second()
                                .getMap();
                        System.out.println(map.size());
                        for (Map.Entry<byte[], NavigableMap<byte[], NavigableMap<Long, byte[]>>> columnFamilyMap : map
                                .entrySet()) {
                            regression = new SimpleRegression();
                            int count = 1;
                            for (Map.Entry<byte[], NavigableMap<Long, byte[]>> entryVersion : columnFamilyMap
                                    .getValue().entrySet()) {
                                for (Map.Entry<Long, byte[]> entry : entryVersion.getValue().entrySet()) {
                                    String rowKey = Bytes.toString(row.second().getRow());
                                    String column = Bytes.toString(entryVersion.getKey());
                                    byte[] val = entry.getValue();
                                    String valOfColumn = new String(val);
                                    System.out.println("RowKey : " + rowKey + " Column Key : " + column
                                            + " Column Val : " + valOfColumn);
                                    if (!valOfColumn.isEmpty()) {
                                        String[] priceAndCap = valOfColumn.split("_");
                                        if (priceAndCap.length > 1) {
                                            String pr = priceAndCap[0];
                                            if (pr != null && !pr.equals("null")) {
                                                double price = Double.valueOf(pr);
                                                if (price < 0) {
                                                    price = price - 2 * price;
                                                }
                                                System.out.println("Price : " + price + " count : " + count);
                                                regression.addData(count, price);
                                            }
                                        }
                                    }
                                }
                                count++;
                            }
                            // displays intercept of regression line
                            System.out.println("Intercept : " + regression.getIntercept());

                            // displays slope of regression line
                            System.out.println("Slope : " + regression.getSlope());

                            // displays slope standard error
                            System.out.println("Slope STD Error : " + regression.getSlopeStdErr());
                            emitter.emit(new Pair<String, String>(String.valueOf(regression.getIntercept()),
                                    String.valueOf(regression.getSlope())));
                        }
                    }
                }, Writables.tableOf(Writables.strings(), Writables.strings()));
    }

}