com.linkedin.thirdeye.hadoop.ThirdEyeJob.java Source code

Java tutorial

Introduction

Here is the source code for com.linkedin.thirdeye.hadoop.ThirdEyeJob.java

Source

/**
 * Copyright (C) 2014-2015 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.thirdeye.hadoop;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.lang.reflect.Method;
import java.util.Properties;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobStatus;
import org.joda.time.DateTime;
import org.joda.time.format.ISODateTimeFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.linkedin.thirdeye.hadoop.aggregation.AggregationPhaseConstants;
import com.linkedin.thirdeye.hadoop.aggregation.AggregationPhaseJob;
import com.linkedin.thirdeye.hadoop.backfill.BackfillPhaseConstants;
import com.linkedin.thirdeye.hadoop.backfill.BackfillPhaseJob;
import com.linkedin.thirdeye.hadoop.config.ThirdEyeConstants;
import com.linkedin.thirdeye.hadoop.derivedcolumn.transformation.DerivedColumnTransformationPhaseConstants;
import com.linkedin.thirdeye.hadoop.derivedcolumn.transformation.DerivedColumnTransformationPhaseJob;
import com.linkedin.thirdeye.hadoop.join.JoinPhaseJob;
import com.linkedin.thirdeye.hadoop.push.SegmentPushPhase;
import com.linkedin.thirdeye.hadoop.push.SegmentPushPhaseConstants;
import com.linkedin.thirdeye.hadoop.segment.creation.SegmentCreationPhaseConstants;
import com.linkedin.thirdeye.hadoop.segment.creation.SegmentCreationPhaseJob;
import com.linkedin.thirdeye.hadoop.topk.TopKPhaseConstants;
import com.linkedin.thirdeye.hadoop.topk.TopKPhaseJob;
import com.linkedin.thirdeye.hadoop.transform.TransformPhaseJob;
import com.linkedin.thirdeye.hadoop.wait.WaitPhaseJob;

/**
 * Wrapper to manage segment create and segment push jobs for thirdeye
 */
public class ThirdEyeJob {
    private static final Logger LOGGER = LoggerFactory.getLogger(ThirdEyeJob.class);

    private static final String USAGE = "usage: phase_name job.properties";

    private final String phaseName;
    private final Properties inputConfig;

    public ThirdEyeJob(String jobName, Properties config) {
        String phaseFromConfig = config.getProperty(ThirdEyeJobProperties.THIRDEYE_PHASE.getName());
        if (phaseFromConfig != null) {
            this.phaseName = phaseFromConfig;
        } else {
            this.phaseName = jobName;
        }
        this.inputConfig = config;
    }

    private enum PhaseSpec {

        BACKFILL {
            @Override
            Class<?> getKlazz() {
                return BackfillPhaseJob.class;
            }

            @Override
            String getDescription() {
                return "Backfills older pinot segments with star tree index and topk information";
            }

            @Override
            Properties getJobProperties(Properties inputConfig, String root, String collection, DateTime minTime,
                    DateTime maxTime, String inputPaths) throws Exception {
                Properties config = new Properties();

                config.setProperty(BackfillPhaseConstants.BACKFILL_PHASE_CONTROLLER_HOST.toString(),
                        inputConfig.getProperty(ThirdEyeJobProperties.THIRDEYE_PINOT_CONTROLLER_HOSTS.getName()));
                config.setProperty(BackfillPhaseConstants.BACKFILL_PHASE_CONTROLLER_PORT.toString(),
                        inputConfig.getProperty(ThirdEyeJobProperties.THIRDEYE_PINOT_CONTROLLER_PORT.getName()));

                config.setProperty(BackfillPhaseConstants.BACKFILL_PHASE_START_TIME.toString(),
                        inputConfig.getProperty(ThirdEyeJobProperties.THIRDEYE_BACKFILL_START_TIME.getName()));
                config.setProperty(BackfillPhaseConstants.BACKFILL_PHASE_END_TIME.toString(),
                        inputConfig.getProperty(ThirdEyeJobProperties.THIRDEYE_BACKFILL_END_TIME.getName()));

                config.setProperty(BackfillPhaseConstants.BACKFILL_PHASE_OUTPUT_PATH.toString(),
                        getIndexDir(root, collection, minTime, maxTime) + File.separator + BACKFILL.getName());
                config.setProperty(BackfillPhaseConstants.BACKFILL_PHASE_TABLE_NAME.toString(), collection);

                return config;
            }
        },
        WAIT {
            @Override
            Class<?> getKlazz() {
                return null;
            }

            @Override
            String getDescription() {
                return "Polls a pre-determined amount of time for the existence of input paths";
            }

            @Override
            Properties getJobProperties(Properties inputConfig, String root, String collection, DateTime minTime,
                    DateTime maxTime, String inputPaths) throws Exception {
                return null;
            }
        },
        JOIN {
            @Override
            Class<?> getKlazz() {
                return JoinPhaseJob.class;
            }

            @Override
            String getDescription() {
                return "Joins multiple data sets based on join key";
            }

            @Override
            Properties getJobProperties(Properties inputConfig, String root, String collection, DateTime minTime,
                    DateTime maxTime, String inputPaths) {
                return inputConfig;
            }
        },
        TRANSFORM {
            @Override
            Class<?> getKlazz() {
                return TransformPhaseJob.class;
            }

            @Override
            String getDescription() {
                return "Transforms avro record";
            }

            @Override
            Properties getJobProperties(Properties inputConfig, String root, String collection, DateTime minTime,
                    DateTime maxTime, String inputPaths) {
                return inputConfig;
            }
        },
        AGGREGATION {
            @Override
            Class<?> getKlazz() {
                return AggregationPhaseJob.class;
            }

            @Override
            String getDescription() {
                return "Aggregates input avro data to another time granularity";
            }

            @Override
            Properties getJobProperties(Properties inputConfig, String root, String collection, DateTime minTime,
                    DateTime maxTime, String inputPaths) throws Exception {
                Properties config = new Properties();

                config.setProperty(AggregationPhaseConstants.AGG_PHASE_INPUT_PATH.toString(), inputPaths);
                config.setProperty(AggregationPhaseConstants.AGG_PHASE_OUTPUT_PATH.toString(),
                        getIndexDir(root, collection, minTime, maxTime) + File.separator + AGGREGATION.getName());

                return config;
            }
        },
        TOPK {
            @Override
            Class<?> getKlazz() {
                return TopKPhaseJob.class;
            }

            @Override
            String getDescription() {
                return "Topk";
            }

            @Override
            Properties getJobProperties(Properties inputConfig, String root, String collection, DateTime minTime,
                    DateTime maxTime, String inputPaths) throws Exception {
                Properties config = new Properties();

                Path aggOutputPath = new Path(
                        getIndexDir(root, collection, minTime, maxTime) + File.separator + AGGREGATION.getName());
                FileSystem fs = FileSystem.get(new Configuration());
                if (fs.exists(aggOutputPath)) {
                    inputPaths = aggOutputPath.toString();
                }
                config.setProperty(TopKPhaseConstants.TOPK_PHASE_INPUT_PATH.toString(), inputPaths);
                config.setProperty(TopKPhaseConstants.TOPK_PHASE_OUTPUT_PATH.toString(),
                        getIndexDir(root, collection, minTime, maxTime) + File.separator + TOPK.getName());

                return config;
            }
        },
        DERIVED_COLUMN_TRANSFORMATION {
            @Override
            Class<?> getKlazz() {
                return DerivedColumnTransformationPhaseJob.class;
            }

            @Override
            String getDescription() {
                return "Adds new columns for dimensions with topk or whitelist";
            }

            @Override
            Properties getJobProperties(Properties inputConfig, String root, String collection, DateTime minTime,
                    DateTime maxTime, String inputPaths) throws Exception {
                Properties config = new Properties();

                Path aggOutputPath = new Path(
                        getIndexDir(root, collection, minTime, maxTime) + File.separator + AGGREGATION.getName());
                FileSystem fs = FileSystem.get(new Configuration());
                if (fs.exists(aggOutputPath)) {
                    inputPaths = aggOutputPath.toString();
                }
                config.setProperty(
                        DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_INPUT_PATH
                                .toString(),
                        inputPaths);
                config.setProperty(
                        DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_SCHEMA
                                .toString(),
                        getIndexDir(root, collection, minTime, maxTime));
                config.setProperty(
                        DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_PATH
                                .toString(),
                        getIndexDir(root, collection, minTime, maxTime) + File.separator
                                + DERIVED_COLUMN_TRANSFORMATION.getName());
                config.setProperty(
                        DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_TOPK_PATH
                                .toString(),
                        getIndexDir(root, collection, minTime, maxTime) + File.separator + TOPK.getName());

                return config;
            }
        },
        SEGMENT_CREATION {
            @Override
            Class<?> getKlazz() {
                return SegmentCreationPhaseJob.class;
            }

            @Override
            String getDescription() {
                return "Generates pinot segments";
            }

            @Override
            Properties getJobProperties(Properties inputConfig, String root, String collection, DateTime minTime,
                    DateTime maxTime, String inputPaths) throws Exception {
                Properties config = new Properties();

                Path derivedOutputPath = new Path(getIndexDir(root, collection, minTime, maxTime) + File.separator
                        + DERIVED_COLUMN_TRANSFORMATION.getName());
                Path aggregationOutputPath = new Path(
                        getIndexDir(root, collection, minTime, maxTime) + File.separator + AGGREGATION.getName());
                FileSystem fs = FileSystem.get(new Configuration());
                if (fs.exists(derivedOutputPath)) {
                    inputPaths = derivedOutputPath.toString();
                } else if (fs.exists(aggregationOutputPath)) {
                    inputPaths = aggregationOutputPath.toString();
                }

                config.setProperty(SegmentCreationPhaseConstants.SEGMENT_CREATION_INPUT_PATH.toString(),
                        inputPaths);
                config.setProperty(SegmentCreationPhaseConstants.SEGMENT_CREATION_OUTPUT_PATH.toString(),
                        getIndexDir(root, collection, minTime, maxTime) + File.separator
                                + SEGMENT_CREATION.getName());
                config.setProperty(SegmentCreationPhaseConstants.SEGMENT_CREATION_WALLCLOCK_START_TIME.toString(),
                        String.valueOf(minTime.getMillis()));
                config.setProperty(SegmentCreationPhaseConstants.SEGMENT_CREATION_WALLCLOCK_END_TIME.toString(),
                        String.valueOf(maxTime.getMillis()));

                String schedule = inputConfig.getProperty(ThirdEyeJobProperties.THIRDEYE_FLOW_SCHEDULE.getName());
                config.setProperty(SegmentCreationPhaseConstants.SEGMENT_CREATION_SCHEDULE.toString(), schedule);
                return config;
            }
        },
        SEGMENT_PUSH {
            @Override
            Class<?> getKlazz() {
                return SegmentPushPhase.class;
            }

            @Override
            String getDescription() {
                return "Pushes pinot segments to pinot controller";
            }

            @Override
            Properties getJobProperties(Properties inputConfig, String root, String collection, DateTime minTime,
                    DateTime maxTime, String inputPaths) throws Exception {
                Properties config = new Properties();

                config.setProperty(SegmentPushPhaseConstants.SEGMENT_PUSH_INPUT_PATH.toString(),
                        getIndexDir(root, collection, minTime, maxTime) + File.separator
                                + SEGMENT_CREATION.getName());
                config.setProperty(SegmentPushPhaseConstants.SEGMENT_PUSH_CONTROLLER_HOSTS.toString(),
                        inputConfig.getProperty(ThirdEyeJobProperties.THIRDEYE_PINOT_CONTROLLER_HOSTS.getName()));
                config.setProperty(SegmentPushPhaseConstants.SEGMENT_PUSH_CONTROLLER_PORT.toString(),
                        inputConfig.getProperty(ThirdEyeJobProperties.THIRDEYE_PINOT_CONTROLLER_PORT.getName()));
                return config;
            }
        };

        abstract Class<?> getKlazz();

        abstract String getDescription();

        abstract Properties getJobProperties(Properties inputConfig, String root, String collection,
                DateTime minTime, DateTime maxTime, String inputPaths) throws Exception;

        String getName() {
            return this.name().toLowerCase();
        }

        String getIndexDir(String root, String collection, DateTime minTime, DateTime maxTime) throws IOException {
            return getCollectionDir(root, collection) + File.separator + "data_"
                    + ThirdEyeConstants.DATE_TIME_FORMATTER.print(minTime) + "_"
                    + ThirdEyeConstants.DATE_TIME_FORMATTER.print(maxTime);
        }

    }

    private static void usage() {
        System.err.println(USAGE);
        for (PhaseSpec phase : PhaseSpec.values()) {
            System.err.printf("%-30s : %s\n", phase.getName(), phase.getDescription());
        }
    }

    private static String getAndCheck(String name, Properties properties) {
        String value = properties.getProperty(name);
        if (value == null) {
            throw new IllegalArgumentException("Must provide " + name);
        }
        return value;
    }

    private static String getCollectionDir(String root, String collection) {
        return root == null ? collection : root + File.separator + collection;
    }

    private void setMapreduceConfig(Configuration configuration) {
        String mapreduceConfig = inputConfig.getProperty(ThirdEyeJobProperties.THIRDEYE_MR_CONF.getName());
        if (mapreduceConfig != null && !mapreduceConfig.isEmpty()) {
            String[] options = mapreduceConfig.split(",");
            for (String option : options) {
                String[] configs = option.split("=", 2);
                if (configs.length == 2) {
                    LOGGER.info("Setting job configuration {} to {}", configs[0], configs[1]);
                    configuration.set(configs[0], configs[1]);
                }
            }
        }
    }

    @SuppressWarnings("unchecked")
    public void run() throws Exception {
        LOGGER.info("Input config:{}", inputConfig);
        PhaseSpec phaseSpec;
        try {
            phaseSpec = PhaseSpec.valueOf(phaseName.toUpperCase());
        } catch (Exception e) {
            usage();
            throw e;
        }

        if (PhaseSpec.TRANSFORM.equals(phaseSpec)) {
            TransformPhaseJob job = new TransformPhaseJob("Transform Job", inputConfig);
            job.run();
            return;

        } else if (PhaseSpec.JOIN.equals(phaseSpec)) {
            JoinPhaseJob job = new JoinPhaseJob("Join Job", inputConfig);
            job.run();
            return;

        } else if (PhaseSpec.WAIT.equals(phaseSpec)) {
            WaitPhaseJob job = new WaitPhaseJob("Wait for inputs", inputConfig);
            job.run();
            return;
        }

        // Get root, collection, input paths
        String root = getAndCheck(ThirdEyeJobProperties.THIRDEYE_ROOT.getName(), inputConfig);
        String collection = getAndCheck(ThirdEyeJobProperties.THIRDEYE_COLLECTION.getName(), inputConfig);
        String inputPaths = getAndCheck(ThirdEyeJobProperties.INPUT_PATHS.getName(), inputConfig);

        // Get min / max time
        DateTime minTime;
        DateTime maxTime;

        String minTimeProp = inputConfig.getProperty(ThirdEyeJobProperties.THIRDEYE_TIME_MIN.getName());
        String maxTimeProp = inputConfig.getProperty(ThirdEyeJobProperties.THIRDEYE_TIME_MAX.getName());

        minTime = ISODateTimeFormat.dateTimeParser().parseDateTime(minTimeProp);
        maxTime = ISODateTimeFormat.dateTimeParser().parseDateTime(maxTimeProp);

        Properties jobProperties = phaseSpec.getJobProperties(inputConfig, root, collection, minTime, maxTime,
                inputPaths);
        for (Object key : inputConfig.keySet()) {
            jobProperties.setProperty(key.toString(), inputConfig.getProperty(key.toString()));
        }

        // Instantiate the job
        Constructor<Configured> constructor = (Constructor<Configured>) phaseSpec.getKlazz()
                .getConstructor(String.class, Properties.class);
        Configured instance = constructor.newInstance(phaseSpec.getName(), jobProperties);
        setMapreduceConfig(instance.getConf());

        // Run the job
        Method runMethod = instance.getClass().getMethod("run");
        Job job = (Job) runMethod.invoke(instance);
        if (job != null) {
            JobStatus status = job.getStatus();
            if (status.getState() != JobStatus.State.SUCCEEDED) {
                throw new RuntimeException(
                        "Job " + job.getJobName() + " failed to execute: Ran with config:" + jobProperties);
            }
        }
    }

    public static void main(String[] args) throws Exception {
        if (args.length != 2) {
            usage();
            System.exit(1);
        }

        String phaseName = args[0];
        Properties config = new Properties();
        config.load(new FileInputStream(args[1]));
        new ThirdEyeJob(phaseName, config).run();
    }
}