com.linkedin.hadoop.jobs.HdfsWaitJob.java Source code

Java tutorial

Introduction

Here is the source code for com.linkedin.hadoop.jobs.HdfsWaitJob.java

Source

/*
 * Copyright 2015 LinkedIn Corp.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package com.linkedin.hadoop.jobs;

import java.io.IOException;
import java.util.concurrent.TimeUnit;
import java.util.Properties;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.log4j.Logger;

/**
 * Hadoop Java job that is passed arguments from an hdfsWaitJob with type=hadoopJava.
 * This job will quit after it runs longer than the limit set by timeout, and will
 * either succeed or fail depending on the value of failOnTimeout passed in
 * from the Gradle file. Before timeout, if we can find a folder in the directory
 * specified by dirPath that is fresh enough, we cause the job to succeed.
 */
public class HdfsWaitJob extends Configured {

    private static final Logger log = Logger.getLogger(HdfsWaitJob.class);
    private String _name;
    private Properties _properties;

    /**
     * Constructor for HdfsWaitJob
     *
     * @param name The name of the job
     * @param properties The properties specified for this job
     */
    public HdfsWaitJob(String name, Properties properties) {
        super(new Configuration());
        _name = name;
        _properties = properties;
    }

    /**
     * Method run that keeps track of the time this program has been running. If
     * it surpasses the timeout limit, it causes the job to either fail or succeed,
     * depending on the value of 'failOnTimeout' specified in the DSL file. This
     * method continuously calls checkDirectory while it hasn't timed out, using
     * sleepTime as a buffer. If checkDirectory ever returns true on any of the
     * calls, the job succeeds.
     *
     * @throws Exception If there is an exception during the parameter setup
     */
    public void run() throws Exception {
        String dirPath = _properties.getProperty("pathToDirectory");
        long freshness = parseTime(_properties.getProperty("freshness"));
        long timeout = parseTime(_properties.getProperty("timeout"));
        long endTime = System.currentTimeMillis() + timeout;
        long sleepTime = parseTime(_properties.getProperty("sleepInterval", "1M"));
        boolean failOnTimeout = Boolean.valueOf(_properties.getProperty("forceJobToFail"));
        boolean checkExactPath = Boolean.valueOf(_properties.getProperty("exactPath", "false"));
        boolean folderFound = false;

        log.info("STATUS: Job started. Checking the directory at " + dirPath
                + " for fresh folders with a sleep interval of " + _properties.getProperty("sleepInterval", "1M"));

        while (System.currentTimeMillis() < endTime && !folderFound) {
            folderFound = checkDirectory(dirPath, freshness, checkExactPath);
            if (!folderFound) {
                log.info("STATUS: Now sleeping for " + _properties.getProperty("sleepInterval", "1M")
                        + " before polling again.");
                log.info("REMINDER: Job will time out " + TimeUnit.MILLISECONDS.toMinutes(timeout)
                        + " minutes after instantiation.");
                Thread.sleep(sleepTime);
            }
        }

        if (!folderFound) {
            log.info("WARNING: There were no folders found in " + dirPath
                    + " that were fresh enough before reaching timeout.");
            log.info("RESULT: Job timing out with parameter failOnTimeout = " + failOnTimeout);
            if (failOnTimeout) {
                throw new Exception("Forcing job to fail after timeout since failOnTimeout = true");
            }
        }
    }

    /**
     * Method parseTime that takes in a string, either timeout, freshness, or sleepTime.
     * It splits the string up using whitespace as the delimiter. Then each time unit
     * and corresponding integer is converted to its millisecond representation. The total
     * millisecond value is then returned.
     *
     * @param prop The string value of either freshness, timeout or sleepTime
     * @throws NumberFormatException If the specification is invalid
     * @return The amount of milliseconds prop corresponds to as a long
     */
    public long parseTime(String prop) throws NumberFormatException {
        long totalTime = 0;
        String[] strArray = prop.split("\\s+");

        for (int i = 0; i < strArray.length; i++) {
            String str = strArray[i];
            long time = Long.valueOf(str.substring(0, str.length() - 1));
            char unit = str.charAt(str.length() - 1);

            if (unit == 'S') {
                totalTime += TimeUnit.SECONDS.toMillis(time);
            } else if (unit == 'M') {
                totalTime += TimeUnit.MINUTES.toMillis(time);
            } else if (unit == 'H') {
                totalTime += TimeUnit.HOURS.toMillis(time);
            } else if (unit == 'D') {
                totalTime += TimeUnit.DAYS.toMillis(time);
            } else {
                String errMessage = "ERROR: Invalid time specification: " + prop
                        + " does not have units in seconds (S), minutes (M), hours (H), or days (D).";
                log.info(errMessage);
                throw new NumberFormatException(errMessage);
            }
        }
        return totalTime;
    }

    /**
     * Method checkDirectory loops through the folders pointed to by dirPath, and will
     * cause the job to succeed if any of the folders are fresh enough. However, if the
     * parameter checkExactPath is true, this method only checks for the existence of
     * dirPath in HDFS.
     *
     * @param dirPath The path to the directory we are searching for fresh folders
     * @param freshness The timeframe in which the folder has to have been modified by
     * @param checkExactPath The boolean that decides if we only check for the existence of dirPath in HDFS
     * @throws IOException If there is an HDFS exception
     * @return A boolean value corresponding to whether a fresh folder was found
     */
    public boolean checkDirectory(String dirPath, long freshness, boolean checkExactPath)
            throws IOException, NullPointerException {
        FileSystem fileSys = FileSystem.get(getConf());

        if (fileSys == null) {
            String errMessage = "ERROR: The file system trying to be accessed does not exist. JOB TERMINATED.";
            log.info(errMessage);
            throw new NullPointerException(errMessage);
        }

        if (checkExactPath) {
            if (fileSys.exists(new Path(dirPath))) {
                log.info("SUCCESS: The exact path: " + dirPath + " was found in HDFS. Program now quitting.");
                return true;
            }
            log.info("STATUS: The exact path: " + dirPath + " was not found during latest polling.");
            return false;
        }

        FileStatus[] status = fileSys.listStatus(new Path(dirPath));

        if (status == null) {
            String errMessage = "ERROR: dirPath -> " + dirPath + " is empty or does not exist. JOB TERMINATED.";
            log.info(errMessage);
            throw new IOException(errMessage);
        }

        for (FileStatus file : status) {
            if (file.isDirectory()) {
                long timeModified = file.getModificationTime();
                if ((System.currentTimeMillis() - timeModified) <= freshness) {
                    String fileName = file.getPath().toString();
                    log.info("We found this fresh folder in the filePath: "
                            + fileName.substring(fileName.lastIndexOf("/") + 1));
                    log.info("SUCCESS: Program now quitting after successfully finding a fresh folder.");
                    return true;
                }
            }
        }
        log.info("STATUS: No fresh folders found during latest polling.");
        return false;
    }
}