Example usage for com.amazonaws.services.elasticmapreduce AmazonElasticMapReduceClient runJobFlow

Introduction

In this page you can find the example usage for com.amazonaws.services.elasticmapreduce AmazonElasticMapReduceClient runJobFlow.

Prototype

@Override
public RunJobFlowResult runJobFlow(RunJobFlowRequest request)

Source Link

Document

RunJobFlow creates and starts running a new cluster (job flow).

Usage

From source file:org.finra.dm.dao.impl.EmrOperationsImpl.java

License:Apache License

/**
 * Run Job Flow to AmazonElasticMapReduceClient
 *///  ww w  . j a v a2 s. c o m
@Override
public String runEmrJobFlow(AmazonElasticMapReduceClient emrClient, RunJobFlowRequest jobFlowRequest) {
    return emrClient.runJobFlow(jobFlowRequest).getJobFlowId();
}

From source file:org.pentaho.amazon.emr.job.AmazonElasticMapReduceJobExecutor.java

License:Apache License

public Result execute(Result result, int arg1) throws KettleException {
    Log4jFileAppender appender = null;/*from  www  . j  av a2s .  c  om*/
    String logFileName = "pdi-" + this.getName(); //$NON-NLS-1$
    try {
        appender = LogWriter.createFileAppender(logFileName, true, false);
        LogWriter.getInstance().addAppender(appender);
        log.setLogLevel(parentJob.getLogLevel());
    } catch (Exception e) {
        logError(BaseMessages.getString(PKG, "AmazonElasticMapReduceJobExecutor.FailedToOpenLogFile", //$NON-NLS-1$
                logFileName, e.toString()));
        logError(Const.getStackTracker(e));
    }

    try {
        // create/connect aws service
        AmazonElasticMapReduceClient emrClient = new AmazonElasticMapReduceClient(awsCredentials);

        // pull down jar from vfs
        FileObject jarFile = KettleVFS.getFileObject(buildFilename(jarUrl));
        File tmpFile = File.createTempFile("customEMR", "jar");
        tmpFile.deleteOnExit();
        FileOutputStream tmpFileOut = new FileOutputStream(tmpFile);
        IOUtils.copy(jarFile.getContent().getInputStream(), tmpFileOut);
        URL localJarUrl = tmpFile.toURI().toURL();

        // find main class in jar
        String mainClass = getMainClass(localJarUrl);

        // create staging bucket
        AmazonS3 s3Client = new AmazonS3Client(awsCredentials);

        FileSystemOptions opts = new FileSystemOptions();
        DefaultFileSystemConfigBuilder.getInstance().setUserAuthenticator(opts, new StaticUserAuthenticator(
                null, awsCredentials.getAWSAccessKeyId(), awsCredentials.getAWSSecretKey()));
        FileObject stagingDirFileObject = KettleVFS.getFileObject(stagingDir, getVariables(), opts);

        String stagingBucketName = stagingDirFileObject.getName().getBaseName();
        if (!s3Client.doesBucketExist(stagingBucketName)) {
            s3Client.createBucket(stagingBucketName);
        }

        // delete old jar if needed
        try {
            s3Client.deleteObject(stagingBucketName, jarFile.getName().getBaseName());
        } catch (Exception ex) {
            logError(Const.getStackTracker(ex));
        }

        // put jar in s3 staging bucket
        s3Client.putObject(new PutObjectRequest(stagingBucketName, jarFile.getName().getBaseName(), tmpFile));
        // create non-vfs s3 url to jar
        String stagingS3JarUrl = "s3://" + stagingBucketName + "/" + jarFile.getName().getBaseName();
        String stagingS3BucketUrl = "s3://" + stagingBucketName;

        RunJobFlowRequest runJobFlowRequest = null;
        RunJobFlowResult runJobFlowResult = null;
        if (StringUtil.isEmpty(hadoopJobFlowId)) {
            // create EMR job flow
            runJobFlowRequest = createJobFlow(stagingS3BucketUrl, stagingS3JarUrl, mainClass);
            // start EMR job
            runJobFlowResult = emrClient.runJobFlow(runJobFlowRequest);
        } else {
            List<String> jarStepArgs = new ArrayList<String>();
            if (!StringUtil.isEmpty(cmdLineArgs)) {
                StringTokenizer st = new StringTokenizer(cmdLineArgs, " ");
                while (st.hasMoreTokens()) {
                    String token = st.nextToken();
                    logBasic("adding args: " + token);
                    jarStepArgs.add(token);
                }
            }

            HadoopJarStepConfig hadoopJarStep = new HadoopJarStepConfig();
            hadoopJarStep.setJar(stagingS3JarUrl);
            hadoopJarStep.setMainClass(mainClass);
            hadoopJarStep.setArgs(jarStepArgs);

            StepConfig stepConfig = new StepConfig();
            stepConfig.setName("custom jar: " + jarUrl);
            stepConfig.setHadoopJarStep(hadoopJarStep);

            List<StepConfig> steps = new ArrayList<StepConfig>();
            steps.add(stepConfig);

            AddJobFlowStepsRequest addJobFlowStepsRequest = new AddJobFlowStepsRequest();
            addJobFlowStepsRequest.setJobFlowId(hadoopJobFlowId);
            addJobFlowStepsRequest.setSteps(steps);

            emrClient.addJobFlowSteps(addJobFlowStepsRequest);
        }

        String loggingIntervalS = environmentSubstitute(loggingInterval);
        int logIntv = 60;
        try {
            logIntv = Integer.parseInt(loggingIntervalS);
        } catch (NumberFormatException ex) {
            logError("Unable to parse logging interval '" + loggingIntervalS + "' - using " + "default of 60");
        }

        // monitor it / blocking / logging if desired
        if (blocking) {
            try {
                if (log.isBasic()) {

                    String executionState = "RUNNING";

                    List<String> jobFlowIds = new ArrayList<String>();
                    String id = hadoopJobFlowId;
                    if (StringUtil.isEmpty(hadoopJobFlowId)) {
                        id = runJobFlowResult.getJobFlowId();
                        jobFlowIds.add(id);
                    }

                    while (isRunning(executionState)) {
                        DescribeJobFlowsRequest describeJobFlowsRequest = new DescribeJobFlowsRequest();
                        describeJobFlowsRequest.setJobFlowIds(jobFlowIds);

                        DescribeJobFlowsResult describeJobFlowsResult = emrClient
                                .describeJobFlows(describeJobFlowsRequest);
                        boolean found = false;
                        for (JobFlowDetail jobFlowDetail : describeJobFlowsResult.getJobFlows()) {
                            if (jobFlowDetail.getJobFlowId().equals(id)) {
                                executionState = jobFlowDetail.getExecutionStatusDetail().getState();
                                found = true;
                            }
                        }

                        if (!found) {
                            break;
                        }
                        // logBasic(BaseMessages.getString(PKG, "AmazonElasticMapReduceJobExecutor.RunningPercent", setupPercent,
                        // mapPercent, reducePercent));
                        logBasic(hadoopJobName + " execution status: " + executionState);
                        try {
                            if (isRunning(executionState)) {
                                Thread.sleep(logIntv * 1000);
                            }
                        } catch (InterruptedException ie) {
                            // Ignore
                        }
                    }

                    if ("FAILED".equalsIgnoreCase(executionState)) {
                        result.setStopped(true);
                        result.setNrErrors(1);
                        result.setResult(false);

                        S3Object outObject = s3Client.getObject(stagingBucketName, id + "/steps/1/stdout");
                        ByteArrayOutputStream outStream = new ByteArrayOutputStream();
                        IOUtils.copy(outObject.getObjectContent(), outStream);
                        logError(outStream.toString());

                        S3Object errorObject = s3Client.getObject(stagingBucketName, id + "/steps/1/stderr");
                        ByteArrayOutputStream errorStream = new ByteArrayOutputStream();
                        IOUtils.copy(errorObject.getObjectContent(), errorStream);
                        logError(errorStream.toString());
                    }
                }
            } catch (Exception e) {
                logError(e.getMessage(), e);
            }
        }

    } catch (Throwable t) {
        t.printStackTrace();
        result.setStopped(true);
        result.setNrErrors(1);
        result.setResult(false);
        logError(t.getMessage(), t);
    }

    if (appender != null) {
        LogWriter.getInstance().removeAppender(appender);
        appender.close();

        ResultFile resultFile = new ResultFile(ResultFile.FILE_TYPE_LOG, appender.getFile(),
                parentJob.getJobname(), getName());
        result.getResultFiles().put(resultFile.getFile().toString(), resultFile);
    }

    return result;
}

From source file:org.pentaho.amazon.hive.job.AmazonHiveJobExecutor.java

License:Apache License

/**
 * Executes a Hive job into the AWS Elastic MapReduce service.
 *///from  w w w.  jav  a  2s . c o m
public Result execute(Result result, int arg1) throws KettleException {

    // Setup a log file.
    Log4jFileAppender appender = null;
    String logFileName = "pdi-" + this.getName(); //$NON-NLS-1$
    try {
        appender = LogWriter.createFileAppender(logFileName, true, false);
        LogWriter.getInstance().addAppender(appender);
        log.setLogLevel(parentJob.getLogLevel());
    } catch (Exception e) {
        logError(BaseMessages.getString(PKG, "AmazonElasticMapReduceJobExecutor.FailedToOpenLogFile", //$NON-NLS-1$
                logFileName, e.toString()));
        logError(Const.getStackTracker(e));
    }

    try {
        // Create and connect an AWS service.
        AmazonElasticMapReduceClient emrClient = new AmazonElasticMapReduceClient(awsCredentials);
        AmazonS3 s3Client = new AmazonS3Client(awsCredentials);

        // Get bucket name and S3 URL.
        String stagingBucketName = GetBucketName(stagingDir);
        String stagingS3BucketUrl = "s3://" + stagingBucketName; //$NON-NLS-1$

        // Prepare staging S3 URL for Hive script file.
        String stagingS3qUrl = "";
        if (qUrl.startsWith(S3FileProvider.SCHEME + "://")) { //$NON-NLS-1$

            // If the .q file is in S3, its staging S3 URL is s3://{bucketname}/{path}
            if (qUrl.indexOf("@s3") > 0) { //$NON-NLS-1$
                stagingS3qUrl = S3FileProvider.SCHEME + "://" + qUrl.substring(qUrl.indexOf("@s3") + 4); //$NON-NLS-1$
            } else {
                stagingS3qUrl = qUrl;
            }

        } else {
            // A local filename is given for the Hive script file. It should be copied to the S3 Log Directory.
            // First, check for the correct protocol.
            if (!qUrl.startsWith("file:")) { //$NON-NLS-1$
                if (log.isBasic()) {
                    logBasic(BaseMessages.getString(PKG,
                            "AmazonElasticMapReduceJobExecutor.HiveScriptFilename.Error") + qUrl); //$NON-NLS-1$
                }
            }
            // pull down .q file from VSF
            FileObject qFile = KettleVFS.getFileObject(buildFilename(qUrl));
            File tmpFile = File.createTempFile("customEMR", "q"); //$NON-NLS-1$
            tmpFile.deleteOnExit();
            FileOutputStream tmpFileOut = new FileOutputStream(tmpFile);
            IOUtils.copy(qFile.getContent().getInputStream(), tmpFileOut);
            // Get key name for the script file S3 destination. Key is defined as path name after {bucket}/
            String key = GetKeyFromS3Url(stagingDir);
            if (key == null) {
                key = qFile.getName().getBaseName();
            } else {
                key += "/" + qFile.getName().getBaseName(); //$NON-NLS-1$
            }

            // delete the previous .q file in S3
            try {
                s3Client.deleteObject(stagingBucketName, key);
            } catch (Exception ex) {
                logError(Const.getStackTracker(ex));
            }

            // Put .q file in S3 Log Directory.
            s3Client.putObject(new PutObjectRequest(stagingBucketName, key, tmpFile));
            stagingS3qUrl = stagingS3BucketUrl + "/" + key; //$NON-NLS-1$
        }

        // AWS provides script-runner.jar (in its public bucket), which should be used as a MapReduce jar for Hive EMR
        // job.
        jarUrl = "s3://elasticmapreduce/libs/script-runner/script-runner.jar"; //$NON-NLS-1$

        RunJobFlowRequest runJobFlowRequest = null;
        RunJobFlowResult runJobFlowResult = null;
        if (StringUtil.isEmpty(hadoopJobFlowId)) {
            // create an EMR job flow, start a step to setup Hive and get the job flow ID.
            runJobFlowRequest = createJobFlow();
            runJobFlowResult = emrClient.runJobFlow(runJobFlowRequest);
            hadoopJobFlowId = runJobFlowResult.getJobFlowId();
        }

        // Now EMR job flow is ready to accept a Run Hive Script step.
        // First, prepare a Job Flow ID list.
        List<String> jobFlowIds = new ArrayList<String>();
        jobFlowIds.add(hadoopJobFlowId);

        // Configure a HadoopJarStep.
        String args = "s3://elasticmapreduce/libs/hive/hive-script "
                + "--base-path s3://elasticmapreduce/libs/hive/ --hive-version 0.7 --run-hive-script --args -f "
                + environmentSubstitute(stagingS3qUrl) + " " + environmentSubstitute(cmdLineArgs); //$NON-NLS-1$
        List<StepConfig> steps = ConfigHadoopJarStep(hadoopJobName, jarUrl, args);

        // Add a Run Hive Script step to the existing job flow.
        AddJobFlowStepsRequest addJobFlowStepsRequest = new AddJobFlowStepsRequest();
        addJobFlowStepsRequest.setJobFlowId(hadoopJobFlowId);
        addJobFlowStepsRequest.setSteps(steps);
        emrClient.addJobFlowSteps(addJobFlowStepsRequest);

        // Set a logging interval.
        String loggingIntervalS = environmentSubstitute(loggingInterval);
        int logIntv = 10;
        try {
            logIntv = Integer.parseInt(loggingIntervalS);
        } catch (NumberFormatException ex) {
            logError(BaseMessages.getString(PKG, "AmazonElasticMapReduceJobExecutor.LoggingInterval.Error", //$NON-NLS-1$
                    loggingIntervalS));
        }

        // monitor and log if intended.
        if (blocking) {
            try {
                if (log.isBasic()) {

                    String executionState = "RUNNING"; //$NON-NLS-1$

                    while (isRunning(executionState)) {
                        DescribeJobFlowsRequest describeJobFlowsRequest = new DescribeJobFlowsRequest();
                        describeJobFlowsRequest.setJobFlowIds(jobFlowIds);

                        DescribeJobFlowsResult describeJobFlowsResult = emrClient
                                .describeJobFlows(describeJobFlowsRequest);
                        boolean found = false;
                        for (JobFlowDetail jobFlowDetail : describeJobFlowsResult.getJobFlows()) {
                            if (jobFlowDetail.getJobFlowId().equals(hadoopJobFlowId)) {
                                executionState = jobFlowDetail.getExecutionStatusDetail().getState();
                                found = true;
                            }
                        }

                        if (!found) {
                            break;
                        }
                        logBasic(hadoopJobName + " " + BaseMessages.getString(PKG, //$NON-NLS-1$
                                "AmazonElasticMapReduceJobExecutor.JobFlowExecutionStatus", hadoopJobFlowId)
                                + executionState);

                        if (parentJob.isStopped()) {
                            if (!alive) {
                                TerminateJobFlowsRequest terminateJobFlowsRequest = new TerminateJobFlowsRequest();
                                terminateJobFlowsRequest.withJobFlowIds(hadoopJobFlowId);
                                emrClient.terminateJobFlows(terminateJobFlowsRequest);
                            }
                            break;
                        }

                        try {
                            if (isRunning(executionState)) {
                                Thread.sleep(logIntv * 1000);
                            }
                        } catch (InterruptedException ie) {
                            logError(Const.getStackTracker(ie));
                        }
                    }

                    if ("FAILED".equalsIgnoreCase(executionState)) { //$NON-NLS-1$
                        result.setStopped(true);
                        result.setNrErrors(1);
                        result.setResult(false);

                        S3Object outObject = s3Client.getObject(stagingBucketName,
                                hadoopJobFlowId + "/steps/1/stdout"); //$NON-NLS-1$
                        ByteArrayOutputStream outStream = new ByteArrayOutputStream();
                        IOUtils.copy(outObject.getObjectContent(), outStream);
                        logError(outStream.toString());

                        S3Object errorObject = s3Client.getObject(stagingBucketName,
                                hadoopJobFlowId + "/steps/1/stderr"); //$NON-NLS-1$
                        ByteArrayOutputStream errorStream = new ByteArrayOutputStream();
                        IOUtils.copy(errorObject.getObjectContent(), errorStream);
                        logError(errorStream.toString());
                    }
                }
            } catch (Exception e) {
                logError(e.getMessage(), e);
            }
        }

    } catch (Throwable t) {
        t.printStackTrace();
        result.setStopped(true);
        result.setNrErrors(1);
        result.setResult(false);
        logError(t.getMessage(), t);
    }

    if (appender != null) {
        LogWriter.getInstance().removeAppender(appender);
        appender.close();

        ResultFile resultFile = new ResultFile(ResultFile.FILE_TYPE_LOG, appender.getFile(),
                parentJob.getJobname(), getName());
        result.getResultFiles().put(resultFile.getFile().toString(), resultFile);
    }

    return result;
}