org.pentaho.amazon.emr.job.AmazonElasticMapReduceJobExecutor.java Source code

Java tutorial

Introduction

Here is the source code for org.pentaho.amazon.emr.job.AmazonElasticMapReduceJobExecutor.java

Source

/*! ******************************************************************************
 *
 * Pentaho Big Data
 *
 * Copyright (C) 2002-2018 by Hitachi Vantara : http://www.pentaho.com
 *
 *******************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/

package org.pentaho.amazon.emr.job;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URL;
import java.util.List;

import org.apache.commons.io.IOUtils;
import org.apache.commons.vfs2.FileObject;
import org.pentaho.amazon.AbstractAmazonJobExecutor;
import org.pentaho.di.cluster.SlaveServer;
import org.pentaho.di.core.annotations.JobEntry;
import org.pentaho.di.core.database.DatabaseMeta;
import org.pentaho.di.core.encryption.Encr;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.exception.KettleXMLException;
import org.pentaho.di.core.hadoop.HadoopConfigurationBootstrap;
import org.pentaho.di.core.vfs.KettleVFS;
import org.pentaho.di.core.xml.XMLHandler;
import org.pentaho.di.i18n.BaseMessages;
import org.pentaho.di.job.entries.hadoopjobexecutor.JarUtility;
import org.pentaho.di.repository.ObjectId;
import org.pentaho.di.repository.Repository;
import org.pentaho.hadoop.shim.spi.HadoopShim;
import org.pentaho.metastore.api.IMetaStore;
import org.w3c.dom.Node;

@JobEntry(id = "EMRJobExecutorPlugin", image = "EMR.svg", name = "EMRJobExecutorPlugin.Name", description = "EMRJobExecutorPlugin.Description", categoryDescription = "i18n:org.pentaho.di.job:JobCategory.Category.BigData", documentationUrl = "Products/Data_Integration/Job_Entry_Reference/Amazon_EMR_Job_Executor", i18nPackageName = "org.pentaho.amazon.emr.job")
public class AmazonElasticMapReduceJobExecutor extends AbstractAmazonJobExecutor {

    private static Class<?> PKG = AmazonElasticMapReduceJobExecutor.class;
    private static final String STEP_EMR = "emr";
    private URL localFileUrl;

    protected String jarUrl = "";

    public String getJarUrl() {
        return jarUrl;
    }

    public void setJarUrl(String jarUrl) {
        this.jarUrl = jarUrl;
    }

    private JarUtility util = new JarUtility();

    public AmazonElasticMapReduceJobExecutor() {
    }

    public String getMainClass(URL localJarUrl) throws Exception {
        HadoopShim shim = HadoopConfigurationBootstrap.getHadoopConfigurationProvider().getActiveConfiguration()
                .getHadoopShim();

        final Class<?> mainClass = util.getMainClassFromManifest(localJarUrl, shim.getClass().getClassLoader());
        if (mainClass != null) {
            return mainClass.getName();
        } else {
            List<Class<?>> classesWithMains = util.getClassesInJarWithMain(localJarUrl.toExternalForm(),
                    shim.getClass().getClassLoader());
            if (!classesWithMains.isEmpty()) {
                return classesWithMains.get(0).getName();
            }
        }
        throw new RuntimeException("Could not find main class in: " + localJarUrl.toExternalForm());
    }

    public boolean isAlive() {
        return alive;
    }

    @Override
    public File createStagingFile() throws IOException, KettleException {
        // pull down .jar file from VSF
        FileObject jarFile = KettleVFS.getFileObject(buildFilename(jarUrl));
        File tmpFile = File.createTempFile("customEMR", "jar");
        tmpFile.deleteOnExit();
        FileOutputStream tmpFileOut = new FileOutputStream(tmpFile);
        IOUtils.copy(jarFile.getContent().getInputStream(), tmpFileOut);
        localFileUrl = tmpFile.toURI().toURL();
        setS3BucketKey(jarFile);
        return tmpFile;
    }

    @Override
    public String getStepBootstrapActions() {
        return null;
    }

    @Override
    public String getMainClass() throws Exception {
        return getMainClass(localFileUrl);
    }

    public String getStepType() {
        return STEP_EMR;
    }

    @Override
    public void loadXML(Node entrynode, List<DatabaseMeta> databases, List<SlaveServer> slaveServers,
            Repository rep, IMetaStore metaStore) throws KettleXMLException {
        super.loadXML(entrynode, databases, slaveServers);
        hadoopJobName = XMLHandler.getTagValue(entrynode, "hadoop_job_name");
        hadoopJobFlowId = XMLHandler.getTagValue(entrynode, "hadoop_job_flow_id");
        jarUrl = XMLHandler.getTagValue(entrynode, "jar_url");
        accessKey = Encr.decryptPasswordOptionallyEncrypted(XMLHandler.getTagValue(entrynode, "access_key"));
        secretKey = Encr.decryptPasswordOptionallyEncrypted(XMLHandler.getTagValue(entrynode, "secret_key"));
        stagingDir = XMLHandler.getTagValue(entrynode, "staging_dir");
        region = XMLHandler.getTagValue(entrynode, "region");
        ec2Role = XMLHandler.getTagValue(entrynode, "ec2_role");
        emrRole = XMLHandler.getTagValue(entrynode, "emr_role");
        masterInstanceType = XMLHandler.getTagValue(entrynode, "master_instance_type");
        slaveInstanceType = XMLHandler.getTagValue(entrynode, "slave_instance_type");
        numInstances = XMLHandler.getTagValue(entrynode, "num_instances");
        emrRelease = XMLHandler.getTagValue(entrynode, "emr_release");
        cmdLineArgs = XMLHandler.getTagValue(entrynode, "command_line_args");
        alive = "Y".equalsIgnoreCase(XMLHandler.getTagValue(entrynode, "alive"));
        runOnNewCluster = "Y".equalsIgnoreCase(XMLHandler.getTagValue(entrynode, "runOnNewCluster"));
        blocking = "Y".equalsIgnoreCase(XMLHandler.getTagValue(entrynode, "blocking"));
        loggingInterval = XMLHandler.getTagValue(entrynode, "logging_interval");
    }

    @Override
    public String getXML() {
        StringBuffer retval = new StringBuffer(1024);
        retval.append(super.getXML());
        retval.append("      ").append(XMLHandler.addTagValue("hadoop_job_name", hadoopJobName));
        retval.append("      ").append(XMLHandler.addTagValue("hadoop_job_flow_id", hadoopJobFlowId));
        retval.append("      ").append(XMLHandler.addTagValue("jar_url", jarUrl));
        retval.append("      ")
                .append(XMLHandler.addTagValue("access_key", Encr.encryptPasswordIfNotUsingVariables(accessKey)));
        retval.append("      ")
                .append(XMLHandler.addTagValue("secret_key", Encr.encryptPasswordIfNotUsingVariables(secretKey)));
        retval.append("      ").append(XMLHandler.addTagValue("region", region));
        retval.append("      ").append(XMLHandler.addTagValue("ec2_role", ec2Role));
        retval.append("      ").append(XMLHandler.addTagValue("emr_role", emrRole));
        retval.append("      ").append(XMLHandler.addTagValue("master_instance_type", masterInstanceType));
        retval.append("      ").append(XMLHandler.addTagValue("slave_instance_type", slaveInstanceType));
        retval.append("      ").append(XMLHandler.addTagValue("emr_release", emrRelease));
        retval.append("      ").append(XMLHandler.addTagValue("num_instances", numInstances));
        retval.append("      ").append(XMLHandler.addTagValue("staging_dir", stagingDir));
        retval.append("      ").append(XMLHandler.addTagValue("command_line_args", cmdLineArgs));
        retval.append("      ").append(XMLHandler.addTagValue("alive", alive));
        retval.append("      ").append(XMLHandler.addTagValue("runOnNewCluster", runOnNewCluster));
        retval.append("      ").append(XMLHandler.addTagValue("blocking", blocking));
        retval.append("      ").append(XMLHandler.addTagValue("logging_interval", loggingInterval));

        return retval.toString();
    }

    @Override
    public void loadRep(Repository rep, IMetaStore metaStore, ObjectId id_jobentry, List<DatabaseMeta> databases,
            List<SlaveServer> slaveServers) throws KettleException {
        if (rep != null) {
            super.loadRep(rep, metaStore, id_jobentry, databases, slaveServers);

            setHadoopJobName(rep.getJobEntryAttributeString(id_jobentry, "hadoop_job_name"));
            setHadoopJobFlowId(rep.getJobEntryAttributeString(id_jobentry, "hadoop_job_flow_id"));
            setJarUrl(rep.getJobEntryAttributeString(id_jobentry, "jar_url"));
            setAccessKey(Encr
                    .decryptPasswordOptionallyEncrypted(rep.getJobEntryAttributeString(id_jobentry, "access_key")));
            setSecretKey(Encr
                    .decryptPasswordOptionallyEncrypted(rep.getJobEntryAttributeString(id_jobentry, "secret_key")));
            setStagingDir(rep.getJobEntryAttributeString(id_jobentry, "staging_dir"));
            setRegion(rep.getJobEntryAttributeString(id_jobentry, "region"));
            setEc2Role(rep.getJobEntryAttributeString(id_jobentry, "ec2_role"));
            setEmrRole(rep.getJobEntryAttributeString(id_jobentry, "emr_role"));
            setMasterInstanceType(rep.getJobEntryAttributeString(id_jobentry, "master_instance_type"));
            setSlaveInstanceType(rep.getJobEntryAttributeString(id_jobentry, "slave_instance_type"));
            setEmrRelease(rep.getJobEntryAttributeString(id_jobentry, "emr_release"));
            setNumInstances(rep.getJobEntryAttributeString(id_jobentry, "num_instances"));
            setCmdLineArgs(rep.getJobEntryAttributeString(id_jobentry, "command_line_args"));
            setAlive(rep.getJobEntryAttributeBoolean(id_jobentry, "alive"));
            setRunOnNewCluster(rep.getJobEntryAttributeBoolean(id_jobentry, "runOnNewCluster"));
            setBlocking(rep.getJobEntryAttributeBoolean(id_jobentry, "blocking"));
            setLoggingInterval(rep.getJobEntryAttributeString(id_jobentry, "logging_interval"));

        } else {
            throw new KettleException(
                    BaseMessages.getString(PKG, "AmazonElasticMapReduceJobExecutor.LoadFromRepository.Error"));
        }
    }

    @Override
    public void saveRep(Repository rep, IMetaStore metaStore, ObjectId id_job) throws KettleException {
        if (rep != null) {
            super.saveRep(rep, metaStore, id_job);

            rep.saveJobEntryAttribute(id_job, getObjectId(), "hadoop_job_name", hadoopJobName);
            rep.saveJobEntryAttribute(id_job, getObjectId(), "hadoop_job_flow_id", hadoopJobFlowId);
            rep.saveJobEntryAttribute(id_job, getObjectId(), "jar_url", jarUrl);
            rep.saveJobEntryAttribute(id_job, getObjectId(), "secret_key",
                    Encr.encryptPasswordIfNotUsingVariables(secretKey));
            rep.saveJobEntryAttribute(id_job, getObjectId(), "access_key",
                    Encr.encryptPasswordIfNotUsingVariables(accessKey));
            rep.saveJobEntryAttribute(id_job, getObjectId(), "staging_dir", stagingDir);
            rep.saveJobEntryAttribute(id_job, getObjectId(), "region", region);
            rep.saveJobEntryAttribute(id_job, getObjectId(), "ec2_role", ec2Role);
            rep.saveJobEntryAttribute(id_job, getObjectId(), "emr_role", emrRole);
            rep.saveJobEntryAttribute(id_job, getObjectId(), "master_instance_type", masterInstanceType);
            rep.saveJobEntryAttribute(id_job, getObjectId(), "slave_instance_type", slaveInstanceType);
            rep.saveJobEntryAttribute(id_job, getObjectId(), "emr_release", emrRelease);
            rep.saveJobEntryAttribute(id_job, getObjectId(), "num_instances", numInstances);
            rep.saveJobEntryAttribute(id_job, getObjectId(), "command_line_args", cmdLineArgs);
            rep.saveJobEntryAttribute(id_job, getObjectId(), "alive", alive);
            rep.saveJobEntryAttribute(id_job, getObjectId(), "runOnNewCluster", runOnNewCluster);
            rep.saveJobEntryAttribute(id_job, getObjectId(), "blocking", blocking);
            rep.saveJobEntryAttribute(id_job, getObjectId(), "logging_interval", loggingInterval);

        } else {
            throw new KettleException(
                    BaseMessages.getString(PKG, "AmazonElasticMapReduceJobExecutor.SaveToRepository.Error"));
        }
    }

    public String buildFilename(String filename) {
        filename = environmentSubstitute(filename);
        return filename;
    }

    @Override
    public boolean evaluates() {
        return true;
    }

    @Override
    public boolean isUnconditional() {
        return true;
    }

    @Override
    public String getDialogClassName() {
        String className = getClass().getCanonicalName();
        className = className.replaceFirst("\\.job\\.", ".ui.");
        className += "Dialog";
        return className;
    }
}