org.pentaho.hadoop.mapreduce.MRUtil.java Source code

Java tutorial

Introduction

Here is the source code for org.pentaho.hadoop.mapreduce.MRUtil.java

Source

/*******************************************************************************
 *
 * Pentaho Big Data
 *
 * Copyright (C) 2002-2017 by Hitachi Vantara : http://www.pentaho.com
 *
 *******************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/

package org.pentaho.hadoop.mapreduce;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.JobConf;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.KettleEnvironment;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.logging.LoggingObjectType;
import org.pentaho.di.core.logging.SimpleLoggingObject;
import org.pentaho.di.core.variables.VariableSpace;
import org.pentaho.di.i18n.BaseMessages;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransConfiguration;
import org.pentaho.di.trans.TransExecutionConfiguration;
import org.pentaho.di.trans.TransMeta;
import org.pentaho.di.trans.TransMeta.TransformationType;

import java.io.FileOutputStream;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.UUID;

public class MRUtil {
    /**
     * Path to the directory to load plugins from. This must be accessible from all TaskTracker nodes.
     */
    public static final String PROPERTY_PENTAHO_KETTLE_PLUGINS_DIR = "pentaho.kettle.plugins.dir";

    /**
     * Path to the directory to load plugins from. This must be accessible from all TaskTracker nodes.
     */
    public static final String PROPERTY_PENTAHO_KETTLE_METASTORE_DIR = "pentaho.kettle.metastore.dir";

    /**
     * Hadoop Configuration for setting KETTLE_HOME. See {@link Const#getKettleDirectory()} for usage.
     */
    public static final String PROPERTY_PENTAHO_KETTLE_HOME = "pentaho.kettle.home";

    public static Trans getTrans(final Configuration conf, final String transXml, boolean singleThreaded)
            throws KettleException {
        initKettleEnvironment(conf);

        TransConfiguration transConfiguration = TransConfiguration.fromXML(transXml);
        TransMeta transMeta = transConfiguration.getTransMeta();
        String carteObjectId = UUID.randomUUID().toString();
        SimpleLoggingObject servletLoggingObject = new SimpleLoggingObject("HADOOP_MAPPER", LoggingObjectType.CARTE, //$NON-NLS-1$
                null);
        servletLoggingObject.setContainerObjectId(carteObjectId);
        TransExecutionConfiguration executionConfiguration = transConfiguration.getTransExecutionConfiguration();
        servletLoggingObject.setLogLevel(executionConfiguration.getLogLevel());

        if (singleThreaded) {
            // Set the type to single threaded in case the user forgot...
            //
            transMeta.setTransformationType(TransformationType.SingleThreaded);

            // Disable thread priority management as it will slow things down needlessly.
            // The single threaded engine doesn't use threads and doesn't need row locking.
            //
            transMeta.setUsingThreadPriorityManagment(false);
        } else {
            transMeta.setTransformationType(TransformationType.Normal);
        }

        return new Trans(transMeta, servletLoggingObject);
    }

    /**
     * Initialize the Kettle environment with settings from the provided configuration
     *
     * @param conf
     *          Configuration to configure Kettle environment with
     */
    private static void initKettleEnvironment(Configuration conf) throws KettleException {
        if (!KettleEnvironment.isInitialized()) {
            String kettleHome = getKettleHomeProperty(conf);
            String pluginDir = getPluginDirProperty(conf);
            String metaStoreDir = getMetastoreDirProperty(conf);
            System.setProperty("KETTLE_HOME", kettleHome);
            System.setProperty(Const.PLUGIN_BASE_FOLDERS_PROP, pluginDir);
            System.setProperty(Const.PENTAHO_METASTORE_FOLDER, metaStoreDir);

            System.out.println(BaseMessages.getString(MRUtil.class, "KettleHome.Info", kettleHome));
            System.out.println(BaseMessages.getString(MRUtil.class, "PluginDirectory.Info", pluginDir));
            System.out.println(BaseMessages.getString(MRUtil.class, "MetasStoreDirectory.Info", metaStoreDir));

            KettleEnvironment.init();
        }
    }

    public static void passInformationToTransformation(final VariableSpace variableSpace, final JobConf job) {
        if (variableSpace != null && job != null) {
            variableSpace.setVariable("Internal.Hadoop.NumMapTasks", Integer.toString(job.getNumMapTasks()));
            variableSpace.setVariable("Internal.Hadoop.NumReduceTasks", Integer.toString(job.getNumReduceTasks()));
            String taskId = job.get("mapred.task.id");
            variableSpace.setVariable("Internal.Hadoop.TaskId", taskId);
            // TODO: Verify if the string range holds true for all Hadoop distributions
            // Extract the node number from the task ID.
            // The consensus currently is that it's the part after the last underscore.
            //
            // Examples:
            // job_201208090841_9999
            // job_201208090841_10000
            //
            String nodeNumber;
            if (Const.isEmpty(taskId)) {
                nodeNumber = "0";
            } else {
                int lastUnderscoreIndex = taskId.lastIndexOf("_");
                if (lastUnderscoreIndex >= 0) {
                    nodeNumber = taskId.substring(lastUnderscoreIndex + 1);
                } else {
                    nodeNumber = "0";
                }
            }
            // get rid of zeroes.
            //
            variableSpace.setVariable("Internal.Hadoop.NodeNumber", Integer.toString(Integer.valueOf(nodeNumber)));
        }
    }

    /**
     * @return the current working directory for this JVM.
     */
    public static String getWorkingDir() {
        return System.getProperty("user.dir");
    }

    /**
     * Determines the Kettle Home property to use for this invocation from the configuration provided. If it is not set
     * the current working directory will be used.
     *
     * @param conf
     *          Configuration to check for Kettle Home to be set in.
     * @return The Kettle Home directory to use
     */
    public static String getKettleHomeProperty(Configuration conf) {
        String kettleHome = conf.get(PROPERTY_PENTAHO_KETTLE_HOME);
        if (StringUtils.isEmpty(kettleHome)) {
            return getWorkingDir();
        }
        return kettleHome;
    }

    public static String getMetastoreDirProperty(Configuration conf) {
        String kettleHome = conf.get(PROPERTY_PENTAHO_KETTLE_METASTORE_DIR);
        if (StringUtils.isEmpty(kettleHome)) {
            return getWorkingDir();
        }
        return kettleHome;
    }

    /**
     * Builds a comma-separated list of paths to load Kettle plugins from. To be used as the value for the System property
     * {@link Const.PLUGIN_BASE_FOLDERS_PROP}.
     *
     * @param conf
     *          Configuration to retrieve properties from
     * @return Comma-separated list of paths to look for Kettle plugins in
     */
    public static String getPluginDirProperty(final Configuration conf) throws KettleException {
        // Load plugins from the directory specified in the configuration
        String kettlePluginDir = conf.get(PROPERTY_PENTAHO_KETTLE_PLUGINS_DIR);
        if (StringUtils.isEmpty(kettlePluginDir)) {
            kettlePluginDir = getDefaultPluginDirs();
        }
        return kettlePluginDir;
    }

    /**
     * Returns a comma-separated list of default paths to load Kettle plugins from.
     *
     * @return
     */
    private static final String getDefaultPluginDirs() {
        return new StringBuilder().append(Const.DEFAULT_PLUGIN_BASE_FOLDERS).append(",").append(getWorkingDir())
                .append(Const.FILE_SEPARATOR).append("plugins").toString();
    }

    /**
     * Create a copy of {@code trans}
     */
    public static Trans recreateTrans(Trans trans) {
        return new Trans(trans.getTransMeta(), trans.getParent());
    }

    public static String getStackTrace(Throwable t) {
        StringWriter stringWritter = new StringWriter();
        PrintWriter printWritter = new PrintWriter(stringWritter, true);
        t.printStackTrace(printWritter);
        printWritter.flush();
        stringWritter.flush();
        return stringWritter.toString();
    }

    public static void logMessage(String message) {
        logMessage(Thread.currentThread().hashCode(), message);
    }

    public static void logMessage(Throwable t) {
        logMessage(Thread.currentThread().hashCode(), getStackTrace(t));
    }

    public static void logMessage(String message, Throwable t) {
        logMessage(Thread.currentThread().hashCode(), message);
        logMessage(Thread.currentThread().hashCode(), getStackTrace(t));
    }

    public static void logMessage(int id, String message) {
        logMessage(new Integer(id).toString(), message);
    }

    public static void logMessage(int id, Throwable t) {
        logMessage(new Integer(id).toString(), getStackTrace(t));
    }

    public static void logMessage(int id, String message, Throwable t) {
        logMessage(new Integer(id).toString(), message);
        logMessage(new Integer(id).toString(), getStackTrace(t));
    }

    public static void logMessage(String id, String message) {
        try {
            FileOutputStream fos = new FileOutputStream("/tmp/PDIMapReduce.log", true); //$NON-NLS-1$
            if (id != null) {
                fos.write((id + ": ").getBytes()); //$NON-NLS-1$
            }
            fos.write(message.getBytes());
            fos.write(System.getProperty("line.separator").getBytes()); //$NON-NLS-1$
            fos.close();
        } catch (Throwable t) {
            // ignore
        }
    }
}