org.eobjects.datacleaner.monitor.pentaho.PentahoJobEngine.java Source code

Java tutorial

Introduction

Here is the source code for org.eobjects.datacleaner.monitor.pentaho.PentahoJobEngine.java

Source

/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.eobjects.datacleaner.monitor.pentaho;

import java.io.StringWriter;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;

import javax.xml.transform.OutputKeys;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.util.EntityUtils;
import org.eobjects.analyzer.data.InputColumn;
import org.eobjects.analyzer.data.MockInputColumn;
import org.eobjects.analyzer.job.ComponentJob;
import org.eobjects.analyzer.result.AnalysisResult;
import org.eobjects.analyzer.util.StringUtils;
import org.eobjects.datacleaner.monitor.configuration.ResultContext;
import org.eobjects.datacleaner.monitor.configuration.TenantContext;
import org.eobjects.datacleaner.monitor.job.ExecutionLogger;
import org.eobjects.datacleaner.monitor.job.JobEngine;
import org.eobjects.datacleaner.monitor.job.MetricJobContext;
import org.eobjects.datacleaner.monitor.job.MetricJobEngine;
import org.eobjects.datacleaner.monitor.job.MetricValues;
import org.eobjects.datacleaner.monitor.pentaho.jaxb.PentahoJobType;
import org.eobjects.datacleaner.monitor.scheduling.model.ExecutionLog;
import org.eobjects.datacleaner.monitor.server.DefaultMetricValues;
import org.eobjects.datacleaner.monitor.server.job.AbstractJobEngine;
import org.eobjects.datacleaner.monitor.shared.model.MetricIdentifier;
import org.eobjects.datacleaner.repository.RepositoryFile;
import org.apache.metamodel.util.CollectionUtils;
import org.apache.metamodel.util.Func;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import org.springframework.util.xml.DomUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;

/**
 * A {@link JobEngine} for running Pentaho Data Integration (aka. Kettle) jobs
 * in within DataCleaner.
 */
@Component
public class PentahoJobEngine extends AbstractJobEngine<PentahoJobContext>
        implements MetricJobEngine<PentahoJobContext> {

    private static final Logger logger = LoggerFactory.getLogger(PentahoJobEngine.class);

    public static final String EXTENSION = ".pentaho.job.xml";

    /**
     * Defines the interval duration when polling for updates from the
     * transformation
     */
    private static final int POLL_INTERVAL_MILLIS = 800;

    /**
     * Defines the duration between adding a progress update in the execution
     * log.
     */
    private static final int PROGRESS_UPDATE_INTERVAL_MILLIS = 10000;

    public PentahoJobEngine() {
        super(EXTENSION);
    }

    @Override
    public String getJobType() {
        return "PentahoJob";
    }

    @Override
    public void executeJob(TenantContext tenantContext, ExecutionLog execution, ExecutionLogger executionLogger,
            Map<String, String> variables) throws Exception {
        final PentahoJobContext jobContext = getJobContext(tenantContext, execution.getJob());
        final PentahoJobType pentahoJobType = jobContext.getPentahoJobType();

        final PentahoCarteClient carteClient = new PentahoCarteClient(pentahoJobType);

        final boolean ready = fillMissingDetails(carteClient, pentahoJobType, executionLogger);
        if (!ready) {
            return;
        }

        final boolean started = startTrans(carteClient, pentahoJobType, executionLogger);

        if (!started) {
            return;
        }

        long lastStatusUpdateTime = System.currentTimeMillis();
        boolean running = true;
        while (running) {
            Thread.sleep(POLL_INTERVAL_MILLIS);

            final boolean progressUpdate;
            if (System.currentTimeMillis() - lastStatusUpdateTime > PROGRESS_UPDATE_INTERVAL_MILLIS) {
                lastStatusUpdateTime = System.currentTimeMillis();
                progressUpdate = true;
            } else {
                progressUpdate = false;
            }

            running = transStatus(carteClient, pentahoJobType, executionLogger, tenantContext, execution,
                    progressUpdate);

            if (!running) {
                break;
            }
        }
    }

    /**
     * Fills in any missing details of the request
     * 
     * @param carteClient
     * @param pentahoJobType
     * @param executionLogger
     * @return true if the job should continue, or false if it has been
     *         aborted/failed.
     * @throws Exception
     */
    private boolean fillMissingDetails(PentahoCarteClient carteClient, PentahoJobType pentahoJobType,
            ExecutionLogger executionLogger) throws Exception {
        final String queriedTransformationId = pentahoJobType.getTransformationId();
        final String queriedTransformationName = pentahoJobType.getTransformationName();
        if (!StringUtils.isNullOrEmpty(queriedTransformationId)
                && StringUtils.isNullOrEmpty(queriedTransformationName)) {
            // both 'id' and 'name' of transformation is filled already.
            return true;
        }

        final List<PentahoTransformation> availableTransformations = carteClient.getAvailableTransformations();
        for (PentahoTransformation candidate : availableTransformations) {
            if (candidate.matches(queriedTransformationId, queriedTransformationName)) {
                pentahoJobType.setTransformationId(candidate.getId());
                pentahoJobType.setTransformationName(candidate.getName());
                executionLogger.log(
                        "Identified transformation: name=" + candidate.getName() + ", id=" + candidate.getId());
                return true;
            }
        }

        executionLogger.setStatusFailed(null, null,
                new PentahoJobException("Carte did not present any transformations with id='"
                        + queriedTransformationId + "' or name='" + queriedTransformationName + "'"));
        return false;
    }

    /**
     * Fires the HTTP request to the Carte server to get the updated status of
     * the execution
     * 
     * @param carteClient
     * @param pentahoJobType
     * @param executionLogger
     * @param execution
     * @param progressUpdate
     * @return
     * @throws Exception
     */
    private boolean transStatus(PentahoCarteClient carteClient, PentahoJobType pentahoJobType,
            ExecutionLogger executionLogger, TenantContext tenantContext, ExecutionLog execution,
            boolean progressUpdate) throws Exception {
        final String transStatusUrl = carteClient.getUrl("transStatus");
        final HttpGet request = new HttpGet(transStatusUrl);
        try {
            final HttpResponse response = carteClient.execute(request);
            final int statusCode = response.getStatusLine().getStatusCode();
            if (statusCode == 200) {
                final Document doc = carteClient.parse(response.getEntity());
                final Element webresultElement = doc.getDocumentElement();

                final String statusDescription = DomUtils
                        .getTextValue(DomUtils.getChildElementByTagName(webresultElement, "status_desc"));
                if ("Running".equalsIgnoreCase(statusDescription)) {
                    // the job is still running

                    if (progressUpdate) {
                        logTransStatus("progress", pentahoJobType, executionLogger, doc);
                    }

                    return true;
                } else if ("Waiting".equalsIgnoreCase(statusDescription)) {
                    // the job has finished - serialize and return succesfully

                    logTransStatus("finished", pentahoJobType, executionLogger, doc);

                    final String documentString = createDocumentString(doc);
                    final PentahoJobResult result = new PentahoJobResult(documentString);

                    executionLogger.setStatusSuccess(result);
                    return false;
                } else if ("Paused".equalsIgnoreCase(statusDescription)) {
                    executionLogger.setStatusFailed(null, transStatusUrl,
                            new PentahoJobException("The transformation was paused by a third-party actor"));
                    return false;
                } else {
                    executionLogger.setStatusFailed(null, transStatusUrl, new PentahoJobException(
                            "Encountered unexpected status_desc from Carte when updating transformation status: "
                                    + statusDescription));
                    return false;
                }
            } else {
                String responseString = EntityUtils.toString(response.getEntity());
                executionLogger.log(responseString);
                executionLogger.setStatusFailed(null, transStatusUrl, new PentahoJobException(
                        "Unexpected response status when updating transformation status: " + statusCode));
                return false;
            }
        } finally {
            request.releaseConnection();
        }
    }

    private String createDocumentString(Document doc) {
        final Transformer transformer = getTransformer();
        final Source source = new DOMSource(doc);
        final StringWriter outText = new StringWriter();
        final StreamResult target = new StreamResult(outText);
        try {
            transformer.transform(source, target);
            return outText.toString();
        } catch (TransformerException e) {
            throw new PentahoJobException("Failed to build XML string", e);
        }
    }

    /**
     * Logs the progress of a job in Carte based on the XML response of a
     * 'transUpdate' call.
     * 
     * @param statusType
     *            the type of status update - expecting a word to put into an
     *            update sentence like 'progress' or 'finished'.
     * @param pentahoJobType
     * @param executionLogger
     * @param document
     */
    private void logTransStatus(String statusType, PentahoJobType pentahoJobType, ExecutionLogger executionLogger,
            Document document) {
        final Element transstatusElement = document.getDocumentElement();
        final Element stepstatuslistElement = DomUtils.getChildElementByTagName(transstatusElement,
                "stepstatuslist");
        final List<Element> stepstatusElements = DomUtils.getChildElements(stepstatuslistElement);
        for (Element stepstatusElement : stepstatusElements) {
            final String stepName = DomUtils.getChildElementValueByTagName(stepstatusElement, "stepname");
            final String linesInput = DomUtils.getChildElementValueByTagName(stepstatusElement, "linesInput");
            final String linesOutput = DomUtils.getChildElementValueByTagName(stepstatusElement, "linesOutput");
            final String linesRead = DomUtils.getChildElementValueByTagName(stepstatusElement, "linesRead");
            final String linesWritten = DomUtils.getChildElementValueByTagName(stepstatusElement, "linesWritten");
            final String statusDescription = DomUtils.getChildElementValueByTagName(stepstatusElement,
                    "statusDescription");

            final StringBuilder update = new StringBuilder();
            update.append("Step '");
            update.append(stepName);
            update.append("' ");
            update.append(statusType);
            update.append(": status='");
            update.append(statusDescription);
            update.append("'");

            if (!"0".equals(linesRead)) {
                update.append(", linesRead=");
                update.append(linesRead);
            }
            if (!"0".equals(linesWritten)) {
                update.append(", linesWritten=");
                update.append(linesWritten);
            }
            if (!"0".equals(linesInput)) {
                update.append(", linesInput=");
                update.append(linesInput);
            }
            if (!"0".equals(linesOutput)) {
                update.append(", linesOutput=");
                update.append(linesOutput);
            }

            executionLogger.log(update.toString());
        }
        executionLogger.flushLog();
    }

    protected Transformer getTransformer() {
        try {
            final TransformerFactory transformerFactory = TransformerFactory.newInstance();
            Transformer transformer = transformerFactory.newTransformer();
            transformer.setOutputProperty(OutputKeys.INDENT, "yes");
            return transformer;
        } catch (TransformerConfigurationException e) {
            throw new IllegalStateException(e);
        }
    }

    /**
     * Fires the HTTP request to Carte to start processing the transformation.
     * 
     * @param carteClient
     * @param pentahoJobType
     * @param executionLogger
     * @return
     * @throws Exception
     */
    private boolean startTrans(PentahoCarteClient carteClient, PentahoJobType pentahoJobType,
            ExecutionLogger executionLogger) throws Exception {
        final String startTransUrl = carteClient.getUrl("startTrans");
        final HttpGet request = new HttpGet(startTransUrl);
        try {
            final HttpResponse response = carteClient.execute(request);
            final int statusCode = response.getStatusLine().getStatusCode();
            if (statusCode == 200) {
                final Document doc = carteClient.parse(response.getEntity());
                final Element webresultElement = doc.getDocumentElement();

                final String message = DomUtils
                        .getTextValue(DomUtils.getChildElementByTagName(webresultElement, "message"));
                if (!StringUtils.isNullOrEmpty(message)) {
                    executionLogger.log(message);
                }
                final String result = DomUtils
                        .getTextValue(DomUtils.getChildElementByTagName(webresultElement, "result"));
                if ("OK".equalsIgnoreCase(result)) {
                    executionLogger.setStatusRunning();
                    executionLogger.flushLog();
                    return true;
                } else if ("ERROR".equalsIgnoreCase(result)) {
                    executionLogger.setStatusFailed(null, startTransUrl, new PentahoJobException(
                            "The Carte service reported an 'ERROR' result when starting transformation"));
                    return false;
                } else {
                    executionLogger.setStatusFailed(null, startTransUrl, new PentahoJobException(
                            "Encountered unexpected result from Carte when starting transformation: " + result));
                    return false;
                }
            } else {
                String responseString = EntityUtils.toString(response.getEntity());
                executionLogger.log(responseString);
                executionLogger.setStatusFailed(null, startTransUrl, new PentahoJobException(
                        "Unexpected response status when starting transformation: " + statusCode));
                return false;
            }
        } finally {
            request.releaseConnection();
        }
    }

    @Override
    protected PentahoJobContext getJobContext(TenantContext tenantContext, RepositoryFile file) {
        return new PentahoJobContext(tenantContext, this, file);
    }

    @Override
    public MetricValues getMetricValues(MetricJobContext job, ResultContext result,
            List<MetricIdentifier> metricIdentifiers) {
        final AnalysisResult analysisResult = result.getAnalysisResult();
        return new DefaultMetricValues(this, job, metricIdentifiers, analysisResult);
    }

    @Override
    public Collection<String> getMetricParameterSuggestions(MetricJobContext job, ResultContext result,
            MetricIdentifier metricIdentifier) {
        // no string parameterized metrics available
        return Collections.emptyList();
    }

    @Override
    public Collection<InputColumn<?>> getMetricParameterColumns(MetricJobContext job, ComponentJob component) {
        try {
            final ResultContext result = job.getTenantContext().getLatestResult(job);

            final PentahoJobResult pentahoJobResult;
            if (result == null) {
                // no step names available from previous results - we'll have to
                // query the Carte server.
                final PentahoJobContext pentahoJobContext = (PentahoJobContext) job;
                final PentahoJobType pentahoJobType = pentahoJobContext.getPentahoJobType();
                final PentahoCarteClient client = new PentahoCarteClient(pentahoJobType);
                final String url = client.getUrl("transStatus");
                final HttpResponse response = client.execute(new HttpGet(url));
                final Document document = client.parse(response.getEntity());
                final String documentString = createDocumentString(document);
                pentahoJobResult = new PentahoJobResult(documentString);
            } else {
                // we'll fetch the step names locally from the result file
                final AnalysisResult analysisResult = result.getAnalysisResult();
                pentahoJobResult = (PentahoJobResult) analysisResult.getResults().get(0);
            }

            final Collection<String> stepNames = pentahoJobResult.getStepNames();
            final List<InputColumn<?>> inputColumns = CollectionUtils.map(stepNames,
                    new Func<String, InputColumn<?>>() {
                        @Override
                        public InputColumn<?> eval(String stepName) {
                            return new MockInputColumn<String>(stepName);
                        }
                    });
            return inputColumns;
        } catch (RuntimeException e) {
            logger.warn("Failed to get step names as InputColumn parameter list", e);
            return Collections.emptyList();
        }
    }

    @Override
    public boolean cancelJob(TenantContext tenantContext, ExecutionLog executionLog) {
        return false;
    }
}