Java tutorial
/******************************************************************************* * * Pentaho Big Data * * Copyright (C) 2002-2017 by Hitachi Vantara : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.hadoop.mapreduce; import com.thoughtworks.xstream.XStream; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapRunnable; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.pentaho.di.core.Const; import org.pentaho.di.core.exception.KettleException; import org.pentaho.di.core.logging.KettleLogStore; import org.pentaho.di.core.logging.KettleLoggingEvent; import org.pentaho.di.core.logging.LogLevel; import org.pentaho.di.core.row.RowMeta; import org.pentaho.di.core.variables.VariableSpace; import org.pentaho.di.core.variables.Variables; import org.pentaho.di.trans.RowProducer; import org.pentaho.di.trans.Trans; import org.pentaho.di.trans.step.BaseStepMeta; import org.pentaho.di.trans.step.StepInterface; import org.pentaho.di.trans.step.StepMeta; import org.pentaho.di.trans.step.StepMetaInterface; import org.pentaho.di.trans.steps.missing.MissingTrans; import org.pentaho.hadoop.mapreduce.converter.TypeConverterFactory; import org.pentaho.hadoop.mapreduce.converter.spi.ITypeConverter; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Date; import java.util.Iterator; import java.util.List; import java.util.Map.Entry; import java.util.UUID; import java.util.concurrent.TimeUnit; /** * Map runner that uses the normal Kettle execution engine to process all input data during one single run.<p> This * relies on newly un-@Deprecated interfaces ({@link MapRunnable}, {@link JobConf}) in Hadoop 0.21.0. */ public class PentahoMapRunnable<K1, V1, K2, V2> implements MapRunnable<K1, V1, K2, V2> { public static final String KETTLE_PMR_PLUGIN_TIMEOUT = "KETTLE_PMR_PLUGIN_TIMEOUT"; private long pluginWaitTimeout; protected static enum Counter { INPUT_RECORDS, OUTPUT_RECORDS, OUT_RECORD_WITH_NULL_KEY, OUT_RECORD_WITH_NULL_VALUE } protected String transMapXml; protected String transReduceXml; protected String mapInputStepName; protected String reduceInputStepName; protected String mapOutputStepName; protected String reduceOutputStepName; protected Class<K2> outClassK; protected Class<V2> outClassV; protected String id = UUID.randomUUID().toString(); protected boolean debug = false; // the transformation that will be used as a mapper or reducer protected Trans trans; protected VariableSpace variableSpace = null; protected LogLevel logLevel; protected OutputCollectorRowListener<K2, V2> rowCollector; private final String ENVIRONMENT_VARIABLE_PREFIX = "java.system."; private final String KETTLE_VARIABLE_PREFIX = "KETTLE_"; public PentahoMapRunnable() throws KettleException { } public void configure(JobConf job) { pluginWaitTimeout = TimeUnit.MINUTES.toMillis(5); debug = "true".equalsIgnoreCase(job.get("debug")); //$NON-NLS-1$ transMapXml = job.get("transformation-map-xml"); transReduceXml = job.get("transformation-reduce-xml"); mapInputStepName = job.get("transformation-map-input-stepname"); mapOutputStepName = job.get("transformation-map-output-stepname"); reduceInputStepName = job.get("transformation-reduce-input-stepname"); reduceOutputStepName = job.get("transformation-reduce-output-stepname"); String xmlVariableSpace = job.get("variableSpace"); outClassK = (Class<K2>) job.getMapOutputKeyClass(); outClassV = (Class<V2>) job.getMapOutputValueClass(); if (!Const.isEmpty(xmlVariableSpace)) { setDebugStatus("PentahoMapRunnable(): variableSpace was retrieved from the job. The contents: "); setDebugStatus(xmlVariableSpace); // deserialize from xml to variable space XStream xStream = new XStream(); setDebugStatus("PentahoMapRunnable(): Setting classes variableSpace property.: "); variableSpace = (VariableSpace) xStream.fromXML(xmlVariableSpace); for (String variableName : variableSpace.listVariables()) { if (variableName.startsWith(KETTLE_VARIABLE_PREFIX)) { System.setProperty(variableName, variableSpace.getVariable(variableName)); } if (KETTLE_PMR_PLUGIN_TIMEOUT.equals(variableName)) { try { pluginWaitTimeout = Long.parseLong(variableSpace.getVariable(variableName)); } catch (Exception e) { System.out.println("Unable to parse plugin wait timeout, defaulting to 5 minutes"); } } } } else { setDebugStatus("PentahoMapRunnable(): The PDI Job's variable space was not sent."); variableSpace = new Variables(); } // Check for environment variables in the userDefined variables Iterator<Entry<String, String>> iter = job.iterator(); while (iter.hasNext()) { Entry<String, String> entry = iter.next(); if (entry.getKey().startsWith(ENVIRONMENT_VARIABLE_PREFIX)) { System.setProperty(entry.getKey().substring(ENVIRONMENT_VARIABLE_PREFIX.length()), entry.getValue()); } else if (entry.getKey().startsWith(KETTLE_VARIABLE_PREFIX)) { System.setProperty(entry.getKey(), entry.getValue()); } } MRUtil.passInformationToTransformation(variableSpace, job); setDebugStatus("Job configuration"); setDebugStatus("Output key class: " + outClassK.getName()); setDebugStatus("Output value class: " + outClassV.getName()); // set the log level to what the level of the job is String stringLogLevel = job.get("logLevel"); if (!Const.isEmpty(stringLogLevel)) { logLevel = LogLevel.valueOf(stringLogLevel); setDebugStatus("Log level set to " + stringLogLevel); } else { System.out.println( "Could not retrieve the log level from the job configuration. logLevel will not be set."); } long deadline = 0; boolean first = true; while (true) { createTrans(job); if (first) { deadline = pluginWaitTimeout + System.currentTimeMillis(); System.out.println(PentahoMapRunnable.class + ": Trans creation checking starting now " + new Date().toString()); first = false; } List<MissingTrans> missingTranses = new ArrayList<MissingTrans>(); for (StepMeta stepMeta : trans.getTransMeta().getSteps()) { StepMetaInterface stepMetaInterface = stepMeta.getStepMetaInterface(); if (stepMetaInterface instanceof MissingTrans) { MissingTrans missingTrans = (MissingTrans) stepMetaInterface; System.out.println(MissingTrans.class + "{stepName: " + missingTrans.getStepName() + ", missingPluginId: " + missingTrans.getMissingPluginId() + "}"); missingTranses.add(missingTrans); } } if (missingTranses.size() == 0) { System.out.println( PentahoMapRunnable.class + ": Done waiting on plugins now " + new Date().toString()); break; } else { if (System.currentTimeMillis() > deadline) { StringBuilder stringBuilder = new StringBuilder("Failed to initialize plugins: "); for (MissingTrans missingTrans : missingTranses) { stringBuilder.append(missingTrans.getMissingPluginId()); stringBuilder.append(" on step ").append(missingTrans.getStepName()); stringBuilder.append(", "); } stringBuilder.setLength(stringBuilder.length() - 2); throw new RuntimeException(stringBuilder.toString()); } else { try { Thread.sleep(Math.min(100, deadline - System.currentTimeMillis())); } catch (InterruptedException e) { throw new RuntimeException(e); } } } } } public void injectValue(Object key, ITypeConverter inConverterK, Object value, ITypeConverter inConverterV, RowMeta injectorRowMeta, RowProducer rowProducer, Reporter reporter) throws Exception { injectValue(key, 0, inConverterK, value, 1, inConverterV, injectorRowMeta, rowProducer, reporter); } public void injectValue(Object key, int keyOrdinal, ITypeConverter inConverterK, Object value, int valueOrdinal, ITypeConverter inConverterV, RowMeta injectorRowMeta, RowProducer rowProducer, Reporter reporter) throws Exception { Object[] row = new Object[injectorRowMeta.size()]; row[keyOrdinal] = inConverterK != null ? inConverterK.convert(injectorRowMeta.getValueMeta(keyOrdinal), key) : key; row[valueOrdinal] = inConverterV != null ? inConverterV.convert(injectorRowMeta.getValueMeta(valueOrdinal), value) : value; if (debug) { setDebugStatus(reporter, "Injecting input record [" + row[keyOrdinal] + "] - [" + row[valueOrdinal] + "]"); } rowProducer.putRow(injectorRowMeta, row); } protected void createTrans(final Configuration conf) { try { setDebugStatus("Creating a transformation for a map."); trans = MRUtil.getTrans(conf, transMapXml, false); } catch (KettleException ke) { throw new RuntimeException("Error loading transformation", ke); //$NON-NLS-1$ } } public String getTransMapXml() { return transMapXml; } public void setTransMapXml(String transMapXml) { this.transMapXml = transMapXml; } public String getTransReduceXml() { return transReduceXml; } public void setTransReduceXml(String transReduceXml) { this.transReduceXml = transReduceXml; } public String getMapInputStepName() { return mapInputStepName; } public void setMapInputStepName(String mapInputStepName) { this.mapInputStepName = mapInputStepName; } public String getMapOutputStepName() { return mapOutputStepName; } public void setMapOutputStepName(String mapOutputStepName) { this.mapOutputStepName = mapOutputStepName; } public String getReduceInputStepName() { return reduceInputStepName; } public void setReduceInputStepName(String reduceInputStepName) { this.reduceInputStepName = reduceInputStepName; } public String getReduceOutputStepName() { return reduceOutputStepName; } public void setReduceOutputStepName(String reduceOutputStepName) { this.reduceOutputStepName = reduceOutputStepName; } public Class<?> getOutClassK() { return outClassK; } public void setOutClassK(Class<K2> outClassK) { this.outClassK = outClassK; } public Class<?> getOutClassV() { return outClassV; } public void setOutClassV(Class<V2> outClassV) { this.outClassV = outClassV; } public Trans getTrans() { return trans; } public void setTrans(Trans trans) { this.trans = trans; } public String getId() { return id; } public void setId(String id) { this.id = id; } public Exception getException() { return rowCollector != null ? rowCollector.getException() : null; } public void setDebugStatus(Reporter reporter, String message) { if (debug) { System.out.println(message); reporter.setStatus(message); } } private void setDebugStatus(String message) { if (debug) { System.out.println(message); } } public void run(RecordReader<K1, V1> input, final OutputCollector<K2, V2> output, final Reporter reporter) throws IOException { try { if (trans == null) { throw new RuntimeException("Error initializing transformation. See error log."); //$NON-NLS-1$ } else { // Clean up old logging KettleLogStore.discardLines(trans.getLogChannelId(), true); } // Create a copy of trans so we don't continue to add new TransListeners and run into a // ConcurrentModificationException // when this mapper is reused "quickly" trans = MRUtil.recreateTrans(trans); String logLinePrefix = getClass().getName() + ".run: "; setDebugStatus(logLinePrefix + " The transformation was just recreated."); // share the variables from the PDI job. // we do this here instead of in createTrans() as MRUtil.recreateTrans() wil not // copy "execution" trans information. if (variableSpace != null) { setDebugStatus("Sharing the VariableSpace from the PDI job."); trans.shareVariablesWith(variableSpace); if (debug) { // list the variables List<String> variables = Arrays.asList(trans.listVariables()); Collections.sort(variables); if (variables != null) { setDebugStatus("Variables: "); for (String variable : variables) { setDebugStatus(" " + variable + " = " + trans.getVariable(variable)); } } } } else { setDebugStatus(reporter, "variableSpace is null. We are not going to share it with the trans."); } // set the trans' log level if we have our's set if (logLevel != null) { setDebugStatus("Setting the trans.logLevel to " + logLevel.toString()); trans.setLogLevel(logLevel); } else { setDebugStatus("logLevel is null. The trans log level will not be set."); } // allocate key & value instances that are re-used for all entries K1 key = input.createKey(); V1 value = input.createValue(); setDebugStatus(reporter, "Preparing transformation for execution"); trans.prepareExecution(null); try { setDebugStatus(reporter, "Locating output step: " + mapOutputStepName); StepInterface outputStep = trans.findRunThread(mapOutputStepName); if (outputStep != null) { rowCollector = new OutputCollectorRowListener(output, outClassK, outClassV, reporter, debug); // rowCollector = OutputCollectorRowListener.build(output, outputRowMeta, outClassK, outClassV, // reporter, debug); outputStep.addRowListener(rowCollector); RowMeta injectorRowMeta = new RowMeta(); RowProducer rowProducer = null; TypeConverterFactory typeConverterFactory = new TypeConverterFactory(); ITypeConverter inConverterK = null; ITypeConverter inConverterV = null; setDebugStatus(reporter, "Locating input step: " + mapInputStepName); if (mapInputStepName != null) { // Setup row injection rowProducer = trans.addRowProducer(mapInputStepName, 0); StepInterface inputStep = rowProducer.getStepInterface(); StepMetaInterface inputStepMeta = inputStep.getStepMeta().getStepMetaInterface(); InKeyValueOrdinals inOrdinals = null; if (inputStepMeta instanceof BaseStepMeta) { setDebugStatus(reporter, "Generating converters from RowMeta for injection into the mapper transformation"); // Use getFields(...) to get the row meta and therefore the expected input types inputStepMeta.getFields(injectorRowMeta, null, null, null, null); inOrdinals = new InKeyValueOrdinals(injectorRowMeta); if (inOrdinals.getKeyOrdinal() < 0 || inOrdinals.getValueOrdinal() < 0) { throw new KettleException( "key or value is not defined in transformation injector step"); } // Get a converter for the Key if the value meta has a concrete Java class we can use. // If no converter can be found here we wont do any type conversion. if (injectorRowMeta.getValueMeta(inOrdinals.getKeyOrdinal()) != null) { inConverterK = typeConverterFactory.getConverter(key.getClass(), injectorRowMeta.getValueMeta(inOrdinals.getKeyOrdinal())); } // Get a converter for the Value if the value meta has a concrete Java class we can use. // If no converter can be found here we wont do any type conversion. if (injectorRowMeta.getValueMeta(inOrdinals.getValueOrdinal()) != null) { inConverterV = typeConverterFactory.getConverter(value.getClass(), injectorRowMeta.getValueMeta(inOrdinals.getValueOrdinal())); } } trans.startThreads(); if (rowProducer != null) { while (input.next(key, value)) { if (inOrdinals != null) { injectValue(key, inOrdinals.getKeyOrdinal(), inConverterK, value, inOrdinals.getValueOrdinal(), inConverterV, injectorRowMeta, rowProducer, reporter); } else { injectValue(key, inConverterK, value, inConverterV, injectorRowMeta, rowProducer, reporter); } } rowProducer.finished(); } trans.waitUntilFinished(); setDebugStatus(reporter, "Mapper transformation has finished"); if (trans.getErrors() > 0) { setDebugStatus("Errors detected for mapper transformation"); List<KettleLoggingEvent> logList = KettleLogStore.getLogBufferFromTo( trans.getLogChannelId(), false, 0, KettleLogStore.getLastBufferLineNr()); StringBuffer buff = new StringBuffer(); for (KettleLoggingEvent le : logList) { if (le.getLevel() == LogLevel.ERROR) { buff.append(le.getMessage().toString()).append("\n"); } } throw new Exception( "Errors were detected for mapper transformation:\n\n" + buff.toString()); } } else { setDebugStatus(reporter, "No input stepname was defined"); } if (getException() != null) { setDebugStatus(reporter, "An exception was generated by the mapper transformation"); // Bubble the exception from within Kettle to Hadoop throw getException(); } } else { if (mapOutputStepName != null) { setDebugStatus(reporter, "Output step [" + mapOutputStepName + "]could not be found"); throw new KettleException("Output step not defined in transformation"); } else { setDebugStatus(reporter, "Output step name not specified"); } } } finally { try { trans.stopAll(); } catch (Exception ex) { ex.printStackTrace(); } try { trans.cleanup(); } catch (Exception ex) { ex.printStackTrace(); } } } catch (Exception e) { e.printStackTrace(System.err); setDebugStatus(reporter, "An exception was generated by the mapper task"); throw new IOException(e); } reporter.setStatus("Completed processing record"); } }