Java tutorial
/* * Copyright 2017 StreamSets Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.streamsets.pipeline.stage.destination.mapreduce; import com.google.common.annotations.VisibleForTesting; import com.streamsets.pipeline.api.Batch; import com.streamsets.pipeline.api.Record; import com.streamsets.pipeline.api.Stage; import com.streamsets.pipeline.api.StageException; import com.streamsets.pipeline.api.base.BaseExecutor; import com.streamsets.pipeline.api.base.OnRecordErrorException; import com.streamsets.pipeline.api.el.ELEval; import com.streamsets.pipeline.api.el.ELEvalException; import com.streamsets.pipeline.api.el.ELVars; import com.streamsets.pipeline.lib.el.RecordEL; import com.streamsets.pipeline.stage.common.DefaultErrorRecordHandler; import com.streamsets.pipeline.stage.common.ErrorRecordHandler; import com.streamsets.pipeline.stage.destination.mapreduce.config.JobConfig; import com.streamsets.pipeline.stage.destination.mapreduce.config.MapReduceConfig; import com.streamsets.pipeline.stage.destination.mapreduce.jobtype.avroconvert.AvroConversionCommonConstants; import com.streamsets.pipeline.stage.destination.mapreduce.jobtype.avroorc.AvroOrcConstants; import com.streamsets.pipeline.lib.converter.AvroParquetConstants; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.util.ReflectionUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.security.PrivilegedExceptionAction; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.concurrent.Callable; public class MapReduceExecutor extends BaseExecutor { private static final Logger LOG = LoggerFactory.getLogger(MapReduceExecutor.class); private final MapReduceConfig mapReduceConfig; private final JobConfig jobConfig; private ErrorRecordHandler errorRecordHandler; @VisibleForTesting public boolean waitForCompletition; public MapReduceExecutor(MapReduceConfig mapReduceConfig, JobConfig jobConfig) { this.mapReduceConfig = mapReduceConfig; this.jobConfig = jobConfig; this.waitForCompletition = false; } @Override public List<ConfigIssue> init() { List<ConfigIssue> issues = super.init(); issues.addAll(mapReduceConfig.init(getContext(), "mapReduceConfig")); issues.addAll(jobConfig.init(getContext(), "jobConfig")); errorRecordHandler = new DefaultErrorRecordHandler(getContext()); return issues; } /** * Handy class to keep track of various ELs with the shared variables object. */ private static class EvalContext { private ELVars variables; private Map<String, ELEval> evals; private Stage.Context context; public EvalContext(Stage.Context context) { this.context = context; this.variables = context.createELVars(); this.evals = new HashMap<>(); } public void setRecord(Record record) { RecordEL.setRecordInContext(variables, record); } public String evaluateToString(String name, String expr, boolean failOnEmptyString) throws ELEvalException { String evaluated = evaluate(name, expr, String.class); if (failOnEmptyString && StringUtils.isEmpty(evaluated)) { throw new ELEvalException(MapReduceErrors.MAPREDUCE_0007, expr, name); } return evaluated; } public <T> T evaluate(String name, String expr, Class<T> klass) throws ELEvalException { return getEval(name).eval(variables, expr, klass); } public ELEval getEval(String name) { if (evals.containsKey(name)) { return evals.get(name); } ELEval eval = context.createELEval(name); evals.put(name, eval); return eval; } } @Override public void write(Batch batch) throws StageException { EvalContext eval = new EvalContext(getContext()); Iterator<Record> it = batch.getRecords(); while (it.hasNext()) { final Record record = it.next(); eval.setRecord(record); Job job = null; try { // Job configuration object is a clone of the original one that we're keeping in mapReduceConfig class final Configuration jobConfiguration = new Configuration(mapReduceConfig.getConfiguration()); // Evaluate all dynamic properties and store them in the configuration job for (Map.Entry<String, String> entry : jobConfig.jobConfigs.entrySet()) { String key = eval.evaluateToString("jobConfigs", entry.getKey(), true); String value = eval.evaluateToString("jobConfigs", entry.getValue(), false); jobConfiguration.set(key, value); } // For build-in job creators, evaluate their properties and persist them in the MR config switch (jobConfig.jobType) { case AVRO_PARQUET: jobConfiguration.set(AvroConversionCommonConstants.INPUT_FILE, eval .evaluateToString("inputFile", jobConfig.avroConversionCommonConfig.inputFile, true)); jobConfiguration.set(AvroConversionCommonConstants.OUTPUT_DIR, eval.evaluateToString( "outputDirectory", jobConfig.avroConversionCommonConfig.outputDirectory, true)); jobConfiguration.setBoolean(AvroConversionCommonConstants.KEEP_INPUT_FILE, jobConfig.avroConversionCommonConfig.keepInputFile); jobConfiguration.set(AvroParquetConstants.COMPRESSION_CODEC_NAME, eval.evaluateToString( "compressionCodec", jobConfig.avroParquetConfig.compressionCodec, false)); jobConfiguration.setInt(AvroParquetConstants.ROW_GROUP_SIZE, jobConfig.avroParquetConfig.rowGroupSize); jobConfiguration.setInt(AvroParquetConstants.PAGE_SIZE, jobConfig.avroParquetConfig.pageSize); jobConfiguration.setInt(AvroParquetConstants.DICTIONARY_PAGE_SIZE, jobConfig.avroParquetConfig.dictionaryPageSize); jobConfiguration.setInt(AvroParquetConstants.MAX_PADDING_SIZE, jobConfig.avroParquetConfig.maxPaddingSize); jobConfiguration.setBoolean(AvroConversionCommonConstants.OVERWRITE_TMP_FILE, jobConfig.avroConversionCommonConfig.overwriteTmpFile); break; case AVRO_ORC: jobConfiguration.set(AvroConversionCommonConstants.INPUT_FILE, eval .evaluateToString("inputFile", jobConfig.avroConversionCommonConfig.inputFile, true)); jobConfiguration.set(AvroConversionCommonConstants.OUTPUT_DIR, eval.evaluateToString( "outputDirectory", jobConfig.avroConversionCommonConfig.outputDirectory, true)); jobConfiguration.setBoolean(AvroConversionCommonConstants.KEEP_INPUT_FILE, jobConfig.avroConversionCommonConfig.keepInputFile); jobConfiguration.setBoolean(AvroConversionCommonConstants.OVERWRITE_TMP_FILE, jobConfig.avroConversionCommonConfig.overwriteTmpFile); jobConfiguration.setInt(AvroOrcConstants.ORC_BATCH_SIZE, jobConfig.avroOrcConfig.orcBatchSize); break; case CUSTOM: // Nothing because custom is generic one that have no special config properties break; default: throw new UnsupportedOperationException("Unsupported JobType: " + jobConfig.jobType); } job = createAndSubmitJob(jobConfiguration); } catch (IOException | InterruptedException | ELEvalException e) { LOG.error("Can't submit mapreduce job", e); errorRecordHandler.onError( new OnRecordErrorException(record, MapReduceErrors.MAPREDUCE_0005, e.getMessage(), e)); } if (job != null) { MapReduceExecutorEvents.JOB_CREATED.create(getContext()).with("tracking-url", job.getTrackingURL()) .with("job-id", job.getJobID().toString()).createAndSend(); } } } private Job createAndSubmitJob(final Configuration configuration) throws IOException, InterruptedException { return mapReduceConfig.getUGI().doAs((PrivilegedExceptionAction<Job>) () -> { // Create new mapreduce job object Callable<Job> jobCreator = ReflectionUtils.newInstance(jobConfig.getJobCreator(), configuration); Job job = jobCreator.call(); // In trace mode, dump all the configuration that we're using for the job if (LOG.isTraceEnabled()) { LOG.trace("Using the following configuration object for mapreduce job."); for (Map.Entry<String, String> entry : configuration) { LOG.trace(" Config: {}={}", entry.getKey(), entry.getValue()); } } // Submit it for processing. Blocking mode is only for testing. job.submit(); if (waitForCompletition) { job.waitForCompletion(true); } return job; }); } }