List of usage examples for org.apache.hadoop.mapreduce OutputCommitter setupTask
public abstract void setupTask(TaskAttemptContext taskContext) throws IOException;
From source file:com.asakusafw.runtime.mapreduce.simple.SimpleJobRunner.java
License:Apache License
@SuppressWarnings({ "rawtypes", "unchecked" }) private void runMap(Job job, KeyValueSorter<?, ?> sorter) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = job.getConfiguration(); InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), conf); List<InputSplit> splits = input.getSplits(job); int serial = 1; for (InputSplit split : splits) { TaskAttemptID id = newTaskAttemptId(newMapTaskId(job.getJobID(), serial++), 0); Mapper<?, ?, ?, ?> mapper = ReflectionUtils.newInstance(job.getMapperClass(), conf); if (LOG.isDebugEnabled()) { LOG.debug(MessageFormat.format("starting mapper: {0}@{1} ({2}bytes)", //$NON-NLS-1$ mapper.getClass().getName(), id, split.getLength())); }/*from www . j a v a 2 s .co m*/ TaskAttemptContext context = newTaskAttemptContext(conf, id); // we always obtain a new OutputFormat object / OutputFormat.getOutputCommiter() may be cached OutputFormat<?, ?> output = ReflectionUtils.newInstance(job.getOutputFormatClass(), conf); OutputCommitter committer = output.getOutputCommitter(context); committer.setupTask(context); boolean succeed = false; try (RecordReader<?, ?> reader = input.createRecordReader(split, newTaskAttemptContext(conf, id))) { RecordWriter<?, ?> writer; if (sorter != null) { writer = new ShuffleWriter(sorter); } else { writer = output.getRecordWriter(newTaskAttemptContext(conf, id)); } try { Mapper.Context c = newMapperContext(conf, id, reader, writer, committer, split); reader.initialize(split, c); mapper.run(c); } finally { writer.close(newTaskAttemptContext(conf, id)); } doCommitTask(context, committer); succeed = true; } finally { if (succeed == false) { doAbortTask(context, committer); } } } }
From source file:com.asakusafw.runtime.mapreduce.simple.SimpleJobRunner.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) private void runReduce(Job job, KeyValueSorter<?, ?> sorter) throws ClassNotFoundException, IOException, InterruptedException { Configuration conf = job.getConfiguration(); OutputFormat<?, ?> output = ReflectionUtils.newInstance(job.getOutputFormatClass(), conf); TaskAttemptID id = newTaskAttemptId(newReduceTaskId(job.getJobID(), 1), 0); Reducer<?, ?, ?, ?> reducer = ReflectionUtils.newInstance(job.getReducerClass(), conf); if (LOG.isDebugEnabled()) { LOG.debug(MessageFormat.format("starting reducer: {0}@{1} ({2}records, {3}bytes)", //$NON-NLS-1$ reducer.getClass().getName(), id, sorter.getRecordCount(), sorter.getSizeInBytes())); }// ww w .ja v a 2s.c o m TaskAttemptContext context = newTaskAttemptContext(conf, id); OutputCommitter committer = output.getOutputCommitter(context); committer.setupTask(context); boolean succeed = false; try { ShuffleReader reader = new ShuffleReader(sorter, new Progress()); try { RecordWriter<?, ?> writer = output.getRecordWriter(newTaskAttemptContext(conf, id)); try { Reducer.Context c = newReducerContext(conf, id, reader, sorter.getKeyClass(), sorter.getValueClass(), writer, committer, (RawComparator) job.getGroupingComparator()); reducer.run(c); } finally { writer.close(newTaskAttemptContext(conf, id)); } } finally { try { reader.close(); } catch (IOException e) { LOG.warn(MessageFormat.format("error occurred while reducer mapper input: {0} ({1})", id, job.getJobName()), e); } } doCommitTask(context, committer); succeed = true; } finally { if (succeed == false) { doAbortTask(context, committer); } } }
From source file:com.scaleoutsoftware.soss.hserver.hadoop.MapperWrapperMapreduce.java
License:Apache License
/** * Runs mapper for the single split.//from w w w . j a va2s . com * * @param mapOutputAccumulator mapOutputAccumulator to use * @param split split ot run on */ @Override @SuppressWarnings("unchecked") public void runSplit(MapOutputAccumulator<OUTKEY, OUTVALUE> mapOutputAccumulator, Object split, int splitIndex) throws IOException, ClassNotFoundException, InterruptedException { TaskAttemptID taskAttemptId = hadoopVersionSpecificCode.createTaskAttemptId(jobId, true, splitIndex); //Setup task ID info TaskAttemptContext taskContext = hadoopVersionSpecificCode.createTaskAttemptContext(configuration, taskAttemptId); InputFormat inputFormat = ReflectionUtils.newInstance(jobContext.getInputFormatClass(), configuration); //Create RecordReader org.apache.hadoop.mapreduce.RecordReader<INKEY, INVALUE> input = inputFormat .createRecordReader((InputSplit) split, taskContext); //Make a mapper org.apache.hadoop.mapreduce.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE> mapper; try { mapper = (org.apache.hadoop.mapreduce.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>) mapperConstructor .newInstance(); } catch (Exception e) { throw new RuntimeException(e); } org.apache.hadoop.mapreduce.RecordWriter output; OutputCommitter committer = null; if (mapOnlyJob) { OutputFormat outputFormat = ReflectionUtils.newInstance(jobContext.getOutputFormatClass(), configuration); output = (org.apache.hadoop.mapreduce.RecordWriter<OUTKEY, OUTVALUE>) outputFormat .getRecordWriter(taskContext); committer = outputFormat.getOutputCommitter(taskContext); committer.setupTask(taskContext); } else { output = new MapOutputCollector<OUTKEY, OUTVALUE>(mapOutputAccumulator); } input.initialize((InputSplit) split, taskContext); org.apache.hadoop.mapreduce.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>.Context mapperContext = hadoopVersionSpecificCode .getMapperContext(configuration, taskAttemptId, input, output); mapper.run(mapperContext); input.close(); output.close(mapperContext); if (mapOnlyJob && committer != null) { committer.commitTask(taskContext); } }
From source file:org.apache.giraph.io.internal.WrappedEdgeOutputFormat.java
License:Apache License
@Override public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException { final OutputCommitter outputCommitter = originalOutputFormat .getOutputCommitter(HadoopUtils.makeTaskAttemptContext(getConf(), context)); return new OutputCommitter() { @Override/* w w w .j a v a 2 s. co m*/ public void setupJob(JobContext context) throws IOException { outputCommitter.setupJob(HadoopUtils.makeJobContext(getConf(), context)); } @Override public void setupTask(TaskAttemptContext context) throws IOException { outputCommitter.setupTask(HadoopUtils.makeTaskAttemptContext(getConf(), context)); } @Override public boolean needsTaskCommit(TaskAttemptContext context) throws IOException { return outputCommitter.needsTaskCommit(HadoopUtils.makeTaskAttemptContext(getConf(), context)); } @Override public void commitTask(TaskAttemptContext context) throws IOException { outputCommitter.commitTask(HadoopUtils.makeTaskAttemptContext(getConf(), context)); } @Override public void abortTask(TaskAttemptContext context) throws IOException { outputCommitter.abortTask(HadoopUtils.makeTaskAttemptContext(getConf(), context)); } @Override public void cleanupJob(JobContext context) throws IOException { outputCommitter.cleanupJob(HadoopUtils.makeJobContext(getConf(), context)); } /*if_not[HADOOP_NON_COMMIT_JOB]*/ @Override public void commitJob(JobContext context) throws IOException { outputCommitter.commitJob(HadoopUtils.makeJobContext(getConf(), context)); } @Override public void abortJob(JobContext context, JobStatus.State state) throws IOException { outputCommitter.abortJob(HadoopUtils.makeJobContext(getConf(), context), state); } /*end[HADOOP_NON_COMMIT_JOB]*/ }; }
From source file:org.apache.giraph.io.internal.WrappedVertexOutputFormat.java
License:Apache License
@Override public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException { final OutputCommitter outputCommitter = originalOutputFormat .getOutputCommitter(HadoopUtils.makeTaskAttemptContext(getConf(), context)); return new OutputCommitter() { @Override/*from w w w . j a v a2s . c o m*/ public void setupJob(JobContext context) throws IOException { outputCommitter.setupJob(HadoopUtils.makeJobContext(getConf(), context)); } @Override public void setupTask(TaskAttemptContext context) throws IOException { outputCommitter.setupTask(HadoopUtils.makeTaskAttemptContext(getConf(), context)); } @Override public boolean needsTaskCommit(TaskAttemptContext context) throws IOException { return outputCommitter.needsTaskCommit(HadoopUtils.makeTaskAttemptContext(getConf(), context)); } @Override public void commitTask(TaskAttemptContext context) throws IOException { outputCommitter.commitTask(HadoopUtils.makeTaskAttemptContext(getConf(), context)); } @Override public void abortTask(TaskAttemptContext context) throws IOException { outputCommitter.abortTask(HadoopUtils.makeTaskAttemptContext(getConf(), context)); } @Override public void cleanupJob(JobContext context) throws IOException { outputCommitter.cleanupJob(HadoopUtils.makeJobContext(getConf(), context)); } /*if_not[HADOOP_NON_COMMIT_JOB]*/ @Override public void commitJob(JobContext context) throws IOException { outputCommitter.commitJob(HadoopUtils.makeJobContext(getConf(), context)); } @Override public void abortJob(JobContext context, JobStatus.State state) throws IOException { outputCommitter.abortJob(HadoopUtils.makeJobContext(getConf(), context), state); } /*end[HADOOP_NON_COMMIT_JOB]*/ }; }
From source file:org.apache.hcatalog.data.transfer.impl.HCatOutputFormatWriter.java
License:Apache License
@Override public void write(Iterator<HCatRecord> recordItr) throws HCatException { int id = sp.getId(); setVarsInConf(id);/* w ww . j ava 2 s . c o m*/ HCatOutputFormat outFormat = new HCatOutputFormat(); TaskAttemptContext cntxt = HCatHadoopShims.Instance.get().createTaskAttemptContext(conf, new TaskAttemptID(HCatHadoopShims.Instance.get().createTaskID(), id)); OutputCommitter committer = null; RecordWriter<WritableComparable<?>, HCatRecord> writer; try { committer = outFormat.getOutputCommitter(cntxt); committer.setupTask(cntxt); writer = outFormat.getRecordWriter(cntxt); while (recordItr.hasNext()) { HCatRecord rec = recordItr.next(); writer.write(null, rec); } writer.close(cntxt); if (committer.needsTaskCommit(cntxt)) { committer.commitTask(cntxt); } } catch (IOException e) { if (null != committer) { try { committer.abortTask(cntxt); } catch (IOException e1) { throw new HCatException(ErrorType.ERROR_INTERNAL_EXCEPTION, e1); } } throw new HCatException("Failed while writing", e); } catch (InterruptedException e) { if (null != committer) { try { committer.abortTask(cntxt); } catch (IOException e1) { throw new HCatException(ErrorType.ERROR_INTERNAL_EXCEPTION, e1); } } throw new HCatException("Failed while writing", e); } }
From source file:org.apache.hcatalog.mapreduce.FileRecordWriterContainer.java
License:Apache License
@Override public void write(WritableComparable<?> key, HCatRecord value) throws IOException, InterruptedException { org.apache.hadoop.mapred.RecordWriter localWriter; ObjectInspector localObjectInspector; SerDe localSerDe;//from w ww . j ava 2 s .co m OutputJobInfo localJobInfo = null; if (dynamicPartitioningUsed) { // calculate which writer to use from the remaining values - this needs to be done before we delete cols List<String> dynamicPartValues = new ArrayList<String>(); for (Integer colToAppend : dynamicPartCols) { dynamicPartValues.add(value.get(colToAppend).toString()); } String dynKey = dynamicPartValues.toString(); if (!baseDynamicWriters.containsKey(dynKey)) { if ((maxDynamicPartitions != -1) && (baseDynamicWriters.size() > maxDynamicPartitions)) { throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS, "Number of dynamic partitions being created " + "exceeds configured max allowable partitions[" + maxDynamicPartitions + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname + "] if needed."); } org.apache.hadoop.mapred.TaskAttemptContext currTaskContext = HCatMapRedUtil .createTaskAttemptContext(context); configureDynamicStorageHandler(currTaskContext, dynamicPartValues); localJobInfo = HCatBaseOutputFormat.getJobInfo(currTaskContext); //setup serDe SerDe currSerDe = ReflectionUtils.newInstance(storageHandler.getSerDeClass(), currTaskContext.getJobConf()); try { InternalUtil.initializeOutputSerDe(currSerDe, currTaskContext.getConfiguration(), localJobInfo); } catch (SerDeException e) { throw new IOException("Failed to initialize SerDe", e); } //create base OutputFormat org.apache.hadoop.mapred.OutputFormat baseOF = ReflectionUtils .newInstance(storageHandler.getOutputFormatClass(), currTaskContext.getJobConf()); //We are skipping calling checkOutputSpecs() for each partition //As it can throw a FileAlreadyExistsException when more than one mapper is writing to a partition //See HCATALOG-490, also to avoid contacting the namenode for each new FileOutputFormat instance //In general this should be ok for most FileOutputFormat implementations //but may become an issue for cases when the method is used to perform other setup tasks //get Output Committer org.apache.hadoop.mapred.OutputCommitter baseOutputCommitter = currTaskContext.getJobConf() .getOutputCommitter(); //create currJobContext the latest so it gets all the config changes org.apache.hadoop.mapred.JobContext currJobContext = HCatMapRedUtil .createJobContext(currTaskContext); //setupJob() baseOutputCommitter.setupJob(currJobContext); //recreate to refresh jobConf of currTask context currTaskContext = HCatMapRedUtil.createTaskAttemptContext(currJobContext.getJobConf(), currTaskContext.getTaskAttemptID(), currTaskContext.getProgressible()); //set temp location currTaskContext.getConfiguration().set("mapred.work.output.dir", new FileOutputCommitter(new Path(localJobInfo.getLocation()), currTaskContext).getWorkPath() .toString()); //setupTask() baseOutputCommitter.setupTask(currTaskContext); Path parentDir = new Path(currTaskContext.getConfiguration().get("mapred.work.output.dir")); Path childPath = new Path(parentDir, FileOutputFormat.getUniqueFile(currTaskContext, "part", "")); org.apache.hadoop.mapred.RecordWriter baseRecordWriter = baseOF.getRecordWriter( parentDir.getFileSystem(currTaskContext.getConfiguration()), currTaskContext.getJobConf(), childPath.toString(), InternalUtil.createReporter(currTaskContext)); baseDynamicWriters.put(dynKey, baseRecordWriter); baseDynamicSerDe.put(dynKey, currSerDe); baseDynamicCommitters.put(dynKey, baseOutputCommitter); dynamicContexts.put(dynKey, currTaskContext); dynamicObjectInspectors.put(dynKey, InternalUtil.createStructObjectInspector(jobInfo.getOutputSchema())); dynamicOutputJobInfo.put(dynKey, HCatOutputFormat.getJobInfo(dynamicContexts.get(dynKey))); } localJobInfo = dynamicOutputJobInfo.get(dynKey); localWriter = baseDynamicWriters.get(dynKey); localSerDe = baseDynamicSerDe.get(dynKey); localObjectInspector = dynamicObjectInspectors.get(dynKey); } else { localJobInfo = jobInfo; localWriter = getBaseRecordWriter(); localSerDe = serDe; localObjectInspector = objectInspector; } for (Integer colToDel : partColsToDel) { value.remove(colToDel); } //The key given by user is ignored try { localWriter.write(NullWritable.get(), localSerDe.serialize(value.getAll(), localObjectInspector)); } catch (SerDeException e) { throw new IOException("Failed to serialize object", e); } }
From source file:org.apache.hcatalog.pig.TestE2EScenarios.java
License:Apache License
private void copyTable(String in, String out) throws IOException, InterruptedException { Job ijob = new Job(); Job ojob = new Job(); HCatInputFormat inpy = new HCatInputFormat(); inpy.setInput(ijob, null, in);//from w w w . j a va 2 s. c o m HCatOutputFormat oupy = new HCatOutputFormat(); oupy.setOutput(ojob, OutputJobInfo.create(null, out, new HashMap<String, String>())); // Test HCatContext System.err.println("HCatContext INSTANCE is present : " + HCatContext.INSTANCE.getConf().isPresent()); if (HCatContext.INSTANCE.getConf().isPresent()) { System.err.println("HCatContext tinyint->int promotion says " + HCatContext.INSTANCE.getConf().get() .getBoolean(HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION, HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION_DEFAULT)); } HCatSchema tableSchema = inpy.getTableSchema(ijob.getConfiguration()); System.err.println("Copying from [" + in + "] to [" + out + "] with schema : " + tableSchema.toString()); oupy.setSchema(ojob, tableSchema); oupy.checkOutputSpecs(ojob); OutputCommitter oc = oupy.getOutputCommitter(createTaskAttemptContext(ojob.getConfiguration())); oc.setupJob(ojob); for (InputSplit split : inpy.getSplits(ijob)) { TaskAttemptContext rtaskContext = createTaskAttemptContext(ijob.getConfiguration()); TaskAttemptContext wtaskContext = createTaskAttemptContext(ojob.getConfiguration()); RecordReader<WritableComparable, HCatRecord> rr = inpy.createRecordReader(split, rtaskContext); rr.initialize(split, rtaskContext); OutputCommitter taskOc = oupy.getOutputCommitter(wtaskContext); taskOc.setupTask(wtaskContext); RecordWriter<WritableComparable<?>, HCatRecord> rw = oupy.getRecordWriter(wtaskContext); while (rr.nextKeyValue()) { rw.write(rr.getCurrentKey(), rr.getCurrentValue()); } rw.close(wtaskContext); taskOc.commitTask(wtaskContext); rr.close(); } oc.commitJob(ojob); }
From source file:org.apache.hive.hcatalog.data.transfer.impl.HCatOutputFormatWriter.java
License:Apache License
@Override public void write(Iterator<HCatRecord> recordItr) throws HCatException { int id = sp.getId(); setVarsInConf(id);/*from ww w . j a v a2 s. c om*/ HCatOutputFormat outFormat = new HCatOutputFormat(); TaskAttemptContext cntxt = ShimLoader.getHadoopShims().getHCatShim().createTaskAttemptContext(conf, new TaskAttemptID(ShimLoader.getHadoopShims().getHCatShim().createTaskID(), id)); OutputCommitter committer = null; RecordWriter<WritableComparable<?>, HCatRecord> writer; try { committer = outFormat.getOutputCommitter(cntxt); committer.setupTask(cntxt); writer = outFormat.getRecordWriter(cntxt); while (recordItr.hasNext()) { HCatRecord rec = recordItr.next(); writer.write(null, rec); } writer.close(cntxt); if (committer.needsTaskCommit(cntxt)) { committer.commitTask(cntxt); } } catch (IOException e) { if (null != committer) { try { committer.abortTask(cntxt); } catch (IOException e1) { throw new HCatException(ErrorType.ERROR_INTERNAL_EXCEPTION, e1); } } throw new HCatException("Failed while writing", e); } catch (InterruptedException e) { if (null != committer) { try { committer.abortTask(cntxt); } catch (IOException e1) { throw new HCatException(ErrorType.ERROR_INTERNAL_EXCEPTION, e1); } } throw new HCatException("Failed while writing", e); } }
From source file:org.apache.hive.hcatalog.mapreduce.DynamicPartitionFileRecordWriterContainer.java
License:Apache License
@Override protected LocalFileWriter getLocalFileWriter(HCatRecord value) throws IOException, HCatException { OutputJobInfo localJobInfo = null;/* w w w . j av a2 s . com*/ // Calculate which writer to use from the remaining values - this needs to // be done before we delete cols. List<String> dynamicPartValues = new ArrayList<String>(); for (Integer colToAppend : dynamicPartCols) { Object partitionValue = value.get(colToAppend); dynamicPartValues .add(partitionValue == null ? HIVE_DEFAULT_PARTITION_VALUE : partitionValue.toString()); } String dynKey = dynamicPartValues.toString(); if (!baseDynamicWriters.containsKey(dynKey)) { if ((maxDynamicPartitions != -1) && (baseDynamicWriters.size() > maxDynamicPartitions)) { throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS, "Number of dynamic partitions being created " + "exceeds configured max allowable partitions[" + maxDynamicPartitions + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname + "] if needed."); } org.apache.hadoop.mapred.TaskAttemptContext currTaskContext = HCatMapRedUtil .createTaskAttemptContext(context); configureDynamicStorageHandler(currTaskContext, dynamicPartValues); localJobInfo = HCatBaseOutputFormat.getJobInfo(currTaskContext.getConfiguration()); // Setup serDe. SerDe currSerDe = ReflectionUtils.newInstance(storageHandler.getSerDeClass(), currTaskContext.getJobConf()); try { InternalUtil.initializeOutputSerDe(currSerDe, currTaskContext.getConfiguration(), localJobInfo); } catch (SerDeException e) { throw new IOException("Failed to initialize SerDe", e); } // create base OutputFormat org.apache.hadoop.mapred.OutputFormat baseOF = ReflectionUtils .newInstance(storageHandler.getOutputFormatClass(), currTaskContext.getJobConf()); // We are skipping calling checkOutputSpecs() for each partition // As it can throw a FileAlreadyExistsException when more than one // mapper is writing to a partition. // See HCATALOG-490, also to avoid contacting the namenode for each new // FileOutputFormat instance. // In general this should be ok for most FileOutputFormat implementations // but may become an issue for cases when the method is used to perform // other setup tasks. // Get Output Committer org.apache.hadoop.mapred.OutputCommitter baseOutputCommitter = currTaskContext.getJobConf() .getOutputCommitter(); // Create currJobContext the latest so it gets all the config changes org.apache.hadoop.mapred.JobContext currJobContext = HCatMapRedUtil.createJobContext(currTaskContext); // Set up job. baseOutputCommitter.setupJob(currJobContext); // Recreate to refresh jobConf of currTask context. currTaskContext = HCatMapRedUtil.createTaskAttemptContext(currJobContext.getJobConf(), currTaskContext.getTaskAttemptID(), currTaskContext.getProgressible()); // Set temp location. currTaskContext.getConfiguration().set("mapred.work.output.dir", new FileOutputCommitter(new Path(localJobInfo.getLocation()), currTaskContext).getWorkPath() .toString()); // Set up task. baseOutputCommitter.setupTask(currTaskContext); Path parentDir = new Path(currTaskContext.getConfiguration().get("mapred.work.output.dir")); Path childPath = new Path(parentDir, FileOutputFormat.getUniqueFile(currTaskContext, currTaskContext.getConfiguration().get("mapreduce.output.basename", "part"), "")); RecordWriter baseRecordWriter = baseOF.getRecordWriter( parentDir.getFileSystem(currTaskContext.getConfiguration()), currTaskContext.getJobConf(), childPath.toString(), InternalUtil.createReporter(currTaskContext)); baseDynamicWriters.put(dynKey, baseRecordWriter); baseDynamicSerDe.put(dynKey, currSerDe); baseDynamicCommitters.put(dynKey, baseOutputCommitter); dynamicContexts.put(dynKey, currTaskContext); dynamicObjectInspectors.put(dynKey, InternalUtil.createStructObjectInspector(jobInfo.getOutputSchema())); dynamicOutputJobInfo.put(dynKey, HCatOutputFormat.getJobInfo(dynamicContexts.get(dynKey).getConfiguration())); } return new LocalFileWriter(baseDynamicWriters.get(dynKey), dynamicObjectInspectors.get(dynKey), baseDynamicSerDe.get(dynKey), dynamicOutputJobInfo.get(dynKey)); }