List of usage examples for org.apache.hadoop.mapred JobConf getOutputCommitter
public OutputCommitter getOutputCommitter()
From source file:com.facebook.hiveio.output.HiveApiOutputFormat.java
License:Apache License
@Override public HiveApiOutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { HadoopUtils.setWorkOutputDir(taskAttemptContext); Configuration conf = taskAttemptContext.getConfiguration(); JobConf jobConf = new JobConf(conf); OutputCommitter baseCommitter = jobConf.getOutputCommitter(); LOG.info("Getting output committer with base output committer {}", baseCommitter.getClass().getSimpleName()); return new HiveApiOutputCommitter(new HackOutputCommitter(baseCommitter, jobConf), myProfileId); }
From source file:com.scaleoutsoftware.soss.hserver.hadoop.MapperWrapperMapred.java
License:Apache License
/** * Runs mapper for the single split.//from www. j ava2s . co m * * @param mapOutputAccumulator mapOutputAccumulator to use * @param split split ot run on */ @Override @SuppressWarnings("unchecked") public void runSplit(final MapOutputAccumulator<OUTKEY, OUTVALUE> mapOutputAccumulator, Object split, int splitIndex) throws IOException, ClassNotFoundException, InterruptedException { JobConf jobConf = new JobConf(this.jobConf); //Clone JobConf to prevent unexpected task interaction TaskAttemptID taskAttemptID = TaskAttemptID .downgrade(hadoopVersionSpecificCode.createTaskAttemptId(jobId, true, splitIndex)); ReducerWrapperMapred.updateJobConf(jobConf, taskAttemptID, splitIndex); updateJobWithSplit(jobConf, split); InputFormat inputFormat = jobConf.getInputFormat(); Reporter reporter = Reporter.NULL; //Create RecordReader org.apache.hadoop.mapred.RecordReader<INKEY, INVALUE> recordReader = inputFormat .getRecordReader((InputSplit) split, jobConf, reporter); //Make a mapper org.apache.hadoop.mapred.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE> mapper; try { mapper = (org.apache.hadoop.mapred.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>) mapperConstructor .newInstance(); mapper.configure(jobConf); } catch (Exception e) { throw new RuntimeException("Cannot instantiate mapper " + mapperConstructor.getDeclaringClass(), e); } //These are to support map only jobs which write output directly to HDFS. final RecordWriter outputRecordWriter; OutputCommitter outputCommitter = null; TaskAttemptContext taskAttemptContext = null; if (mapOnlyJob) { taskAttemptContext = hadoopVersionSpecificCode.createTaskAttemptContextMapred(jobConf, taskAttemptID); OutputFormat outputFormat = jobConf.getOutputFormat(); FileSystem fs = FileSystem.get(jobConf); outputRecordWriter = (org.apache.hadoop.mapred.RecordWriter<OUTKEY, OUTVALUE>) outputFormat .getRecordWriter(fs, jobConf, ReducerWrapperMapred.getOutputName(splitIndex), Reporter.NULL); outputCommitter = jobConf.getOutputCommitter(); //Create task object so it can handle file format initialization //The MapTask is private in the Hadoop 1.x so we have to go through reflection. try { Class reduceTask = Class.forName("org.apache.hadoop.mapred.MapTask"); Constructor reduceTaskConstructor = reduceTask.getDeclaredConstructor(String.class, TaskAttemptID.class, int.class, JobSplit.TaskSplitIndex.class, int.class); reduceTaskConstructor.setAccessible(true); Task task = (Task) reduceTaskConstructor.newInstance(null, taskAttemptID, splitIndex, new JobSplit.TaskSplitIndex(), 0); task.setConf(jobConf); task.initialize(jobConf, jobId, Reporter.NULL, false); } catch (Exception e) { throw new IOException("Cannot initialize MapTask", e); } outputCommitter.setupTask(taskAttemptContext); } else { outputRecordWriter = null; } OutputCollector<OUTKEY, OUTVALUE> outputCollector; if (!mapOnlyJob) { outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() { @Override public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException { try { mapOutputAccumulator.combine(outkey, outvalue); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } } }; } else { outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() { @Override public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException { outputRecordWriter.write(outkey, outvalue); } }; } INKEY key = recordReader.createKey(); INVALUE value = recordReader.createValue(); while (recordReader.next(key, value)) { mapper.map(key, value, outputCollector, reporter); } mapper.close(); recordReader.close(); if (mapOnlyJob) { outputRecordWriter.close(Reporter.NULL); outputCommitter.commitTask(taskAttemptContext); } }
From source file:com.scaleoutsoftware.soss.hserver.hadoop.ReducerWrapperMapred.java
License:Apache License
public ReducerWrapperMapred(HServerInvocationParameters invocationParameters, int hadoopPartition, int appId, int region, boolean sort) throws IOException, ClassNotFoundException, InterruptedException { this.invocationParameters = invocationParameters; JobConf jobConf = new JobConf((Configuration) invocationParameters.getConfiguration()); //Clone JobConf, so the temporary settings do not pollute other tasks LOG.info("Starting reducer:" + HadoopInvocationParameters.dumpConfiguration(jobConf)); JobID jobID = (JobID) invocationParameters.getJobId(); this.hadoopPartition = hadoopPartition; hadoopVersionSpecificCode = HadoopVersionSpecificCode.getInstance(invocationParameters.getHadoopVersion(), jobConf);//from www . j a v a 2 s . co m TaskAttemptID taskAttemptID = TaskAttemptID .downgrade(hadoopVersionSpecificCode.createTaskAttemptId(jobID, false, hadoopPartition)); updateJobConf(jobConf, taskAttemptID, region); context = hadoopVersionSpecificCode.createTaskAttemptContextMapred(jobConf, taskAttemptID); reducer = (org.apache.hadoop.mapred.Reducer<INKEY, INVALUE, OUTKEY, OUTVALUE>) ReflectionUtils .newInstance(jobConf.getReducerClass(), jobConf); reducer.configure(jobConf); OutputFormat outputFormat = jobConf.getOutputFormat(); FileSystem fs = FileSystem.get(jobConf); recordWriter = (org.apache.hadoop.mapred.RecordWriter<OUTKEY, OUTVALUE>) outputFormat.getRecordWriter(fs, jobConf, getOutputName(hadoopPartition), Reporter.NULL); committer = jobConf.getOutputCommitter(); //Create task object so it can handle file format initialization //The ReduceTask is private in the Hadoop 1.x so we have to go through reflection. try { Class reduceTask = Class.forName("org.apache.hadoop.mapred.ReduceTask"); Constructor reduceTaskConstructor = reduceTask.getDeclaredConstructor(String.class, TaskAttemptID.class, int.class, int.class, int.class); reduceTaskConstructor.setAccessible(true); Task task = (Task) reduceTaskConstructor.newInstance(null, taskAttemptID, hadoopPartition, 0, 0); task.setConf(jobConf); task.initialize(jobConf, jobID, Reporter.NULL, false); } catch (Exception e) { throw new IOException("Cannot initialize ReduceTask", e); } committer.setupTask(context); Class<INKEY> keyClass = (Class<INKEY>) jobConf.getMapOutputKeyClass(); WritableSerializerDeserializer<INKEY> firstKeySerializer = new WritableSerializerDeserializer<INKEY>( keyClass, null); WritableSerializerDeserializer<INKEY> secondKeySerializer = new WritableSerializerDeserializer<INKEY>( keyClass, null); Class<INVALUE> valueClass = (Class<INVALUE>) jobConf.getMapOutputValueClass(); WritableSerializerDeserializer<INVALUE> valueSerializer = new WritableSerializerDeserializer<INVALUE>( valueClass, null); DataGridReaderParameters<INKEY, INVALUE> params = new DataGridReaderParameters<INKEY, INVALUE>(region, appId, HServerParameters.getSetting(REDUCE_USEMEMORYMAPPEDFILES, jobConf) > 0, firstKeySerializer, valueSerializer, invocationParameters.getSerializationMode(), secondKeySerializer, keyClass, valueClass, sort, HServerParameters.getSetting(REDUCE_CHUNKSTOREADAHEAD, jobConf), 1024 * HServerParameters.getSetting(REDUCE_INPUTCHUNKSIZE_KB, jobConf), HServerParameters.getSetting(REDUCE_CHUNKREADTIMEOUT, jobConf)); transport = DataGridChunkedCollectionReader.getGridReader(params); outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() { @Override public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException { recordWriter.write(outkey, outvalue); } }; }
From source file:com.scaleoutsoftware.soss.hserver.JobScheduler.java
License:Apache License
/** * Runs the map-reduce job on ScaleOut hServer.* * * @param job the job to run/* w ww. j a v a 2 s. c om*/ * @param jobId the id of the job * @param sortEnabled if key sorting is enabled * @param jobParameter user defined parameter object for the job * @param grid the invocation grid to run the job * @throws IOException if errors occurred during the job * @throws InterruptedException if the processing thread is interrupted * @throws ClassNotFoundException if the invocation grid does not contain the dependency class */ @SuppressWarnings("unchecked") public void runOldApiJob(JobConf job, org.apache.hadoop.mapred.JobID jobId, boolean sortEnabled, Object jobParameter, InvocationGrid grid) throws IOException, InterruptedException, ClassNotFoundException { //Initialize user credential in advance int jobAppId = 0xFFFFFFF & BitConverter.hashStringOneInt(jobId.toString()); String hadoopVersion = VersionInfo.getVersion(); long time = System.currentTimeMillis(); CreateUserCredentials.run(grid); try { //Check output specs before running the job job.getOutputFormat().checkOutputSpecs(FileSystem.get(job), job); JobContext jContext = HadoopVersionSpecificCode.getInstance(hadoopVersion, job).createJobContext(job, jobId); org.apache.hadoop.mapred.OutputCommitter outputCommitter = job.getOutputCommitter(); outputCommitter.setupJob(jContext); //clear all temporary objects DataAccessor.clearObjects(jobAppId); //Calculating the partition layout com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping hostNameToPartition = com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping .getCurrent(); List<InetAddress> hostAddresses = new ArrayList<InetAddress>(hostNameToPartition.getHosts()); //Generating mapping of Hadoop partitions to SOSS partitions, so they are equally distributed across hosts int numHosts = hostAddresses.size(); int numberOfSlotsPerNode = Math .max(grid != null ? grid.getMaxNumberOfCores() : Runtime.getRuntime().availableProcessors(), 1); //Set the number of splits to the number of cores if (NamedMapInputFormatMapred.class.isAssignableFrom(job.getInputFormat().getClass())) { int numberOfSplits = HServerParameters.getSetting(MAP_SPLITS_PER_CORE, job) * numHosts * numberOfSlotsPerNode; job.setNumMapTasks(Math.min(numberOfSplits, HServerConstants.MAX_MAP_REDUCE_TASKS)); } //Generating split to hostname map org.apache.hadoop.mapred.InputFormat inputFormat = job.getInputFormat(); List<org.apache.hadoop.mapred.InputSplit> splitList = Arrays .asList(inputFormat.getSplits(job, job.getNumMapTasks())); Map<InetAddress, List<Integer>> splitToHostAddress = assignSplitsToHost(splitList, hostAddresses, null); //Choose the optimal number of reducers for GridOutputFormat if (job.getOutputFormat() instanceof NamedMapOutputFormatMapred) { job.setNumReduceTasks(numHosts * numberOfSlotsPerNode); sortEnabled = false; } int[] partitionMapping = hostNameToPartition.generateEvenItemDistribution(job.getNumReduceTasks()); //Generating invocation parameters Class<? extends org.apache.hadoop.mapred.InputSplit> splitType = splitList.size() > 0 ? splitList.get(0).getClass() : null; HadoopInvocationParameters hadoopParameters = new HadoopInvocationParameters(job, jobId, true); HServerInvocationParameters<org.apache.hadoop.mapred.InputSplit> parameters = new HServerInvocationParameters<org.apache.hadoop.mapred.InputSplit>( hadoopParameters, jobAppId, partitionMapping, hostNameToPartition, numberOfSlotsPerNode, splitType, splitList, splitToHostAddress, false, sortEnabled, hadoopVersion, jobParameter, SerializationMode.DEFAULT); StringBuilder stringBuilder = new StringBuilder(); stringBuilder.append("Splits created:\n"); for (InetAddress address : splitToHostAddress.keySet()) { stringBuilder.append("Host "); stringBuilder.append(address); stringBuilder.append(" has "); stringBuilder.append(splitToHostAddress.get(address).size()); stringBuilder.append(" splits.\n"); } System.out.println(stringBuilder.toString()); System.out.println("Job initialization completed in " + (System.currentTimeMillis() - time) + " ms."); time = System.currentTimeMillis(); InvokeResult<MapperResult> mapInvokeResult = MessagingHelper.invoke(grid, RunMapper.MapperInvokable.class, parameters, TimeSpan.INFINITE_TIMEOUT.getSeconds()); if (mapInvokeResult.getErrors() != null && mapInvokeResult.getErrors().size() > 0) { throw new IOException("Map invocation failed.", mapInvokeResult.getErrors().get(0)); } System.out.println("Map invocation done in " + (System.currentTimeMillis() - time) + " ms."); time = System.currentTimeMillis(); MapperResult resultObject = mapInvokeResult.getResult(); if (resultObject == null || mapInvokeResult.getNumFailed() != 0) { throw new IOException("Mapper invocation failed. Num failed = " + mapInvokeResult.getNumFailed()); } if (resultObject.getNumberOfSplitsProcessed() != splitList.size()) { throw new IOException("Number of splits does not match the number of invocations. Nsplits = " + splitList.size() + ", Ninvokes =" + resultObject.getNumberOfSplitsProcessed()); } if (partitionMapping.length > 0) { //Running the reduce step InvokeResult<Integer> reduceInvokeResult = MessagingHelper.invoke(grid, ReduceInvokable.class, jobAppId, TimeSpan.INFINITE_TIMEOUT.getSeconds()); System.out.println("Reduce invocation done in " + (System.currentTimeMillis() - time) + " ms."); DataAccessor.clearObjects(jobAppId); //clear all temporary objects if (reduceInvokeResult.getErrors() != null && reduceInvokeResult.getErrors().size() > 0) { throw new IOException("Reduce invocation failed.", reduceInvokeResult.getErrors().get(0)); } if (reduceInvokeResult.getNumFailed() != 0) { throw new IOException("Reduce invocation failed."); } if (reduceInvokeResult.getResult() != partitionMapping.length) { throw new IOException("Not all partitions were reduced. Expected = " + partitionMapping.length + " Actual = " + reduceInvokeResult.getResult()); } } outputCommitter.commitJob(jContext); } catch (StateServerException e) { throw new IOException("ScaleOut hServer access error.", e); } }
From source file:org.apache.flink.api.java.hadoop.mapred.HadoopOutputFormatTest.java
License:Apache License
@Test public void testOpen() throws Exception { OutputFormat<String, Long> dummyOutputFormat = mock(DummyOutputFormat.class); DummyOutputCommitter outputCommitter = mock(DummyOutputCommitter.class); JobConf jobConf = spy(new JobConf()); when(jobConf.getOutputCommitter()).thenReturn(outputCommitter); HadoopOutputFormat<String, Long> outputFormat = new HadoopOutputFormat<>(dummyOutputFormat, jobConf); outputFormat.open(1, 1);//from w ww.j a v a 2 s . c om verify(jobConf, times(2)).getOutputCommitter(); verify(outputCommitter, times(1)).setupJob(any(JobContext.class)); verify(dummyOutputFormat, times(1)).getRecordWriter(any(FileSystem.class), any(JobConf.class), anyString(), any(Progressable.class)); }
From source file:org.apache.flink.api.java.hadoop.mapred.HadoopOutputFormatTest.java
License:Apache License
@Test public void testFinalizeGlobal() throws Exception { OutputFormat<String, Long> dummyOutputFormat = mock(DummyOutputFormat.class); DummyOutputCommitter outputCommitter = mock(DummyOutputCommitter.class); JobConf jobConf = spy(new JobConf()); when(jobConf.getOutputCommitter()).thenReturn(outputCommitter); HadoopOutputFormat<String, Long> outputFormat = new HadoopOutputFormat<>(dummyOutputFormat, jobConf); outputFormat.finalizeGlobal(1);// ww w .ja va 2 s . com verify(outputCommitter, times(1)).commitJob(any(JobContext.class)); }
From source file:org.apache.flink.batch.connectors.hive.HiveTableOutputFormat.java
License:Apache License
private HivePartitionWriter writerForLocation(String location) throws IOException { JobConf clonedConf = new JobConf(jobConf); clonedConf.set(OUTDIR, location);//from w w w . j a v a2 s .c om OutputFormat outputFormat; try { StorageDescriptor sd = hiveTablePartition.getStorageDescriptor(); Class outputFormatClz = Class.forName(sd.getOutputFormat(), true, Thread.currentThread().getContextClassLoader()); outputFormatClz = HiveFileFormatUtils.getOutputFormatSubstitute(outputFormatClz); outputFormat = (OutputFormat) outputFormatClz.newInstance(); } catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) { throw new FlinkRuntimeException("Unable to instantiate the hadoop output format", e); } ReflectionUtils.setConf(outputFormat, clonedConf); OutputCommitter outputCommitter = clonedConf.getOutputCommitter(); JobContext jobContext = new JobContextImpl(clonedConf, new JobID()); outputCommitter.setupJob(jobContext); final boolean isCompressed = clonedConf.getBoolean(HiveConf.ConfVars.COMPRESSRESULT.varname, false); if (isCompressed) { String codecStr = clonedConf.get(HiveConf.ConfVars.COMPRESSINTERMEDIATECODEC.varname); if (!StringUtils.isNullOrWhitespaceOnly(codecStr)) { try { Class<? extends CompressionCodec> codec = (Class<? extends CompressionCodec>) Class .forName(codecStr, true, Thread.currentThread().getContextClassLoader()); FileOutputFormat.setOutputCompressorClass(clonedConf, codec); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } } String typeStr = clonedConf.get(HiveConf.ConfVars.COMPRESSINTERMEDIATETYPE.varname); if (!StringUtils.isNullOrWhitespaceOnly(typeStr)) { SequenceFile.CompressionType style = SequenceFile.CompressionType.valueOf(typeStr); SequenceFileOutputFormat.setOutputCompressionType(clonedConf, style); } } String taskPartition = String.valueOf(clonedConf.getInt("mapreduce.task.partition", -1)); Path taskPath = FileOutputFormat.getTaskOutputPath(clonedConf, taskPartition); FileSinkOperator.RecordWriter recordWriter; try { recordWriter = HiveFileFormatUtils.getRecordWriter(clonedConf, outputFormat, outputClass, isCompressed, tblProperties, taskPath, Reporter.NULL); } catch (HiveException e) { throw new IOException(e); } return new HivePartitionWriter(clonedConf, outputFormat, recordWriter, outputCommitter); }
From source file:org.apache.tez.mapreduce.output.MROutput.java
License:Apache License
public void initCommitter(JobConf job, boolean useNewApi) throws IOException, InterruptedException { if (useNewApi) { if (LOG.isDebugEnabled()) { LOG.debug("using new api for output committer"); }//from ww w . j av a 2s . c o m this.committer = newOutputFormat.getOutputCommitter(newApiTaskAttemptContext); } else { this.committer = job.getOutputCommitter(); } Path outputPath = FileOutputFormat.getOutputPath(job); if (outputPath != null) { if ((this.committer instanceof FileOutputCommitter)) { FileOutputFormat.setWorkOutputPath(job, ((FileOutputCommitter) this.committer).getTaskAttemptPath(oldApiTaskAttemptContext)); } else { FileOutputFormat.setWorkOutputPath(job, outputPath); } } if (useNewApi) { this.committer.setupTask(newApiTaskAttemptContext); } else { this.committer.setupTask(oldApiTaskAttemptContext); } }