List of usage examples for org.apache.hadoop.mapreduce OutputCommitter commitJob
public void commitJob(JobContext jobContext) throws IOException
From source file:com.scaleoutsoftware.soss.hserver.JobScheduler.java
License:Apache License
/** * Runs the map-reduce job on ScaleOut hServer. * * @param jobID the id of the job * @param jobConf the job to run//from ww w . j av a 2 s .c o m * @param isNewApi if the job uses the new MapReduce APIs * @param splitType the type of the split * @param inputSplits the list of input splits * @param splitLocations the locations of the splits * @param grid the invocation grid to run the job * @throws IOException if errors occurred during the job * @throws InterruptedException if the processing thread is interrupted * @throws ClassNotFoundException if the invocation grid does not contain the dependency class */ @SuppressWarnings("unchecked") public void runPredefinedJob(JobID jobID, JobConf jobConf, boolean isNewApi, Class splitType, List<?> inputSplits, Map<Object, String[]> splitLocations, InvocationGrid grid) throws IOException, InterruptedException, ClassNotFoundException { //Initialize user credential in advance long time = System.currentTimeMillis(); CreateUserCredentials.run(grid); String hadoopVersion = VersionInfo.getVersion(); int appID = 0xFFFFFFF & BitConverter.hashStringOneInt(jobID.toString()); try { org.apache.hadoop.mapreduce.OutputCommitter outputCommitter = createOutputCommitter(isNewApi, jobID, jobConf); HadoopVersionSpecificCode hadoopVersionSpecificCode = HadoopVersionSpecificCode .getInstance(hadoopVersion, jobConf); org.apache.hadoop.mapred.JobContext jobContext = hadoopVersionSpecificCode.createJobContext(jobConf, jobID); outputCommitter.setupJob(jobContext); //clear all temporary objects DataAccessor.clearObjects(appID); //Calculating the partition layout com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping hostNameToPartition = com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping .getCurrent(); List<InetAddress> hostAddresses = new ArrayList<InetAddress>(hostNameToPartition.getHosts()); //Generating mapping of Hadoop partitions to SOSS partitions, so they are equally distributed across hosts int numHosts = hostAddresses.size(); int numberOfSlotsPerNode = Math .max(grid != null ? grid.getMaxNumberOfCores() : Runtime.getRuntime().availableProcessors(), 1); //Generating split to hostname map Map<InetAddress, List<Integer>> splitToHostAddress = assignSplitsToHost(inputSplits, hostAddresses, splitLocations); int[] partitionMapping = hostNameToPartition.generateEvenItemDistribution(jobConf.getNumReduceTasks()); HadoopInvocationParameters hadoopParameters = new HadoopInvocationParameters(jobConf, jobID, !isNewApi); HServerInvocationParameters parameters = new HServerInvocationParameters(hadoopParameters, appID, partitionMapping, hostNameToPartition, numberOfSlotsPerNode, splitType, inputSplits, splitToHostAddress, false, HServerParameters.getBooleanSetting(HServerParameters.SORT_KEYS, jobConf), hadoopVersion, null, SerializationMode.DEFAULT); StringBuilder stringBuilder = new StringBuilder(); stringBuilder.append("Splits created:\n"); for (InetAddress address : splitToHostAddress.keySet()) { stringBuilder.append("Host "); stringBuilder.append(address); stringBuilder.append(" has "); stringBuilder.append(splitToHostAddress.get(address).size()); stringBuilder.append(" splits.\n"); } System.out.println(stringBuilder.toString()); System.out.println("Job initialization completed in " + (System.currentTimeMillis() - time) + " ms."); time = System.currentTimeMillis(); InvokeResult<MapperResult> mapInvokeResult = MessagingHelper.invoke(grid, RunMapper.MapperInvokable.class, parameters, TimeSpan.INFINITE_TIMEOUT.getSeconds()); if (mapInvokeResult.getErrors() != null && mapInvokeResult.getErrors().size() > 0) { throw new IOException("Map invocation failed.", mapInvokeResult.getErrors().get(0)); } System.out.println("Map invocation done in " + (System.currentTimeMillis() - time) + " ms."); time = System.currentTimeMillis(); MapperResult resultObject = mapInvokeResult.getResult(); if (resultObject == null || mapInvokeResult.getNumFailed() != 0) { throw new IOException("Mapper invocation failed. Num failed = " + mapInvokeResult.getNumFailed()); } if (resultObject.getNumberOfSplitsProcessed() != inputSplits.size()) { throw new IOException("Number of splits does not match the number of invocations. Nsplits = " + inputSplits.size() + ", Ninvokes =" + resultObject.getNumberOfSplitsProcessed()); } if (partitionMapping.length > 0) { //Running the reduce step InvokeResult<Integer> reduceInvokeResult = MessagingHelper.invoke(grid, ReduceInvokable.class, appID, TimeSpan.INFINITE_TIMEOUT.getSeconds()); System.out.println("Reduce invocation done in " + (System.currentTimeMillis() - time) + " ms."); DataAccessor.clearObjects(appID); //clear all temporary objects if (reduceInvokeResult.getErrors() != null && reduceInvokeResult.getErrors().size() > 0) { throw new IOException("Reduce invocation failed.", reduceInvokeResult.getErrors().get(0)); } if (reduceInvokeResult.getNumFailed() != 0) { throw new IOException("Reduce invocation failed."); } if (reduceInvokeResult.getResult() != partitionMapping.length) { throw new IOException("Not all partitions were reduced. Expected = " + partitionMapping.length + " Actual = " + reduceInvokeResult.getResult()); } } outputCommitter.commitJob(jobContext); } catch (StateServerException e) { throw new IOException("ScaleOut hServer access error.", e); } }
From source file:com.scaleoutsoftware.soss.hserver.JobScheduler.java
License:Apache License
/** * Runs the map-reduce job on ScaleOut hServer.* * * @param job the job to run//from w w w .j a v a 2s . c o m * @param jobId the id of the job * @param sortEnabled if key sorting is enabled * @param jobParameter user defined parameter object for the job * @param grid the invocation grid to run the job * @throws IOException if errors occurred during the job * @throws InterruptedException if the processing thread is interrupted * @throws ClassNotFoundException if the invocation grid does not contain the dependency class */ @SuppressWarnings("unchecked") public void runOldApiJob(JobConf job, org.apache.hadoop.mapred.JobID jobId, boolean sortEnabled, Object jobParameter, InvocationGrid grid) throws IOException, InterruptedException, ClassNotFoundException { //Initialize user credential in advance int jobAppId = 0xFFFFFFF & BitConverter.hashStringOneInt(jobId.toString()); String hadoopVersion = VersionInfo.getVersion(); long time = System.currentTimeMillis(); CreateUserCredentials.run(grid); try { //Check output specs before running the job job.getOutputFormat().checkOutputSpecs(FileSystem.get(job), job); JobContext jContext = HadoopVersionSpecificCode.getInstance(hadoopVersion, job).createJobContext(job, jobId); org.apache.hadoop.mapred.OutputCommitter outputCommitter = job.getOutputCommitter(); outputCommitter.setupJob(jContext); //clear all temporary objects DataAccessor.clearObjects(jobAppId); //Calculating the partition layout com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping hostNameToPartition = com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping .getCurrent(); List<InetAddress> hostAddresses = new ArrayList<InetAddress>(hostNameToPartition.getHosts()); //Generating mapping of Hadoop partitions to SOSS partitions, so they are equally distributed across hosts int numHosts = hostAddresses.size(); int numberOfSlotsPerNode = Math .max(grid != null ? grid.getMaxNumberOfCores() : Runtime.getRuntime().availableProcessors(), 1); //Set the number of splits to the number of cores if (NamedMapInputFormatMapred.class.isAssignableFrom(job.getInputFormat().getClass())) { int numberOfSplits = HServerParameters.getSetting(MAP_SPLITS_PER_CORE, job) * numHosts * numberOfSlotsPerNode; job.setNumMapTasks(Math.min(numberOfSplits, HServerConstants.MAX_MAP_REDUCE_TASKS)); } //Generating split to hostname map org.apache.hadoop.mapred.InputFormat inputFormat = job.getInputFormat(); List<org.apache.hadoop.mapred.InputSplit> splitList = Arrays .asList(inputFormat.getSplits(job, job.getNumMapTasks())); Map<InetAddress, List<Integer>> splitToHostAddress = assignSplitsToHost(splitList, hostAddresses, null); //Choose the optimal number of reducers for GridOutputFormat if (job.getOutputFormat() instanceof NamedMapOutputFormatMapred) { job.setNumReduceTasks(numHosts * numberOfSlotsPerNode); sortEnabled = false; } int[] partitionMapping = hostNameToPartition.generateEvenItemDistribution(job.getNumReduceTasks()); //Generating invocation parameters Class<? extends org.apache.hadoop.mapred.InputSplit> splitType = splitList.size() > 0 ? splitList.get(0).getClass() : null; HadoopInvocationParameters hadoopParameters = new HadoopInvocationParameters(job, jobId, true); HServerInvocationParameters<org.apache.hadoop.mapred.InputSplit> parameters = new HServerInvocationParameters<org.apache.hadoop.mapred.InputSplit>( hadoopParameters, jobAppId, partitionMapping, hostNameToPartition, numberOfSlotsPerNode, splitType, splitList, splitToHostAddress, false, sortEnabled, hadoopVersion, jobParameter, SerializationMode.DEFAULT); StringBuilder stringBuilder = new StringBuilder(); stringBuilder.append("Splits created:\n"); for (InetAddress address : splitToHostAddress.keySet()) { stringBuilder.append("Host "); stringBuilder.append(address); stringBuilder.append(" has "); stringBuilder.append(splitToHostAddress.get(address).size()); stringBuilder.append(" splits.\n"); } System.out.println(stringBuilder.toString()); System.out.println("Job initialization completed in " + (System.currentTimeMillis() - time) + " ms."); time = System.currentTimeMillis(); InvokeResult<MapperResult> mapInvokeResult = MessagingHelper.invoke(grid, RunMapper.MapperInvokable.class, parameters, TimeSpan.INFINITE_TIMEOUT.getSeconds()); if (mapInvokeResult.getErrors() != null && mapInvokeResult.getErrors().size() > 0) { throw new IOException("Map invocation failed.", mapInvokeResult.getErrors().get(0)); } System.out.println("Map invocation done in " + (System.currentTimeMillis() - time) + " ms."); time = System.currentTimeMillis(); MapperResult resultObject = mapInvokeResult.getResult(); if (resultObject == null || mapInvokeResult.getNumFailed() != 0) { throw new IOException("Mapper invocation failed. Num failed = " + mapInvokeResult.getNumFailed()); } if (resultObject.getNumberOfSplitsProcessed() != splitList.size()) { throw new IOException("Number of splits does not match the number of invocations. Nsplits = " + splitList.size() + ", Ninvokes =" + resultObject.getNumberOfSplitsProcessed()); } if (partitionMapping.length > 0) { //Running the reduce step InvokeResult<Integer> reduceInvokeResult = MessagingHelper.invoke(grid, ReduceInvokable.class, jobAppId, TimeSpan.INFINITE_TIMEOUT.getSeconds()); System.out.println("Reduce invocation done in " + (System.currentTimeMillis() - time) + " ms."); DataAccessor.clearObjects(jobAppId); //clear all temporary objects if (reduceInvokeResult.getErrors() != null && reduceInvokeResult.getErrors().size() > 0) { throw new IOException("Reduce invocation failed.", reduceInvokeResult.getErrors().get(0)); } if (reduceInvokeResult.getNumFailed() != 0) { throw new IOException("Reduce invocation failed."); } if (reduceInvokeResult.getResult() != partitionMapping.length) { throw new IOException("Not all partitions were reduced. Expected = " + partitionMapping.length + " Actual = " + reduceInvokeResult.getResult()); } } outputCommitter.commitJob(jContext); } catch (StateServerException e) { throw new IOException("ScaleOut hServer access error.", e); } }
From source file:cz.seznam.euphoria.hadoop.output.HadoopSink.java
License:Apache License
@Override @SneakyThrows//from ww w . jav a 2s . c o m public void commit() throws IOException { OutputCommitter committer = getHadoopFormatInstance() .getOutputCommitter(HadoopUtils.createTaskContext(conf.getWritable(), 0)); committer.commitJob(HadoopUtils.createJobContext(conf.getWritable())); }
From source file:cz.seznam.euphoria.hadoop.output.TestDataSinkOutputFormat.java
License:Apache License
@Test @SuppressWarnings("unchecked") /**/*from w ww . ja va 2 s .co m*/ * Test that {@code ListDataSink} can be used in place of hadoop {@code OutputFormat}. **/ public void testDataSink() throws Exception { DummySink sink = new DummySink(); Configuration conf = new Configuration(); DataSinkOutputFormat.configure(conf, sink); // mock the instances we will need TaskAttemptContext first = mockContext(conf, 0); TaskAttemptContext second = mockContext(conf, 1); // instantiate the output format DataSinkOutputFormat<Long> format = DataSinkOutputFormat.class.newInstance(); // validate format.checkOutputSpecs(first); // create record writer for the first partition RecordWriter<NullWritable, Long> writer = format.getRecordWriter(first); writer.write(NullWritable.get(), 2L); writer.close(first); format.getOutputCommitter(first).commitTask(first); // now the second partition, we need to create new instance of output format format = DataSinkOutputFormat.class.newInstance(); // validate format.checkOutputSpecs(second); // create record writer for the second partition writer = format.getRecordWriter(second); writer.write(NullWritable.get(), 4L); writer.close(second); OutputCommitter committer = format.getOutputCommitter(second); committer.commitTask(second); // and now validate what was written assertFalse(DummySink.isCommitted); committer.commitJob(second); assertTrue(DummySink.isCommitted); assertTrue(DummySink.outputs.isEmpty()); assertEquals(2, DummySink.committed.size()); assertEquals(Arrays.asList(2L), DummySink.committed.get(0)); assertEquals(Arrays.asList(4L), DummySink.committed.get(1)); }
From source file:org.apache.giraph.hive.Helpers.java
License:Apache License
public static void commitJob(GiraphConfiguration conf) throws IOException, InterruptedException { ImmutableClassesGiraphConfiguration iconf = new ImmutableClassesGiraphConfiguration(conf); WrappedVertexOutputFormat outputFormat = iconf.createWrappedVertexOutputFormat(); JobConf jobConf = new JobConf(conf); TaskAttemptContext taskContext = new HackTaskAttemptContext(jobConf, new TaskAttemptID()); OutputCommitter outputCommitter = outputFormat.getOutputCommitter(taskContext); JobContext jobContext = new HackJobContext(jobConf, taskContext.getJobID()); outputCommitter.commitJob(jobContext); }
From source file:org.apache.giraph.io.internal.WrappedEdgeOutputFormat.java
License:Apache License
@Override public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException { final OutputCommitter outputCommitter = originalOutputFormat .getOutputCommitter(HadoopUtils.makeTaskAttemptContext(getConf(), context)); return new OutputCommitter() { @Override/*from ww w . java2 s.c o m*/ public void setupJob(JobContext context) throws IOException { outputCommitter.setupJob(HadoopUtils.makeJobContext(getConf(), context)); } @Override public void setupTask(TaskAttemptContext context) throws IOException { outputCommitter.setupTask(HadoopUtils.makeTaskAttemptContext(getConf(), context)); } @Override public boolean needsTaskCommit(TaskAttemptContext context) throws IOException { return outputCommitter.needsTaskCommit(HadoopUtils.makeTaskAttemptContext(getConf(), context)); } @Override public void commitTask(TaskAttemptContext context) throws IOException { outputCommitter.commitTask(HadoopUtils.makeTaskAttemptContext(getConf(), context)); } @Override public void abortTask(TaskAttemptContext context) throws IOException { outputCommitter.abortTask(HadoopUtils.makeTaskAttemptContext(getConf(), context)); } @Override public void cleanupJob(JobContext context) throws IOException { outputCommitter.cleanupJob(HadoopUtils.makeJobContext(getConf(), context)); } /*if_not[HADOOP_NON_COMMIT_JOB]*/ @Override public void commitJob(JobContext context) throws IOException { outputCommitter.commitJob(HadoopUtils.makeJobContext(getConf(), context)); } @Override public void abortJob(JobContext context, JobStatus.State state) throws IOException { outputCommitter.abortJob(HadoopUtils.makeJobContext(getConf(), context), state); } /*end[HADOOP_NON_COMMIT_JOB]*/ }; }
From source file:org.apache.giraph.io.internal.WrappedVertexOutputFormat.java
License:Apache License
@Override public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException { final OutputCommitter outputCommitter = originalOutputFormat .getOutputCommitter(HadoopUtils.makeTaskAttemptContext(getConf(), context)); return new OutputCommitter() { @Override/* w w w .ja v a 2s . co m*/ public void setupJob(JobContext context) throws IOException { outputCommitter.setupJob(HadoopUtils.makeJobContext(getConf(), context)); } @Override public void setupTask(TaskAttemptContext context) throws IOException { outputCommitter.setupTask(HadoopUtils.makeTaskAttemptContext(getConf(), context)); } @Override public boolean needsTaskCommit(TaskAttemptContext context) throws IOException { return outputCommitter.needsTaskCommit(HadoopUtils.makeTaskAttemptContext(getConf(), context)); } @Override public void commitTask(TaskAttemptContext context) throws IOException { outputCommitter.commitTask(HadoopUtils.makeTaskAttemptContext(getConf(), context)); } @Override public void abortTask(TaskAttemptContext context) throws IOException { outputCommitter.abortTask(HadoopUtils.makeTaskAttemptContext(getConf(), context)); } @Override public void cleanupJob(JobContext context) throws IOException { outputCommitter.cleanupJob(HadoopUtils.makeJobContext(getConf(), context)); } /*if_not[HADOOP_NON_COMMIT_JOB]*/ @Override public void commitJob(JobContext context) throws IOException { outputCommitter.commitJob(HadoopUtils.makeJobContext(getConf(), context)); } @Override public void abortJob(JobContext context, JobStatus.State state) throws IOException { outputCommitter.abortJob(HadoopUtils.makeJobContext(getConf(), context), state); } /*end[HADOOP_NON_COMMIT_JOB]*/ }; }
From source file:org.apache.giraph.yarn.GiraphYarnTask.java
License:Apache License
/** * Without Hadoop MR to finish the consolidation of all the task output from * each HDFS task tmp dir, it won't get done. YARN has some job finalization * it must do "for us." -- AND must delete "jar cache" in HDFS too! */// w ww.ja v a 2 s . c o m private void finalizeYarnJob() { if (conf.isPureYarnJob() && graphTaskManager.isMaster() && conf.getVertexOutputFormatClass() != null) { try { LOG.info("Master is ready to commit final job output data."); VertexOutputFormat vertexOutputFormat = conf.createWrappedVertexOutputFormat(); OutputCommitter outputCommitter = vertexOutputFormat.getOutputCommitter(proxy); // now we will have our output in OUTDIR if all went well... outputCommitter.commitJob(proxy); LOG.info("Master has committed the final job output data."); } catch (InterruptedException ie) { LOG.error("Interrupted while attempting to obtain " + "OutputCommitter.", ie); } catch (IOException ioe) { LOG.error("Master task's attempt to commit output has " + "FAILED.", ioe); } } }
From source file:org.apache.hcatalog.mapreduce.TestHCatOutputFormat.java
License:Apache License
public void publishTest(Job job) throws Exception { OutputCommitter committer = new FileOutputCommitterContainer(job, null); committer.commitJob(job); Partition part = client.getPartition(dbName, tblName, Arrays.asList("p1")); assertNotNull(part);//w ww. ja va 2 s . co m StorerInfo storer = InternalUtil.extractStorerInfo(part.getSd(), part.getParameters()); assertEquals(storer.getProperties().get("hcat.testarg"), "testArgValue"); assertTrue(part.getSd().getLocation().indexOf("p1") != -1); }
From source file:org.apache.hcatalog.pig.TestE2EScenarios.java
License:Apache License
private void copyTable(String in, String out) throws IOException, InterruptedException { Job ijob = new Job(); Job ojob = new Job(); HCatInputFormat inpy = new HCatInputFormat(); inpy.setInput(ijob, null, in);/*from w ww. ja va 2s . c o m*/ HCatOutputFormat oupy = new HCatOutputFormat(); oupy.setOutput(ojob, OutputJobInfo.create(null, out, new HashMap<String, String>())); // Test HCatContext System.err.println("HCatContext INSTANCE is present : " + HCatContext.INSTANCE.getConf().isPresent()); if (HCatContext.INSTANCE.getConf().isPresent()) { System.err.println("HCatContext tinyint->int promotion says " + HCatContext.INSTANCE.getConf().get() .getBoolean(HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION, HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION_DEFAULT)); } HCatSchema tableSchema = inpy.getTableSchema(ijob.getConfiguration()); System.err.println("Copying from [" + in + "] to [" + out + "] with schema : " + tableSchema.toString()); oupy.setSchema(ojob, tableSchema); oupy.checkOutputSpecs(ojob); OutputCommitter oc = oupy.getOutputCommitter(createTaskAttemptContext(ojob.getConfiguration())); oc.setupJob(ojob); for (InputSplit split : inpy.getSplits(ijob)) { TaskAttemptContext rtaskContext = createTaskAttemptContext(ijob.getConfiguration()); TaskAttemptContext wtaskContext = createTaskAttemptContext(ojob.getConfiguration()); RecordReader<WritableComparable, HCatRecord> rr = inpy.createRecordReader(split, rtaskContext); rr.initialize(split, rtaskContext); OutputCommitter taskOc = oupy.getOutputCommitter(wtaskContext); taskOc.setupTask(wtaskContext); RecordWriter<WritableComparable<?>, HCatRecord> rw = oupy.getRecordWriter(wtaskContext); while (rr.nextKeyValue()) { rw.write(rr.getCurrentKey(), rr.getCurrentValue()); } rw.close(wtaskContext); taskOc.commitTask(wtaskContext); rr.close(); } oc.commitJob(ojob); }