Example usage for org.apache.hadoop.mapreduce OutputCommitter commitJob

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce OutputCommitter commitJob.

Prototype

public void commitJob(JobContext jobContext) throws IOException

Source Link

Document

For committing job's output after successful job completion.

Usage

From source file:com.scaleoutsoftware.soss.hserver.JobScheduler.java

License:Apache License

/**
 * Runs the map-reduce job on ScaleOut hServer.
 *
 * @param jobID          the id of the job
 * @param jobConf        the job to run//from   ww w . j av  a 2 s  .c o  m
 * @param isNewApi       if the job uses the new MapReduce APIs
 * @param splitType      the type of the split
 * @param inputSplits    the list of input splits
 * @param splitLocations the locations of the splits
 * @param grid           the invocation grid to run the job
 * @throws IOException            if errors occurred during the job
 * @throws InterruptedException   if the processing thread is interrupted
 * @throws ClassNotFoundException if the invocation grid does not contain the dependency class
 */
@SuppressWarnings("unchecked")
public void runPredefinedJob(JobID jobID, JobConf jobConf, boolean isNewApi, Class splitType,
        List<?> inputSplits, Map<Object, String[]> splitLocations, InvocationGrid grid)
        throws IOException, InterruptedException, ClassNotFoundException {

    //Initialize user credential in advance
    long time = System.currentTimeMillis();
    CreateUserCredentials.run(grid);
    String hadoopVersion = VersionInfo.getVersion();

    int appID = 0xFFFFFFF & BitConverter.hashStringOneInt(jobID.toString());

    try {

        org.apache.hadoop.mapreduce.OutputCommitter outputCommitter = createOutputCommitter(isNewApi, jobID,
                jobConf);

        HadoopVersionSpecificCode hadoopVersionSpecificCode = HadoopVersionSpecificCode
                .getInstance(hadoopVersion, jobConf);

        org.apache.hadoop.mapred.JobContext jobContext = hadoopVersionSpecificCode.createJobContext(jobConf,
                jobID);
        outputCommitter.setupJob(jobContext);

        //clear all temporary objects
        DataAccessor.clearObjects(appID);

        //Calculating the partition layout
        com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping hostNameToPartition = com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping
                .getCurrent();
        List<InetAddress> hostAddresses = new ArrayList<InetAddress>(hostNameToPartition.getHosts());

        //Generating mapping of Hadoop partitions to SOSS partitions, so they are equally distributed across hosts
        int numHosts = hostAddresses.size();
        int numberOfSlotsPerNode = Math
                .max(grid != null ? grid.getMaxNumberOfCores() : Runtime.getRuntime().availableProcessors(), 1);

        //Generating split to hostname map
        Map<InetAddress, List<Integer>> splitToHostAddress = assignSplitsToHost(inputSplits, hostAddresses,
                splitLocations);

        int[] partitionMapping = hostNameToPartition.generateEvenItemDistribution(jobConf.getNumReduceTasks());

        HadoopInvocationParameters hadoopParameters = new HadoopInvocationParameters(jobConf, jobID, !isNewApi);
        HServerInvocationParameters parameters = new HServerInvocationParameters(hadoopParameters, appID,
                partitionMapping, hostNameToPartition, numberOfSlotsPerNode, splitType, inputSplits,
                splitToHostAddress, false,
                HServerParameters.getBooleanSetting(HServerParameters.SORT_KEYS, jobConf), hadoopVersion, null,
                SerializationMode.DEFAULT);

        StringBuilder stringBuilder = new StringBuilder();
        stringBuilder.append("Splits created:\n");
        for (InetAddress address : splitToHostAddress.keySet()) {
            stringBuilder.append("Host ");
            stringBuilder.append(address);
            stringBuilder.append(" has ");
            stringBuilder.append(splitToHostAddress.get(address).size());
            stringBuilder.append(" splits.\n");
        }
        System.out.println(stringBuilder.toString());

        System.out.println("Job initialization completed in " + (System.currentTimeMillis() - time) + " ms.");

        time = System.currentTimeMillis();

        InvokeResult<MapperResult> mapInvokeResult = MessagingHelper.invoke(grid,
                RunMapper.MapperInvokable.class, parameters, TimeSpan.INFINITE_TIMEOUT.getSeconds());

        if (mapInvokeResult.getErrors() != null && mapInvokeResult.getErrors().size() > 0) {
            throw new IOException("Map invocation failed.", mapInvokeResult.getErrors().get(0));
        }

        System.out.println("Map invocation done in " + (System.currentTimeMillis() - time) + " ms.");
        time = System.currentTimeMillis();

        MapperResult resultObject = mapInvokeResult.getResult();

        if (resultObject == null || mapInvokeResult.getNumFailed() != 0) {
            throw new IOException("Mapper invocation failed. Num failed = " + mapInvokeResult.getNumFailed());
        }

        if (resultObject.getNumberOfSplitsProcessed() != inputSplits.size()) {
            throw new IOException("Number of splits does not match the number of invocations. Nsplits = "
                    + inputSplits.size() + ", Ninvokes =" + resultObject.getNumberOfSplitsProcessed());
        }

        if (partitionMapping.length > 0) {
            //Running the reduce step
            InvokeResult<Integer> reduceInvokeResult = MessagingHelper.invoke(grid, ReduceInvokable.class,
                    appID, TimeSpan.INFINITE_TIMEOUT.getSeconds());

            System.out.println("Reduce invocation done in " + (System.currentTimeMillis() - time) + " ms.");

            DataAccessor.clearObjects(appID); //clear all temporary objects

            if (reduceInvokeResult.getErrors() != null && reduceInvokeResult.getErrors().size() > 0) {
                throw new IOException("Reduce invocation failed.", reduceInvokeResult.getErrors().get(0));
            }
            if (reduceInvokeResult.getNumFailed() != 0) {
                throw new IOException("Reduce invocation failed.");
            }
            if (reduceInvokeResult.getResult() != partitionMapping.length) {
                throw new IOException("Not all partitions were reduced. Expected = " + partitionMapping.length
                        + " Actual = " + reduceInvokeResult.getResult());
            }
        }
        outputCommitter.commitJob(jobContext);
    } catch (StateServerException e) {
        throw new IOException("ScaleOut hServer access error.", e);
    }

}

From source file:com.scaleoutsoftware.soss.hserver.JobScheduler.java

License:Apache License

/**
 * Runs the map-reduce job on ScaleOut hServer.*
 *
 * @param job          the job to run//from   w w  w  .j  a  v  a 2s  . c o m
 * @param jobId        the id of the job
 * @param sortEnabled  if key sorting is enabled
 * @param jobParameter user defined parameter object for the job
 * @param grid         the invocation grid to run the job
 * @throws IOException            if errors occurred during the job
 * @throws InterruptedException   if the processing thread is interrupted
 * @throws ClassNotFoundException if the invocation grid does not contain the dependency class
 */
@SuppressWarnings("unchecked")
public void runOldApiJob(JobConf job, org.apache.hadoop.mapred.JobID jobId, boolean sortEnabled,
        Object jobParameter, InvocationGrid grid)
        throws IOException, InterruptedException, ClassNotFoundException {
    //Initialize user credential in advance
    int jobAppId = 0xFFFFFFF & BitConverter.hashStringOneInt(jobId.toString());
    String hadoopVersion = VersionInfo.getVersion();
    long time = System.currentTimeMillis();
    CreateUserCredentials.run(grid);

    try {
        //Check output specs before running the job
        job.getOutputFormat().checkOutputSpecs(FileSystem.get(job), job);

        JobContext jContext = HadoopVersionSpecificCode.getInstance(hadoopVersion, job).createJobContext(job,
                jobId);

        org.apache.hadoop.mapred.OutputCommitter outputCommitter = job.getOutputCommitter();
        outputCommitter.setupJob(jContext);

        //clear all temporary objects
        DataAccessor.clearObjects(jobAppId);

        //Calculating the partition layout
        com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping hostNameToPartition = com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping
                .getCurrent();
        List<InetAddress> hostAddresses = new ArrayList<InetAddress>(hostNameToPartition.getHosts());

        //Generating mapping of Hadoop partitions to SOSS partitions, so they are equally distributed across hosts
        int numHosts = hostAddresses.size();
        int numberOfSlotsPerNode = Math
                .max(grid != null ? grid.getMaxNumberOfCores() : Runtime.getRuntime().availableProcessors(), 1);

        //Set the number of splits to the number of cores
        if (NamedMapInputFormatMapred.class.isAssignableFrom(job.getInputFormat().getClass())) {
            int numberOfSplits = HServerParameters.getSetting(MAP_SPLITS_PER_CORE, job) * numHosts
                    * numberOfSlotsPerNode;
            job.setNumMapTasks(Math.min(numberOfSplits, HServerConstants.MAX_MAP_REDUCE_TASKS));
        }

        //Generating split to hostname map
        org.apache.hadoop.mapred.InputFormat inputFormat = job.getInputFormat();
        List<org.apache.hadoop.mapred.InputSplit> splitList = Arrays
                .asList(inputFormat.getSplits(job, job.getNumMapTasks()));
        Map<InetAddress, List<Integer>> splitToHostAddress = assignSplitsToHost(splitList, hostAddresses, null);

        //Choose the optimal number of reducers for GridOutputFormat
        if (job.getOutputFormat() instanceof NamedMapOutputFormatMapred) {
            job.setNumReduceTasks(numHosts * numberOfSlotsPerNode);
            sortEnabled = false;
        }

        int[] partitionMapping = hostNameToPartition.generateEvenItemDistribution(job.getNumReduceTasks());

        //Generating invocation parameters
        Class<? extends org.apache.hadoop.mapred.InputSplit> splitType = splitList.size() > 0
                ? splitList.get(0).getClass()
                : null;

        HadoopInvocationParameters hadoopParameters = new HadoopInvocationParameters(job, jobId, true);

        HServerInvocationParameters<org.apache.hadoop.mapred.InputSplit> parameters = new HServerInvocationParameters<org.apache.hadoop.mapred.InputSplit>(
                hadoopParameters, jobAppId, partitionMapping, hostNameToPartition, numberOfSlotsPerNode,
                splitType, splitList, splitToHostAddress, false, sortEnabled, hadoopVersion, jobParameter,
                SerializationMode.DEFAULT);

        StringBuilder stringBuilder = new StringBuilder();
        stringBuilder.append("Splits created:\n");
        for (InetAddress address : splitToHostAddress.keySet()) {
            stringBuilder.append("Host ");
            stringBuilder.append(address);
            stringBuilder.append(" has ");
            stringBuilder.append(splitToHostAddress.get(address).size());
            stringBuilder.append(" splits.\n");
        }
        System.out.println(stringBuilder.toString());

        System.out.println("Job initialization completed in " + (System.currentTimeMillis() - time) + " ms.");
        time = System.currentTimeMillis();

        InvokeResult<MapperResult> mapInvokeResult = MessagingHelper.invoke(grid,
                RunMapper.MapperInvokable.class, parameters, TimeSpan.INFINITE_TIMEOUT.getSeconds());

        if (mapInvokeResult.getErrors() != null && mapInvokeResult.getErrors().size() > 0) {
            throw new IOException("Map invocation failed.", mapInvokeResult.getErrors().get(0));
        }

        System.out.println("Map invocation done in " + (System.currentTimeMillis() - time) + " ms.");
        time = System.currentTimeMillis();

        MapperResult resultObject = mapInvokeResult.getResult();

        if (resultObject == null || mapInvokeResult.getNumFailed() != 0) {
            throw new IOException("Mapper invocation failed. Num failed = " + mapInvokeResult.getNumFailed());
        }

        if (resultObject.getNumberOfSplitsProcessed() != splitList.size()) {
            throw new IOException("Number of splits does not match the number of invocations. Nsplits = "
                    + splitList.size() + ", Ninvokes =" + resultObject.getNumberOfSplitsProcessed());
        }

        if (partitionMapping.length > 0) {
            //Running the reduce step
            InvokeResult<Integer> reduceInvokeResult = MessagingHelper.invoke(grid, ReduceInvokable.class,
                    jobAppId, TimeSpan.INFINITE_TIMEOUT.getSeconds());

            System.out.println("Reduce invocation done in " + (System.currentTimeMillis() - time) + " ms.");

            DataAccessor.clearObjects(jobAppId); //clear all temporary objects

            if (reduceInvokeResult.getErrors() != null && reduceInvokeResult.getErrors().size() > 0) {
                throw new IOException("Reduce invocation failed.", reduceInvokeResult.getErrors().get(0));
            }
            if (reduceInvokeResult.getNumFailed() != 0) {
                throw new IOException("Reduce invocation failed.");
            }
            if (reduceInvokeResult.getResult() != partitionMapping.length) {
                throw new IOException("Not all partitions were reduced. Expected = " + partitionMapping.length
                        + " Actual = " + reduceInvokeResult.getResult());
            }
        }
        outputCommitter.commitJob(jContext);
    } catch (StateServerException e) {
        throw new IOException("ScaleOut hServer access error.", e);
    }

}

From source file:cz.seznam.euphoria.hadoop.output.HadoopSink.java

License:Apache License

@Override
@SneakyThrows//from  ww w  . jav a 2s  .  c o  m
public void commit() throws IOException {
    OutputCommitter committer = getHadoopFormatInstance()
            .getOutputCommitter(HadoopUtils.createTaskContext(conf.getWritable(), 0));

    committer.commitJob(HadoopUtils.createJobContext(conf.getWritable()));
}

From source file:cz.seznam.euphoria.hadoop.output.TestDataSinkOutputFormat.java

License:Apache License

@Test
@SuppressWarnings("unchecked")
/**/*from w ww  .  ja  va  2 s .co m*/
 * Test that {@code ListDataSink} can be used in place of hadoop {@code OutputFormat}.
 **/
public void testDataSink() throws Exception {
    DummySink sink = new DummySink();
    Configuration conf = new Configuration();
    DataSinkOutputFormat.configure(conf, sink);

    // mock the instances we will need
    TaskAttemptContext first = mockContext(conf, 0);
    TaskAttemptContext second = mockContext(conf, 1);

    // instantiate the output format
    DataSinkOutputFormat<Long> format = DataSinkOutputFormat.class.newInstance();

    // validate
    format.checkOutputSpecs(first);

    // create record writer for the first partition
    RecordWriter<NullWritable, Long> writer = format.getRecordWriter(first);
    writer.write(NullWritable.get(), 2L);
    writer.close(first);
    format.getOutputCommitter(first).commitTask(first);

    // now the second partition, we need to create new instance of output format
    format = DataSinkOutputFormat.class.newInstance();
    // validate
    format.checkOutputSpecs(second);

    // create record writer for the second partition
    writer = format.getRecordWriter(second);
    writer.write(NullWritable.get(), 4L);
    writer.close(second);
    OutputCommitter committer = format.getOutputCommitter(second);
    committer.commitTask(second);

    // and now validate what was written
    assertFalse(DummySink.isCommitted);

    committer.commitJob(second);
    assertTrue(DummySink.isCommitted);

    assertTrue(DummySink.outputs.isEmpty());
    assertEquals(2, DummySink.committed.size());

    assertEquals(Arrays.asList(2L), DummySink.committed.get(0));
    assertEquals(Arrays.asList(4L), DummySink.committed.get(1));
}

From source file:org.apache.giraph.hive.Helpers.java

License:Apache License

public static void commitJob(GiraphConfiguration conf) throws IOException, InterruptedException {
    ImmutableClassesGiraphConfiguration iconf = new ImmutableClassesGiraphConfiguration(conf);
    WrappedVertexOutputFormat outputFormat = iconf.createWrappedVertexOutputFormat();
    JobConf jobConf = new JobConf(conf);
    TaskAttemptContext taskContext = new HackTaskAttemptContext(jobConf, new TaskAttemptID());
    OutputCommitter outputCommitter = outputFormat.getOutputCommitter(taskContext);
    JobContext jobContext = new HackJobContext(jobConf, taskContext.getJobID());
    outputCommitter.commitJob(jobContext);
}

From source file:org.apache.giraph.io.internal.WrappedEdgeOutputFormat.java

License:Apache License

@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {

    final OutputCommitter outputCommitter = originalOutputFormat
            .getOutputCommitter(HadoopUtils.makeTaskAttemptContext(getConf(), context));

    return new OutputCommitter() {
        @Override/*from  ww w  . java2 s.c o m*/
        public void setupJob(JobContext context) throws IOException {
            outputCommitter.setupJob(HadoopUtils.makeJobContext(getConf(), context));
        }

        @Override
        public void setupTask(TaskAttemptContext context) throws IOException {
            outputCommitter.setupTask(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public boolean needsTaskCommit(TaskAttemptContext context) throws IOException {
            return outputCommitter.needsTaskCommit(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public void commitTask(TaskAttemptContext context) throws IOException {
            outputCommitter.commitTask(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public void abortTask(TaskAttemptContext context) throws IOException {
            outputCommitter.abortTask(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public void cleanupJob(JobContext context) throws IOException {
            outputCommitter.cleanupJob(HadoopUtils.makeJobContext(getConf(), context));
        }

        /*if_not[HADOOP_NON_COMMIT_JOB]*/
        @Override
        public void commitJob(JobContext context) throws IOException {
            outputCommitter.commitJob(HadoopUtils.makeJobContext(getConf(), context));
        }

        @Override
        public void abortJob(JobContext context, JobStatus.State state) throws IOException {
            outputCommitter.abortJob(HadoopUtils.makeJobContext(getConf(), context), state);
        }
        /*end[HADOOP_NON_COMMIT_JOB]*/
    };
}

From source file:org.apache.giraph.io.internal.WrappedVertexOutputFormat.java

License:Apache License

@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
    final OutputCommitter outputCommitter = originalOutputFormat
            .getOutputCommitter(HadoopUtils.makeTaskAttemptContext(getConf(), context));
    return new OutputCommitter() {
        @Override/* w w  w  .ja  v a 2s  .  co m*/
        public void setupJob(JobContext context) throws IOException {
            outputCommitter.setupJob(HadoopUtils.makeJobContext(getConf(), context));
        }

        @Override
        public void setupTask(TaskAttemptContext context) throws IOException {
            outputCommitter.setupTask(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public boolean needsTaskCommit(TaskAttemptContext context) throws IOException {
            return outputCommitter.needsTaskCommit(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public void commitTask(TaskAttemptContext context) throws IOException {
            outputCommitter.commitTask(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public void abortTask(TaskAttemptContext context) throws IOException {
            outputCommitter.abortTask(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public void cleanupJob(JobContext context) throws IOException {
            outputCommitter.cleanupJob(HadoopUtils.makeJobContext(getConf(), context));
        }

        /*if_not[HADOOP_NON_COMMIT_JOB]*/
        @Override
        public void commitJob(JobContext context) throws IOException {
            outputCommitter.commitJob(HadoopUtils.makeJobContext(getConf(), context));
        }

        @Override
        public void abortJob(JobContext context, JobStatus.State state) throws IOException {
            outputCommitter.abortJob(HadoopUtils.makeJobContext(getConf(), context), state);
        }
        /*end[HADOOP_NON_COMMIT_JOB]*/
    };
}

From source file:org.apache.giraph.yarn.GiraphYarnTask.java

License:Apache License

/**
 * Without Hadoop MR to finish the consolidation of all the task output from
 * each HDFS task tmp dir, it won't get done. YARN has some job finalization
 * it must do "for us." -- AND must delete "jar cache" in HDFS too!
 *///  w  ww.ja v a 2 s  .  c  o m
private void finalizeYarnJob() {
    if (conf.isPureYarnJob() && graphTaskManager.isMaster() && conf.getVertexOutputFormatClass() != null) {
        try {
            LOG.info("Master is ready to commit final job output data.");
            VertexOutputFormat vertexOutputFormat = conf.createWrappedVertexOutputFormat();
            OutputCommitter outputCommitter = vertexOutputFormat.getOutputCommitter(proxy);
            // now we will have our output in OUTDIR if all went well...
            outputCommitter.commitJob(proxy);
            LOG.info("Master has committed the final job output data.");
        } catch (InterruptedException ie) {
            LOG.error("Interrupted while attempting to obtain " + "OutputCommitter.", ie);
        } catch (IOException ioe) {
            LOG.error("Master task's attempt to commit output has " + "FAILED.", ioe);
        }
    }
}

From source file:org.apache.hcatalog.mapreduce.TestHCatOutputFormat.java

License:Apache License

public void publishTest(Job job) throws Exception {
    OutputCommitter committer = new FileOutputCommitterContainer(job, null);
    committer.commitJob(job);

    Partition part = client.getPartition(dbName, tblName, Arrays.asList("p1"));
    assertNotNull(part);//w ww. ja  va 2 s . co  m

    StorerInfo storer = InternalUtil.extractStorerInfo(part.getSd(), part.getParameters());
    assertEquals(storer.getProperties().get("hcat.testarg"), "testArgValue");
    assertTrue(part.getSd().getLocation().indexOf("p1") != -1);
}

From source file:org.apache.hcatalog.pig.TestE2EScenarios.java

License:Apache License

private void copyTable(String in, String out) throws IOException, InterruptedException {
    Job ijob = new Job();
    Job ojob = new Job();
    HCatInputFormat inpy = new HCatInputFormat();
    inpy.setInput(ijob, null, in);/*from   w  ww. ja va 2s  .  c o m*/
    HCatOutputFormat oupy = new HCatOutputFormat();
    oupy.setOutput(ojob, OutputJobInfo.create(null, out, new HashMap<String, String>()));

    // Test HCatContext

    System.err.println("HCatContext INSTANCE is present : " + HCatContext.INSTANCE.getConf().isPresent());
    if (HCatContext.INSTANCE.getConf().isPresent()) {
        System.err.println("HCatContext tinyint->int promotion says " + HCatContext.INSTANCE.getConf().get()
                .getBoolean(HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION,
                        HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION_DEFAULT));
    }

    HCatSchema tableSchema = inpy.getTableSchema(ijob.getConfiguration());
    System.err.println("Copying from [" + in + "] to [" + out + "] with schema : " + tableSchema.toString());
    oupy.setSchema(ojob, tableSchema);
    oupy.checkOutputSpecs(ojob);
    OutputCommitter oc = oupy.getOutputCommitter(createTaskAttemptContext(ojob.getConfiguration()));
    oc.setupJob(ojob);

    for (InputSplit split : inpy.getSplits(ijob)) {

        TaskAttemptContext rtaskContext = createTaskAttemptContext(ijob.getConfiguration());
        TaskAttemptContext wtaskContext = createTaskAttemptContext(ojob.getConfiguration());

        RecordReader<WritableComparable, HCatRecord> rr = inpy.createRecordReader(split, rtaskContext);
        rr.initialize(split, rtaskContext);

        OutputCommitter taskOc = oupy.getOutputCommitter(wtaskContext);
        taskOc.setupTask(wtaskContext);
        RecordWriter<WritableComparable<?>, HCatRecord> rw = oupy.getRecordWriter(wtaskContext);

        while (rr.nextKeyValue()) {
            rw.write(rr.getCurrentKey(), rr.getCurrentValue());
        }
        rw.close(wtaskContext);
        taskOc.commitTask(wtaskContext);
        rr.close();
    }

    oc.commitJob(ojob);
}