Example usage for org.apache.hadoop.mapreduce OutputCommitter setupJob

List of usage examples for org.apache.hadoop.mapreduce OutputCommitter setupJob

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce OutputCommitter setupJob.

Prototype

public abstract void setupJob(JobContext jobContext) throws IOException;

Source Link

Document

For the framework to setup the job output during initialization.

Usage

From source file:com.asakusafw.runtime.mapreduce.simple.SimpleJobRunner.java

License:Apache License

private void runJob(Job job) throws ClassNotFoundException, IOException, InterruptedException {
    assert job.getJobID() != null;
    TaskID taskId = newMapTaskId(job.getJobID(), 0);
    Configuration conf = job.getConfiguration();
    OutputFormat<?, ?> output = ReflectionUtils.newInstance(job.getOutputFormatClass(), conf);
    OutputCommitter committer = output
            .getOutputCommitter(newTaskAttemptContext(conf, newTaskAttemptId(taskId, 0)));
    boolean succeed = false;
    committer.setupJob(job);
    try {//w  w  w.  java 2 s .  c  o  m
        if (job.getNumReduceTasks() == 0) {
            runMap(job, null);
        } else {
            try (KeyValueSorter<?, ?> sorter = createSorter(job, job.getMapOutputKeyClass(),
                    job.getMapOutputValueClass())) {
                runMap(job, sorter);
                runReduce(job, sorter);
            }
        }
        committer.commitJob(job);
        succeed = true;
    } finally {
        if (succeed == false) {
            try {
                committer.abortJob(job, State.FAILED);
            } catch (IOException e) {
                LOG.error(MessageFormat.format("error occurred while aborting job: {0} ({1})", job.getJobID(),
                        job.getJobName()), e);
            }
        }
    }
}

From source file:com.scaleoutsoftware.soss.hserver.JobScheduler.java

License:Apache License

/**
 * Runs the map-reduce job on ScaleOut hServer.
 *
 * @param jobID          the id of the job
 * @param jobConf        the job to run/*from w  w w .ja  v  a  2s  .  c o m*/
 * @param isNewApi       if the job uses the new MapReduce APIs
 * @param splitType      the type of the split
 * @param inputSplits    the list of input splits
 * @param splitLocations the locations of the splits
 * @param grid           the invocation grid to run the job
 * @throws IOException            if errors occurred during the job
 * @throws InterruptedException   if the processing thread is interrupted
 * @throws ClassNotFoundException if the invocation grid does not contain the dependency class
 */
@SuppressWarnings("unchecked")
public void runPredefinedJob(JobID jobID, JobConf jobConf, boolean isNewApi, Class splitType,
        List<?> inputSplits, Map<Object, String[]> splitLocations, InvocationGrid grid)
        throws IOException, InterruptedException, ClassNotFoundException {

    //Initialize user credential in advance
    long time = System.currentTimeMillis();
    CreateUserCredentials.run(grid);
    String hadoopVersion = VersionInfo.getVersion();

    int appID = 0xFFFFFFF & BitConverter.hashStringOneInt(jobID.toString());

    try {

        org.apache.hadoop.mapreduce.OutputCommitter outputCommitter = createOutputCommitter(isNewApi, jobID,
                jobConf);

        HadoopVersionSpecificCode hadoopVersionSpecificCode = HadoopVersionSpecificCode
                .getInstance(hadoopVersion, jobConf);

        org.apache.hadoop.mapred.JobContext jobContext = hadoopVersionSpecificCode.createJobContext(jobConf,
                jobID);
        outputCommitter.setupJob(jobContext);

        //clear all temporary objects
        DataAccessor.clearObjects(appID);

        //Calculating the partition layout
        com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping hostNameToPartition = com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping
                .getCurrent();
        List<InetAddress> hostAddresses = new ArrayList<InetAddress>(hostNameToPartition.getHosts());

        //Generating mapping of Hadoop partitions to SOSS partitions, so they are equally distributed across hosts
        int numHosts = hostAddresses.size();
        int numberOfSlotsPerNode = Math
                .max(grid != null ? grid.getMaxNumberOfCores() : Runtime.getRuntime().availableProcessors(), 1);

        //Generating split to hostname map
        Map<InetAddress, List<Integer>> splitToHostAddress = assignSplitsToHost(inputSplits, hostAddresses,
                splitLocations);

        int[] partitionMapping = hostNameToPartition.generateEvenItemDistribution(jobConf.getNumReduceTasks());

        HadoopInvocationParameters hadoopParameters = new HadoopInvocationParameters(jobConf, jobID, !isNewApi);
        HServerInvocationParameters parameters = new HServerInvocationParameters(hadoopParameters, appID,
                partitionMapping, hostNameToPartition, numberOfSlotsPerNode, splitType, inputSplits,
                splitToHostAddress, false,
                HServerParameters.getBooleanSetting(HServerParameters.SORT_KEYS, jobConf), hadoopVersion, null,
                SerializationMode.DEFAULT);

        StringBuilder stringBuilder = new StringBuilder();
        stringBuilder.append("Splits created:\n");
        for (InetAddress address : splitToHostAddress.keySet()) {
            stringBuilder.append("Host ");
            stringBuilder.append(address);
            stringBuilder.append(" has ");
            stringBuilder.append(splitToHostAddress.get(address).size());
            stringBuilder.append(" splits.\n");
        }
        System.out.println(stringBuilder.toString());

        System.out.println("Job initialization completed in " + (System.currentTimeMillis() - time) + " ms.");

        time = System.currentTimeMillis();

        InvokeResult<MapperResult> mapInvokeResult = MessagingHelper.invoke(grid,
                RunMapper.MapperInvokable.class, parameters, TimeSpan.INFINITE_TIMEOUT.getSeconds());

        if (mapInvokeResult.getErrors() != null && mapInvokeResult.getErrors().size() > 0) {
            throw new IOException("Map invocation failed.", mapInvokeResult.getErrors().get(0));
        }

        System.out.println("Map invocation done in " + (System.currentTimeMillis() - time) + " ms.");
        time = System.currentTimeMillis();

        MapperResult resultObject = mapInvokeResult.getResult();

        if (resultObject == null || mapInvokeResult.getNumFailed() != 0) {
            throw new IOException("Mapper invocation failed. Num failed = " + mapInvokeResult.getNumFailed());
        }

        if (resultObject.getNumberOfSplitsProcessed() != inputSplits.size()) {
            throw new IOException("Number of splits does not match the number of invocations. Nsplits = "
                    + inputSplits.size() + ", Ninvokes =" + resultObject.getNumberOfSplitsProcessed());
        }

        if (partitionMapping.length > 0) {
            //Running the reduce step
            InvokeResult<Integer> reduceInvokeResult = MessagingHelper.invoke(grid, ReduceInvokable.class,
                    appID, TimeSpan.INFINITE_TIMEOUT.getSeconds());

            System.out.println("Reduce invocation done in " + (System.currentTimeMillis() - time) + " ms.");

            DataAccessor.clearObjects(appID); //clear all temporary objects

            if (reduceInvokeResult.getErrors() != null && reduceInvokeResult.getErrors().size() > 0) {
                throw new IOException("Reduce invocation failed.", reduceInvokeResult.getErrors().get(0));
            }
            if (reduceInvokeResult.getNumFailed() != 0) {
                throw new IOException("Reduce invocation failed.");
            }
            if (reduceInvokeResult.getResult() != partitionMapping.length) {
                throw new IOException("Not all partitions were reduced. Expected = " + partitionMapping.length
                        + " Actual = " + reduceInvokeResult.getResult());
            }
        }
        outputCommitter.commitJob(jobContext);
    } catch (StateServerException e) {
        throw new IOException("ScaleOut hServer access error.", e);
    }

}

From source file:com.scaleoutsoftware.soss.hserver.JobScheduler.java

License:Apache License

/**
 * Runs the map-reduce job on ScaleOut hServer.*
 *
 * @param job          the job to run// w  w  w. ja  v  a 2  s .  c  om
 * @param jobId        the id of the job
 * @param sortEnabled  if key sorting is enabled
 * @param jobParameter user defined parameter object for the job
 * @param grid         the invocation grid to run the job
 * @throws IOException            if errors occurred during the job
 * @throws InterruptedException   if the processing thread is interrupted
 * @throws ClassNotFoundException if the invocation grid does not contain the dependency class
 */
@SuppressWarnings("unchecked")
public void runOldApiJob(JobConf job, org.apache.hadoop.mapred.JobID jobId, boolean sortEnabled,
        Object jobParameter, InvocationGrid grid)
        throws IOException, InterruptedException, ClassNotFoundException {
    //Initialize user credential in advance
    int jobAppId = 0xFFFFFFF & BitConverter.hashStringOneInt(jobId.toString());
    String hadoopVersion = VersionInfo.getVersion();
    long time = System.currentTimeMillis();
    CreateUserCredentials.run(grid);

    try {
        //Check output specs before running the job
        job.getOutputFormat().checkOutputSpecs(FileSystem.get(job), job);

        JobContext jContext = HadoopVersionSpecificCode.getInstance(hadoopVersion, job).createJobContext(job,
                jobId);

        org.apache.hadoop.mapred.OutputCommitter outputCommitter = job.getOutputCommitter();
        outputCommitter.setupJob(jContext);

        //clear all temporary objects
        DataAccessor.clearObjects(jobAppId);

        //Calculating the partition layout
        com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping hostNameToPartition = com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping
                .getCurrent();
        List<InetAddress> hostAddresses = new ArrayList<InetAddress>(hostNameToPartition.getHosts());

        //Generating mapping of Hadoop partitions to SOSS partitions, so they are equally distributed across hosts
        int numHosts = hostAddresses.size();
        int numberOfSlotsPerNode = Math
                .max(grid != null ? grid.getMaxNumberOfCores() : Runtime.getRuntime().availableProcessors(), 1);

        //Set the number of splits to the number of cores
        if (NamedMapInputFormatMapred.class.isAssignableFrom(job.getInputFormat().getClass())) {
            int numberOfSplits = HServerParameters.getSetting(MAP_SPLITS_PER_CORE, job) * numHosts
                    * numberOfSlotsPerNode;
            job.setNumMapTasks(Math.min(numberOfSplits, HServerConstants.MAX_MAP_REDUCE_TASKS));
        }

        //Generating split to hostname map
        org.apache.hadoop.mapred.InputFormat inputFormat = job.getInputFormat();
        List<org.apache.hadoop.mapred.InputSplit> splitList = Arrays
                .asList(inputFormat.getSplits(job, job.getNumMapTasks()));
        Map<InetAddress, List<Integer>> splitToHostAddress = assignSplitsToHost(splitList, hostAddresses, null);

        //Choose the optimal number of reducers for GridOutputFormat
        if (job.getOutputFormat() instanceof NamedMapOutputFormatMapred) {
            job.setNumReduceTasks(numHosts * numberOfSlotsPerNode);
            sortEnabled = false;
        }

        int[] partitionMapping = hostNameToPartition.generateEvenItemDistribution(job.getNumReduceTasks());

        //Generating invocation parameters
        Class<? extends org.apache.hadoop.mapred.InputSplit> splitType = splitList.size() > 0
                ? splitList.get(0).getClass()
                : null;

        HadoopInvocationParameters hadoopParameters = new HadoopInvocationParameters(job, jobId, true);

        HServerInvocationParameters<org.apache.hadoop.mapred.InputSplit> parameters = new HServerInvocationParameters<org.apache.hadoop.mapred.InputSplit>(
                hadoopParameters, jobAppId, partitionMapping, hostNameToPartition, numberOfSlotsPerNode,
                splitType, splitList, splitToHostAddress, false, sortEnabled, hadoopVersion, jobParameter,
                SerializationMode.DEFAULT);

        StringBuilder stringBuilder = new StringBuilder();
        stringBuilder.append("Splits created:\n");
        for (InetAddress address : splitToHostAddress.keySet()) {
            stringBuilder.append("Host ");
            stringBuilder.append(address);
            stringBuilder.append(" has ");
            stringBuilder.append(splitToHostAddress.get(address).size());
            stringBuilder.append(" splits.\n");
        }
        System.out.println(stringBuilder.toString());

        System.out.println("Job initialization completed in " + (System.currentTimeMillis() - time) + " ms.");
        time = System.currentTimeMillis();

        InvokeResult<MapperResult> mapInvokeResult = MessagingHelper.invoke(grid,
                RunMapper.MapperInvokable.class, parameters, TimeSpan.INFINITE_TIMEOUT.getSeconds());

        if (mapInvokeResult.getErrors() != null && mapInvokeResult.getErrors().size() > 0) {
            throw new IOException("Map invocation failed.", mapInvokeResult.getErrors().get(0));
        }

        System.out.println("Map invocation done in " + (System.currentTimeMillis() - time) + " ms.");
        time = System.currentTimeMillis();

        MapperResult resultObject = mapInvokeResult.getResult();

        if (resultObject == null || mapInvokeResult.getNumFailed() != 0) {
            throw new IOException("Mapper invocation failed. Num failed = " + mapInvokeResult.getNumFailed());
        }

        if (resultObject.getNumberOfSplitsProcessed() != splitList.size()) {
            throw new IOException("Number of splits does not match the number of invocations. Nsplits = "
                    + splitList.size() + ", Ninvokes =" + resultObject.getNumberOfSplitsProcessed());
        }

        if (partitionMapping.length > 0) {
            //Running the reduce step
            InvokeResult<Integer> reduceInvokeResult = MessagingHelper.invoke(grid, ReduceInvokable.class,
                    jobAppId, TimeSpan.INFINITE_TIMEOUT.getSeconds());

            System.out.println("Reduce invocation done in " + (System.currentTimeMillis() - time) + " ms.");

            DataAccessor.clearObjects(jobAppId); //clear all temporary objects

            if (reduceInvokeResult.getErrors() != null && reduceInvokeResult.getErrors().size() > 0) {
                throw new IOException("Reduce invocation failed.", reduceInvokeResult.getErrors().get(0));
            }
            if (reduceInvokeResult.getNumFailed() != 0) {
                throw new IOException("Reduce invocation failed.");
            }
            if (reduceInvokeResult.getResult() != partitionMapping.length) {
                throw new IOException("Not all partitions were reduced. Expected = " + partitionMapping.length
                        + " Actual = " + reduceInvokeResult.getResult());
            }
        }
        outputCommitter.commitJob(jContext);
    } catch (StateServerException e) {
        throw new IOException("ScaleOut hServer access error.", e);
    }

}

From source file:cz.seznam.euphoria.hadoop.output.HadoopSink.java

License:Apache License

@Override
@SneakyThrows// w  ww.  jav a2  s .  c om
public void initialize() {
    OutputCommitter committer = getHadoopFormatInstance()
            .getOutputCommitter(HadoopUtils.createTaskContext(conf.getWritable(), 0));

    committer.setupJob(HadoopUtils.createJobContext(conf.getWritable()));
}

From source file:org.apache.giraph.io.internal.WrappedEdgeOutputFormat.java

License:Apache License

@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {

    final OutputCommitter outputCommitter = originalOutputFormat
            .getOutputCommitter(HadoopUtils.makeTaskAttemptContext(getConf(), context));

    return new OutputCommitter() {
        @Override//from ww w . j  a  v a  2s.  co  m
        public void setupJob(JobContext context) throws IOException {
            outputCommitter.setupJob(HadoopUtils.makeJobContext(getConf(), context));
        }

        @Override
        public void setupTask(TaskAttemptContext context) throws IOException {
            outputCommitter.setupTask(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public boolean needsTaskCommit(TaskAttemptContext context) throws IOException {
            return outputCommitter.needsTaskCommit(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public void commitTask(TaskAttemptContext context) throws IOException {
            outputCommitter.commitTask(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public void abortTask(TaskAttemptContext context) throws IOException {
            outputCommitter.abortTask(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public void cleanupJob(JobContext context) throws IOException {
            outputCommitter.cleanupJob(HadoopUtils.makeJobContext(getConf(), context));
        }

        /*if_not[HADOOP_NON_COMMIT_JOB]*/
        @Override
        public void commitJob(JobContext context) throws IOException {
            outputCommitter.commitJob(HadoopUtils.makeJobContext(getConf(), context));
        }

        @Override
        public void abortJob(JobContext context, JobStatus.State state) throws IOException {
            outputCommitter.abortJob(HadoopUtils.makeJobContext(getConf(), context), state);
        }
        /*end[HADOOP_NON_COMMIT_JOB]*/
    };
}

From source file:org.apache.giraph.io.internal.WrappedVertexOutputFormat.java

License:Apache License

@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
    final OutputCommitter outputCommitter = originalOutputFormat
            .getOutputCommitter(HadoopUtils.makeTaskAttemptContext(getConf(), context));
    return new OutputCommitter() {
        @Override//from   w  w  w.  ja v a 2  s  .c o m
        public void setupJob(JobContext context) throws IOException {
            outputCommitter.setupJob(HadoopUtils.makeJobContext(getConf(), context));
        }

        @Override
        public void setupTask(TaskAttemptContext context) throws IOException {
            outputCommitter.setupTask(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public boolean needsTaskCommit(TaskAttemptContext context) throws IOException {
            return outputCommitter.needsTaskCommit(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public void commitTask(TaskAttemptContext context) throws IOException {
            outputCommitter.commitTask(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public void abortTask(TaskAttemptContext context) throws IOException {
            outputCommitter.abortTask(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public void cleanupJob(JobContext context) throws IOException {
            outputCommitter.cleanupJob(HadoopUtils.makeJobContext(getConf(), context));
        }

        /*if_not[HADOOP_NON_COMMIT_JOB]*/
        @Override
        public void commitJob(JobContext context) throws IOException {
            outputCommitter.commitJob(HadoopUtils.makeJobContext(getConf(), context));
        }

        @Override
        public void abortJob(JobContext context, JobStatus.State state) throws IOException {
            outputCommitter.abortJob(HadoopUtils.makeJobContext(getConf(), context), state);
        }
        /*end[HADOOP_NON_COMMIT_JOB]*/
    };
}

From source file:org.apache.hcatalog.mapreduce.FileRecordWriterContainer.java

License:Apache License

@Override
public void write(WritableComparable<?> key, HCatRecord value) throws IOException, InterruptedException {

    org.apache.hadoop.mapred.RecordWriter localWriter;
    ObjectInspector localObjectInspector;
    SerDe localSerDe;//  w  w w.  j  a  v  a  2s  .  c o m
    OutputJobInfo localJobInfo = null;

    if (dynamicPartitioningUsed) {
        // calculate which writer to use from the remaining values - this needs to be done before we delete cols
        List<String> dynamicPartValues = new ArrayList<String>();
        for (Integer colToAppend : dynamicPartCols) {
            dynamicPartValues.add(value.get(colToAppend).toString());
        }

        String dynKey = dynamicPartValues.toString();
        if (!baseDynamicWriters.containsKey(dynKey)) {
            if ((maxDynamicPartitions != -1) && (baseDynamicWriters.size() > maxDynamicPartitions)) {
                throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS,
                        "Number of dynamic partitions being created "
                                + "exceeds configured max allowable partitions[" + maxDynamicPartitions
                                + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname
                                + "] if needed.");
            }

            org.apache.hadoop.mapred.TaskAttemptContext currTaskContext = HCatMapRedUtil
                    .createTaskAttemptContext(context);
            configureDynamicStorageHandler(currTaskContext, dynamicPartValues);
            localJobInfo = HCatBaseOutputFormat.getJobInfo(currTaskContext);

            //setup serDe
            SerDe currSerDe = ReflectionUtils.newInstance(storageHandler.getSerDeClass(),
                    currTaskContext.getJobConf());
            try {
                InternalUtil.initializeOutputSerDe(currSerDe, currTaskContext.getConfiguration(), localJobInfo);
            } catch (SerDeException e) {
                throw new IOException("Failed to initialize SerDe", e);
            }

            //create base OutputFormat
            org.apache.hadoop.mapred.OutputFormat baseOF = ReflectionUtils
                    .newInstance(storageHandler.getOutputFormatClass(), currTaskContext.getJobConf());

            //We are skipping calling checkOutputSpecs() for each partition
            //As it can throw a FileAlreadyExistsException when more than one mapper is writing to a partition
            //See HCATALOG-490, also to avoid contacting the namenode for each new FileOutputFormat instance
            //In general this should be ok for most FileOutputFormat implementations
            //but may become an issue for cases when the method is used to perform other setup tasks

            //get Output Committer
            org.apache.hadoop.mapred.OutputCommitter baseOutputCommitter = currTaskContext.getJobConf()
                    .getOutputCommitter();
            //create currJobContext the latest so it gets all the config changes
            org.apache.hadoop.mapred.JobContext currJobContext = HCatMapRedUtil
                    .createJobContext(currTaskContext);
            //setupJob()
            baseOutputCommitter.setupJob(currJobContext);
            //recreate to refresh jobConf of currTask context
            currTaskContext = HCatMapRedUtil.createTaskAttemptContext(currJobContext.getJobConf(),
                    currTaskContext.getTaskAttemptID(), currTaskContext.getProgressible());
            //set temp location
            currTaskContext.getConfiguration().set("mapred.work.output.dir",
                    new FileOutputCommitter(new Path(localJobInfo.getLocation()), currTaskContext).getWorkPath()
                            .toString());
            //setupTask()
            baseOutputCommitter.setupTask(currTaskContext);

            Path parentDir = new Path(currTaskContext.getConfiguration().get("mapred.work.output.dir"));
            Path childPath = new Path(parentDir, FileOutputFormat.getUniqueFile(currTaskContext, "part", ""));

            org.apache.hadoop.mapred.RecordWriter baseRecordWriter = baseOF.getRecordWriter(
                    parentDir.getFileSystem(currTaskContext.getConfiguration()), currTaskContext.getJobConf(),
                    childPath.toString(), InternalUtil.createReporter(currTaskContext));

            baseDynamicWriters.put(dynKey, baseRecordWriter);
            baseDynamicSerDe.put(dynKey, currSerDe);
            baseDynamicCommitters.put(dynKey, baseOutputCommitter);
            dynamicContexts.put(dynKey, currTaskContext);
            dynamicObjectInspectors.put(dynKey,
                    InternalUtil.createStructObjectInspector(jobInfo.getOutputSchema()));
            dynamicOutputJobInfo.put(dynKey, HCatOutputFormat.getJobInfo(dynamicContexts.get(dynKey)));
        }

        localJobInfo = dynamicOutputJobInfo.get(dynKey);
        localWriter = baseDynamicWriters.get(dynKey);
        localSerDe = baseDynamicSerDe.get(dynKey);
        localObjectInspector = dynamicObjectInspectors.get(dynKey);
    } else {
        localJobInfo = jobInfo;
        localWriter = getBaseRecordWriter();
        localSerDe = serDe;
        localObjectInspector = objectInspector;
    }

    for (Integer colToDel : partColsToDel) {
        value.remove(colToDel);
    }

    //The key given by user is ignored
    try {
        localWriter.write(NullWritable.get(), localSerDe.serialize(value.getAll(), localObjectInspector));
    } catch (SerDeException e) {
        throw new IOException("Failed to serialize object", e);
    }
}

From source file:org.apache.hcatalog.pig.TestE2EScenarios.java

License:Apache License

private void copyTable(String in, String out) throws IOException, InterruptedException {
    Job ijob = new Job();
    Job ojob = new Job();
    HCatInputFormat inpy = new HCatInputFormat();
    inpy.setInput(ijob, null, in);/*from w ww  .jav  a2s. co  m*/
    HCatOutputFormat oupy = new HCatOutputFormat();
    oupy.setOutput(ojob, OutputJobInfo.create(null, out, new HashMap<String, String>()));

    // Test HCatContext

    System.err.println("HCatContext INSTANCE is present : " + HCatContext.INSTANCE.getConf().isPresent());
    if (HCatContext.INSTANCE.getConf().isPresent()) {
        System.err.println("HCatContext tinyint->int promotion says " + HCatContext.INSTANCE.getConf().get()
                .getBoolean(HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION,
                        HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION_DEFAULT));
    }

    HCatSchema tableSchema = inpy.getTableSchema(ijob.getConfiguration());
    System.err.println("Copying from [" + in + "] to [" + out + "] with schema : " + tableSchema.toString());
    oupy.setSchema(ojob, tableSchema);
    oupy.checkOutputSpecs(ojob);
    OutputCommitter oc = oupy.getOutputCommitter(createTaskAttemptContext(ojob.getConfiguration()));
    oc.setupJob(ojob);

    for (InputSplit split : inpy.getSplits(ijob)) {

        TaskAttemptContext rtaskContext = createTaskAttemptContext(ijob.getConfiguration());
        TaskAttemptContext wtaskContext = createTaskAttemptContext(ojob.getConfiguration());

        RecordReader<WritableComparable, HCatRecord> rr = inpy.createRecordReader(split, rtaskContext);
        rr.initialize(split, rtaskContext);

        OutputCommitter taskOc = oupy.getOutputCommitter(wtaskContext);
        taskOc.setupTask(wtaskContext);
        RecordWriter<WritableComparable<?>, HCatRecord> rw = oupy.getRecordWriter(wtaskContext);

        while (rr.nextKeyValue()) {
            rw.write(rr.getCurrentKey(), rr.getCurrentValue());
        }
        rw.close(wtaskContext);
        taskOc.commitTask(wtaskContext);
        rr.close();
    }

    oc.commitJob(ojob);
}

From source file:org.apache.hive.hcatalog.mapreduce.DynamicPartitionFileRecordWriterContainer.java

License:Apache License

@Override
protected LocalFileWriter getLocalFileWriter(HCatRecord value) throws IOException, HCatException {
    OutputJobInfo localJobInfo = null;//from www  .  j a v  a2  s.  c o m
    // Calculate which writer to use from the remaining values - this needs to
    // be done before we delete cols.
    List<String> dynamicPartValues = new ArrayList<String>();
    for (Integer colToAppend : dynamicPartCols) {
        Object partitionValue = value.get(colToAppend);
        dynamicPartValues
                .add(partitionValue == null ? HIVE_DEFAULT_PARTITION_VALUE : partitionValue.toString());
    }

    String dynKey = dynamicPartValues.toString();
    if (!baseDynamicWriters.containsKey(dynKey)) {
        if ((maxDynamicPartitions != -1) && (baseDynamicWriters.size() > maxDynamicPartitions)) {
            throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS,
                    "Number of dynamic partitions being created "
                            + "exceeds configured max allowable partitions[" + maxDynamicPartitions
                            + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname
                            + "] if needed.");
        }

        org.apache.hadoop.mapred.TaskAttemptContext currTaskContext = HCatMapRedUtil
                .createTaskAttemptContext(context);
        configureDynamicStorageHandler(currTaskContext, dynamicPartValues);
        localJobInfo = HCatBaseOutputFormat.getJobInfo(currTaskContext.getConfiguration());

        // Setup serDe.
        SerDe currSerDe = ReflectionUtils.newInstance(storageHandler.getSerDeClass(),
                currTaskContext.getJobConf());
        try {
            InternalUtil.initializeOutputSerDe(currSerDe, currTaskContext.getConfiguration(), localJobInfo);
        } catch (SerDeException e) {
            throw new IOException("Failed to initialize SerDe", e);
        }

        // create base OutputFormat
        org.apache.hadoop.mapred.OutputFormat baseOF = ReflectionUtils
                .newInstance(storageHandler.getOutputFormatClass(), currTaskContext.getJobConf());

        // We are skipping calling checkOutputSpecs() for each partition
        // As it can throw a FileAlreadyExistsException when more than one
        // mapper is writing to a partition.
        // See HCATALOG-490, also to avoid contacting the namenode for each new
        // FileOutputFormat instance.
        // In general this should be ok for most FileOutputFormat implementations
        // but may become an issue for cases when the method is used to perform
        // other setup tasks.

        // Get Output Committer
        org.apache.hadoop.mapred.OutputCommitter baseOutputCommitter = currTaskContext.getJobConf()
                .getOutputCommitter();

        // Create currJobContext the latest so it gets all the config changes
        org.apache.hadoop.mapred.JobContext currJobContext = HCatMapRedUtil.createJobContext(currTaskContext);

        // Set up job.
        baseOutputCommitter.setupJob(currJobContext);

        // Recreate to refresh jobConf of currTask context.
        currTaskContext = HCatMapRedUtil.createTaskAttemptContext(currJobContext.getJobConf(),
                currTaskContext.getTaskAttemptID(), currTaskContext.getProgressible());

        // Set temp location.
        currTaskContext.getConfiguration().set("mapred.work.output.dir",
                new FileOutputCommitter(new Path(localJobInfo.getLocation()), currTaskContext).getWorkPath()
                        .toString());

        // Set up task.
        baseOutputCommitter.setupTask(currTaskContext);

        Path parentDir = new Path(currTaskContext.getConfiguration().get("mapred.work.output.dir"));
        Path childPath = new Path(parentDir, FileOutputFormat.getUniqueFile(currTaskContext,
                currTaskContext.getConfiguration().get("mapreduce.output.basename", "part"), ""));

        RecordWriter baseRecordWriter = baseOF.getRecordWriter(
                parentDir.getFileSystem(currTaskContext.getConfiguration()), currTaskContext.getJobConf(),
                childPath.toString(), InternalUtil.createReporter(currTaskContext));

        baseDynamicWriters.put(dynKey, baseRecordWriter);
        baseDynamicSerDe.put(dynKey, currSerDe);
        baseDynamicCommitters.put(dynKey, baseOutputCommitter);
        dynamicContexts.put(dynKey, currTaskContext);
        dynamicObjectInspectors.put(dynKey,
                InternalUtil.createStructObjectInspector(jobInfo.getOutputSchema()));
        dynamicOutputJobInfo.put(dynKey,
                HCatOutputFormat.getJobInfo(dynamicContexts.get(dynKey).getConfiguration()));
    }

    return new LocalFileWriter(baseDynamicWriters.get(dynKey), dynamicObjectInspectors.get(dynKey),
            baseDynamicSerDe.get(dynKey), dynamicOutputJobInfo.get(dynKey));
}

From source file:org.apache.hive.hcatalog.mapreduce.TestHCatOutputFormat.java

License:Apache License

public void publishTest(Job job) throws Exception {
    HCatOutputFormat hcof = new HCatOutputFormat();
    TaskAttemptContext tac = ShimLoader.getHadoopShims().getHCatShim().createTaskAttemptContext(
            job.getConfiguration(), ShimLoader.getHadoopShims().getHCatShim().createTaskAttemptID());
    OutputCommitter committer = hcof.getOutputCommitter(tac);
    committer.setupJob(job);
    committer.setupTask(tac);//  ww w.  jav  a 2s  .  c o m
    committer.commitTask(tac);
    committer.commitJob(job);

    Partition part = client.getPartition(dbName, tblName, Arrays.asList("p1"));
    assertNotNull(part);

    StorerInfo storer = InternalUtil.extractStorerInfo(part.getSd(), part.getParameters());
    assertEquals(storer.getProperties().get("hcat.testarg"), "testArgValue");
    assertTrue(part.getSd().getLocation().indexOf("p1") != -1);
}