Example usage for org.apache.hadoop.mapreduce OutputCommitter setupJob

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce OutputCommitter setupJob.

Prototype

public abstract void setupJob(JobContext jobContext) throws IOException;

Source Link

Document

For the framework to setup the job output during initialization.

Usage

From source file:com.asakusafw.runtime.mapreduce.simple.SimpleJobRunner.java

License:Apache License

private void runJob(Job job) throws ClassNotFoundException, IOException, InterruptedException {
    assert job.getJobID() != null;
    TaskID taskId = newMapTaskId(job.getJobID(), 0);
    Configuration conf = job.getConfiguration();
    OutputFormat<?, ?> output = ReflectionUtils.newInstance(job.getOutputFormatClass(), conf);
    OutputCommitter committer = output
            .getOutputCommitter(newTaskAttemptContext(conf, newTaskAttemptId(taskId, 0)));
    boolean succeed = false;
    committer.setupJob(job);
    try {//w  w  w.  java 2 s .  c  o  m
        if (job.getNumReduceTasks() == 0) {
            runMap(job, null);
        } else {
            try (KeyValueSorter<?, ?> sorter = createSorter(job, job.getMapOutputKeyClass(),
                    job.getMapOutputValueClass())) {
                runMap(job, sorter);
                runReduce(job, sorter);
            }
        }
        committer.commitJob(job);
        succeed = true;
    } finally {
        if (succeed == false) {
            try {
                committer.abortJob(job, State.FAILED);
            } catch (IOException e) {
                LOG.error(MessageFormat.format("error occurred while aborting job: {0} ({1})", job.getJobID(),
                        job.getJobName()), e);
            }
        }
    }
}

From source file:com.scaleoutsoftware.soss.hserver.JobScheduler.java

License:Apache License

/**
 * Runs the map-reduce job on ScaleOut hServer.
 *
 * @param jobID          the id of the job
 * @param jobConf        the job to run/*from w  w w .ja  v  a  2s  .  c o m*/
 * @param isNewApi       if the job uses the new MapReduce APIs
 * @param splitType      the type of the split
 * @param inputSplits    the list of input splits
 * @param splitLocations the locations of the splits
 * @param grid           the invocation grid to run the job
 * @throws IOException            if errors occurred during the job
 * @throws InterruptedException   if the processing thread is interrupted
 * @throws ClassNotFoundException if the invocation grid does not contain the dependency class
 */
@SuppressWarnings("unchecked")
public void runPredefinedJob(JobID jobID, JobConf jobConf, boolean isNewApi, Class splitType,
        List<?> inputSplits, Map<Object, String[]> splitLocations, InvocationGrid grid)
        throws IOException, InterruptedException, ClassNotFoundException {

    //Initialize user credential in advance
    long time = System.currentTimeMillis();
    CreateUserCredentials.run(grid);
    String hadoopVersion = VersionInfo.getVersion();

    int appID = 0xFFFFFFF & BitConverter.hashStringOneInt(jobID.toString());

    try {

        org.apache.hadoop.mapreduce.OutputCommitter outputCommitter = createOutputCommitter(isNewApi, jobID,
                jobConf);

        HadoopVersionSpecificCode hadoopVersionSpecificCode = HadoopVersionSpecificCode
                .getInstance(hadoopVersion, jobConf);

        org.apache.hadoop.mapred.JobContext jobContext = hadoopVersionSpecificCode.createJobContext(jobConf,
                jobID);
        outputCommitter.setupJob(jobContext);

        //clear all temporary objects
        DataAccessor.clearObjects(appID);

        //Calculating the partition layout
        com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping hostNameToPartition = com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping
                .getCurrent();
        List<InetAddress> hostAddresses = new ArrayList<InetAddress>(hostNameToPartition.getHosts());

        //Generating mapping of Hadoop partitions to SOSS partitions, so they are equally distributed across hosts
        int numHosts = hostAddresses.size();
        int numberOfSlotsPerNode = Math
                .max(grid != null ? grid.getMaxNumberOfCores() : Runtime.getRuntime().availableProcessors(), 1);

        //Generating split to hostname map
        Map<InetAddress, List<Integer>> splitToHostAddress = assignSplitsToHost(inputSplits, hostAddresses,
                splitLocations);

        int[] partitionMapping = hostNameToPartition.generateEvenItemDistribution(jobConf.getNumReduceTasks());

        HadoopInvocationParameters hadoopParameters = new HadoopInvocationParameters(jobConf, jobID, !isNewApi);
        HServerInvocationParameters parameters = new HServerInvocationParameters(hadoopParameters, appID,
                partitionMapping, hostNameToPartition, numberOfSlotsPerNode, splitType, inputSplits,
                splitToHostAddress, false,
                HServerParameters.getBooleanSetting(HServerParameters.SORT_KEYS, jobConf), hadoopVersion, null,
                SerializationMode.DEFAULT);

        StringBuilder stringBuilder = new StringBuilder();
        stringBuilder.append("Splits created:\n");
        for (InetAddress address : splitToHostAddress.keySet()) {
            stringBuilder.append("Host ");
            stringBuilder.append(address);
            stringBuilder.append(" has ");
            stringBuilder.append(splitToHostAddress.get(address).size());
            stringBuilder.append(" splits.\n");
        }
        System.out.println(stringBuilder.toString());

        System.out.println("Job initialization completed in " + (System.currentTimeMillis() - time) + " ms.");

        time = System.currentTimeMillis();

        InvokeResult<MapperResult> mapInvokeResult = MessagingHelper.invoke(grid,
                RunMapper.MapperInvokable.class, parameters, TimeSpan.INFINITE_TIMEOUT.getSeconds());

        if (mapInvokeResult.getErrors() != null && mapInvokeResult.getErrors().size() > 0) {
            throw new IOException("Map invocation failed.", mapInvokeResult.getErrors().get(0));
        }

        System.out.println("Map invocation done in " + (System.currentTimeMillis() - time) + " ms.");
        time = System.currentTimeMillis();

        MapperResult resultObject = mapInvokeResult.getResult();

        if (resultObject == null || mapInvokeResult.getNumFailed() != 0) {
            throw new IOException("Mapper invocation failed. Num failed = " + mapInvokeResult.getNumFailed());
        }

        if (resultObject.getNumberOfSplitsProcessed() != inputSplits.size()) {
            throw new IOException("Number of splits does not match the number of invocations. Nsplits = "
                    + inputSplits.size() + ", Ninvokes =" + resultObject.getNumberOfSplitsProcessed());
        }

        if (partitionMapping.length > 0) {
            //Running the reduce step
            InvokeResult<Integer> reduceInvokeResult = MessagingHelper.invoke(grid, ReduceInvokable.class,
                    appID, TimeSpan.INFINITE_TIMEOUT.getSeconds());

            System.out.println("Reduce invocation done in " + (System.currentTimeMillis() - time) + " ms.");

            DataAccessor.clearObjects(appID); //clear all temporary objects

            if (reduceInvokeResult.getErrors() != null && reduceInvokeResult.getErrors().size() > 0) {
                throw new IOException("Reduce invocation failed.", reduceInvokeResult.getErrors().get(0));
            }
            if (reduceInvokeResult.getNumFailed() != 0) {
                throw new IOException("Reduce invocation failed.");
            }
            if (reduceInvokeResult.getResult() != partitionMapping.length) {
                throw new IOException("Not all partitions were reduced. Expected = " + partitionMapping.length
                        + " Actual = " + reduceInvokeResult.getResult());
            }
        }
        outputCommitter.commitJob(jobContext);
    } catch (StateServerException e) {
        throw new IOException("ScaleOut hServer access error.", e);
    }

}

From source file:com.scaleoutsoftware.soss.hserver.JobScheduler.java

License:Apache License

/**
 * Runs the map-reduce job on ScaleOut hServer.*
 *
 * @param job          the job to run// w  w  w. ja  v  a 2  s .  c  om
 * @param jobId        the id of the job
 * @param sortEnabled  if key sorting is enabled
 * @param jobParameter user defined parameter object for the job
 * @param grid         the invocation grid to run the job
 * @throws IOException            if errors occurred during the job
 * @throws InterruptedException   if the processing thread is interrupted
 * @throws ClassNotFoundException if the invocation grid does not contain the dependency class
 */
@SuppressWarnings("unchecked")
public void runOldApiJob(JobConf job, org.apache.hadoop.mapred.JobID jobId, boolean sortEnabled,
        Object jobParameter, InvocationGrid grid)
        throws IOException, InterruptedException, ClassNotFoundException {
    //Initialize user credential in advance
    int jobAppId = 0xFFFFFFF & BitConverter.hashStringOneInt(jobId.toString());
    String hadoopVersion = VersionInfo.getVersion();
    long time = System.currentTimeMillis();
    CreateUserCredentials.run(grid);

    try {
        //Check output specs before running the job
        job.getOutputFormat().checkOutputSpecs(FileSystem.get(job), job);

        JobContext jContext = HadoopVersionSpecificCode.getInstance(hadoopVersion, job).createJobContext(job,
                jobId);

        org.apache.hadoop.mapred.OutputCommitter outputCommitter = job.getOutputCommitter();
        outputCommitter.setupJob(jContext);

        //clear all temporary objects
        DataAccessor.clearObjects(jobAppId);

        //Calculating the partition layout
        com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping hostNameToPartition = com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping
                .getCurrent();
        List<InetAddress> hostAddresses = new ArrayList<InetAddress>(hostNameToPartition.getHosts());

        //Generating mapping of Hadoop partitions to SOSS partitions, so they are equally distributed across hosts
        int numHosts = hostAddresses.size();
        int numberOfSlotsPerNode = Math
                .max(grid != null ? grid.getMaxNumberOfCores() : Runtime.getRuntime().availableProcessors(), 1);

        //Set the number of splits to the number of cores
        if (NamedMapInputFormatMapred.class.isAssignableFrom(job.getInputFormat().getClass())) {
            int numberOfSplits = HServerParameters.getSetting(MAP_SPLITS_PER_CORE, job) * numHosts
                    * numberOfSlotsPerNode;
            job.setNumMapTasks(Math.min(numberOfSplits, HServerConstants.MAX_MAP_REDUCE_TASKS));
        }

        //Generating split to hostname map
        org.apache.hadoop.mapred.InputFormat inputFormat = job.getInputFormat();
        List<org.apache.hadoop.mapred.InputSplit> splitList = Arrays
                .asList(inputFormat.getSplits(job, job.getNumMapTasks()));
        Map<InetAddress, List<Integer>> splitToHostAddress = assignSplitsToHost(splitList, hostAddresses, null);

        //Choose the optimal number of reducers for GridOutputFormat
        if (job.getOutputFormat() instanceof NamedMapOutputFormatMapred) {
            job.setNumReduceTasks(numHosts * numberOfSlotsPerNode);
            sortEnabled = false;
        }

        int[] partitionMapping = hostNameToPartition.generateEvenItemDistribution(job.getNumReduceTasks());

        //Generating invocation parameters
        Class<? extends org.apache.hadoop.mapred.InputSplit> splitType = splitList.size() > 0
                ? splitList.get(0).getClass()
                : null;

        HadoopInvocationParameters hadoopParameters = new HadoopInvocationParameters(job, jobId, true);

        HServerInvocationParameters<org.apache.hadoop.mapred.InputSplit> parameters = new HServerInvocationParameters<org.apache.hadoop.mapred.InputSplit>(
                hadoopParameters, jobAppId, partitionMapping, hostNameToPartition, numberOfSlotsPerNode,
                splitType, splitList, splitToHostAddress, false, sortEnabled, hadoopVersion, jobParameter,
                SerializationMode.DEFAULT);

        StringBuilder stringBuilder = new StringBuilder();
        stringBuilder.append("Splits created:\n");
        for (InetAddress address : splitToHostAddress.keySet()) {
            stringBuilder.append("Host ");
            stringBuilder.append(address);
            stringBuilder.append(" has ");
            stringBuilder.append(splitToHostAddress.get(address).size());
            stringBuilder.append(" splits.\n");
        }
        System.out.println(stringBuilder.toString());

        System.out.println("Job initialization completed in " + (System.currentTimeMillis() - time) + " ms.");
        time = System.currentTimeMillis();

        InvokeResult<MapperResult> mapInvokeResult = MessagingHelper.invoke(grid,
                RunMapper.MapperInvokable.class, parameters, TimeSpan.INFINITE_TIMEOUT.getSeconds());

        if (mapInvokeResult.getErrors() != null && mapInvokeResult.getErrors().size() > 0) {
            throw new IOException("Map invocation failed.", mapInvokeResult.getErrors().get(0));
        }

        System.out.println("Map invocation done in " + (System.currentTimeMillis() - time) + " ms.");
        time = System.currentTimeMillis();

        MapperResult resultObject = mapInvokeResult.getResult();

        if (resultObject == null || mapInvokeResult.getNumFailed() != 0) {
            throw new IOException("Mapper invocation failed. Num failed = " + mapInvokeResult.getNumFailed());
        }

        if (resultObject.getNumberOfSplitsProcessed() != splitList.size()) {
            throw new IOException("Number of splits does not match the number of invocations. Nsplits = "
                    + splitList.size() + ", Ninvokes =" + resultObject.getNumberOfSplitsProcessed());
        }

        if (partitionMapping.length > 0) {
            //Running the reduce step
            InvokeResult<Integer> reduceInvokeResult = MessagingHelper.invoke(grid, ReduceInvokable.class,
                    jobAppId, TimeSpan.INFINITE_TIMEOUT.getSeconds());

            System.out.println("Reduce invocation done in " + (System.currentTimeMillis() - time) + " ms.");

            DataAccessor.clearObjects(jobAppId); //clear all temporary objects

            if (reduceInvokeResult.getErrors() != null && reduceInvokeResult.getErrors().size() > 0) {
                throw new IOException("Reduce invocation failed.", reduceInvokeResult.getErrors().get(0));
            }
            if (reduceInvokeResult.getNumFailed() != 0) {
                throw new IOException("Reduce invocation failed.");
            }
            if (reduceInvokeResult.getResult() != partitionMapping.length) {
                throw new IOException("Not all partitions were reduced. Expected = " + partitionMapping.length
                        + " Actual = " + reduceInvokeResult.getResult());
            }
        }
        outputCommitter.commitJob(jContext);
    } catch (StateServerException e) {
        throw new IOException("ScaleOut hServer access error.", e);
    }

}

From source file:cz.seznam.euphoria.hadoop.output.HadoopSink.java

License:Apache License

@Override
@SneakyThrows// w  ww.  jav a2  s .  c om
public void initialize() {
    OutputCommitter committer = getHadoopFormatInstance()
            .getOutputCommitter(HadoopUtils.createTaskContext(conf.getWritable(), 0));

    committer.setupJob(HadoopUtils.createJobContext(conf.getWritable()));
}

From source file:org.apache.giraph.io.internal.WrappedEdgeOutputFormat.java

License:Apache License

@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {

    final OutputCommitter outputCommitter = originalOutputFormat
            .getOutputCommitter(HadoopUtils.makeTaskAttemptContext(getConf(), context));

    return new OutputCommitter() {
        @Override//from ww w . j  a  v a  2s.  co  m
        public void setupJob(JobContext context) throws IOException {
            outputCommitter.setupJob(HadoopUtils.makeJobContext(getConf(), context));
        }

        @Override
        public void setupTask(TaskAttemptContext context) throws IOException {
            outputCommitter.setupTask(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public boolean needsTaskCommit(TaskAttemptContext context) throws IOException {
            return outputCommitter.needsTaskCommit(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public void commitTask(TaskAttemptContext context) throws IOException {
            outputCommitter.commitTask(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public void abortTask(TaskAttemptContext context) throws IOException {
            outputCommitter.abortTask(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public void cleanupJob(JobContext context) throws IOException {
            outputCommitter.cleanupJob(HadoopUtils.makeJobContext(getConf(), context));
        }

        /*if_not[HADOOP_NON_COMMIT_JOB]*/
        @Override
        public void commitJob(JobContext context) throws IOException {
            outputCommitter.commitJob(HadoopUtils.makeJobContext(getConf(), context));
        }

        @Override
        public void abortJob(JobContext context, JobStatus.State state) throws IOException {
            outputCommitter.abortJob(HadoopUtils.makeJobContext(getConf(), context), state);
        }
        /*end[HADOOP_NON_COMMIT_JOB]*/
    };
}

From source file:org.apache.giraph.io.internal.WrappedVertexOutputFormat.java

License:Apache License

@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
    final OutputCommitter outputCommitter = originalOutputFormat
            .getOutputCommitter(HadoopUtils.makeTaskAttemptContext(getConf(), context));
    return new OutputCommitter() {
        @Override//from   w  w  w.  ja v a 2  s  .c o m
        public void setupJob(JobContext context) throws IOException {
            outputCommitter.setupJob(HadoopUtils.makeJobContext(getConf(), context));
        }

        @Override
        public void setupTask(TaskAttemptContext context) throws IOException {
            outputCommitter.setupTask(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public boolean needsTaskCommit(TaskAttemptContext context) throws IOException {
            return outputCommitter.needsTaskCommit(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public void commitTask(TaskAttemptContext context) throws IOException {
            outputCommitter.commitTask(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public void abortTask(TaskAttemptContext context) throws IOException {
            outputCommitter.abortTask(HadoopUtils.makeTaskAttemptContext(getConf(), context));
        }

        @Override
        public void cleanupJob(JobContext context) throws IOException {
            outputCommitter.cleanupJob(HadoopUtils.makeJobContext(getConf(), context));
        }

        /*if_not[HADOOP_NON_COMMIT_JOB]*/
        @Override
        public void commitJob(JobContext context) throws IOException {
            outputCommitter.commitJob(HadoopUtils.makeJobContext(getConf(), context));
        }

        @Override
        public void abortJob(JobContext context, JobStatus.State state) throws IOException {
            outputCommitter.abortJob(HadoopUtils.makeJobContext(getConf(), context), state);
        }
        /*end[HADOOP_NON_COMMIT_JOB]*/
    };
}

From source file:org.apache.hcatalog.mapreduce.FileRecordWriterContainer.java

License:Apache License

@Override
public void write(WritableComparable<?> key, HCatRecord value) throws IOException, InterruptedException {

    org.apache.hadoop.mapred.RecordWriter localWriter;
    ObjectInspector localObjectInspector;
    SerDe localSerDe;//  w  w w.  j  a  v  a  2s  .  c o m
    OutputJobInfo localJobInfo = null;

    if (dynamicPartitioningUsed) {
        // calculate which writer to use from the remaining values - this needs to be done before we delete cols
        List<String> dynamicPartValues = new ArrayList<String>();
        for (Integer colToAppend : dynamicPartCols) {
            dynamicPartValues.add(value.get(colToAppend).toString());
        }

        String dynKey = dynamicPartValues.toString();
        if (!baseDynamicWriters.containsKey(dynKey)) {
            if ((maxDynamicPartitions != -1) && (baseDynamicWriters.size() > maxDynamicPartitions)) {
                throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS,
                        "Number of dynamic partitions being created "
                                + "exceeds configured max allowable partitions[" + maxDynamicPartitions
                                + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname
                                + "] if needed.");
            }

            org.apache.hadoop.mapred.TaskAttemptContext currTaskContext = HCatMapRedUtil
                    .createTaskAttemptContext(context);
            configureDynamicStorageHandler(currTaskContext, dynamicPartValues);
            localJobInfo = HCatBaseOutputFormat.getJobInfo(currTaskContext);

            //setup serDe
            SerDe currSerDe = ReflectionUtils.newInstance(storageHandler.getSerDeClass(),
                    currTaskContext.getJobConf());
            try {
                InternalUtil.initializeOutputSerDe(currSerDe, currTaskContext.getConfiguration(), localJobInfo);
            } catch (SerDeException e) {
                throw new IOException("Failed to initialize SerDe", e);
            }

            //create base OutputFormat
            org.apache.hadoop.mapred.OutputFormat baseOF = ReflectionUtils
                    .newInstance(storageHandler.getOutputFormatClass(), currTaskContext.getJobConf());

            //We are skipping calling checkOutputSpecs() for each partition
            //As it can throw a FileAlreadyExistsException when more than one mapper is writing to a partition
            //See HCATALOG-490, also to avoid contacting the namenode for each new FileOutputFormat instance
            //In general this should be ok for most FileOutputFormat implementations
            //but may become an issue for cases when the method is used to perform other setup tasks

            //get Output Committer
            org.apache.hadoop.mapred.OutputCommitter baseOutputCommitter = currTaskContext.getJobConf()
                    .getOutputCommitter();
            //create currJobContext the latest so it gets all the config changes
            org.apache.hadoop.mapred.JobContext currJobContext = HCatMapRedUtil
                    .createJobContext(currTaskContext);
            //setupJob()
            baseOutputCommitter.setupJob(currJobContext);
            //recreate to refresh jobConf of currTask context
            currTaskContext = HCatMapRedUtil.createTaskAttemptContext(currJobContext.getJobConf(),
                    currTaskContext.getTaskAttemptID(), currTaskContext.getProgressible());
            //set temp location
            currTaskContext.getConfiguration().set("mapred.work.output.dir",
                    new FileOutputCommitter(new Path(localJobInfo.getLocation()), currTaskContext).getWorkPath()
                            .toString());
            //setupTask()
            baseOutputCommitter.setupTask(currTaskContext);

            Path parentDir = new Path(currTaskContext.getConfiguration().get("mapred.work.output.dir"));
            Path childPath = new Path(parentDir, FileOutputFormat.getUniqueFile(currTaskContext, "part", ""));

            org.apache.hadoop.mapred.RecordWriter baseRecordWriter = baseOF.getRecordWriter(
                    parentDir.getFileSystem(currTaskContext.getConfiguration()), currTaskContext.getJobConf(),
                    childPath.toString(), InternalUtil.createReporter(currTaskContext));

            baseDynamicWriters.put(dynKey, baseRecordWriter);
            baseDynamicSerDe.put(dynKey, currSerDe);
            baseDynamicCommitters.put(dynKey, baseOutputCommitter);
            dynamicContexts.put(dynKey, currTaskContext);
            dynamicObjectInspectors.put(dynKey,
                    InternalUtil.createStructObjectInspector(jobInfo.getOutputSchema()));
            dynamicOutputJobInfo.put(dynKey, HCatOutputFormat.getJobInfo(dynamicContexts.get(dynKey)));
        }

        localJobInfo = dynamicOutputJobInfo.get(dynKey);
        localWriter = baseDynamicWriters.get(dynKey);
        localSerDe = baseDynamicSerDe.get(dynKey);
        localObjectInspector = dynamicObjectInspectors.get(dynKey);
    } else {
        localJobInfo = jobInfo;
        localWriter = getBaseRecordWriter();
        localSerDe = serDe;
        localObjectInspector = objectInspector;
    }

    for (Integer colToDel : partColsToDel) {
        value.remove(colToDel);
    }

    //The key given by user is ignored
    try {
        localWriter.write(NullWritable.get(), localSerDe.serialize(value.getAll(), localObjectInspector));
    } catch (SerDeException e) {
        throw new IOException("Failed to serialize object", e);
    }
}

From source file:org.apache.hcatalog.pig.TestE2EScenarios.java

License:Apache License

private void copyTable(String in, String out) throws IOException, InterruptedException {
    Job ijob = new Job();
    Job ojob = new Job();
    HCatInputFormat inpy = new HCatInputFormat();
    inpy.setInput(ijob, null, in);/*from w ww  .jav  a2s. co  m*/
    HCatOutputFormat oupy = new HCatOutputFormat();
    oupy.setOutput(ojob, OutputJobInfo.create(null, out, new HashMap<String, String>()));

    // Test HCatContext

    System.err.println("HCatContext INSTANCE is present : " + HCatContext.INSTANCE.getConf().isPresent());
    if (HCatContext.INSTANCE.getConf().isPresent()) {
        System.err.println("HCatContext tinyint->int promotion says " + HCatContext.INSTANCE.getConf().get()
                .getBoolean(HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION,
                        HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION_DEFAULT));
    }

    HCatSchema tableSchema = inpy.getTableSchema(ijob.getConfiguration());
    System.err.println("Copying from [" + in + "] to [" + out + "] with schema : " + tableSchema.toString());
    oupy.setSchema(ojob, tableSchema);
    oupy.checkOutputSpecs(ojob);
    OutputCommitter oc = oupy.getOutputCommitter(createTaskAttemptContext(ojob.getConfiguration()));
    oc.setupJob(ojob);

    for (InputSplit split : inpy.getSplits(ijob)) {

        TaskAttemptContext rtaskContext = createTaskAttemptContext(ijob.getConfiguration());
        TaskAttemptContext wtaskContext = createTaskAttemptContext(ojob.getConfiguration());

        RecordReader<WritableComparable, HCatRecord> rr = inpy.createRecordReader(split, rtaskContext);
        rr.initialize(split, rtaskContext);

        OutputCommitter taskOc = oupy.getOutputCommitter(wtaskContext);
        taskOc.setupTask(wtaskContext);
        RecordWriter<WritableComparable<?>, HCatRecord> rw = oupy.getRecordWriter(wtaskContext);

        while (rr.nextKeyValue()) {
            rw.write(rr.getCurrentKey(), rr.getCurrentValue());
        }
        rw.close(wtaskContext);
        taskOc.commitTask(wtaskContext);
        rr.close();
    }

    oc.commitJob(ojob);
}

From source file:org.apache.hive.hcatalog.mapreduce.DynamicPartitionFileRecordWriterContainer.java

License:Apache License

@Override
protected LocalFileWriter getLocalFileWriter(HCatRecord value) throws IOException, HCatException {
    OutputJobInfo localJobInfo = null;//from www  .  j a v  a2  s.  c o m
    // Calculate which writer to use from the remaining values - this needs to
    // be done before we delete cols.
    List<String> dynamicPartValues = new ArrayList<String>();
    for (Integer colToAppend : dynamicPartCols) {
        Object partitionValue = value.get(colToAppend);
        dynamicPartValues
                .add(partitionValue == null ? HIVE_DEFAULT_PARTITION_VALUE : partitionValue.toString());
    }

    String dynKey = dynamicPartValues.toString();
    if (!baseDynamicWriters.containsKey(dynKey)) {
        if ((maxDynamicPartitions != -1) && (baseDynamicWriters.size() > maxDynamicPartitions)) {
            throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS,
                    "Number of dynamic partitions being created "
                            + "exceeds configured max allowable partitions[" + maxDynamicPartitions
                            + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname
                            + "] if needed.");
        }

        org.apache.hadoop.mapred.TaskAttemptContext currTaskContext = HCatMapRedUtil
                .createTaskAttemptContext(context);
        configureDynamicStorageHandler(currTaskContext, dynamicPartValues);
        localJobInfo = HCatBaseOutputFormat.getJobInfo(currTaskContext.getConfiguration());

        // Setup serDe.
        SerDe currSerDe = ReflectionUtils.newInstance(storageHandler.getSerDeClass(),
                currTaskContext.getJobConf());
        try {
            InternalUtil.initializeOutputSerDe(currSerDe, currTaskContext.getConfiguration(), localJobInfo);
        } catch (SerDeException e) {
            throw new IOException("Failed to initialize SerDe", e);
        }

        // create base OutputFormat
        org.apache.hadoop.mapred.OutputFormat baseOF = ReflectionUtils
                .newInstance(storageHandler.getOutputFormatClass(), currTaskContext.getJobConf());

        // We are skipping calling checkOutputSpecs() for each partition
        // As it can throw a FileAlreadyExistsException when more than one
        // mapper is writing to a partition.
        // See HCATALOG-490, also to avoid contacting the namenode for each new
        // FileOutputFormat instance.
        // In general this should be ok for most FileOutputFormat implementations
        // but may become an issue for cases when the method is used to perform
        // other setup tasks.

        // Get Output Committer
        org.apache.hadoop.mapred.OutputCommitter baseOutputCommitter = currTaskContext.getJobConf()
                .getOutputCommitter();

        // Create currJobContext the latest so it gets all the config changes
        org.apache.hadoop.mapred.JobContext currJobContext = HCatMapRedUtil.createJobContext(currTaskContext);

        // Set up job.
        baseOutputCommitter.setupJob(currJobContext);

        // Recreate to refresh jobConf of currTask context.
        currTaskContext = HCatMapRedUtil.createTaskAttemptContext(currJobContext.getJobConf(),
                currTaskContext.getTaskAttemptID(), currTaskContext.getProgressible());

        // Set temp location.
        currTaskContext.getConfiguration().set("mapred.work.output.dir",
                new FileOutputCommitter(new Path(localJobInfo.getLocation()), currTaskContext).getWorkPath()
                        .toString());

        // Set up task.
        baseOutputCommitter.setupTask(currTaskContext);

        Path parentDir = new Path(currTaskContext.getConfiguration().get("mapred.work.output.dir"));
        Path childPath = new Path(parentDir, FileOutputFormat.getUniqueFile(currTaskContext,
                currTaskContext.getConfiguration().get("mapreduce.output.basename", "part"), ""));

        RecordWriter baseRecordWriter = baseOF.getRecordWriter(
                parentDir.getFileSystem(currTaskContext.getConfiguration()), currTaskContext.getJobConf(),
                childPath.toString(), InternalUtil.createReporter(currTaskContext));

        baseDynamicWriters.put(dynKey, baseRecordWriter);
        baseDynamicSerDe.put(dynKey, currSerDe);
        baseDynamicCommitters.put(dynKey, baseOutputCommitter);
        dynamicContexts.put(dynKey, currTaskContext);
        dynamicObjectInspectors.put(dynKey,
                InternalUtil.createStructObjectInspector(jobInfo.getOutputSchema()));
        dynamicOutputJobInfo.put(dynKey,
                HCatOutputFormat.getJobInfo(dynamicContexts.get(dynKey).getConfiguration()));
    }

    return new LocalFileWriter(baseDynamicWriters.get(dynKey), dynamicObjectInspectors.get(dynKey),
            baseDynamicSerDe.get(dynKey), dynamicOutputJobInfo.get(dynKey));
}

From source file:org.apache.hive.hcatalog.mapreduce.TestHCatOutputFormat.java

License:Apache License

public void publishTest(Job job) throws Exception {
    HCatOutputFormat hcof = new HCatOutputFormat();
    TaskAttemptContext tac = ShimLoader.getHadoopShims().getHCatShim().createTaskAttemptContext(
            job.getConfiguration(), ShimLoader.getHadoopShims().getHCatShim().createTaskAttemptID());
    OutputCommitter committer = hcof.getOutputCommitter(tac);
    committer.setupJob(job);
    committer.setupTask(tac);//  ww w.  jav  a 2s  .  c o m
    committer.commitTask(tac);
    committer.commitJob(job);

    Partition part = client.getPartition(dbName, tblName, Arrays.asList("p1"));
    assertNotNull(part);

    StorerInfo storer = InternalUtil.extractStorerInfo(part.getSd(), part.getParameters());
    assertEquals(storer.getProperties().get("hcat.testarg"), "testArgValue");
    assertTrue(part.getSd().getLocation().indexOf("p1") != -1);
}