Example usage for org.apache.hadoop.mapred JobConf getOutputCommitter

List of usage examples for org.apache.hadoop.mapred JobConf getOutputCommitter

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf getOutputCommitter.

Prototype

public OutputCommitter getOutputCommitter() 

Source Link

Document

Get the OutputCommitter implementation for the map-reduce job, defaults to FileOutputCommitter if not specified explicitly.

Usage

From source file:com.facebook.hiveio.output.HiveApiOutputFormat.java

License:Apache License

@Override
public HiveApiOutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext)
        throws IOException, InterruptedException {
    HadoopUtils.setWorkOutputDir(taskAttemptContext);
    Configuration conf = taskAttemptContext.getConfiguration();
    JobConf jobConf = new JobConf(conf);
    OutputCommitter baseCommitter = jobConf.getOutputCommitter();
    LOG.info("Getting output committer with base output committer {}",
            baseCommitter.getClass().getSimpleName());
    return new HiveApiOutputCommitter(new HackOutputCommitter(baseCommitter, jobConf), myProfileId);
}

From source file:com.scaleoutsoftware.soss.hserver.hadoop.MapperWrapperMapred.java

License:Apache License

/**
 * Runs mapper for the single split.//from  www. j ava2s  .  co  m
 *
 * @param mapOutputAccumulator mapOutputAccumulator to use
 * @param split                split ot run on
 */

@Override
@SuppressWarnings("unchecked")
public void runSplit(final MapOutputAccumulator<OUTKEY, OUTVALUE> mapOutputAccumulator, Object split,
        int splitIndex) throws IOException, ClassNotFoundException, InterruptedException {
    JobConf jobConf = new JobConf(this.jobConf); //Clone JobConf to prevent unexpected task interaction

    TaskAttemptID taskAttemptID = TaskAttemptID
            .downgrade(hadoopVersionSpecificCode.createTaskAttemptId(jobId, true, splitIndex));

    ReducerWrapperMapred.updateJobConf(jobConf, taskAttemptID, splitIndex);
    updateJobWithSplit(jobConf, split);

    InputFormat inputFormat = jobConf.getInputFormat();

    Reporter reporter = Reporter.NULL;

    //Create RecordReader
    org.apache.hadoop.mapred.RecordReader<INKEY, INVALUE> recordReader = inputFormat
            .getRecordReader((InputSplit) split, jobConf, reporter);

    //Make a mapper
    org.apache.hadoop.mapred.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE> mapper;
    try {
        mapper = (org.apache.hadoop.mapred.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>) mapperConstructor
                .newInstance();
        mapper.configure(jobConf);
    } catch (Exception e) {
        throw new RuntimeException("Cannot instantiate mapper " + mapperConstructor.getDeclaringClass(), e);
    }

    //These are to support map only jobs which write output directly to HDFS.
    final RecordWriter outputRecordWriter;
    OutputCommitter outputCommitter = null;
    TaskAttemptContext taskAttemptContext = null;

    if (mapOnlyJob) {

        taskAttemptContext = hadoopVersionSpecificCode.createTaskAttemptContextMapred(jobConf, taskAttemptID);
        OutputFormat outputFormat = jobConf.getOutputFormat();
        FileSystem fs = FileSystem.get(jobConf);
        outputRecordWriter = (org.apache.hadoop.mapred.RecordWriter<OUTKEY, OUTVALUE>) outputFormat
                .getRecordWriter(fs, jobConf, ReducerWrapperMapred.getOutputName(splitIndex), Reporter.NULL);
        outputCommitter = jobConf.getOutputCommitter();

        //Create task object so it can handle file format initialization
        //The MapTask is private in the Hadoop 1.x so we have to go through reflection.
        try {
            Class reduceTask = Class.forName("org.apache.hadoop.mapred.MapTask");
            Constructor reduceTaskConstructor = reduceTask.getDeclaredConstructor(String.class,
                    TaskAttemptID.class, int.class, JobSplit.TaskSplitIndex.class, int.class);
            reduceTaskConstructor.setAccessible(true);
            Task task = (Task) reduceTaskConstructor.newInstance(null, taskAttemptID, splitIndex,
                    new JobSplit.TaskSplitIndex(), 0);
            task.setConf(jobConf);
            task.initialize(jobConf, jobId, Reporter.NULL, false);
        } catch (Exception e) {
            throw new IOException("Cannot initialize MapTask", e);
        }
        outputCommitter.setupTask(taskAttemptContext);
    } else {
        outputRecordWriter = null;
    }

    OutputCollector<OUTKEY, OUTVALUE> outputCollector;

    if (!mapOnlyJob) {
        outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() {
            @Override
            public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException {
                try {
                    mapOutputAccumulator.combine(outkey, outvalue);
                } catch (InterruptedException e) {
                    Thread.currentThread().interrupt();
                }
            }
        };
    } else {
        outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() {
            @Override
            public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException {
                outputRecordWriter.write(outkey, outvalue);
            }
        };
    }

    INKEY key = recordReader.createKey();
    INVALUE value = recordReader.createValue();

    while (recordReader.next(key, value)) {
        mapper.map(key, value, outputCollector, reporter);
    }
    mapper.close();

    recordReader.close();

    if (mapOnlyJob) {
        outputRecordWriter.close(Reporter.NULL);
        outputCommitter.commitTask(taskAttemptContext);
    }

}

From source file:com.scaleoutsoftware.soss.hserver.hadoop.ReducerWrapperMapred.java

License:Apache License

public ReducerWrapperMapred(HServerInvocationParameters invocationParameters, int hadoopPartition, int appId,
        int region, boolean sort) throws IOException, ClassNotFoundException, InterruptedException {
    this.invocationParameters = invocationParameters;
    JobConf jobConf = new JobConf((Configuration) invocationParameters.getConfiguration()); //Clone JobConf, so the temporary settings do not pollute other tasks

    LOG.info("Starting reducer:" + HadoopInvocationParameters.dumpConfiguration(jobConf));

    JobID jobID = (JobID) invocationParameters.getJobId();
    this.hadoopPartition = hadoopPartition;
    hadoopVersionSpecificCode = HadoopVersionSpecificCode.getInstance(invocationParameters.getHadoopVersion(),
            jobConf);//from  www . j  a  v a 2 s  . co  m

    TaskAttemptID taskAttemptID = TaskAttemptID
            .downgrade(hadoopVersionSpecificCode.createTaskAttemptId(jobID, false, hadoopPartition));

    updateJobConf(jobConf, taskAttemptID, region);

    context = hadoopVersionSpecificCode.createTaskAttemptContextMapred(jobConf, taskAttemptID);

    reducer = (org.apache.hadoop.mapred.Reducer<INKEY, INVALUE, OUTKEY, OUTVALUE>) ReflectionUtils
            .newInstance(jobConf.getReducerClass(), jobConf);

    reducer.configure(jobConf);

    OutputFormat outputFormat = jobConf.getOutputFormat();

    FileSystem fs = FileSystem.get(jobConf);
    recordWriter = (org.apache.hadoop.mapred.RecordWriter<OUTKEY, OUTVALUE>) outputFormat.getRecordWriter(fs,
            jobConf, getOutputName(hadoopPartition), Reporter.NULL);

    committer = jobConf.getOutputCommitter();

    //Create task object so it can handle file format initialization
    //The ReduceTask is private in the Hadoop 1.x so we have to go through reflection.
    try {
        Class reduceTask = Class.forName("org.apache.hadoop.mapred.ReduceTask");
        Constructor reduceTaskConstructor = reduceTask.getDeclaredConstructor(String.class, TaskAttemptID.class,
                int.class, int.class, int.class);
        reduceTaskConstructor.setAccessible(true);
        Task task = (Task) reduceTaskConstructor.newInstance(null, taskAttemptID, hadoopPartition, 0, 0);
        task.setConf(jobConf);
        task.initialize(jobConf, jobID, Reporter.NULL, false);
    } catch (Exception e) {
        throw new IOException("Cannot initialize ReduceTask", e);
    }

    committer.setupTask(context);

    Class<INKEY> keyClass = (Class<INKEY>) jobConf.getMapOutputKeyClass();
    WritableSerializerDeserializer<INKEY> firstKeySerializer = new WritableSerializerDeserializer<INKEY>(
            keyClass, null);
    WritableSerializerDeserializer<INKEY> secondKeySerializer = new WritableSerializerDeserializer<INKEY>(
            keyClass, null);
    Class<INVALUE> valueClass = (Class<INVALUE>) jobConf.getMapOutputValueClass();
    WritableSerializerDeserializer<INVALUE> valueSerializer = new WritableSerializerDeserializer<INVALUE>(
            valueClass, null);

    DataGridReaderParameters<INKEY, INVALUE> params = new DataGridReaderParameters<INKEY, INVALUE>(region,
            appId, HServerParameters.getSetting(REDUCE_USEMEMORYMAPPEDFILES, jobConf) > 0, firstKeySerializer,
            valueSerializer, invocationParameters.getSerializationMode(), secondKeySerializer, keyClass,
            valueClass, sort, HServerParameters.getSetting(REDUCE_CHUNKSTOREADAHEAD, jobConf),
            1024 * HServerParameters.getSetting(REDUCE_INPUTCHUNKSIZE_KB, jobConf),
            HServerParameters.getSetting(REDUCE_CHUNKREADTIMEOUT, jobConf));
    transport = DataGridChunkedCollectionReader.getGridReader(params);
    outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() {
        @Override
        public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException {
            recordWriter.write(outkey, outvalue);
        }
    };
}

From source file:com.scaleoutsoftware.soss.hserver.JobScheduler.java

License:Apache License

/**
 * Runs the map-reduce job on ScaleOut hServer.*
 *
 * @param job          the job to run/*  w ww. j a  v a  2 s.  c  om*/
 * @param jobId        the id of the job
 * @param sortEnabled  if key sorting is enabled
 * @param jobParameter user defined parameter object for the job
 * @param grid         the invocation grid to run the job
 * @throws IOException            if errors occurred during the job
 * @throws InterruptedException   if the processing thread is interrupted
 * @throws ClassNotFoundException if the invocation grid does not contain the dependency class
 */
@SuppressWarnings("unchecked")
public void runOldApiJob(JobConf job, org.apache.hadoop.mapred.JobID jobId, boolean sortEnabled,
        Object jobParameter, InvocationGrid grid)
        throws IOException, InterruptedException, ClassNotFoundException {
    //Initialize user credential in advance
    int jobAppId = 0xFFFFFFF & BitConverter.hashStringOneInt(jobId.toString());
    String hadoopVersion = VersionInfo.getVersion();
    long time = System.currentTimeMillis();
    CreateUserCredentials.run(grid);

    try {
        //Check output specs before running the job
        job.getOutputFormat().checkOutputSpecs(FileSystem.get(job), job);

        JobContext jContext = HadoopVersionSpecificCode.getInstance(hadoopVersion, job).createJobContext(job,
                jobId);

        org.apache.hadoop.mapred.OutputCommitter outputCommitter = job.getOutputCommitter();
        outputCommitter.setupJob(jContext);

        //clear all temporary objects
        DataAccessor.clearObjects(jobAppId);

        //Calculating the partition layout
        com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping hostNameToPartition = com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping
                .getCurrent();
        List<InetAddress> hostAddresses = new ArrayList<InetAddress>(hostNameToPartition.getHosts());

        //Generating mapping of Hadoop partitions to SOSS partitions, so they are equally distributed across hosts
        int numHosts = hostAddresses.size();
        int numberOfSlotsPerNode = Math
                .max(grid != null ? grid.getMaxNumberOfCores() : Runtime.getRuntime().availableProcessors(), 1);

        //Set the number of splits to the number of cores
        if (NamedMapInputFormatMapred.class.isAssignableFrom(job.getInputFormat().getClass())) {
            int numberOfSplits = HServerParameters.getSetting(MAP_SPLITS_PER_CORE, job) * numHosts
                    * numberOfSlotsPerNode;
            job.setNumMapTasks(Math.min(numberOfSplits, HServerConstants.MAX_MAP_REDUCE_TASKS));
        }

        //Generating split to hostname map
        org.apache.hadoop.mapred.InputFormat inputFormat = job.getInputFormat();
        List<org.apache.hadoop.mapred.InputSplit> splitList = Arrays
                .asList(inputFormat.getSplits(job, job.getNumMapTasks()));
        Map<InetAddress, List<Integer>> splitToHostAddress = assignSplitsToHost(splitList, hostAddresses, null);

        //Choose the optimal number of reducers for GridOutputFormat
        if (job.getOutputFormat() instanceof NamedMapOutputFormatMapred) {
            job.setNumReduceTasks(numHosts * numberOfSlotsPerNode);
            sortEnabled = false;
        }

        int[] partitionMapping = hostNameToPartition.generateEvenItemDistribution(job.getNumReduceTasks());

        //Generating invocation parameters
        Class<? extends org.apache.hadoop.mapred.InputSplit> splitType = splitList.size() > 0
                ? splitList.get(0).getClass()
                : null;

        HadoopInvocationParameters hadoopParameters = new HadoopInvocationParameters(job, jobId, true);

        HServerInvocationParameters<org.apache.hadoop.mapred.InputSplit> parameters = new HServerInvocationParameters<org.apache.hadoop.mapred.InputSplit>(
                hadoopParameters, jobAppId, partitionMapping, hostNameToPartition, numberOfSlotsPerNode,
                splitType, splitList, splitToHostAddress, false, sortEnabled, hadoopVersion, jobParameter,
                SerializationMode.DEFAULT);

        StringBuilder stringBuilder = new StringBuilder();
        stringBuilder.append("Splits created:\n");
        for (InetAddress address : splitToHostAddress.keySet()) {
            stringBuilder.append("Host ");
            stringBuilder.append(address);
            stringBuilder.append(" has ");
            stringBuilder.append(splitToHostAddress.get(address).size());
            stringBuilder.append(" splits.\n");
        }
        System.out.println(stringBuilder.toString());

        System.out.println("Job initialization completed in " + (System.currentTimeMillis() - time) + " ms.");
        time = System.currentTimeMillis();

        InvokeResult<MapperResult> mapInvokeResult = MessagingHelper.invoke(grid,
                RunMapper.MapperInvokable.class, parameters, TimeSpan.INFINITE_TIMEOUT.getSeconds());

        if (mapInvokeResult.getErrors() != null && mapInvokeResult.getErrors().size() > 0) {
            throw new IOException("Map invocation failed.", mapInvokeResult.getErrors().get(0));
        }

        System.out.println("Map invocation done in " + (System.currentTimeMillis() - time) + " ms.");
        time = System.currentTimeMillis();

        MapperResult resultObject = mapInvokeResult.getResult();

        if (resultObject == null || mapInvokeResult.getNumFailed() != 0) {
            throw new IOException("Mapper invocation failed. Num failed = " + mapInvokeResult.getNumFailed());
        }

        if (resultObject.getNumberOfSplitsProcessed() != splitList.size()) {
            throw new IOException("Number of splits does not match the number of invocations. Nsplits = "
                    + splitList.size() + ", Ninvokes =" + resultObject.getNumberOfSplitsProcessed());
        }

        if (partitionMapping.length > 0) {
            //Running the reduce step
            InvokeResult<Integer> reduceInvokeResult = MessagingHelper.invoke(grid, ReduceInvokable.class,
                    jobAppId, TimeSpan.INFINITE_TIMEOUT.getSeconds());

            System.out.println("Reduce invocation done in " + (System.currentTimeMillis() - time) + " ms.");

            DataAccessor.clearObjects(jobAppId); //clear all temporary objects

            if (reduceInvokeResult.getErrors() != null && reduceInvokeResult.getErrors().size() > 0) {
                throw new IOException("Reduce invocation failed.", reduceInvokeResult.getErrors().get(0));
            }
            if (reduceInvokeResult.getNumFailed() != 0) {
                throw new IOException("Reduce invocation failed.");
            }
            if (reduceInvokeResult.getResult() != partitionMapping.length) {
                throw new IOException("Not all partitions were reduced. Expected = " + partitionMapping.length
                        + " Actual = " + reduceInvokeResult.getResult());
            }
        }
        outputCommitter.commitJob(jContext);
    } catch (StateServerException e) {
        throw new IOException("ScaleOut hServer access error.", e);
    }

}

From source file:org.apache.flink.api.java.hadoop.mapred.HadoopOutputFormatTest.java

License:Apache License

@Test
public void testOpen() throws Exception {

    OutputFormat<String, Long> dummyOutputFormat = mock(DummyOutputFormat.class);
    DummyOutputCommitter outputCommitter = mock(DummyOutputCommitter.class);
    JobConf jobConf = spy(new JobConf());
    when(jobConf.getOutputCommitter()).thenReturn(outputCommitter);

    HadoopOutputFormat<String, Long> outputFormat = new HadoopOutputFormat<>(dummyOutputFormat, jobConf);

    outputFormat.open(1, 1);//from w ww.j  a  v a 2 s .  c  om

    verify(jobConf, times(2)).getOutputCommitter();
    verify(outputCommitter, times(1)).setupJob(any(JobContext.class));
    verify(dummyOutputFormat, times(1)).getRecordWriter(any(FileSystem.class), any(JobConf.class), anyString(),
            any(Progressable.class));
}

From source file:org.apache.flink.api.java.hadoop.mapred.HadoopOutputFormatTest.java

License:Apache License

@Test
public void testFinalizeGlobal() throws Exception {
    OutputFormat<String, Long> dummyOutputFormat = mock(DummyOutputFormat.class);
    DummyOutputCommitter outputCommitter = mock(DummyOutputCommitter.class);
    JobConf jobConf = spy(new JobConf());
    when(jobConf.getOutputCommitter()).thenReturn(outputCommitter);

    HadoopOutputFormat<String, Long> outputFormat = new HadoopOutputFormat<>(dummyOutputFormat, jobConf);

    outputFormat.finalizeGlobal(1);// ww w  .ja va  2 s .  com

    verify(outputCommitter, times(1)).commitJob(any(JobContext.class));
}

From source file:org.apache.flink.batch.connectors.hive.HiveTableOutputFormat.java

License:Apache License

private HivePartitionWriter writerForLocation(String location) throws IOException {
    JobConf clonedConf = new JobConf(jobConf);
    clonedConf.set(OUTDIR, location);//from w  w w .  j  a  v a2  s  .c om
    OutputFormat outputFormat;
    try {
        StorageDescriptor sd = hiveTablePartition.getStorageDescriptor();
        Class outputFormatClz = Class.forName(sd.getOutputFormat(), true,
                Thread.currentThread().getContextClassLoader());
        outputFormatClz = HiveFileFormatUtils.getOutputFormatSubstitute(outputFormatClz);
        outputFormat = (OutputFormat) outputFormatClz.newInstance();
    } catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
        throw new FlinkRuntimeException("Unable to instantiate the hadoop output format", e);
    }
    ReflectionUtils.setConf(outputFormat, clonedConf);
    OutputCommitter outputCommitter = clonedConf.getOutputCommitter();
    JobContext jobContext = new JobContextImpl(clonedConf, new JobID());
    outputCommitter.setupJob(jobContext);
    final boolean isCompressed = clonedConf.getBoolean(HiveConf.ConfVars.COMPRESSRESULT.varname, false);
    if (isCompressed) {
        String codecStr = clonedConf.get(HiveConf.ConfVars.COMPRESSINTERMEDIATECODEC.varname);
        if (!StringUtils.isNullOrWhitespaceOnly(codecStr)) {
            try {
                Class<? extends CompressionCodec> codec = (Class<? extends CompressionCodec>) Class
                        .forName(codecStr, true, Thread.currentThread().getContextClassLoader());
                FileOutputFormat.setOutputCompressorClass(clonedConf, codec);
            } catch (ClassNotFoundException e) {
                throw new RuntimeException(e);
            }
        }
        String typeStr = clonedConf.get(HiveConf.ConfVars.COMPRESSINTERMEDIATETYPE.varname);
        if (!StringUtils.isNullOrWhitespaceOnly(typeStr)) {
            SequenceFile.CompressionType style = SequenceFile.CompressionType.valueOf(typeStr);
            SequenceFileOutputFormat.setOutputCompressionType(clonedConf, style);
        }
    }
    String taskPartition = String.valueOf(clonedConf.getInt("mapreduce.task.partition", -1));
    Path taskPath = FileOutputFormat.getTaskOutputPath(clonedConf, taskPartition);
    FileSinkOperator.RecordWriter recordWriter;
    try {
        recordWriter = HiveFileFormatUtils.getRecordWriter(clonedConf, outputFormat, outputClass, isCompressed,
                tblProperties, taskPath, Reporter.NULL);
    } catch (HiveException e) {
        throw new IOException(e);
    }
    return new HivePartitionWriter(clonedConf, outputFormat, recordWriter, outputCommitter);
}

From source file:org.apache.tez.mapreduce.output.MROutput.java

License:Apache License

public void initCommitter(JobConf job, boolean useNewApi) throws IOException, InterruptedException {

    if (useNewApi) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("using new api for output committer");
        }//from   ww w  . j av a 2s  .  c  o m

        this.committer = newOutputFormat.getOutputCommitter(newApiTaskAttemptContext);
    } else {
        this.committer = job.getOutputCommitter();
    }

    Path outputPath = FileOutputFormat.getOutputPath(job);
    if (outputPath != null) {
        if ((this.committer instanceof FileOutputCommitter)) {
            FileOutputFormat.setWorkOutputPath(job,
                    ((FileOutputCommitter) this.committer).getTaskAttemptPath(oldApiTaskAttemptContext));
        } else {
            FileOutputFormat.setWorkOutputPath(job, outputPath);
        }
    }
    if (useNewApi) {
        this.committer.setupTask(newApiTaskAttemptContext);
    } else {
        this.committer.setupTask(oldApiTaskAttemptContext);
    }
}