Example usage for org.apache.hadoop.mapred JobConf setOutputCommitter

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setOutputCommitter.

Prototype

public void setOutputCommitter(Class<? extends OutputCommitter> theClass)

Source Link

Document

Set the OutputCommitter implementation for the map-reduce job.

Usage

From source file:edu.umn.cs.spatialHadoop.temporal.RepartitionTemporal.java

License:Apache License

public static void repartitionMapReduce(Path[] inputPaths, Path outputPath, Shape stockShape, long blockSize,
        CellInfo[] cellInfos, String sindex, boolean overwrite) throws IOException {

    JobConf job = new JobConf(Repartition.class);

    job.setJobName("RepartitionTemporal");
    FileSystem outFs = outputPath.getFileSystem(job);

    // Overwrite output file
    if (outFs.exists(outputPath)) {
        if (overwrite)
            outFs.delete(outputPath, true);
        else/* w w  w .java  2s  .c o m*/
            throw new RuntimeException(
                    "Output file '" + outputPath + "' already exists and overwrite flag is not set");
    }

    // Decide which map function to use depending on the type of global
    // index
    if (sindex.equals("rtree") || sindex.equals("str")) {
        // Repartition without replication
        job.setMapperClass(RepartitionMapNoReplication.class);
    } else {
        // Repartition with replication (grid and r+tree)
        job.setMapperClass(RepartitionMap.class);
    }
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(stockShape.getClass());
    CombinedSpatialInputFormat.setInputPaths(job, inputPaths);
    job.setInputFormat(CombinedSpatialInputFormat.class);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));

    FileOutputFormat.setOutputPath(job, outputPath);
    if (sindex.equals("grid") || sindex.equals("str") || sindex.equals("str+")) {
        job.setOutputFormat(GridOutputFormat.class);
    } else if (sindex.equals("rtree") || sindex.equals("r+tree")) {
        // For now, the two types of local index are the same
        job.setOutputFormat(RTreeGridOutputFormat.class);
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    SpatialSite.setCells(job, cellInfos);
    job.setBoolean(SpatialSite.OVERWRITE, overwrite);

    // Set reduce function
    job.setReducerClass(RepartitionReduce.class);
    job.setNumReduceTasks(
            Math.max(1, Math.min(cellInfos.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10)));

    // Set output committer that combines output files together
    job.setOutputCommitter(RepartitionOutputCommitter.class);

    if (blockSize != 0) {
        job.setLong("dfs.block.size", blockSize);
        job.setLong("fs.local.block.size", blockSize);
    }

    JobClient.runJob(job);
}

From source file:org.apache.blur.hive.BlurHiveStorageHandler.java

License:Apache License

@Override
public void configureJobConf(TableDesc tableDesc, JobConf jobConf) {
    if (BlurSerDe.shouldUseMRWorkingPath(jobConf)) {
        String loadId = UUID.randomUUID().toString();
        jobConf.set(BlurSerDe.BLUR_MR_LOAD_ID, loadId);
        jobConf.setOutputCommitter(BlurHiveMRLoaderOutputCommitter.class);
    } else {//from   ww w  . j a v  a  2s .  c om
        try {
            String bulkId = UUID.randomUUID().toString();
            String connectionStr = jobConf.get(BlurSerDe.BLUR_CONTROLLER_CONNECTION_STR);
            Iface client = BlurClient.getClient(connectionStr);
            client.bulkMutateStart(bulkId);
            BlurHiveOutputFormat.setBulkId(jobConf, bulkId);
            jobConf.setOutputCommitter(BlurHiveOutputCommitter.class);
        } catch (BlurException e) {
            throw new RuntimeException(e);
        } catch (TException e) {
            throw new RuntimeException(e);
        }
    }
}

From source file:org.apache.hcatalog.hbase.TestHBaseBulkOutputFormat.java

License:Apache License

@Test
public void hbaseBulkOutputFormatTest() throws IOException, ClassNotFoundException, InterruptedException {
    String testName = "hbaseBulkOutputFormatTest";
    Path methodTestDir = new Path(getTestDir(), testName);
    LOG.info("starting: " + testName);

    String tableName = newTableName(testName).toLowerCase();
    String familyName = "my_family";
    byte[] familyNameBytes = Bytes.toBytes(familyName);

    //include hbase config in conf file
    Configuration conf = new Configuration(allConf);

    //create table
    conf.set(HBaseConstants.PROPERTY_OUTPUT_TABLE_NAME_KEY, tableName);
    conf.set("yarn.scheduler.capacity.root.queues", "default");
    conf.set("yarn.scheduler.capacity.root.default.capacity", "100");
    createTable(tableName, new String[] { familyName });

    String data[] = { "1,english:one,spanish:uno", "2,english:two,spanish:dos",
            "3,english:three,spanish:tres" };

    // input/output settings
    Path inputPath = new Path(methodTestDir, "mr_input");
    FSDataOutputStream os = getFileSystem().create(new Path(inputPath, "inputFile.txt"));
    for (String line : data)
        os.write(Bytes.toBytes(line + "\n"));
    os.close();/*from  w w w .j av  a2 s . c om*/
    Path interPath = new Path(methodTestDir, "inter");
    //create job
    JobConf job = new JobConf(conf);
    job.setWorkingDirectory(new Path(methodTestDir, "mr_work"));
    job.setJarByClass(this.getClass());
    job.setMapperClass(MapWriteOldMapper.class);

    job.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class);
    org.apache.hadoop.mapred.TextInputFormat.setInputPaths(job, inputPath);

    job.setOutputFormat(HBaseBulkOutputFormat.class);
    org.apache.hadoop.mapred.SequenceFileOutputFormat.setOutputPath(job, interPath);
    job.setOutputCommitter(HBaseBulkOutputCommitter.class);

    //manually create transaction
    RevisionManager rm = HBaseRevisionManagerUtil.getOpenedRevisionManager(conf);
    try {
        OutputJobInfo outputJobInfo = OutputJobInfo.create("default", tableName, null);
        Transaction txn = rm.beginWriteTransaction(tableName, Arrays.asList(familyName));
        outputJobInfo.getProperties().setProperty(HBaseConstants.PROPERTY_WRITE_TXN_KEY,
                HCatUtil.serialize(txn));
        job.set(HCatConstants.HCAT_KEY_OUTPUT_INFO, HCatUtil.serialize(outputJobInfo));
    } finally {
        rm.close();
    }

    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(HCatRecord.class);

    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(HCatRecord.class);

    job.setNumReduceTasks(0);

    RunningJob runJob = JobClient.runJob(job);
    runJob.waitForCompletion();
    assertTrue(runJob.isSuccessful());

    //verify
    HTable table = new HTable(conf, tableName);
    Scan scan = new Scan();
    scan.addFamily(familyNameBytes);
    ResultScanner scanner = table.getScanner(scan);
    int index = 0;
    for (Result result : scanner) {
        String vals[] = data[index].toString().split(",");
        for (int i = 1; i < vals.length; i++) {
            String pair[] = vals[i].split(":");
            assertTrue(result.containsColumn(familyNameBytes, Bytes.toBytes(pair[0])));
            assertEquals(pair[1], Bytes.toString(result.getValue(familyNameBytes, Bytes.toBytes(pair[0]))));
        }
        index++;
    }
    //test if load count is the same
    assertEquals(data.length, index);
    //test if scratch directory was erased
    assertFalse(FileSystem.get(job).exists(interPath));
}

From source file:org.apache.parquet.hadoop.mapred.DeprecatedParquetOutputFormat.java

License:Apache License

public static void setAsOutputFormat(JobConf jobConf) {
    jobConf.setOutputFormat(DeprecatedParquetOutputFormat.class);
    jobConf.setOutputCommitter(MapredParquetOutputCommitter.class);
}

From source file:org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.java

License:Apache License

public static void setUpMultipleOutputs(JobConf job, byte[] resultIndexes, byte[] resultDimsUnknown,
        String[] outputs, OutputInfo[] outputInfos, boolean inBlockRepresentation, boolean mayContainCtable)
        throws Exception {
    if (resultIndexes.length != outputs.length)
        throw new Exception("number of outputs and result indexes does not match");
    if (outputs.length != outputInfos.length)
        throw new Exception("number of outputs and outputInfos indexes does not match");

    job.set(RESULT_INDEXES_CONFIG, MRJobConfiguration.getIndexesString(resultIndexes));
    job.set(RESULT_DIMS_UNKNOWN_CONFIG, MRJobConfiguration.getIndexesString(resultDimsUnknown));
    job.setStrings(OUTPUT_MATRICES_DIRS_CONFIG, outputs);
    job.setOutputCommitter(MultipleOutputCommitter.class);

    for (int i = 0; i < outputs.length; i++) {
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(outputs[i]), job);
        if (mayContainCtable && resultDimsUnknown[i] == (byte) 1) {
            setOutputInfo(job, i, outputInfos[i], false);
        } else {//w w w.j  a  va2s  .  c om
            setOutputInfo(job, i, outputInfos[i], inBlockRepresentation);
        }
        MultipleOutputs.addNamedOutput(job, Integer.toString(i), outputInfos[i].outputFormatClass,
                outputInfos[i].outputKeyClass, outputInfos[i].outputValueClass);
    }
    job.setOutputFormat(NullOutputFormat.class);

    // configure temp output
    Path tempOutputPath = new Path(constructTempOutputFilename());
    FileOutputFormat.setOutputPath(job, tempOutputPath);
    MapReduceTool.deleteFileIfExistOnHDFS(tempOutputPath, job);
}

From source file:org.commoncrawl.mapred.ec2.parser.EC2ParserTask.java

License:Open Source License

private static void parse(FileSystem fs, Configuration conf, ImmutableList<Path> paths) throws IOException {
    LOG.info("Need to Parse:" + paths.toString());
    // create output path 
    long segmentId = System.currentTimeMillis();
    Path outputPath = new Path(S3N_BUCKET_PREFIX + SEGMENTS_PATH + Long.toString(segmentId));
    LOG.info("Starting Map-Reduce Job. SegmentId:" + segmentId + " OutputPath:" + outputPath);
    // run job...
    JobConf jobConf = new JobBuilder("parse job", conf)

            .inputs(paths).inputFormat(SequenceFileInputFormat.class).keyValue(Text.class, ParseOutput.class)
            .mapper(ParserMapper.class).maxMapAttempts(3).maxMapTaskFailures(100).speculativeExecution(true)
            .numReducers(0).outputFormat(ParserOutputFormat.class).output(outputPath)
            .minSplitSize(134217728 * 2).build();

    jobConf.set("fs.default.name", S3N_BUCKET_PREFIX);
    jobConf.setOutputCommitter(OutputCommitter.class);

    JobClient.runJob(jobConf);//from   ww w.j av a  2  s  .  co m

    LOG.info("Job Finished. Writing Segments Manifest File");
    writeSegmentManifestFile(fs, segmentId, paths);
}

From source file:tachyon.client.keyvalue.hadoop.KeyValueOutputFormat.java

License:Apache License

/**
 * {@inheritDoc}/* w w w . j  a  v a  2 s .  c  om*/
 * <p>
 * {@link KeyValueOutputCommitter} is forced to be used.
 * <p>
 * NOTE: This method is called immediately when job is submitted, so that modifications to the
 * {@link JobConf} are reflected in the whole job.
 */
@Override
public void checkOutputSpecs(FileSystem ignored, JobConf conf)
        throws FileAlreadyExistsException, InvalidJobConfException, IOException {
    super.checkOutputSpecs(ignored, conf);
    conf.setOutputCommitter(KeyValueOutputCommitter.class);
}

From source file:tachyon.examples.keyvalue.hadoop.CloneKeyValueStoreMapReduce.java

License:Apache License

/**
 * @param args two parameters, the first is the input key-value store path, the second is the
 *    output key-value store path//ww  w  .j  ava 2  s  .co m
 * @throws Exception if any exception happens
 */
public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(CloneKeyValueStoreMapReduce.class);
    conf.setJobName("clone key-value store");

    conf.setOutputKeyClass(BytesWritable.class);
    conf.setOutputValueClass(BytesWritable.class);

    conf.setMapperClass(Map.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(KeyValueInputFormat.class);
    conf.setOutputFormat(KeyValueOutputFormat.class);
    conf.setOutputCommitter(KeyValueOutputCommitter.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);
}