Example usage for org.apache.hadoop.mapred JobConf setOutputCommitter

List of usage examples for org.apache.hadoop.mapred JobConf setOutputCommitter

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setOutputCommitter.

Prototype

public void setOutputCommitter(Class<? extends OutputCommitter> theClass) 

Source Link

Document

Set the OutputCommitter implementation for the map-reduce job.

Usage

From source file:edu.umn.cs.spatialHadoop.temporal.RepartitionTemporal.java

License:Apache License

public static void repartitionMapReduce(Path[] inputPaths, Path outputPath, Shape stockShape, long blockSize,
        CellInfo[] cellInfos, String sindex, boolean overwrite) throws IOException {

    JobConf job = new JobConf(Repartition.class);

    job.setJobName("RepartitionTemporal");
    FileSystem outFs = outputPath.getFileSystem(job);

    // Overwrite output file
    if (outFs.exists(outputPath)) {
        if (overwrite)
            outFs.delete(outputPath, true);
        else/* w w  w .java  2s  .c o m*/
            throw new RuntimeException(
                    "Output file '" + outputPath + "' already exists and overwrite flag is not set");
    }

    // Decide which map function to use depending on the type of global
    // index
    if (sindex.equals("rtree") || sindex.equals("str")) {
        // Repartition without replication
        job.setMapperClass(RepartitionMapNoReplication.class);
    } else {
        // Repartition with replication (grid and r+tree)
        job.setMapperClass(RepartitionMap.class);
    }
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(stockShape.getClass());
    CombinedSpatialInputFormat.setInputPaths(job, inputPaths);
    job.setInputFormat(CombinedSpatialInputFormat.class);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));

    FileOutputFormat.setOutputPath(job, outputPath);
    if (sindex.equals("grid") || sindex.equals("str") || sindex.equals("str+")) {
        job.setOutputFormat(GridOutputFormat.class);
    } else if (sindex.equals("rtree") || sindex.equals("r+tree")) {
        // For now, the two types of local index are the same
        job.setOutputFormat(RTreeGridOutputFormat.class);
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    SpatialSite.setCells(job, cellInfos);
    job.setBoolean(SpatialSite.OVERWRITE, overwrite);

    // Set reduce function
    job.setReducerClass(RepartitionReduce.class);
    job.setNumReduceTasks(
            Math.max(1, Math.min(cellInfos.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10)));

    // Set output committer that combines output files together
    job.setOutputCommitter(RepartitionOutputCommitter.class);

    if (blockSize != 0) {
        job.setLong("dfs.block.size", blockSize);
        job.setLong("fs.local.block.size", blockSize);
    }

    JobClient.runJob(job);
}

From source file:org.apache.blur.hive.BlurHiveStorageHandler.java

License:Apache License

@Override
public void configureJobConf(TableDesc tableDesc, JobConf jobConf) {
    if (BlurSerDe.shouldUseMRWorkingPath(jobConf)) {
        String loadId = UUID.randomUUID().toString();
        jobConf.set(BlurSerDe.BLUR_MR_LOAD_ID, loadId);
        jobConf.setOutputCommitter(BlurHiveMRLoaderOutputCommitter.class);
    } else {//from   ww w  . j a v  a  2s .  c om
        try {
            String bulkId = UUID.randomUUID().toString();
            String connectionStr = jobConf.get(BlurSerDe.BLUR_CONTROLLER_CONNECTION_STR);
            Iface client = BlurClient.getClient(connectionStr);
            client.bulkMutateStart(bulkId);
            BlurHiveOutputFormat.setBulkId(jobConf, bulkId);
            jobConf.setOutputCommitter(BlurHiveOutputCommitter.class);
        } catch (BlurException e) {
            throw new RuntimeException(e);
        } catch (TException e) {
            throw new RuntimeException(e);
        }
    }
}

From source file:org.apache.hcatalog.hbase.TestHBaseBulkOutputFormat.java

License:Apache License

@Test
public void hbaseBulkOutputFormatTest() throws IOException, ClassNotFoundException, InterruptedException {
    String testName = "hbaseBulkOutputFormatTest";
    Path methodTestDir = new Path(getTestDir(), testName);
    LOG.info("starting: " + testName);

    String tableName = newTableName(testName).toLowerCase();
    String familyName = "my_family";
    byte[] familyNameBytes = Bytes.toBytes(familyName);

    //include hbase config in conf file
    Configuration conf = new Configuration(allConf);

    //create table
    conf.set(HBaseConstants.PROPERTY_OUTPUT_TABLE_NAME_KEY, tableName);
    conf.set("yarn.scheduler.capacity.root.queues", "default");
    conf.set("yarn.scheduler.capacity.root.default.capacity", "100");
    createTable(tableName, new String[] { familyName });

    String data[] = { "1,english:one,spanish:uno", "2,english:two,spanish:dos",
            "3,english:three,spanish:tres" };

    // input/output settings
    Path inputPath = new Path(methodTestDir, "mr_input");
    FSDataOutputStream os = getFileSystem().create(new Path(inputPath, "inputFile.txt"));
    for (String line : data)
        os.write(Bytes.toBytes(line + "\n"));
    os.close();/*from  w w w .j av  a2 s . c om*/
    Path interPath = new Path(methodTestDir, "inter");
    //create job
    JobConf job = new JobConf(conf);
    job.setWorkingDirectory(new Path(methodTestDir, "mr_work"));
    job.setJarByClass(this.getClass());
    job.setMapperClass(MapWriteOldMapper.class);

    job.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class);
    org.apache.hadoop.mapred.TextInputFormat.setInputPaths(job, inputPath);

    job.setOutputFormat(HBaseBulkOutputFormat.class);
    org.apache.hadoop.mapred.SequenceFileOutputFormat.setOutputPath(job, interPath);
    job.setOutputCommitter(HBaseBulkOutputCommitter.class);

    //manually create transaction
    RevisionManager rm = HBaseRevisionManagerUtil.getOpenedRevisionManager(conf);
    try {
        OutputJobInfo outputJobInfo = OutputJobInfo.create("default", tableName, null);
        Transaction txn = rm.beginWriteTransaction(tableName, Arrays.asList(familyName));
        outputJobInfo.getProperties().setProperty(HBaseConstants.PROPERTY_WRITE_TXN_KEY,
                HCatUtil.serialize(txn));
        job.set(HCatConstants.HCAT_KEY_OUTPUT_INFO, HCatUtil.serialize(outputJobInfo));
    } finally {
        rm.close();
    }

    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(HCatRecord.class);

    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(HCatRecord.class);

    job.setNumReduceTasks(0);

    RunningJob runJob = JobClient.runJob(job);
    runJob.waitForCompletion();
    assertTrue(runJob.isSuccessful());

    //verify
    HTable table = new HTable(conf, tableName);
    Scan scan = new Scan();
    scan.addFamily(familyNameBytes);
    ResultScanner scanner = table.getScanner(scan);
    int index = 0;
    for (Result result : scanner) {
        String vals[] = data[index].toString().split(",");
        for (int i = 1; i < vals.length; i++) {
            String pair[] = vals[i].split(":");
            assertTrue(result.containsColumn(familyNameBytes, Bytes.toBytes(pair[0])));
            assertEquals(pair[1], Bytes.toString(result.getValue(familyNameBytes, Bytes.toBytes(pair[0]))));
        }
        index++;
    }
    //test if load count is the same
    assertEquals(data.length, index);
    //test if scratch directory was erased
    assertFalse(FileSystem.get(job).exists(interPath));
}

From source file:org.apache.parquet.hadoop.mapred.DeprecatedParquetOutputFormat.java

License:Apache License

public static void setAsOutputFormat(JobConf jobConf) {
    jobConf.setOutputFormat(DeprecatedParquetOutputFormat.class);
    jobConf.setOutputCommitter(MapredParquetOutputCommitter.class);
}

From source file:org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.java

License:Apache License

public static void setUpMultipleOutputs(JobConf job, byte[] resultIndexes, byte[] resultDimsUnknown,
        String[] outputs, OutputInfo[] outputInfos, boolean inBlockRepresentation, boolean mayContainCtable)
        throws Exception {
    if (resultIndexes.length != outputs.length)
        throw new Exception("number of outputs and result indexes does not match");
    if (outputs.length != outputInfos.length)
        throw new Exception("number of outputs and outputInfos indexes does not match");

    job.set(RESULT_INDEXES_CONFIG, MRJobConfiguration.getIndexesString(resultIndexes));
    job.set(RESULT_DIMS_UNKNOWN_CONFIG, MRJobConfiguration.getIndexesString(resultDimsUnknown));
    job.setStrings(OUTPUT_MATRICES_DIRS_CONFIG, outputs);
    job.setOutputCommitter(MultipleOutputCommitter.class);

    for (int i = 0; i < outputs.length; i++) {
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(outputs[i]), job);
        if (mayContainCtable && resultDimsUnknown[i] == (byte) 1) {
            setOutputInfo(job, i, outputInfos[i], false);
        } else {//w w w.j  a  va2s  .  c om
            setOutputInfo(job, i, outputInfos[i], inBlockRepresentation);
        }
        MultipleOutputs.addNamedOutput(job, Integer.toString(i), outputInfos[i].outputFormatClass,
                outputInfos[i].outputKeyClass, outputInfos[i].outputValueClass);
    }
    job.setOutputFormat(NullOutputFormat.class);

    // configure temp output
    Path tempOutputPath = new Path(constructTempOutputFilename());
    FileOutputFormat.setOutputPath(job, tempOutputPath);
    MapReduceTool.deleteFileIfExistOnHDFS(tempOutputPath, job);
}

From source file:org.commoncrawl.mapred.ec2.parser.EC2ParserTask.java

License:Open Source License

private static void parse(FileSystem fs, Configuration conf, ImmutableList<Path> paths) throws IOException {
    LOG.info("Need to Parse:" + paths.toString());
    // create output path 
    long segmentId = System.currentTimeMillis();
    Path outputPath = new Path(S3N_BUCKET_PREFIX + SEGMENTS_PATH + Long.toString(segmentId));
    LOG.info("Starting Map-Reduce Job. SegmentId:" + segmentId + " OutputPath:" + outputPath);
    // run job...
    JobConf jobConf = new JobBuilder("parse job", conf)

            .inputs(paths).inputFormat(SequenceFileInputFormat.class).keyValue(Text.class, ParseOutput.class)
            .mapper(ParserMapper.class).maxMapAttempts(3).maxMapTaskFailures(100).speculativeExecution(true)
            .numReducers(0).outputFormat(ParserOutputFormat.class).output(outputPath)
            .minSplitSize(134217728 * 2).build();

    jobConf.set("fs.default.name", S3N_BUCKET_PREFIX);
    jobConf.setOutputCommitter(OutputCommitter.class);

    JobClient.runJob(jobConf);//from   ww w.j av a  2  s  .  co m

    LOG.info("Job Finished. Writing Segments Manifest File");
    writeSegmentManifestFile(fs, segmentId, paths);
}

From source file:tachyon.client.keyvalue.hadoop.KeyValueOutputFormat.java

License:Apache License

/**
 * {@inheritDoc}/* w w w . j  a  v a  2 s .  c  om*/
 * <p>
 * {@link KeyValueOutputCommitter} is forced to be used.
 * <p>
 * NOTE: This method is called immediately when job is submitted, so that modifications to the
 * {@link JobConf} are reflected in the whole job.
 */
@Override
public void checkOutputSpecs(FileSystem ignored, JobConf conf)
        throws FileAlreadyExistsException, InvalidJobConfException, IOException {
    super.checkOutputSpecs(ignored, conf);
    conf.setOutputCommitter(KeyValueOutputCommitter.class);
}

From source file:tachyon.examples.keyvalue.hadoop.CloneKeyValueStoreMapReduce.java

License:Apache License

/**
 * @param args two parameters, the first is the input key-value store path, the second is the
 *    output key-value store path//ww  w  .j  ava 2  s  .co m
 * @throws Exception if any exception happens
 */
public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(CloneKeyValueStoreMapReduce.class);
    conf.setJobName("clone key-value store");

    conf.setOutputKeyClass(BytesWritable.class);
    conf.setOutputValueClass(BytesWritable.class);

    conf.setMapperClass(Map.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(KeyValueInputFormat.class);
    conf.setOutputFormat(KeyValueOutputFormat.class);
    conf.setOutputCommitter(KeyValueOutputCommitter.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);
}