Example usage for org.apache.hadoop.mapred JobConf setOutputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setOutputFormat.

Prototype

public void setOutputFormat(Class<? extends OutputFormat> theClass)

Source Link

Document

Set the OutputFormat implementation for the map-reduce job.

Usage

From source file:com.benchmark.mapred.SleepJob.java

License:Apache License

public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount,
        long reduceSleepTime, int reduceSleepCount) {
    JobConf job = new JobConf(getConf(), SleepJob.class);
    job.setNumMapTasks(numMapper);/* w w  w  . j a  v a  2  s .c o m*/
    job.setNumReduceTasks(numReducer);
    job.setMapperClass(SleepJob.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setReducerClass(SleepJob.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setInputFormat(SleepInputFormat.class);
    job.setPartitionerClass(SleepJob.class);
    job.setSpeculativeExecution(false);
    job.setJobName("Sleep job");
    FileInputFormat.addInputPath(job, new Path("ignored"));
    job.setLong("sleep.job.map.sleep.time", mapSleepTime);
    job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime);
    job.setInt("sleep.job.map.sleep.count", mapSleepCount);
    job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount);
    return job;
}

From source file:com.benchmark.mapred.Sort.java

License:Apache License

/**
 * The main driver for sort program./*from www.j  a  va  2  s. c  o  m*/
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker.
 */
public int run(String[] args) throws Exception {

    JobConf jobConf = new JobConf(getConf(), Sort.class);
    jobConf.setJobName("sorter");

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = jobConf.get("test.sort.reduces_per_host");
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }
    Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class;
    Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class;
    Class<? extends WritableComparable> outputKeyClass = BytesWritable.class;
    Class<? extends Writable> outputValueClass = BytesWritable.class;
    List<String> otherArgs = new ArrayList<String>();
    InputSampler.Sampler<K, V> sampler = null;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                jobConf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                num_reduces = Integer.parseInt(args[++i]);
            } else if ("-inFormat".equals(args[i])) {
                inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class);
            } else if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else if ("-outKey".equals(args[i])) {
                outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class);
            } else if ("-outValue".equals(args[i])) {
                outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class);
            } else if ("-totalOrder".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Set user-supplied (possibly default) job configs
    jobConf.setNumReduceTasks(num_reduces);

    jobConf.setInputFormat(inputFormatClass);
    jobConf.setOutputFormat(outputFormatClass);

    jobConf.setOutputKeyClass(outputKeyClass);
    jobConf.setOutputValueClass(outputValueClass);

    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(jobConf, otherArgs.get(0));
    FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1)));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        jobConf.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
        InputSampler.<K, V>writePartitionFile(jobConf, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, jobConf);
        DistributedCache.createSymlink(jobConf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + num_reduces + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;
}

From source file:com.benchmark.mapred.terasort.TeraGen.java

License:Apache License

/**
 * @param args the cli arguments/* w w  w .j  a  v a  2  s  . c o m*/
 */
public int run(String[] args) throws IOException {
    JobConf job = (JobConf) getConf();
    setNumberOfRows(job, Long.parseLong(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setJobName("TeraGen");
    job.setJarByClass(TeraGen.class);
    job.setMapperClass(SortGenMapper.class);
    job.setNumReduceTasks(0);
    job.setNumMapTasks(50);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormat(RangeInputFormat.class);
    job.setOutputFormat(TeraOutputFormat.class);
    JobClient.runJob(job);
    return 0;
}

From source file:com.benchmark.mapred.terasort.TeraSort.java

License:Apache License

public int run(String[] args) throws Exception {
    LOG.info("starting");
    JobConf job = (JobConf) getConf();
    Path inputDir = new Path(args[0]);
    if (args.length != 3) {
        System.out.println("ERROR: Wrong number of parameters: " + args.length + " instead of 3.");
    }//from   www  . ja va  2 s .c  o  m
    inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
    Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
    URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME);
    TeraInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setJobName("TeraSort");
    job.setJarByClass(TeraSort.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormat(TeraInputFormat.class);
    job.setOutputFormat(TeraOutputFormat.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    job.setNumReduceTasks(Integer.parseInt(args[2]));
    TeraInputFormat.writePartitionFile(job, partitionFile);
    DistributedCache.addCacheFile(partitionUri, job);
    DistributedCache.createSymlink(job);
    job.setInt("dfs.replication", 1);
    TeraOutputFormat.setFinalSync(job, true);
    Date startIteration = new Date();
    JobClient.runJob(job);
    Date endIteration = new Date();
    System.out.println(
            "The iteration took " + (endIteration.getTime() - startIteration.getTime()) / 1000 + " seconds.");
    LOG.info("done");
    return 0;
}

From source file:com.bixolabs.cascading.avro.AvroScheme.java

License:Apache License

@SuppressWarnings({ "deprecation" })
@Override/*from www . j a v  a2  s  .co m*/
public void sinkInit(Tap tap, JobConf conf) {
    conf.set(AvroJob.OUTPUT_SCHEMA, getSchema().toString());
    conf.setOutputFormat(AvroOutputFormat.class);

    // Since we're outputting to Avro, we need to set up output values.
    // TODO KKr - why don't we need to set the OutputValueClass?
    // TODO KKr - why do we need to set the OutputKeyComparatorClass?
    conf.setOutputKeyClass(NullWritable.class);
    conf.setOutputValueClass(AvroWrapper.class);
    conf.setOutputKeyComparatorClass(AvroKeyComparator.class);
    //        conf.setMapOutputKeyClass(AvroKey.class);
    //        conf.setMapOutputValueClass(AvroValue.class);

    // add AvroSerialization to io.serializations
    //        Collection<String> serializations = conf.getStringCollection("io.serializations");
    //        if (!serializations.contains(AvroSerialization.class.getName())) {
    //            serializations.add(AvroSerialization.class.getName());
    //            conf.setStrings("io.serializations", serializations.toArray(new String[0]));
    //        }

    // Class<? extends Mapper> mapClass = conf.getMapperClass();
    // Class<? extends Reducer> reduceClass = conf.getReducerClass();
    // AvroJob.setOutputSchema(conf, getSchema());
    // conf.setMapperClass(mapClass);
    // conf.setReducerClass(reduceClass);

    LOGGER.info(String.format("Initializing Avro scheme for sink tap - scheme fields: %s", _schemeFields));
}

From source file:com.chriscx.mapred.Driver.java

public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), Driver.class);
    conf.setJobName("wordcount");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        if ("-skip".equals(args[i])) {
            DistributedCache.addCacheFile(new Path(args[++i]).toUri(), conf);
            conf.setBoolean("wordcount.skip.patterns", true);
        } else {//from   w w  w . jav a  2 s.c o  m
            other_args.add(args[i]);
        }
    }

    FileInputFormat.setInputPaths(conf, new Path(other_args.get(0)));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    JobClient.runJob(conf);
    return 0;
}

From source file:com.chriscx.matching.Driver.java

public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), com.chriscx.mapred.Driver.class);
    conf.setJobName("Matching");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        if ("-skip".equals(args[i])) {
            DistributedCache.addCacheFile(new Path(args[++i]).toUri(), conf);
            conf.setBoolean("wordcount.skip.patterns", true);
        } else {//from   w  w w  .j  a  v a2  s.c  o m
            other_args.add(args[i]);
        }
    }

    FileInputFormat.setInputPaths(conf, new Path(other_args.get(0)));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    JobClient.runJob(conf);
    return 0;
}

From source file:com.cloudera.hbase.OldWordCount.java

License:Open Source License

public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(WordCount.class);
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);//from  ww  w  .j a v  a 2s. co  m
    }
    conf.setJobName("word count");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(conf, new Path(otherArgs[1]));
    JobClient.runJob(conf);
    return;
}

From source file:com.cloudera.recordservice.examples.mapreduce.WordCount.java

License:Apache License

public void run(String[] args) throws Exception {
    boolean useRecordService = true;
    if (args.length == 3) {
        useRecordService = Boolean.parseBoolean(args[2]);
    } else if (args.length != 2) {
        System.err.println("Usage: WordCount <input path> <output path>");
        System.exit(-1);// w  ww . j a va  2s  .  c  om
    }
    String input = args[0].trim();
    String output = args[1];

    JobConf conf = new JobConf(WordCount.class);
    conf.setJobName("wordcount-" + (useRecordService ? "with" : "without") + "-RecordService");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    if (useRecordService) {
        conf.setInputFormat(com.cloudera.recordservice.mapred.TextInputFormat.class);
        RecordServiceConfig.setInput(conf, input);
    } else {
        conf.setInputFormat(TextInputFormat.class);
        FileInputFormat.setInputPaths(conf, new Path(input));
    }

    FileSystem fs = FileSystem.get(conf);
    Path outputPath = new Path(output);
    if (fs.exists(outputPath))
        fs.delete(outputPath, true);
    conf.setOutputFormat(TextOutputFormat.class);
    FileOutputFormat.setOutputPath(conf, outputPath);

    JobClient.runJob(conf);
    System.out.println("Done");
}

From source file:com.cloudera.recordservice.mapreduce.testapps.RecordCount.java

License:Apache License

public static long countRecords(String path) throws IOException {
    String output = TestUtil.getTempDirectory();
    Path inputPath = new Path(path);
    Path outputPath = new Path(output);

    JobConf conf = new JobConf(RecordCount.class);
    conf.setJobName("recordcount");

    conf.setOutputKeyClass(NullWritable.class);
    conf.setOutputValueClass(LongWritable.class);

    conf.setInt("mapreduce.job.reduces", 1);
    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(com.cloudera.recordservice.mapred.TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    JobClient.runJob(conf);//ww w  .  ja v  a2 s .co m

    // Read the result and return it. Since we set the number of reducers to 1,
    // there is always just one file containing the value.
    FileSystem fs = outputPath.getFileSystem(conf);
    FSDataInputStream resultStream = fs.open(new Path(output + "/part-00000"));
    byte[] bytes = new byte[16];
    int length = resultStream.read(bytes);
    String result = new String(bytes, 0, length).trim();
    return Long.parseLong(result);
}