Example usage for org.apache.hadoop.mapred JobConf setOutputFormat

List of usage examples for org.apache.hadoop.mapred JobConf setOutputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setOutputFormat.

Prototype

public void setOutputFormat(Class<? extends OutputFormat> theClass) 

Source Link

Document

Set the OutputFormat implementation for the map-reduce job.

Usage

From source file:Brush.TransitiveReduction.java

License:Apache License

public RunningJob run(String inputPath, String outputPath) throws Exception {
    sLogger.info("Tool name: TransitiveReduction");
    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);

    //JobConf conf = new JobConf(Stats.class);
    JobConf conf = new JobConf(TransitiveReduction.class);
    conf.setJobName("TransitiveReduction " + inputPath);

    BrushConfig.initializeConfiguration(conf);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    //conf.setBoolean("mapred.output.compress", true);

    conf.setMapperClass(TransitiveReductionMapper.class);
    conf.setReducerClass(TransitiveReductionReducer.class);

    //delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
}

From source file:Brush.VerifyOverlap.java

License:Apache License

public RunningJob run(String inputPath, String outputPath) throws Exception {
    sLogger.info("Tool name: VerifyOverlap");
    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);

    //JobConf conf = new JobConf(Stats.class);
    JobConf conf = new JobConf(VerifyOverlap.class);
    conf.setJobName("VerifyOverlap " + inputPath);

    BrushConfig.initializeConfiguration(conf);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    //conf.setBoolean("mapred.output.compress", true);
    //conf.setClass("mapred.output.compression.codec", GzipCodec.class,CompressionCodec.class);

    conf.setMapperClass(VerifyOverlapMapper.class);
    conf.setReducerClass(VerifyOverlapReducer.class);

    //delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
}

From source file:buildtestproject.MyFirstMapReduce.java

public static void main(String[] args) throws Exception {
    //Configuration conf = new Configuration();
    JobConf conf = new JobConf(MyFirstMapReduce.class);
    //Job job = Job.getInstance(conf, "word-count-one");
    conf.setJobName("word-count-one");

    conf.setMapperClass(TokenizerMapper.class);
    conf.setCombinerClass(IntSumReducer.class);
    conf.setReducerClass(IntSumReducer.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    //        job.setJarByClass(MyFirstMapReduce.class);
    //        job.setMapperClass(TokenizerMapper.class);
    //        job.setCombinerClass(IntSumReducer.class);
    //        job.setReducerClass(IntSumReducer.class);
    //        /* ww  w .  jav a 2s.  c o  m*/
    //        job.setOutputKeyClass(Text.class);
    //        job.setOutputValueClass(IntWritable.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    //        FileInputFormat.addInputPath(job, new Path(args[0]));
    //        FileOutputFormat.setOutputPath(job, new Path(args[1]));

    JobClient.runJob(conf);

    //        System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:Business.DataJoin.java

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    JobConf job = new JobConf(conf, DataJoin.class);

    final File f = new File(MapReduceOne.class.getProtectionDomain().getCodeSource().getLocation().getPath());
    String inFiles = f.getAbsolutePath().replace("/build/classes", "") + "/src/inFiles/";
    String outFiles = f.getAbsolutePath().replace("/build/classes", "") + "/src/outFiles/OutputOne";
    //use the arguments instead if provided.
    if (args.length > 1) {
        inFiles = args[1];/*from   ww  w .  ja  va 2s.  co  m*/
        outFiles = args[2];
    }
    Path in = new Path(inFiles);
    Path out = new Path(outFiles);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    job.setJobName("Data Join");
    job.setMapperClass(MapClass.class);
    job.setReducerClass(ReduceClass.class);

    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(TaggedWritable.class);
    job.set("mapred.textoutputformat.separator", ",");

    JobClient.runJob(job);
    return 0;
}

From source file:ca.etsmtl.lasi.hbasewikipedialoader.HBaseWikipediaLoader.java

License:Apache License

/**
 * Sets up the actual job.//from   w ww. j  a va  2 s  .  c  o  m
 * 
 * @param conf
 *          The current configuration.
 * @param args
 *          The command line parameters.
 * @return The newly created job.
 * @throws IOException
 *           When setting up the job fails.
 */
public static JobConf createSubmittableJob(HBaseConfiguration conf, String[] args) throws IOException {
    JobConf jobConf = new JobConf(conf, HBaseWikipediaLoader.class);
    jobConf.setJobName(NAME);

    // Stream stuff
    jobConf.set("stream.recordreader.class", "org.apache.hadoop.streaming.StreamXmlRecordReader");
    jobConf.set("stream.recordreader.begin", "<page>");
    jobConf.set("stream.recordreader.end", "</page>");

    jobConf.setSpeculativeExecution(false);

    jobConf.setMapOutputKeyClass(ImmutableBytesWritable.class);
    jobConf.setMapOutputValueClass(BatchUpdate.class);

    jobConf.setMapperClass(Map.class);

    jobConf.setNumReduceTasks(0);

    jobConf.setInputFormat(StreamInputFormat.class);
    jobConf.setOutputFormat(TableOutputFormat.class);
    jobConf.set(TableOutputFormat.OUTPUT_TABLE, TABLE);
    jobConf.setOutputKeyClass(ImmutableBytesWritable.class);
    jobConf.setOutputValueClass(BatchUpdate.class);

    StreamInputFormat.setInputPaths(jobConf, new Path(args[0]));
    FileOutputFormat.setOutputPath(jobConf, new Path("/tmp/" + NAME + "-" + System.currentTimeMillis()));

    return jobConf;

}

From source file:cascading.avro.AvroScheme.java

License:Apache License

/**
 * sinkConfInit is called by cascading to set up the sinks. This happens on the client side before the
 * job is distributed./* www  . ja v  a 2  s  .  c  o  m*/
 * There is a check for the presence of a schema and an exception is thrown if none has been provided.
 * After the schema check the conf object is given the options that Avro needs.
 *
 * @param flowProcess The cascading FlowProcess object. Should be passed in by cascading automatically.
 * @param tap         The cascading Tap object. Should be passed in by cascading automatically.
 * @param conf        The Hadoop JobConf object. This is passed in by cascading automatically.
 * @throws RuntimeException If no schema is present this halts the entire process.
 */
@Override
public void sinkConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap,
        JobConf conf) {

    if (schema == null) {
        throw new RuntimeException("Must provide sink schema");
    }
    // Set the output schema and output format class
    conf.set(AvroJob.OUTPUT_SCHEMA, schema.toString());
    conf.setOutputFormat(AvroOutputFormat.class);

    // add AvroSerialization to io.serializations
    addAvroSerializations(conf);
}

From source file:cascading.flow.hadoop.MapReduceFlowPlatformTest.java

License:Open Source License

@Test
public void testFlow() throws IOException {
    getPlatform().copyFromLocal(inputFileApache);

    JobConf defaultConf = (JobConf) ((BaseHadoopPlatform) getPlatform()).getConfiguration();

    JobConf conf = new JobConf(defaultConf);
    conf.setJobName("mrflow");

    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(IdentityMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(inputFileApache));

    String outputPath = getOutputPath("flowTest");
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    Flow flow = new MapReduceFlow("mrflow", conf, true);

    validateLength(new Hfs(new TextLine(), inputFileApache).openForRead(new HadoopFlowProcess(defaultConf)),
            10);/*  w ww .j a  v a2s. c o m*/

    flow.complete();

    validateLength(new Hfs(new TextLine(), outputPath).openForRead(new HadoopFlowProcess(defaultConf)), 10);
}

From source file:cascading.flow.hadoop.MapReduceFlowPlatformTest.java

License:Open Source License

@Test
public void testCascade() throws IOException {
    getPlatform().copyFromLocal(inputFileApache);

    // Setup two standard cascading flows that will generate the input for the first MapReduceFlow
    Tap source1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(inputFileApache, false));
    String sinkPath4 = getOutputPath("flow4");
    Tap sink1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath4, true), SinkMode.REPLACE);
    Flow firstFlow = getPlatform().getFlowConnector(getProperties()).connect(source1, sink1,
            new Pipe("first-flow"));

    String sinkPath5 = getOutputPath("flow5");
    Tap sink2 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath5, true), SinkMode.REPLACE);
    Flow secondFlow = getPlatform().getFlowConnector(getProperties()).connect(sink1, sink2,
            new Pipe("second-flow"));

    JobConf defaultConf = HadoopPlanner.createJobConf(getProperties());

    JobConf firstConf = new JobConf(defaultConf);
    firstConf.setJobName("first-mr");

    firstConf.setOutputKeyClass(LongWritable.class);
    firstConf.setOutputValueClass(Text.class);

    firstConf.setMapperClass(IdentityMapper.class);
    firstConf.setReducerClass(IdentityReducer.class);

    firstConf.setInputFormat(TextInputFormat.class);
    firstConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(firstConf, new Path(remove(sinkPath5, true)));
    String sinkPath1 = getOutputPath("flow1");
    FileOutputFormat.setOutputPath(firstConf, new Path(remove(sinkPath1, true)));

    Flow firstMR = new MapReduceFlow(firstConf, true);

    JobConf secondConf = new JobConf(defaultConf);
    secondConf.setJobName("second-mr");

    secondConf.setOutputKeyClass(LongWritable.class);
    secondConf.setOutputValueClass(Text.class);

    secondConf.setMapperClass(IdentityMapper.class);
    secondConf.setReducerClass(IdentityReducer.class);

    secondConf.setInputFormat(TextInputFormat.class);
    secondConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(secondConf, new Path(remove(sinkPath1, true)));
    String sinkPath2 = getOutputPath("flow2");
    FileOutputFormat.setOutputPath(secondConf, new Path(remove(sinkPath2, true)));

    Flow secondMR = new MapReduceFlow(secondConf, true);

    Job job = new Job(defaultConf);
    job.setJobName("third-mr");

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class);
    job.setReducerClass(org.apache.hadoop.mapreduce.Reducer.class);

    job.setInputFormatClass(org.apache.hadoop.mapreduce.lib.input.TextInputFormat.class);
    job.setOutputFormatClass(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class);
    job.getConfiguration().set("mapred.mapper.new-api", "true");
    job.getConfiguration().set("mapred.reducer.new-api", "true");

    org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, new Path(remove(sinkPath2, true)));
    String sinkPath3 = getOutputPath("flow3");
    org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.setOutputPath(job,
            new Path(remove(sinkPath3, true)));

    Flow thirdMR = new MapReduceFlow(new JobConf(job.getConfiguration()), true);

    CascadeConnector cascadeConnector = new CascadeConnector();

    // pass out of order
    Cascade cascade = cascadeConnector.connect(firstFlow, secondFlow, thirdMR, firstMR, secondMR);

    cascade.complete();/*from   ww  w  . j ava  2s  .c o m*/

    validateLength(new Hfs(new TextLine(), sinkPath3).openForRead(new HadoopFlowProcess(defaultConf)), 10);
}

From source file:cascading.flow.MapReduceFlowTest.java

License:Open Source License

public void testFlow() throws IOException {
    if (!new File(inputFileApache).exists())
        fail("data file not found");

    copyFromLocal(inputFileApache);/*from w  w  w .  j  a  va 2  s. c om*/

    JobConf defaultConf = MultiMapReducePlanner.getJobConf(getProperties());

    JobConf conf = new JobConf(defaultConf);
    conf.setJobName("mrflow");

    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(IdentityMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(inputFileApache));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath1));

    Flow flow = new MapReduceFlow("mrflow", conf, true);

    validateLength(flow.openSource(), 10);

    flow.complete();

    validateLength(flow.openSink(), 10);
}

From source file:cascading.flow.MapReduceFlowTest.java

License:Open Source License

public void testCascade() throws IOException {
    if (!new File(inputFileApache).exists())
        fail("data file not found");

    copyFromLocal(inputFileApache);//from  ww w. j  a  va  2s  .  c  o m

    // Setup two standard cascading flows that will generate the input for the first MapReduceFlow
    Tap source1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(inputFileApache, false));
    Tap sink1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(outputPath4, true), true);
    Flow firstFlow = new FlowConnector(getProperties()).connect(source1, sink1, new Pipe("first-flow"));

    Tap sink2 = new Hfs(new TextLine(new Fields("offset", "line")), remove(outputPath5, true), true);
    Flow secondFlow = new FlowConnector(getProperties()).connect(sink1, sink2, new Pipe("second-flow"));

    JobConf defaultConf = MultiMapReducePlanner.getJobConf(getProperties());

    JobConf firstConf = new JobConf(defaultConf);
    firstConf.setJobName("first-mr");

    firstConf.setOutputKeyClass(LongWritable.class);
    firstConf.setOutputValueClass(Text.class);

    firstConf.setMapperClass(IdentityMapper.class);
    firstConf.setReducerClass(IdentityReducer.class);

    firstConf.setInputFormat(TextInputFormat.class);
    firstConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(firstConf, new Path(remove(outputPath5, true)));
    FileOutputFormat.setOutputPath(firstConf, new Path(remove(outputPath1, true)));

    Flow firstMR = new MapReduceFlow(firstConf, true);

    JobConf secondConf = new JobConf(defaultConf);
    secondConf.setJobName("second-mr");

    secondConf.setOutputKeyClass(LongWritable.class);
    secondConf.setOutputValueClass(Text.class);

    secondConf.setMapperClass(IdentityMapper.class);
    secondConf.setReducerClass(IdentityReducer.class);

    secondConf.setInputFormat(TextInputFormat.class);
    secondConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(secondConf, new Path(remove(outputPath1, true)));
    FileOutputFormat.setOutputPath(secondConf, new Path(remove(outputPath2, true)));

    Flow secondMR = new MapReduceFlow(secondConf, true);

    JobConf thirdConf = new JobConf(defaultConf);
    thirdConf.setJobName("third-mr");

    thirdConf.setOutputKeyClass(LongWritable.class);
    thirdConf.setOutputValueClass(Text.class);

    thirdConf.setMapperClass(IdentityMapper.class);
    thirdConf.setReducerClass(IdentityReducer.class);

    thirdConf.setInputFormat(TextInputFormat.class);
    thirdConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(thirdConf, new Path(remove(outputPath2, true)));
    FileOutputFormat.setOutputPath(thirdConf, new Path(remove(outputPath3, true)));

    Flow thirdMR = new MapReduceFlow(thirdConf, true);

    CascadeConnector cascadeConnector = new CascadeConnector();

    // pass out of order
    Cascade cascade = cascadeConnector.connect(firstFlow, secondFlow, thirdMR, firstMR, secondMR);

    //    cascade.writeDOT( "mrcascade.dot" );

    cascade.complete();

    validateLength(thirdMR.openSink(), 10);
}