Example usage for org.apache.hadoop.mapred JobConf setOutputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setOutputFormat.

Prototype

public void setOutputFormat(Class<? extends OutputFormat> theClass)

Source Link

Document

Set the OutputFormat implementation for the map-reduce job.

Usage

From source file:com.cloudera.recordservice.tests.TestMiniClusterController.java

License:Apache License

public static void fillInWordCountMRJobConf(JobConf conf) {
    String input = "select n_comment from tpch.nation";

    conf.setJobName("samplejob-wordcount");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(com.cloudera.recordservice.mapred.TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    com.cloudera.recordservice.mr.RecordServiceConfig.setInputQuery(conf, input);
    setRandomOutputDir(conf);//from   ww w .j a v  a  2s  .  co  m
}

From source file:com.datascience.cascading.scheme.CsvScheme.java

License:Apache License

@Override
public void sinkConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap,
        JobConf conf) {
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(ListWritable.class);
    conf.setOutputFormat(CsvOutputFormat.class);
    configureWriterFormat(format, conf);
}

From source file:com.datatorrent.demos.mroperator.LogCountsPerHour.java

License:Open Source License

public int run(String[] args) throws Exception {
    // Create a configuration
    Configuration conf = getConf();

    // Create a job from the default configuration that will use the WordCount class
    JobConf job = new JobConf(conf, LogCountsPerHour.class);

    // Define our input path as the first command line argument and our output path as the second
    Path in = new Path(args[0]);
    Path out = new Path(args[1]);

    // Create File Input/Output formats for these paths (in the job)
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    // Configure the job: name, mapper, reducer, and combiner
    job.setJobName("LogAveragePerHour");
    job.setMapperClass(LogMapClass.class);
    job.setReducerClass(LogReduce.class);
    job.setCombinerClass(LogReduce.class);

    // Configure the output
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(DateWritable.class);
    job.setOutputValueClass(IntWritable.class);

    // Run the job
    JobClient.runJob(job);// w ww.ja  v  a  2 s. co m
    return 0;
}

From source file:com.datatorrent.demos.mroperator.WordCount.java

License:Open Source License

public void run(String[] args) throws Exception {

    JobConf conf = new JobConf(this.getClass());
    conf.setJobName("wordcount");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);/*  ww w  . ja v a  2  s  .c  o m*/
}

From source file:com.digitalpebble.behemoth.ClassifierJob.java

License:Apache License

public int run(String[] args) throws Exception {

    Options options = new Options();
    // automatically generate the help statement
    HelpFormatter formatter = new HelpFormatter();
    // create the parser
    CommandLineParser parser = new GnuParser();

    options.addOption("h", "help", false, "print this message");
    options.addOption("i", "input", true, "input Behemoth corpus");
    options.addOption("o", "output", true, "output Behemoth corpus");
    options.addOption("m", "model", true, "location of the model");

    // parse the command line arguments
    CommandLine line = null;//  w w  w . j a  va 2s .c o m
    try {
        line = parser.parse(options, args);
        String input = line.getOptionValue("i");
        String output = line.getOptionValue("o");
        String model = line.getOptionValue("m");
        if (line.hasOption("help")) {
            formatter.printHelp("ClassifierJob", options);
            return 0;
        }
        if (model == null | input == null | output == null) {
            formatter.printHelp("ClassifierJob", options);
            return -1;
        }
    } catch (ParseException e) {
        formatter.printHelp("ClassifierJob", options);
    }

    final FileSystem fs = FileSystem.get(getConf());

    Path inputPath = new Path(line.getOptionValue("i"));
    Path outputPath = new Path(line.getOptionValue("o"));
    String modelPath = line.getOptionValue("m");

    JobConf job = new JobConf(getConf());

    // push the model file to the DistributedCache
    DistributedCache.addCacheArchive(new URI(modelPath), job);

    job.setJarByClass(this.getClass());

    job.setJobName("ClassifierJob : " + inputPath.toString());

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(BehemothDocument.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    job.setMapperClass(TextClassifierMapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.set(modelNameParam, modelPath);

    try {
        JobClient.runJob(job);
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.commoncrawl.CorpusMerger.java

License:Apache License

public int run(String[] args) throws Exception {

    Options options = new Options();
    // automatically generate the help statement
    HelpFormatter formatter = new HelpFormatter();
    // create the parser
    CommandLineParser parser = new GnuParser();

    options.addOption("h", "help", false, "print this message");
    options.addOption("i", "input", true, "input Behemoth corpus");
    options.addOption("o", "output", true, "output Behemoth corpus");

    // parse the command line arguments
    CommandLine line = null;//  ww  w . j  a  v a2s.  co m
    try {
        line = parser.parse(options, args);
        String input = line.getOptionValue("i");
        if (line.hasOption("help")) {
            formatter.printHelp("CorpusMerger", options);
            return 0;
        }
        if (input == null) {
            formatter.printHelp("CorpusMerger", options);
            return -1;
        }
    } catch (ParseException e) {
        formatter.printHelp("CorpusMerger", options);
        return -1;
    }

    Path outputPath = new Path(line.getOptionValue("o"));

    String[] paths = (line.getOptionValues("i"));

    JobConf job = new JobConf(getConf());
    // MUST not forget the line below
    job.setJarByClass(this.getClass());

    job.setJobName("CorpusMerger");

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    //job.setMapperClass(IdentityMapper.class);

    job.setReducerClass(MergerReducer.class);

    for (String in : paths)
        FileInputFormat.addInputPath(job, new Path(in));

    FileOutputFormat.setOutputPath(job, outputPath);

    try {
        long start = System.currentTimeMillis();
        JobClient.runJob(job);
        long finish = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("CorpusMerger completed. Timing: " + (finish - start) + " ms");
        }
    } catch (Exception e) {
        LOG.error("Exception caught", e);
        // fs.delete(outputPath, true);
    } finally {
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.es.ESIndexerJob.java

License:Apache License

public int run(String[] args) throws Exception {

    if (args.length != 1) {
        String syntax = "com.digitalpebble.behemoth.ESIndexerJob input";
        System.err.println(syntax);
        return -1;
    }// www .jav  a 2  s .  c  o m

    Path inputPath = new Path(args[0]);

    JobConf job = new JobConf(getConf());

    job.setJarByClass(this.getClass());

    job.setJobName("Indexing " + inputPath + " into ElasticSearch");

    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapOutputValueClass(MapWritable.class);

    job.setMapperClass(BehemothToESMapper.class);

    job.setSpeculativeExecution(false); // disable speculative execution
    // when writing to ES

    // job.set("es.resource", "radio/artists"); // index used for storing
    // data
    job.setOutputFormat(EsOutputFormat.class); // use dedicated output
    // format

    FileInputFormat.addInputPath(job, inputPath);

    // no reducer : send straight to elasticsearch at end of mapping
    job.setNumReduceTasks(0);

    try {
        long start = System.currentTimeMillis();
        JobClient.runJob(job);
        long finish = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("ESIndexerJob completed. Timing: " + (finish - start) + " ms");
        }
    } catch (Exception e) {
        LOG.error("Exception while running job", e);
        return -1;
    }
    return 0;
}

From source file:com.digitalpebble.behemoth.gate.GATEDriver.java

License:Apache License

public int run(String[] args) throws Exception {

    final FileSystem fs = FileSystem.get(getConf());

    if (args.length < 3 | args.length > 4) {
        String syntax = "com.digitalpebble.behemoth.gate.GATEDriver in out path_gate_file [-XML]";
        System.err.println(syntax);
        return -1;
    }//  w  w w  . j  a va  2  s.c o  m

    boolean dumpGATEXML = false;

    for (String arg : args) {
        if (arg.equalsIgnoreCase("-xml"))
            dumpGATEXML = true;
    }

    Path inputPath = new Path(args[0]);
    Path outputPath = new Path(args[1]);
    String zip_application_path = args[2];

    // check that the GATE application has been stored on HDFS
    Path zap = new Path(zip_application_path);
    if (fs.exists(zap) == false) {
        System.err
                .println("The GATE application " + zip_application_path + "can't be found on HDFS - aborting");
        return -1;
    }

    JobConf job = new JobConf(getConf());
    // MUST not forget the line below
    job.setJarByClass(this.getClass());

    job.setJobName("Processing " + args[0] + " with GATE application from " + zip_application_path);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setOutputKeyClass(Text.class);

    if (dumpGATEXML) {
        job.setOutputValueClass(Text.class);
        job.setMapperClass(GATEXMLMapper.class);
    } else {
        job.setOutputValueClass(BehemothDocument.class);
        job.setMapperClass(GATEMapper.class);
    }

    // detect if any filters have been defined
    // and activate the reducer accordingly
    boolean isFilterRequired = BehemothReducer.isRequired(job);
    if (isFilterRequired)
        job.setReducerClass(BehemothReducer.class);
    else {
        job.setNumReduceTasks(0);
    }

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    // push the zipped_gate_application onto the DistributedCache
    DistributedCache.addCacheArchive(new URI(zip_application_path), job);

    job.set("gate.application.path", zip_application_path.toString());

    try {
        long start = System.currentTimeMillis();
        JobClient.runJob(job);
        long finish = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("GATEDriver completed. Timing: " + (finish - start) + " ms");
        }
    } catch (Exception e) {
        LOG.error("Exception caught", e);
        // leave even partial output
        // fs.delete(outputPath, true);
    } finally {
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.io.nutch.NutchSegmentConverterJob.java

License:Apache License

public void convert(List<Path> list, Path output) throws IOException {

    JobConf job = new JobConf(getConf());

    job.setJobName("Converting Nutch segments");
    job.setJarByClass(this.getClass());

    for (Path p : list) {
        FileInputFormat.addInputPath(job, new Path(p, Content.DIR_NAME));
    }/*from w w w. jav a  2s.  com*/

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(NutchSegmentConverterJob.class);

    // no reducers
    job.setNumReduceTasks(0);

    FileOutputFormat.setOutputPath(job, output);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    long start = System.currentTimeMillis();
    JobClient.runJob(job);
    long finish = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("NutchSegmentConverter completed. Timing: " + (finish - start) + " ms");
    }
}

From source file:com.digitalpebble.behemoth.io.warc.WARCConverterJob.java

License:Apache License

public void convert(Path warcpath, Path output) throws IOException {

    JobConf job = new JobConf(getConf());
    job.setJobName("Convert WARC " + warcpath);

    job.setJarByClass(this.getClass());

    FileInputFormat.addInputPath(job, warcpath);
    job.setInputFormat(WarcFileInputFormat.class);

    job.setMapperClass(WARCConverterJob.class);

    // no reducers
    job.setNumReduceTasks(0);/* w w  w  . j a  va  2  s  .c  o m*/

    FileOutputFormat.setOutputPath(job, output);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    long start = System.currentTimeMillis();
    JobClient.runJob(job);
    long finish = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("WARCConverterJob completed. Timing: " + (finish - start) + " ms");
    }
}