Example usage for org.apache.hadoop.mapred JobConf setInputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setInputFormat.

Prototype

public void setInputFormat(Class<? extends InputFormat> theClass)

Source Link

Document

Set the InputFormat implementation for the map-reduce job.

Usage

From source file:com.chriscx.matching.Driver.java

public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), com.chriscx.mapred.Driver.class);
    conf.setJobName("Matching");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        if ("-skip".equals(args[i])) {
            DistributedCache.addCacheFile(new Path(args[++i]).toUri(), conf);
            conf.setBoolean("wordcount.skip.patterns", true);
        } else {//from w  ww .  jav  a 2 s . com
            other_args.add(args[i]);
        }
    }

    FileInputFormat.setInputPaths(conf, new Path(other_args.get(0)));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    JobClient.runJob(conf);
    return 0;
}

From source file:com.cloudera.avro.AvroWordCount.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: AvroWordCount <input path> <output path>");
        return -1;
    }//from  w  w  w.j a  v  a 2  s . co  m

    JobConf conf = new JobConf(AvroWordCount.class);
    conf.setJobName("wordcount");

    // We call setOutputSchema first so we can override the configuration
    // parameters it sets
    AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Type.STRING), Schema.create(Type.INT)));

    conf.setMapperClass(Map.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);
    conf.setOutputKeyComparatorClass(Text.Comparator.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);
    return 0;
}

From source file:com.cloudera.hbase.OldWordCount.java

License:Open Source License

public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(WordCount.class);
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);// w  w  w  . j a v a 2s  .c  o  m
    }
    conf.setJobName("word count");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(conf, new Path(otherArgs[1]));
    JobClient.runJob(conf);
    return;
}

From source file:com.cloudera.recordservice.avro.AvroJob.java

License:Apache License

public static void setInputFormat(org.apache.hadoop.mapred.JobConf job,
        Class<? extends org.apache.hadoop.mapred.InputFormat> c) {
    if (job.getBoolean(USE_RECORD_SERVICE_INPUT_FORMAT_CONF_KEY, false)) {
        if (c.getName().equals(org.apache.avro.mapred.AvroInputFormat.class.getName())) {
            c = com.cloudera.recordservice.avro.mapred.AvroInputFormat.class;
        } else {/*from www  .  j ava  2 s  .c  om*/
            throw new RuntimeException("Class '" + c.getName() + "' is not supported "
                    + "by the RecordService. Use AvroInputFormat or disable RecordService.");
        }
    }
    LOG.debug("Using input format: " + c.getName());
    job.setInputFormat(c);
}

From source file:com.cloudera.recordservice.examples.mapreduce.WordCount.java

License:Apache License

public void run(String[] args) throws Exception {
    boolean useRecordService = true;
    if (args.length == 3) {
        useRecordService = Boolean.parseBoolean(args[2]);
    } else if (args.length != 2) {
        System.err.println("Usage: WordCount <input path> <output path>");
        System.exit(-1);/*from   ww w. jav a  2  s  . c o m*/
    }
    String input = args[0].trim();
    String output = args[1];

    JobConf conf = new JobConf(WordCount.class);
    conf.setJobName("wordcount-" + (useRecordService ? "with" : "without") + "-RecordService");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    if (useRecordService) {
        conf.setInputFormat(com.cloudera.recordservice.mapred.TextInputFormat.class);
        RecordServiceConfig.setInput(conf, input);
    } else {
        conf.setInputFormat(TextInputFormat.class);
        FileInputFormat.setInputPaths(conf, new Path(input));
    }

    FileSystem fs = FileSystem.get(conf);
    Path outputPath = new Path(output);
    if (fs.exists(outputPath))
        fs.delete(outputPath, true);
    conf.setOutputFormat(TextOutputFormat.class);
    FileOutputFormat.setOutputPath(conf, outputPath);

    JobClient.runJob(conf);
    System.out.println("Done");
}

From source file:com.cloudera.recordservice.hive.RecordServiceHiveInputFormat.java

License:Apache License

private void addSplitsForGroup(List<Path> dirs, TableScanOperator tableScan, JobConf conf,
        InputFormat inputFormat, Class<? extends InputFormat> inputFormatClass, int splits, TableDesc table,
        List<InputSplit> result) throws IOException {
    Utilities.copyTableJobPropertiesToConf(table, conf);

    // The table and database name to scan.
    // TODO: This is commented out until we have a pluggable way to configure the
    // SerDe. Until then, the create a separate table and set the job conf properties.
    // String fqTblName[] = table.getTableName().split("\\.");
    ////w w  w . ja va  2  s  .c o m
    // conf.set("recordservice.table.name", table.getTableName());

    if (tableScan != null) {
        pushFilters(conf, tableScan);
        // Set the projected column and table info for the RecordServiceRecordReader.
        conf.set("recordservice.col.names", Joiner.on(",").join(tableScan.getNeededColumns()));
    }
    // Unset the file config. We're going to be just reading from the table.
    conf.unset(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR);
    conf.setInputFormat(inputFormat.getClass());

    // Generate the RecordService
    InputSplit[] iss = inputFormat.getSplits(conf, splits);
    for (InputSplit is : iss) {
        if (is instanceof FileSplit) {
            FileSplit fileSplit = (FileSplit) is;
            LOG.info("INPUT SPLIT: " + fileSplit.getPath().toString());
        }
    }

    // Wrap the InputSplits in HiveInputSplits. We use modified version of the
    // HiveInputSplit to work around some issues with the base one.
    // TODO: Get changes incorporated into Hive.
    for (InputSplit is : iss) {
        result.add(new HiveInputSplitShim(dirs.get(0), is, inputFormatClass.getName()));
    }
}

From source file:com.cloudera.recordservice.mapreduce.testapps.RecordCount.java

License:Apache License

public static long countRecords(String path) throws IOException {
    String output = TestUtil.getTempDirectory();
    Path inputPath = new Path(path);
    Path outputPath = new Path(output);

    JobConf conf = new JobConf(RecordCount.class);
    conf.setJobName("recordcount");

    conf.setOutputKeyClass(NullWritable.class);
    conf.setOutputValueClass(LongWritable.class);

    conf.setInt("mapreduce.job.reduces", 1);
    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(com.cloudera.recordservice.mapred.TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    JobClient.runJob(conf);//  w  w w .j  a  v  a2  s.c om

    // Read the result and return it. Since we set the number of reducers to 1,
    // there is always just one file containing the value.
    FileSystem fs = outputPath.getFileSystem(conf);
    FSDataInputStream resultStream = fs.open(new Path(output + "/part-00000"));
    byte[] bytes = new byte[16];
    int length = resultStream.read(bytes);
    String result = new String(bytes, 0, length).trim();
    return Long.parseLong(result);
}

From source file:com.cloudera.recordservice.tests.TestMiniClusterController.java

License:Apache License

public static void fillInWordCountMRJobConf(JobConf conf) {
    String input = "select n_comment from tpch.nation";

    conf.setJobName("samplejob-wordcount");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(com.cloudera.recordservice.mapred.TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    com.cloudera.recordservice.mr.RecordServiceConfig.setInputQuery(conf, input);
    setRandomOutputDir(conf);/*from   ww w  .j  a  v  a2s.  c o m*/
}

From source file:com.csiro.hadoop.UFORecord.java

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("UFO count");

    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: avro UFO counter <in> <out>");
        System.exit(2);/*  w w w  .  j a va2s  .  com*/

    }

    org.apache.hadoop.mapred.FileInputFormat.addInputPath(conf, new Path(otherArgs[0]));
    Path outputPath = new Path(otherArgs[1]);
    org.apache.hadoop.mapred.FileOutputFormat.setOutputPath(conf, outputPath);
    outputPath.getFileSystem(conf).delete(outputPath);
    Schema input_schema = Schema.parse(getClass().getResourceAsStream("ufo.avsc"));
    AvroJob.setInputSchema(conf, input_schema);
    AvroJob.setMapOutputSchema(conf,
            Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG)));

    AvroJob.setOutputSchema(conf, OUTPUT_SCHEMA);
    AvroJob.setMapperClass(conf, AvroRecordMapper.class);
    AvroJob.setReducerClass(conf, AvroRecordReducer.class);
    conf.setInputFormat(AvroInputFormat.class);
    JobClient.runJob(conf);

    return 0;
}

From source file:com.datascience.cascading.scheme.CsvScheme.java

License:Apache License

@Override
public void sourceConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap,
        JobConf conf) {
    conf.setInputFormat(CsvInputFormat.class);
    configureReaderFormat(format, conf);
}