Example usage for org.apache.hadoop.mapreduce Job setInputFormatClass

List of usage examples for org.apache.hadoop.mapreduce Job setInputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setInputFormatClass.

Prototype

public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException 

Source Link

Document

Set the InputFormat for the job.

Usage

From source file:com.talis.hadoop.rdf.merge.IndexMerge.java

License:Apache License

public int run(String[] args) throws Exception {

    Configuration configuration = getConf();

    boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION,
            Constants.OPTION_USE_COMPRESSION_DEFAULT);
    if (useCompression) {
        configuration.setBoolean("mapred.compress.map.output", true);
        configuration.set("mapred.output.compression.type", "BLOCK");
        configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
    }//w  ww. j a va2s  . c  om

    boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT,
            Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT);
    FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration);
    if (overrideOutput) {
        fs.delete(new Path(args[1]), true);
    }

    Job job = new Job(configuration);
    job.setJobName(JOB_NAME);
    job.setJarByClass(getClass());

    Path input = new Path(args[0]);
    Path output = new Path(args[1]);
    FileInputFormat.addInputPath(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(Mapper.class);
    job.setReducerClass(IndexMergeReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setNumReduceTasks(1);

    if (LOG.isDebugEnabled())
        Utils.log(job, LOG);

    return job.waitForCompletion(true) ? 0 : -1;
}

From source file:com.talis.hadoop.rdf.solr.QuadsIndexer.java

License:Apache License

public int run(String[] args) throws Exception {

    Configuration configuration = getConf();

    boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION,
            Constants.OPTION_USE_COMPRESSION_DEFAULT);
    if (useCompression) {
        configuration.setBoolean("mapred.compress.map.output", true);
        configuration.set("mapred.output.compression.type", "BLOCK");
        configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
    }//from   w  ww . j  a v a 2  s.c o  m

    boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT,
            Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT);
    FileSystem outputFs = FileSystem.get(new Path(args[1]).toUri(), configuration);
    if (overrideOutput) {
        outputFs.delete(new Path(args[1]), true);
    }

    Job job = new Job(configuration);
    job.setJobName(JOB_NAME);
    job.setJarByClass(getClass());

    int shards = -1;
    boolean compressOutput = false;

    Path input = new Path(args[0]);
    Path output = new Path(args[1]);
    Path solrConfig = new Path(args[2]);
    FileInputFormat.addInputPath(job, input);
    FileOutputFormat.setOutputPath(job, output);

    if (shards > 0) {
        job.setNumReduceTasks(shards);
    }

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(QuadArrayWritable.class);

    job.setReducerClass(SolrReducer.class);
    SolrDocumentConverter.setSolrDocumentConverter(LiteralsIndexer.class, job.getConfiguration());

    job.setOutputFormatClass(SolrOutputFormat.class);

    String zipName = "solr.zip";
    FileSystem solrConfigFs = FileSystem.get(solrConfig.toUri(), configuration);
    final URI baseZipUrl = solrConfigFs.getUri().resolve(solrConfig.toString() + '#' + zipName);
    DistributedCache.addCacheArchive(baseZipUrl, job.getConfiguration());
    job.getConfiguration().set(SolrOutputFormat.SETUP_OK, solrConfig.toString());
    SolrOutputFormat.setOutputZipFormat(compressOutput, job.getConfiguration());

    if (LOG.isDebugEnabled())
        Utils.log(job, LOG);

    return job.waitForCompletion(true) ? 0 : -1;
}

From source file:com.talis.mapreduce.lib.input.TestDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }//w  w  w  . j  ava2s  . c o m

    Job job = new Job(getConf(), "test");
    job.setJarByClass(getClass());

    job.setInputFormatClass(NQuadsInputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(TestMapper.class);
    job.setNumReduceTasks(0);

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.telefonica.iot.tidoop.apiext.utils.CKANMapReduceExample.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    // check the number of arguments, show the usage if it is wrong
    if (args.length != 7) {
        showUsage();//from  www . j  a  va2 s. c om
        return -1;
    } // if

    // get the arguments
    String ckanHost = args[0];
    String ckanPort = args[1];
    boolean sslEnabled = args[2].equals("true");
    String ckanAPIKey = args[3];
    String ckanInputs = args[4];
    String ckanOutput = args[5];
    String splitsLength = args[6];

    // create and configure a MapReduce job
    Configuration conf = this.getConf();
    Job job = Job.getInstance(conf, "CKAN MapReduce test");
    job.setJarByClass(CKANMapReduceExample.class);
    job.setMapperClass(RecordSizeGetter.class);
    job.setCombinerClass(RecordSizeAdder.class);
    job.setReducerClass(RecordSizeAdder.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(CKANInputFormat.class);
    CKANInputFormat.setInput(job, ckanInputs);
    CKANInputFormat.setEnvironment(job, ckanHost, ckanPort, sslEnabled, ckanAPIKey);
    CKANInputFormat.setSplitsLength(job, splitsLength);
    job.setOutputFormatClass(CKANOutputFormat.class);
    CKANOutputFormat.setEnvironment(job, ckanHost, ckanPort, sslEnabled, ckanAPIKey);
    CKANOutputFormat.setOutputPkg(job, ckanOutput);

    // run the MapReduce job
    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.teradata.benchto.generator.HiveTypesGenerator.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(/*from   w ww.  ja  v a 2s  .  c om*/
            Option.builder("format").required().hasArg().desc("file format (orc, parquet or text)").build());
    options.addOption(Option.builder("type").required().hasArg().desc(
            "hive type to be generated (bigint, int, boolean, double, binary, date, timestamp, string, decimal or varchar)")
            .build());
    options.addOption(Option.builder("rows").required().hasArg().desc("total row count").build());
    options.addOption(Option.builder("mappers").required().hasArg().desc("total mappers count").build());
    options.addOption(Option.builder("path").hasArg()
            .desc("base path for generating files, default is: /benchmarks/benchto/types").build());
    options.addOption(Option.builder("regex").numberOfArgs(3)
            .desc("generate varchars from regex pattern, arguments are: pattern, min length, max length")
            .build());

    CommandLine line;
    String format;
    String hiveType;
    long numberOfRows;
    long numberOfFiles;
    String basePath;
    Optional<String> regexPattern = Optional.absent();
    Optional<Integer> regexMinLength = Optional.absent();
    Optional<Integer> regexMaxLength = Optional.absent();
    try {
        line = new DefaultParser().parse(options, args);
        format = line.getOptionValue("format");
        hiveType = line.getOptionValue("type");
        numberOfRows = parseLong(line.getOptionValue("rows"));
        numberOfFiles = parseLong(line.getOptionValue("mappers"));
        basePath = line.getOptionValue("path", "/benchmarks/benchto/types");
        if (line.hasOption("regex")) {
            String[] values = line.getOptionValues("regex");
            regexPattern = Optional.of(values[0]);
            regexMinLength = Optional.of(parseInt(values[1]));
            regexMaxLength = Optional.of(parseInt(values[2]));
        }
    } catch (Exception e) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("benchto-generator", options);
        throw e;
    }

    String jobName = format("GenerateData-%s-%s-%d", format, hiveType, numberOfRows);
    Path outputDir = new Path(format("%s/%s-%s/%d", basePath, format, hiveType, numberOfRows));
    Class<? extends OutputFormat> outputFormatClass = getOutputFormatClass(format);

    LOG.info("Generating " + numberOfRows + " " + hiveType + "s, directory: " + outputDir
            + ", number of files: " + numberOfFiles);

    Configuration configuration = new Configuration();
    configuration.set(FORMAT_PROPERTY_NAME, format);
    configuration.set(HIVE_TYPE_PROPERTY_NAME, hiveType);
    configuration.setLong(NUM_ROWS_PROPERTY_NAME, numberOfRows);
    configuration.setLong(NUM_MAPS, numberOfFiles);
    if (regexPattern.isPresent()) {
        configuration.set(REGEX_PATTERN, regexPattern.get());
        configuration.setInt(REGEX_MIN_LENGTH, regexMinLength.get());
        configuration.setInt(REGEX_MAX_LENGTH, regexMaxLength.get());
    }

    Job generatorJob = Job.getInstance(configuration, jobName);
    FileOutputFormat.setOutputPath(generatorJob, outputDir);
    ParquetOutputFormat.setWriteSupportClass(generatorJob, DataWritableWriteSupport.class);
    generatorJob.setJarByClass(HiveTypesGenerator.class);
    generatorJob.setMapperClass(HiveTypesMapper.class);
    generatorJob.setNumReduceTasks(0);
    generatorJob.setOutputKeyClass(NullWritable.class);
    generatorJob.setOutputValueClass(Writable.class);
    generatorJob.setInputFormatClass(CounterInputFormat.class);
    generatorJob.setOutputFormatClass(outputFormatClass);

    return generatorJob.waitForCompletion(true) ? 0 : 1;
}

From source file:com.teradata.compaction.mapreduce.MergeParquetFilesMR.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = new Job(conf, "MergeParquet");

    if (args.length != 2) {
        System.err.println("Usage: java -jar MergeParquetFilesMR path_to_input_folder path_to_output_folder ");
        System.exit(0);//from   ww w  . j  a  v a  2  s . co  m
    }

    final Path inputPath = new Path(args[0]);
    final Path out = new Path(args[1]);

    Schema schemaParquetFile = getBaseSchema(inputPath, conf);
    job.setJarByClass(MergeParquetFilesMR.class);
    job.setMapperClass(SampleParquetMapper.class);
    job.setReducerClass(SampleParquetReducer.class);
    job.setInputFormatClass(AvroParquetInputFormat.class);
    job.setOutputFormatClass(AvroParquetOutputFormat.class);
    job.setMapOutputKeyClass(NullWritable.class);

    AvroJob.setMapOutputValueSchema(job, schemaParquetFile);
    AvroParquetOutputFormat.setSchema(job, schemaParquetFile);
    FileInputFormat.addInputPath(job, inputPath);
    AvroParquetOutputFormat.setOutputPath(job, out);
    job.setNumReduceTasks(1);
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.tfm.utad.reducerdata.ReducerDataPig.java

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM-dd-HH-mm-ss");
    Date date = new Date();

    Path inputPath = new Path("/home/jab/camus/reducer-data-pig");
    Path outputDir = new Path("/home/jab/camus/pigdata/" + sdf.format(date));

    // Create configuration
    Configuration conf = new Configuration(true);
    conf.set(FS_DEFAULT_FS, HDFS_LOCALHOST_LOCALDOMAIN);
    FileSystem fs = FileSystem.get(conf);
    Path filesPath = new Path(inputPath + "/*");
    FileStatus[] files = fs.globStatus(filesPath);

    // Create job
    Job job = new Job(conf, "ReducerDataPig");
    job.setJarByClass(ReducerDataPig.class);

    // Setup MapReduce
    job.setMapperClass(ReducerDataPigMapper.class);
    job.setReducerClass(ReducerDataPigReducer.class);
    job.setNumReduceTasks(1);//from  ww w  .j  a v a2s  .  c om

    // Specify key / value
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(ReducerPigKey.class);

    // Input
    FileInputFormat.addInputPath(job, inputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);

    // Output
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormatClass(TextOutputFormat.class);

    // Delete output if exists
    if (fs.exists(outputDir)) {
        fs.delete(outputDir, true);
    }

    // Execute job
    int code = job.waitForCompletion(true) ? 0 : 1;
    if (code == 0) {
        Counters counters = job.getCounters();
        Counter malformedCounter = counters.findCounter(ReducerDataEnum.MALFORMED_DATA);
        LOG.info("Counter malformed data: " + malformedCounter.getValue());
        for (FileStatus fStatus : files) {
            LOG.info("File name:" + fStatus.getPath());
            if (fStatus.isFile()) {
                LOG.info("Removing file in path:" + fStatus.getPath());
                fs.delete(fStatus.getPath(), false);
            }
        }
    }
}

From source file:com.tfm.utad.reducerdata.ReducerDataVertica.java

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM-dd-HH-mm-ss");
    Date date = new Date();

    Path inputPath = new Path("/home/jab/camus/reducer-data-vertica");
    Path outputDir = new Path("/home/jab/camus/verticadb/" + sdf.format(date));

    // Create configuration
    Configuration conf = new Configuration(true);
    conf.set(FS_DEFAULT_FS, HDFS_LOCALHOST_LOCALDOMAIN);
    FileSystem fs = FileSystem.get(conf);
    Path filesPath = new Path(inputPath + "/*");
    FileStatus[] files = fs.globStatus(filesPath);

    // Create job
    Job job = new Job(conf, "ReducerDataVertica");
    job.setJarByClass(ReducerDataVertica.class);

    // Setup MapReduce
    job.setMapperClass(ReducerDataVerticaMapper.class);
    job.setReducerClass(ReducerDataVerticaReducer.class);
    job.setNumReduceTasks(1);/*ww w.  j  a v a  2  s .  c  o m*/

    // Specify key / value
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(ReducerVerticaValue.class);

    // Input
    FileInputFormat.addInputPath(job, inputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);

    // Output
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormatClass(TextOutputFormat.class);

    // Delete output if exists
    if (fs.exists(outputDir)) {
        fs.delete(outputDir, true);
    }

    // Execute job
    int code = job.waitForCompletion(true) ? 0 : 1;
    if (code == 0) {
        Counters counters = job.getCounters();
        Counter malformedCounter = counters.findCounter(ReducerDataEnum.MALFORMED_DATA);
        LOG.info("Counter malformed data: " + malformedCounter.getValue());
        for (FileStatus fStatus : files) {
            LOG.info("File name:" + fStatus.getPath());
            if (fStatus.isFile()) {
                LOG.info("Removing file in path:" + fStatus.getPath());
                fs.delete(fStatus.getPath(), false);
            }
        }
    }
}

From source file:com.tomslabs.grid.avro.AvroWordCount.java

License:Apache License

public static Job createSubmitableJob(final Configuration conf, final Path inputPath, final Path outputPath)
        throws IOException {

    conf.set(AvroFileOutputFormat.OUTPUT_SCHEMA, WordCountSchema.getSchema().toString());

    conf.setInt("mapred.max.split.size", 1024000);
    conf.setInt("mapred.reduce.tasks", 10);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", true);
    final Job job = new Job(conf, "Word Count");
    job.setJarByClass(AvroWordCount.class);

    job.setInputFormatClass(AvroFileInputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setMapperClass(WordCountMapper.class);

    job.setReducerClass(WordCountReducer.class);

    job.setOutputKeyClass(GenericRecord.class);
    job.setOutputValueClass(NullWritable.class);
    job.setOutputFormatClass(AvroFileOutputFormat.class);
    AvroFileOutputFormat.setDeflateLevel(job, 3);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    return job;/*from  w w  w  .j a  v a  2 s  . com*/
}

From source file:com.toshiba.mwcloud.gs.hadoop.mapreduce.examples.GSWordCount.java

License:Apache License

/**
 * <div lang="ja">// w w w  . j a v  a  2s.  co  m
 * WordCount?MapReduce???
 * @param args 
 * @return ???0????????1
 * @throws Exception ??????
 * </div><div lang="en">
 * Run a MapReduce job of WordCount.
 * @param args command argument
 * @return 0 for normal termination of the job and 1 otherwise
 * @throws Exception processing failed.
 * </div>
 */
public int run(String[] args) throws Exception {
    GSConf gsConf = new GSConf();
    gsConf.parseArg(args);

    Configuration conf = getConf();
    gsConf.setup(conf);

    Job job = Job.getInstance(conf, APP_NAME);
    job.setJarByClass(GSWordCount.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(GSRowWritable.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormatClass(GSRowInputFormat.class);
    job.setOutputFormatClass(GSRowOutputFormat.class);

    int res = job.waitForCompletion(true) ? 0 : 1;

    if (res == 0) {
        printResult(gsConf);
    }

    return res;
}