Example usage for org.apache.hadoop.mapreduce Job setInputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setInputFormatClass.

Prototype

public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException

Source Link

Document

Set the InputFormat for the job.

Usage

From source file:com.talis.hadoop.rdf.merge.IndexMerge.java

License:Apache License

public int run(String[] args) throws Exception {

    Configuration configuration = getConf();

    boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION,
            Constants.OPTION_USE_COMPRESSION_DEFAULT);
    if (useCompression) {
        configuration.setBoolean("mapred.compress.map.output", true);
        configuration.set("mapred.output.compression.type", "BLOCK");
        configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
    }//w  ww. j a va2s  . c  om

    boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT,
            Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT);
    FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration);
    if (overrideOutput) {
        fs.delete(new Path(args[1]), true);
    }

    Job job = new Job(configuration);
    job.setJobName(JOB_NAME);
    job.setJarByClass(getClass());

    Path input = new Path(args[0]);
    Path output = new Path(args[1]);
    FileInputFormat.addInputPath(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(Mapper.class);
    job.setReducerClass(IndexMergeReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setNumReduceTasks(1);

    if (LOG.isDebugEnabled())
        Utils.log(job, LOG);

    return job.waitForCompletion(true) ? 0 : -1;
}

From source file:com.talis.hadoop.rdf.solr.QuadsIndexer.java

License:Apache License

public int run(String[] args) throws Exception {

    Configuration configuration = getConf();

    boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION,
            Constants.OPTION_USE_COMPRESSION_DEFAULT);
    if (useCompression) {
        configuration.setBoolean("mapred.compress.map.output", true);
        configuration.set("mapred.output.compression.type", "BLOCK");
        configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
    }//from   w  ww . j  a v a 2  s.c o  m

    boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT,
            Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT);
    FileSystem outputFs = FileSystem.get(new Path(args[1]).toUri(), configuration);
    if (overrideOutput) {
        outputFs.delete(new Path(args[1]), true);
    }

    Job job = new Job(configuration);
    job.setJobName(JOB_NAME);
    job.setJarByClass(getClass());

    int shards = -1;
    boolean compressOutput = false;

    Path input = new Path(args[0]);
    Path output = new Path(args[1]);
    Path solrConfig = new Path(args[2]);
    FileInputFormat.addInputPath(job, input);
    FileOutputFormat.setOutputPath(job, output);

    if (shards > 0) {
        job.setNumReduceTasks(shards);
    }

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(QuadArrayWritable.class);

    job.setReducerClass(SolrReducer.class);
    SolrDocumentConverter.setSolrDocumentConverter(LiteralsIndexer.class, job.getConfiguration());

    job.setOutputFormatClass(SolrOutputFormat.class);

    String zipName = "solr.zip";
    FileSystem solrConfigFs = FileSystem.get(solrConfig.toUri(), configuration);
    final URI baseZipUrl = solrConfigFs.getUri().resolve(solrConfig.toString() + '#' + zipName);
    DistributedCache.addCacheArchive(baseZipUrl, job.getConfiguration());
    job.getConfiguration().set(SolrOutputFormat.SETUP_OK, solrConfig.toString());
    SolrOutputFormat.setOutputZipFormat(compressOutput, job.getConfiguration());

    if (LOG.isDebugEnabled())
        Utils.log(job, LOG);

    return job.waitForCompletion(true) ? 0 : -1;
}

From source file:com.talis.mapreduce.lib.input.TestDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }//w  w  w  . j  ava2s  . c o m

    Job job = new Job(getConf(), "test");
    job.setJarByClass(getClass());

    job.setInputFormatClass(NQuadsInputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(TestMapper.class);
    job.setNumReduceTasks(0);

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.telefonica.iot.tidoop.apiext.utils.CKANMapReduceExample.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    // check the number of arguments, show the usage if it is wrong
    if (args.length != 7) {
        showUsage();//from  www . j  a  va2 s. c om
        return -1;
    } // if

    // get the arguments
    String ckanHost = args[0];
    String ckanPort = args[1];
    boolean sslEnabled = args[2].equals("true");
    String ckanAPIKey = args[3];
    String ckanInputs = args[4];
    String ckanOutput = args[5];
    String splitsLength = args[6];

    // create and configure a MapReduce job
    Configuration conf = this.getConf();
    Job job = Job.getInstance(conf, "CKAN MapReduce test");
    job.setJarByClass(CKANMapReduceExample.class);
    job.setMapperClass(RecordSizeGetter.class);
    job.setCombinerClass(RecordSizeAdder.class);
    job.setReducerClass(RecordSizeAdder.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(CKANInputFormat.class);
    CKANInputFormat.setInput(job, ckanInputs);
    CKANInputFormat.setEnvironment(job, ckanHost, ckanPort, sslEnabled, ckanAPIKey);
    CKANInputFormat.setSplitsLength(job, splitsLength);
    job.setOutputFormatClass(CKANOutputFormat.class);
    CKANOutputFormat.setEnvironment(job, ckanHost, ckanPort, sslEnabled, ckanAPIKey);
    CKANOutputFormat.setOutputPkg(job, ckanOutput);

    // run the MapReduce job
    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.teradata.benchto.generator.HiveTypesGenerator.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(/*from   w ww.  ja  v a 2s  .  c om*/
            Option.builder("format").required().hasArg().desc("file format (orc, parquet or text)").build());
    options.addOption(Option.builder("type").required().hasArg().desc(
            "hive type to be generated (bigint, int, boolean, double, binary, date, timestamp, string, decimal or varchar)")
            .build());
    options.addOption(Option.builder("rows").required().hasArg().desc("total row count").build());
    options.addOption(Option.builder("mappers").required().hasArg().desc("total mappers count").build());
    options.addOption(Option.builder("path").hasArg()
            .desc("base path for generating files, default is: /benchmarks/benchto/types").build());
    options.addOption(Option.builder("regex").numberOfArgs(3)
            .desc("generate varchars from regex pattern, arguments are: pattern, min length, max length")
            .build());

    CommandLine line;
    String format;
    String hiveType;
    long numberOfRows;
    long numberOfFiles;
    String basePath;
    Optional<String> regexPattern = Optional.absent();
    Optional<Integer> regexMinLength = Optional.absent();
    Optional<Integer> regexMaxLength = Optional.absent();
    try {
        line = new DefaultParser().parse(options, args);
        format = line.getOptionValue("format");
        hiveType = line.getOptionValue("type");
        numberOfRows = parseLong(line.getOptionValue("rows"));
        numberOfFiles = parseLong(line.getOptionValue("mappers"));
        basePath = line.getOptionValue("path", "/benchmarks/benchto/types");
        if (line.hasOption("regex")) {
            String[] values = line.getOptionValues("regex");
            regexPattern = Optional.of(values[0]);
            regexMinLength = Optional.of(parseInt(values[1]));
            regexMaxLength = Optional.of(parseInt(values[2]));
        }
    } catch (Exception e) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("benchto-generator", options);
        throw e;
    }

    String jobName = format("GenerateData-%s-%s-%d", format, hiveType, numberOfRows);
    Path outputDir = new Path(format("%s/%s-%s/%d", basePath, format, hiveType, numberOfRows));
    Class<? extends OutputFormat> outputFormatClass = getOutputFormatClass(format);

    LOG.info("Generating " + numberOfRows + " " + hiveType + "s, directory: " + outputDir
            + ", number of files: " + numberOfFiles);

    Configuration configuration = new Configuration();
    configuration.set(FORMAT_PROPERTY_NAME, format);
    configuration.set(HIVE_TYPE_PROPERTY_NAME, hiveType);
    configuration.setLong(NUM_ROWS_PROPERTY_NAME, numberOfRows);
    configuration.setLong(NUM_MAPS, numberOfFiles);
    if (regexPattern.isPresent()) {
        configuration.set(REGEX_PATTERN, regexPattern.get());
        configuration.setInt(REGEX_MIN_LENGTH, regexMinLength.get());
        configuration.setInt(REGEX_MAX_LENGTH, regexMaxLength.get());
    }

    Job generatorJob = Job.getInstance(configuration, jobName);
    FileOutputFormat.setOutputPath(generatorJob, outputDir);
    ParquetOutputFormat.setWriteSupportClass(generatorJob, DataWritableWriteSupport.class);
    generatorJob.setJarByClass(HiveTypesGenerator.class);
    generatorJob.setMapperClass(HiveTypesMapper.class);
    generatorJob.setNumReduceTasks(0);
    generatorJob.setOutputKeyClass(NullWritable.class);
    generatorJob.setOutputValueClass(Writable.class);
    generatorJob.setInputFormatClass(CounterInputFormat.class);
    generatorJob.setOutputFormatClass(outputFormatClass);

    return generatorJob.waitForCompletion(true) ? 0 : 1;
}

From source file:com.teradata.compaction.mapreduce.MergeParquetFilesMR.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = new Job(conf, "MergeParquet");

    if (args.length != 2) {
        System.err.println("Usage: java -jar MergeParquetFilesMR path_to_input_folder path_to_output_folder ");
        System.exit(0);//from   ww w  . j  a  v a  2  s . co  m
    }

    final Path inputPath = new Path(args[0]);
    final Path out = new Path(args[1]);

    Schema schemaParquetFile = getBaseSchema(inputPath, conf);
    job.setJarByClass(MergeParquetFilesMR.class);
    job.setMapperClass(SampleParquetMapper.class);
    job.setReducerClass(SampleParquetReducer.class);
    job.setInputFormatClass(AvroParquetInputFormat.class);
    job.setOutputFormatClass(AvroParquetOutputFormat.class);
    job.setMapOutputKeyClass(NullWritable.class);

    AvroJob.setMapOutputValueSchema(job, schemaParquetFile);
    AvroParquetOutputFormat.setSchema(job, schemaParquetFile);
    FileInputFormat.addInputPath(job, inputPath);
    AvroParquetOutputFormat.setOutputPath(job, out);
    job.setNumReduceTasks(1);
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.tfm.utad.reducerdata.ReducerDataPig.java

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM-dd-HH-mm-ss");
    Date date = new Date();

    Path inputPath = new Path("/home/jab/camus/reducer-data-pig");
    Path outputDir = new Path("/home/jab/camus/pigdata/" + sdf.format(date));

    // Create configuration
    Configuration conf = new Configuration(true);
    conf.set(FS_DEFAULT_FS, HDFS_LOCALHOST_LOCALDOMAIN);
    FileSystem fs = FileSystem.get(conf);
    Path filesPath = new Path(inputPath + "/*");
    FileStatus[] files = fs.globStatus(filesPath);

    // Create job
    Job job = new Job(conf, "ReducerDataPig");
    job.setJarByClass(ReducerDataPig.class);

    // Setup MapReduce
    job.setMapperClass(ReducerDataPigMapper.class);
    job.setReducerClass(ReducerDataPigReducer.class);
    job.setNumReduceTasks(1);//from  ww w  .j  a v a2s  .  c om

    // Specify key / value
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(ReducerPigKey.class);

    // Input
    FileInputFormat.addInputPath(job, inputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);

    // Output
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormatClass(TextOutputFormat.class);

    // Delete output if exists
    if (fs.exists(outputDir)) {
        fs.delete(outputDir, true);
    }

    // Execute job
    int code = job.waitForCompletion(true) ? 0 : 1;
    if (code == 0) {
        Counters counters = job.getCounters();
        Counter malformedCounter = counters.findCounter(ReducerDataEnum.MALFORMED_DATA);
        LOG.info("Counter malformed data: " + malformedCounter.getValue());
        for (FileStatus fStatus : files) {
            LOG.info("File name:" + fStatus.getPath());
            if (fStatus.isFile()) {
                LOG.info("Removing file in path:" + fStatus.getPath());
                fs.delete(fStatus.getPath(), false);
            }
        }
    }
}

From source file:com.tfm.utad.reducerdata.ReducerDataVertica.java

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM-dd-HH-mm-ss");
    Date date = new Date();

    Path inputPath = new Path("/home/jab/camus/reducer-data-vertica");
    Path outputDir = new Path("/home/jab/camus/verticadb/" + sdf.format(date));

    // Create configuration
    Configuration conf = new Configuration(true);
    conf.set(FS_DEFAULT_FS, HDFS_LOCALHOST_LOCALDOMAIN);
    FileSystem fs = FileSystem.get(conf);
    Path filesPath = new Path(inputPath + "/*");
    FileStatus[] files = fs.globStatus(filesPath);

    // Create job
    Job job = new Job(conf, "ReducerDataVertica");
    job.setJarByClass(ReducerDataVertica.class);

    // Setup MapReduce
    job.setMapperClass(ReducerDataVerticaMapper.class);
    job.setReducerClass(ReducerDataVerticaReducer.class);
    job.setNumReduceTasks(1);/*ww w.  j  a v a  2  s .  c  o m*/

    // Specify key / value
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(ReducerVerticaValue.class);

    // Input
    FileInputFormat.addInputPath(job, inputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);

    // Output
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormatClass(TextOutputFormat.class);

    // Delete output if exists
    if (fs.exists(outputDir)) {
        fs.delete(outputDir, true);
    }

    // Execute job
    int code = job.waitForCompletion(true) ? 0 : 1;
    if (code == 0) {
        Counters counters = job.getCounters();
        Counter malformedCounter = counters.findCounter(ReducerDataEnum.MALFORMED_DATA);
        LOG.info("Counter malformed data: " + malformedCounter.getValue());
        for (FileStatus fStatus : files) {
            LOG.info("File name:" + fStatus.getPath());
            if (fStatus.isFile()) {
                LOG.info("Removing file in path:" + fStatus.getPath());
                fs.delete(fStatus.getPath(), false);
            }
        }
    }
}

From source file:com.tomslabs.grid.avro.AvroWordCount.java

License:Apache License

public static Job createSubmitableJob(final Configuration conf, final Path inputPath, final Path outputPath)
        throws IOException {

    conf.set(AvroFileOutputFormat.OUTPUT_SCHEMA, WordCountSchema.getSchema().toString());

    conf.setInt("mapred.max.split.size", 1024000);
    conf.setInt("mapred.reduce.tasks", 10);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", true);
    final Job job = new Job(conf, "Word Count");
    job.setJarByClass(AvroWordCount.class);

    job.setInputFormatClass(AvroFileInputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setMapperClass(WordCountMapper.class);

    job.setReducerClass(WordCountReducer.class);

    job.setOutputKeyClass(GenericRecord.class);
    job.setOutputValueClass(NullWritable.class);
    job.setOutputFormatClass(AvroFileOutputFormat.class);
    AvroFileOutputFormat.setDeflateLevel(job, 3);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    return job;/*from  w w  w  .j a  v a  2 s  . com*/
}

From source file:com.toshiba.mwcloud.gs.hadoop.mapreduce.examples.GSWordCount.java

License:Apache License

/**
 * <div lang="ja">// w w w  . j a v  a  2s.  co  m
 * WordCount?MapReduce???
 * @param args 
 * @return ???0????????1
 * @throws Exception ??????
 * </div><div lang="en">
 * Run a MapReduce job of WordCount.
 * @param args command argument
 * @return 0 for normal termination of the job and 1 otherwise
 * @throws Exception processing failed.
 * </div>
 */
public int run(String[] args) throws Exception {
    GSConf gsConf = new GSConf();
    gsConf.parseArg(args);

    Configuration conf = getConf();
    gsConf.setup(conf);

    Job job = Job.getInstance(conf, APP_NAME);
    job.setJarByClass(GSWordCount.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(GSRowWritable.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormatClass(GSRowInputFormat.class);
    job.setOutputFormatClass(GSRowOutputFormat.class);

    int res = job.waitForCompletion(true) ? 0 : 1;

    if (res == 0) {
        printResult(gsConf);
    }

    return res;
}