List of usage examples for org.apache.hadoop.mapreduce Job setInputFormatClass
public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException
From source file:com.talis.hadoop.rdf.merge.IndexMerge.java
License:Apache License
public int run(String[] args) throws Exception { Configuration configuration = getConf(); boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION, Constants.OPTION_USE_COMPRESSION_DEFAULT); if (useCompression) { configuration.setBoolean("mapred.compress.map.output", true); configuration.set("mapred.output.compression.type", "BLOCK"); configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); }//w ww. j a va2s . c om boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT, Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT); FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration); if (overrideOutput) { fs.delete(new Path(args[1]), true); } Job job = new Job(configuration); job.setJobName(JOB_NAME); job.setJarByClass(getClass()); Path input = new Path(args[0]); Path output = new Path(args[1]); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(Mapper.class); job.setReducerClass(IndexMergeReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); job.setNumReduceTasks(1); if (LOG.isDebugEnabled()) Utils.log(job, LOG); return job.waitForCompletion(true) ? 0 : -1; }
From source file:com.talis.hadoop.rdf.solr.QuadsIndexer.java
License:Apache License
public int run(String[] args) throws Exception { Configuration configuration = getConf(); boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION, Constants.OPTION_USE_COMPRESSION_DEFAULT); if (useCompression) { configuration.setBoolean("mapred.compress.map.output", true); configuration.set("mapred.output.compression.type", "BLOCK"); configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); }//from w ww . j a v a 2 s.c o m boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT, Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT); FileSystem outputFs = FileSystem.get(new Path(args[1]).toUri(), configuration); if (overrideOutput) { outputFs.delete(new Path(args[1]), true); } Job job = new Job(configuration); job.setJobName(JOB_NAME); job.setJarByClass(getClass()); int shards = -1; boolean compressOutput = false; Path input = new Path(args[0]); Path output = new Path(args[1]); Path solrConfig = new Path(args[2]); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); if (shards > 0) { job.setNumReduceTasks(shards); } job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(QuadArrayWritable.class); job.setReducerClass(SolrReducer.class); SolrDocumentConverter.setSolrDocumentConverter(LiteralsIndexer.class, job.getConfiguration()); job.setOutputFormatClass(SolrOutputFormat.class); String zipName = "solr.zip"; FileSystem solrConfigFs = FileSystem.get(solrConfig.toUri(), configuration); final URI baseZipUrl = solrConfigFs.getUri().resolve(solrConfig.toString() + '#' + zipName); DistributedCache.addCacheArchive(baseZipUrl, job.getConfiguration()); job.getConfiguration().set(SolrOutputFormat.SETUP_OK, solrConfig.toString()); SolrOutputFormat.setOutputZipFormat(compressOutput, job.getConfiguration()); if (LOG.isDebugEnabled()) Utils.log(job, LOG); return job.waitForCompletion(true) ? 0 : -1; }
From source file:com.talis.mapreduce.lib.input.TestDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getName()); ToolRunner.printGenericCommandUsage(System.err); return -1; }//w w w . j ava2s . c o m Job job = new Job(getConf(), "test"); job.setJarByClass(getClass()); job.setInputFormatClass(NQuadsInputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(TestMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.telefonica.iot.tidoop.apiext.utils.CKANMapReduceExample.java
License:Open Source License
@Override public int run(String[] args) throws Exception { // check the number of arguments, show the usage if it is wrong if (args.length != 7) { showUsage();//from www . j a va2 s. c om return -1; } // if // get the arguments String ckanHost = args[0]; String ckanPort = args[1]; boolean sslEnabled = args[2].equals("true"); String ckanAPIKey = args[3]; String ckanInputs = args[4]; String ckanOutput = args[5]; String splitsLength = args[6]; // create and configure a MapReduce job Configuration conf = this.getConf(); Job job = Job.getInstance(conf, "CKAN MapReduce test"); job.setJarByClass(CKANMapReduceExample.class); job.setMapperClass(RecordSizeGetter.class); job.setCombinerClass(RecordSizeAdder.class); job.setReducerClass(RecordSizeAdder.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setInputFormatClass(CKANInputFormat.class); CKANInputFormat.setInput(job, ckanInputs); CKANInputFormat.setEnvironment(job, ckanHost, ckanPort, sslEnabled, ckanAPIKey); CKANInputFormat.setSplitsLength(job, splitsLength); job.setOutputFormatClass(CKANOutputFormat.class); CKANOutputFormat.setEnvironment(job, ckanHost, ckanPort, sslEnabled, ckanAPIKey); CKANOutputFormat.setOutputPkg(job, ckanOutput); // run the MapReduce job return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.teradata.benchto.generator.HiveTypesGenerator.java
License:Apache License
@Override public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(/*from w ww. ja v a 2s . c om*/ Option.builder("format").required().hasArg().desc("file format (orc, parquet or text)").build()); options.addOption(Option.builder("type").required().hasArg().desc( "hive type to be generated (bigint, int, boolean, double, binary, date, timestamp, string, decimal or varchar)") .build()); options.addOption(Option.builder("rows").required().hasArg().desc("total row count").build()); options.addOption(Option.builder("mappers").required().hasArg().desc("total mappers count").build()); options.addOption(Option.builder("path").hasArg() .desc("base path for generating files, default is: /benchmarks/benchto/types").build()); options.addOption(Option.builder("regex").numberOfArgs(3) .desc("generate varchars from regex pattern, arguments are: pattern, min length, max length") .build()); CommandLine line; String format; String hiveType; long numberOfRows; long numberOfFiles; String basePath; Optional<String> regexPattern = Optional.absent(); Optional<Integer> regexMinLength = Optional.absent(); Optional<Integer> regexMaxLength = Optional.absent(); try { line = new DefaultParser().parse(options, args); format = line.getOptionValue("format"); hiveType = line.getOptionValue("type"); numberOfRows = parseLong(line.getOptionValue("rows")); numberOfFiles = parseLong(line.getOptionValue("mappers")); basePath = line.getOptionValue("path", "/benchmarks/benchto/types"); if (line.hasOption("regex")) { String[] values = line.getOptionValues("regex"); regexPattern = Optional.of(values[0]); regexMinLength = Optional.of(parseInt(values[1])); regexMaxLength = Optional.of(parseInt(values[2])); } } catch (Exception e) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("benchto-generator", options); throw e; } String jobName = format("GenerateData-%s-%s-%d", format, hiveType, numberOfRows); Path outputDir = new Path(format("%s/%s-%s/%d", basePath, format, hiveType, numberOfRows)); Class<? extends OutputFormat> outputFormatClass = getOutputFormatClass(format); LOG.info("Generating " + numberOfRows + " " + hiveType + "s, directory: " + outputDir + ", number of files: " + numberOfFiles); Configuration configuration = new Configuration(); configuration.set(FORMAT_PROPERTY_NAME, format); configuration.set(HIVE_TYPE_PROPERTY_NAME, hiveType); configuration.setLong(NUM_ROWS_PROPERTY_NAME, numberOfRows); configuration.setLong(NUM_MAPS, numberOfFiles); if (regexPattern.isPresent()) { configuration.set(REGEX_PATTERN, regexPattern.get()); configuration.setInt(REGEX_MIN_LENGTH, regexMinLength.get()); configuration.setInt(REGEX_MAX_LENGTH, regexMaxLength.get()); } Job generatorJob = Job.getInstance(configuration, jobName); FileOutputFormat.setOutputPath(generatorJob, outputDir); ParquetOutputFormat.setWriteSupportClass(generatorJob, DataWritableWriteSupport.class); generatorJob.setJarByClass(HiveTypesGenerator.class); generatorJob.setMapperClass(HiveTypesMapper.class); generatorJob.setNumReduceTasks(0); generatorJob.setOutputKeyClass(NullWritable.class); generatorJob.setOutputValueClass(Writable.class); generatorJob.setInputFormatClass(CounterInputFormat.class); generatorJob.setOutputFormatClass(outputFormatClass); return generatorJob.waitForCompletion(true) ? 0 : 1; }
From source file:com.teradata.compaction.mapreduce.MergeParquetFilesMR.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "MergeParquet"); if (args.length != 2) { System.err.println("Usage: java -jar MergeParquetFilesMR path_to_input_folder path_to_output_folder "); System.exit(0);//from ww w . j a v a 2 s . co m } final Path inputPath = new Path(args[0]); final Path out = new Path(args[1]); Schema schemaParquetFile = getBaseSchema(inputPath, conf); job.setJarByClass(MergeParquetFilesMR.class); job.setMapperClass(SampleParquetMapper.class); job.setReducerClass(SampleParquetReducer.class); job.setInputFormatClass(AvroParquetInputFormat.class); job.setOutputFormatClass(AvroParquetOutputFormat.class); job.setMapOutputKeyClass(NullWritable.class); AvroJob.setMapOutputValueSchema(job, schemaParquetFile); AvroParquetOutputFormat.setSchema(job, schemaParquetFile); FileInputFormat.addInputPath(job, inputPath); AvroParquetOutputFormat.setOutputPath(job, out); job.setNumReduceTasks(1); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.tfm.utad.reducerdata.ReducerDataPig.java
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM-dd-HH-mm-ss"); Date date = new Date(); Path inputPath = new Path("/home/jab/camus/reducer-data-pig"); Path outputDir = new Path("/home/jab/camus/pigdata/" + sdf.format(date)); // Create configuration Configuration conf = new Configuration(true); conf.set(FS_DEFAULT_FS, HDFS_LOCALHOST_LOCALDOMAIN); FileSystem fs = FileSystem.get(conf); Path filesPath = new Path(inputPath + "/*"); FileStatus[] files = fs.globStatus(filesPath); // Create job Job job = new Job(conf, "ReducerDataPig"); job.setJarByClass(ReducerDataPig.class); // Setup MapReduce job.setMapperClass(ReducerDataPigMapper.class); job.setReducerClass(ReducerDataPigReducer.class); job.setNumReduceTasks(1);//from ww w .j a v a2s . c om // Specify key / value job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(ReducerPigKey.class); // Input FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(SequenceFileInputFormat.class); // Output FileOutputFormat.setOutputPath(job, outputDir); job.setOutputFormatClass(TextOutputFormat.class); // Delete output if exists if (fs.exists(outputDir)) { fs.delete(outputDir, true); } // Execute job int code = job.waitForCompletion(true) ? 0 : 1; if (code == 0) { Counters counters = job.getCounters(); Counter malformedCounter = counters.findCounter(ReducerDataEnum.MALFORMED_DATA); LOG.info("Counter malformed data: " + malformedCounter.getValue()); for (FileStatus fStatus : files) { LOG.info("File name:" + fStatus.getPath()); if (fStatus.isFile()) { LOG.info("Removing file in path:" + fStatus.getPath()); fs.delete(fStatus.getPath(), false); } } } }
From source file:com.tfm.utad.reducerdata.ReducerDataVertica.java
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM-dd-HH-mm-ss"); Date date = new Date(); Path inputPath = new Path("/home/jab/camus/reducer-data-vertica"); Path outputDir = new Path("/home/jab/camus/verticadb/" + sdf.format(date)); // Create configuration Configuration conf = new Configuration(true); conf.set(FS_DEFAULT_FS, HDFS_LOCALHOST_LOCALDOMAIN); FileSystem fs = FileSystem.get(conf); Path filesPath = new Path(inputPath + "/*"); FileStatus[] files = fs.globStatus(filesPath); // Create job Job job = new Job(conf, "ReducerDataVertica"); job.setJarByClass(ReducerDataVertica.class); // Setup MapReduce job.setMapperClass(ReducerDataVerticaMapper.class); job.setReducerClass(ReducerDataVerticaReducer.class); job.setNumReduceTasks(1);/*ww w. j a v a 2 s . c o m*/ // Specify key / value job.setOutputKeyClass(Text.class); job.setOutputValueClass(ReducerVerticaValue.class); // Input FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(SequenceFileInputFormat.class); // Output FileOutputFormat.setOutputPath(job, outputDir); job.setOutputFormatClass(TextOutputFormat.class); // Delete output if exists if (fs.exists(outputDir)) { fs.delete(outputDir, true); } // Execute job int code = job.waitForCompletion(true) ? 0 : 1; if (code == 0) { Counters counters = job.getCounters(); Counter malformedCounter = counters.findCounter(ReducerDataEnum.MALFORMED_DATA); LOG.info("Counter malformed data: " + malformedCounter.getValue()); for (FileStatus fStatus : files) { LOG.info("File name:" + fStatus.getPath()); if (fStatus.isFile()) { LOG.info("Removing file in path:" + fStatus.getPath()); fs.delete(fStatus.getPath(), false); } } } }
From source file:com.tomslabs.grid.avro.AvroWordCount.java
License:Apache License
public static Job createSubmitableJob(final Configuration conf, final Path inputPath, final Path outputPath) throws IOException { conf.set(AvroFileOutputFormat.OUTPUT_SCHEMA, WordCountSchema.getSchema().toString()); conf.setInt("mapred.max.split.size", 1024000); conf.setInt("mapred.reduce.tasks", 10); conf.setBoolean("mapred.reduce.tasks.speculative.execution", true); final Job job = new Job(conf, "Word Count"); job.setJarByClass(AvroWordCount.class); job.setInputFormatClass(AvroFileInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setMapperClass(WordCountMapper.class); job.setReducerClass(WordCountReducer.class); job.setOutputKeyClass(GenericRecord.class); job.setOutputValueClass(NullWritable.class); job.setOutputFormatClass(AvroFileOutputFormat.class); AvroFileOutputFormat.setDeflateLevel(job, 3); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); return job;/*from w w w .j a v a 2 s . com*/ }
From source file:com.toshiba.mwcloud.gs.hadoop.mapreduce.examples.GSWordCount.java
License:Apache License
/** * <div lang="ja">// w w w . j a v a 2s. co m * WordCount?MapReduce??? * @param args * @return ???0????????1 * @throws Exception ?????? * </div><div lang="en"> * Run a MapReduce job of WordCount. * @param args command argument * @return 0 for normal termination of the job and 1 otherwise * @throws Exception processing failed. * </div> */ public int run(String[] args) throws Exception { GSConf gsConf = new GSConf(); gsConf.parseArg(args); Configuration conf = getConf(); gsConf.setup(conf); Job job = Job.getInstance(conf, APP_NAME); job.setJarByClass(GSWordCount.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(GSRowWritable.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(GSRowInputFormat.class); job.setOutputFormatClass(GSRowOutputFormat.class); int res = job.waitForCompletion(true) ? 0 : 1; if (res == 0) { printResult(gsConf); } return res; }