List of usage examples for org.apache.hadoop.mapreduce Job setOutputFormatClass
public void setOutputFormatClass(Class<? extends OutputFormat> cls) throws IllegalStateException
From source file:com.synerzip.analytics.commoncrawl.googleads.counter.GoogleAdsCounterJob.java
License:Apache License
/** * Configures and submits the Map Reduce Job to Hadoop *//*from w w w . jav a2 s .c o m*/ public int run(String[] args) throws Exception { String inputPath = null; String outputPath = null; boolean overwrite = false; String s3AccessKey = null; String s3SecretKey = null; // Read the command line arguments. We're not using GenericOptionsParser // to prevent having to include commons.cli as a dependency. for (int index = 0; index < args.length; index++) { try { if (ARGNAME_INPATH.equals(args[index])) { inputPath = args[++index]; } else if (ARGNAME_OUTPATH.equals(args[index])) { outputPath = args[++index]; } else if (ARGNAME_S3ACCESSKEY.equals(args[index])) { s3AccessKey = args[++index]; } else if (ARGNAME_S3SECRETKEY.equals(args[index])) { s3SecretKey = args[++index]; } else if (ARGNAME_MAXFILES.equals(args[index])) { // FIXME - No use of static methods WarcFileFilter.setMax(Long.parseLong(args[++index])); } else if (ARGNAME_OVERWRITE.equals(args[index])) { overwrite = true; } else { LOG.warn("Unsupported argument: " + args[index]); } } catch (ArrayIndexOutOfBoundsException e) { usage(); throw new IllegalArgumentException(); } } if (inputPath == null || outputPath == null) { usage(); throw new IllegalArgumentException(); } if (inputPath.contains("s3n") && (s3AccessKey == null || s3SecretKey == null)) { usage(); LOG.info("Please specify Access Key and Secret Key to access data on AWS S3 storage "); throw new IllegalArgumentException(); } // Create the Hadoop job. Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(GoogleAdsCounterJob.class); if (inputPath.contains("s3n") && (s3AccessKey != null && s3SecretKey != null)) { conf.set("AWS_ACCESS_KEY_ID", s3AccessKey); conf.set("AWS_SECRET_ACCESS_KEY", s3SecretKey); } // Scan the provided input path for WARC files. LOG.info("setting input path to '" + inputPath + "'"); WarcFileFilter.setFilter(FILEFILTER); FileInputFormat.addInputPath(job, new Path(inputPath)); // FIXME - I see the problem that you want to give a dynamic number to a // static class. My question is, Is this really required, if we just // point to a file in s3 that should solve our problem FileInputFormat.setInputPathFilter(job, WarcFileFilter.class); // Delete the output path directory if it already exists and user wants // to overwrite it. if (overwrite) { LOG.info("clearing the output path at '" + outputPath + "'"); FileSystem fs = FileSystem.get(new URI(outputPath), conf); if (fs.exists(new Path(outputPath))) { fs.delete(new Path(outputPath), true); } } // Set the path where final output 'part' files will be saved. LOG.info("setting output path to '" + outputPath + "'"); FileOutputFormat.setOutputPath(job, new Path(outputPath)); /* * // Defines additional single text based output 'GoogleAdClient' for * the job MultipleOutputs.addNamedOutput(job, "GoogleAdClient", * TextOutputFormat.class, Text.class,LongWritable.class ); * * // Defines additional text based output 'GoogleAdType' for the job * MultipleOutputs.addNamedOutput(job, * "GoogleAdType",TextOutputFormat.class, Text.class, * LongWritable.class); */ // Set which InputFormat class to use. job.setInputFormatClass(WARCInputFormat.class); // Set which OutputFormat class to use. job.setOutputFormatClass(TextOutputFormat.class); /* * Using MultipleOutputs creates zero-sized default output e.g.: * * part-r-00000. To prevent this use LazyOutputFormat instead of * job.setOutputFormatClass(TextOutputFormat.class) in Hadoop job * configuration. */ // LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); // job.setPartitionerClass(GoogleAdsCounterPartitioner.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); //job.setNumReduceTasks(4); // Set the output data types. job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // Set which Mapper and Reducer classes to use. job.setMapperClass(GoogleAdsCounterMapper.class); // job.setMapperClass(CrawlMapper_AdStatsDetails.class); job.setReducerClass(GoogleAdsCounterReducer.class); // set combiner //job.setCombinerClass(GoogleAdsCounterReducer.class); // set job name job.setJobName("CommonCrawl Data Processing : Counting Google Ads"); long startTime = System.currentTimeMillis(); if (job.waitForCompletion(true)) { LOG.info("Job completion status : " + job.waitForCompletion(true)); long endTime = System.currentTimeMillis(); long difference = endTime - startTime; LOG.info("Elapsed milliseconds: " + difference); Counter totalResponsePagesCounter = job.getCounters().findCounter(TestCounters.TOTALRESPONSEPAGES); LOG.info("totalResponsePagesCounter = " + totalResponsePagesCounter.getValue()); Counter totalGoogleAdPagesCounter = job.getCounters().findCounter(TestCounters.TOTALGOOGLEADSPAGES); LOG.info("totalGoogleAdPagesCounter = " + totalGoogleAdPagesCounter.getValue()); return 0; } else { return 1; } }
From source file:com.talis.hadoop.rdf.collation.QuadsCollater.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration configuration = getConf(); boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION, Constants.OPTION_USE_COMPRESSION_DEFAULT); if (useCompression) { configuration.setBoolean("mapred.compress.map.output", true); configuration.set("mapred.output.compression.type", "BLOCK"); configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); }/*from www .jav a 2s . co m*/ boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT, Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT); FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration); if (overrideOutput) { fs.delete(new Path(args[1]), true); } Job job = new Job(configuration); job.setJobName(JOB_NAME); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); FileOutputFormat.setCompressOutput(job, true); job.setInputFormatClass(NQuadsInputFormat.class); job.setMapperClass(CollationMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(QuadWritable.class); job.setReducerClass(CollationReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(QuadArrayWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); if (LOG.isDebugEnabled()) Utils.log(job, LOG); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.talis.hadoop.rdf.merge.IndexMerge.java
License:Apache License
public int run(String[] args) throws Exception { Configuration configuration = getConf(); boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION, Constants.OPTION_USE_COMPRESSION_DEFAULT); if (useCompression) { configuration.setBoolean("mapred.compress.map.output", true); configuration.set("mapred.output.compression.type", "BLOCK"); configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); }/* w ww .jav a2 s. c om*/ boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT, Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT); FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration); if (overrideOutput) { fs.delete(new Path(args[1]), true); } Job job = new Job(configuration); job.setJobName(JOB_NAME); job.setJarByClass(getClass()); Path input = new Path(args[0]); Path output = new Path(args[1]); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(Mapper.class); job.setReducerClass(IndexMergeReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); job.setNumReduceTasks(1); if (LOG.isDebugEnabled()) Utils.log(job, LOG); return job.waitForCompletion(true) ? 0 : -1; }
From source file:com.talis.hadoop.rdf.solr.QuadsIndexer.java
License:Apache License
public int run(String[] args) throws Exception { Configuration configuration = getConf(); boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION, Constants.OPTION_USE_COMPRESSION_DEFAULT); if (useCompression) { configuration.setBoolean("mapred.compress.map.output", true); configuration.set("mapred.output.compression.type", "BLOCK"); configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); }/*from w ww. j a va 2s . c om*/ boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT, Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT); FileSystem outputFs = FileSystem.get(new Path(args[1]).toUri(), configuration); if (overrideOutput) { outputFs.delete(new Path(args[1]), true); } Job job = new Job(configuration); job.setJobName(JOB_NAME); job.setJarByClass(getClass()); int shards = -1; boolean compressOutput = false; Path input = new Path(args[0]); Path output = new Path(args[1]); Path solrConfig = new Path(args[2]); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); if (shards > 0) { job.setNumReduceTasks(shards); } job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(QuadArrayWritable.class); job.setReducerClass(SolrReducer.class); SolrDocumentConverter.setSolrDocumentConverter(LiteralsIndexer.class, job.getConfiguration()); job.setOutputFormatClass(SolrOutputFormat.class); String zipName = "solr.zip"; FileSystem solrConfigFs = FileSystem.get(solrConfig.toUri(), configuration); final URI baseZipUrl = solrConfigFs.getUri().resolve(solrConfig.toString() + '#' + zipName); DistributedCache.addCacheArchive(baseZipUrl, job.getConfiguration()); job.getConfiguration().set(SolrOutputFormat.SETUP_OK, solrConfig.toString()); SolrOutputFormat.setOutputZipFormat(compressOutput, job.getConfiguration()); if (LOG.isDebugEnabled()) Utils.log(job, LOG); return job.waitForCompletion(true) ? 0 : -1; }
From source file:com.telefonica.iot.tidoop.apiext.utils.CKANMapReduceExample.java
License:Open Source License
@Override public int run(String[] args) throws Exception { // check the number of arguments, show the usage if it is wrong if (args.length != 7) { showUsage();//from ww w . jav a2 s. com return -1; } // if // get the arguments String ckanHost = args[0]; String ckanPort = args[1]; boolean sslEnabled = args[2].equals("true"); String ckanAPIKey = args[3]; String ckanInputs = args[4]; String ckanOutput = args[5]; String splitsLength = args[6]; // create and configure a MapReduce job Configuration conf = this.getConf(); Job job = Job.getInstance(conf, "CKAN MapReduce test"); job.setJarByClass(CKANMapReduceExample.class); job.setMapperClass(RecordSizeGetter.class); job.setCombinerClass(RecordSizeAdder.class); job.setReducerClass(RecordSizeAdder.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setInputFormatClass(CKANInputFormat.class); CKANInputFormat.setInput(job, ckanInputs); CKANInputFormat.setEnvironment(job, ckanHost, ckanPort, sslEnabled, ckanAPIKey); CKANInputFormat.setSplitsLength(job, splitsLength); job.setOutputFormatClass(CKANOutputFormat.class); CKANOutputFormat.setEnvironment(job, ckanHost, ckanPort, sslEnabled, ckanAPIKey); CKANOutputFormat.setOutputPkg(job, ckanOutput); // run the MapReduce job return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.teradata.benchto.generator.HiveTypesGenerator.java
License:Apache License
@Override public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(/* w w w .java 2s. co m*/ Option.builder("format").required().hasArg().desc("file format (orc, parquet or text)").build()); options.addOption(Option.builder("type").required().hasArg().desc( "hive type to be generated (bigint, int, boolean, double, binary, date, timestamp, string, decimal or varchar)") .build()); options.addOption(Option.builder("rows").required().hasArg().desc("total row count").build()); options.addOption(Option.builder("mappers").required().hasArg().desc("total mappers count").build()); options.addOption(Option.builder("path").hasArg() .desc("base path for generating files, default is: /benchmarks/benchto/types").build()); options.addOption(Option.builder("regex").numberOfArgs(3) .desc("generate varchars from regex pattern, arguments are: pattern, min length, max length") .build()); CommandLine line; String format; String hiveType; long numberOfRows; long numberOfFiles; String basePath; Optional<String> regexPattern = Optional.absent(); Optional<Integer> regexMinLength = Optional.absent(); Optional<Integer> regexMaxLength = Optional.absent(); try { line = new DefaultParser().parse(options, args); format = line.getOptionValue("format"); hiveType = line.getOptionValue("type"); numberOfRows = parseLong(line.getOptionValue("rows")); numberOfFiles = parseLong(line.getOptionValue("mappers")); basePath = line.getOptionValue("path", "/benchmarks/benchto/types"); if (line.hasOption("regex")) { String[] values = line.getOptionValues("regex"); regexPattern = Optional.of(values[0]); regexMinLength = Optional.of(parseInt(values[1])); regexMaxLength = Optional.of(parseInt(values[2])); } } catch (Exception e) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("benchto-generator", options); throw e; } String jobName = format("GenerateData-%s-%s-%d", format, hiveType, numberOfRows); Path outputDir = new Path(format("%s/%s-%s/%d", basePath, format, hiveType, numberOfRows)); Class<? extends OutputFormat> outputFormatClass = getOutputFormatClass(format); LOG.info("Generating " + numberOfRows + " " + hiveType + "s, directory: " + outputDir + ", number of files: " + numberOfFiles); Configuration configuration = new Configuration(); configuration.set(FORMAT_PROPERTY_NAME, format); configuration.set(HIVE_TYPE_PROPERTY_NAME, hiveType); configuration.setLong(NUM_ROWS_PROPERTY_NAME, numberOfRows); configuration.setLong(NUM_MAPS, numberOfFiles); if (regexPattern.isPresent()) { configuration.set(REGEX_PATTERN, regexPattern.get()); configuration.setInt(REGEX_MIN_LENGTH, regexMinLength.get()); configuration.setInt(REGEX_MAX_LENGTH, regexMaxLength.get()); } Job generatorJob = Job.getInstance(configuration, jobName); FileOutputFormat.setOutputPath(generatorJob, outputDir); ParquetOutputFormat.setWriteSupportClass(generatorJob, DataWritableWriteSupport.class); generatorJob.setJarByClass(HiveTypesGenerator.class); generatorJob.setMapperClass(HiveTypesMapper.class); generatorJob.setNumReduceTasks(0); generatorJob.setOutputKeyClass(NullWritable.class); generatorJob.setOutputValueClass(Writable.class); generatorJob.setInputFormatClass(CounterInputFormat.class); generatorJob.setOutputFormatClass(outputFormatClass); return generatorJob.waitForCompletion(true) ? 0 : 1; }
From source file:com.teradata.compaction.mapreduce.MergeParquetFilesMR.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "MergeParquet"); if (args.length != 2) { System.err.println("Usage: java -jar MergeParquetFilesMR path_to_input_folder path_to_output_folder "); System.exit(0);/*ww w. j a v a2 s. c o m*/ } final Path inputPath = new Path(args[0]); final Path out = new Path(args[1]); Schema schemaParquetFile = getBaseSchema(inputPath, conf); job.setJarByClass(MergeParquetFilesMR.class); job.setMapperClass(SampleParquetMapper.class); job.setReducerClass(SampleParquetReducer.class); job.setInputFormatClass(AvroParquetInputFormat.class); job.setOutputFormatClass(AvroParquetOutputFormat.class); job.setMapOutputKeyClass(NullWritable.class); AvroJob.setMapOutputValueSchema(job, schemaParquetFile); AvroParquetOutputFormat.setSchema(job, schemaParquetFile); FileInputFormat.addInputPath(job, inputPath); AvroParquetOutputFormat.setOutputPath(job, out); job.setNumReduceTasks(1); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.tfm.utad.reducerdata.ReducerDataPig.java
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM-dd-HH-mm-ss"); Date date = new Date(); Path inputPath = new Path("/home/jab/camus/reducer-data-pig"); Path outputDir = new Path("/home/jab/camus/pigdata/" + sdf.format(date)); // Create configuration Configuration conf = new Configuration(true); conf.set(FS_DEFAULT_FS, HDFS_LOCALHOST_LOCALDOMAIN); FileSystem fs = FileSystem.get(conf); Path filesPath = new Path(inputPath + "/*"); FileStatus[] files = fs.globStatus(filesPath); // Create job Job job = new Job(conf, "ReducerDataPig"); job.setJarByClass(ReducerDataPig.class); // Setup MapReduce job.setMapperClass(ReducerDataPigMapper.class); job.setReducerClass(ReducerDataPigReducer.class); job.setNumReduceTasks(1);//from w w w . ja v a2 s . c o m // Specify key / value job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(ReducerPigKey.class); // Input FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(SequenceFileInputFormat.class); // Output FileOutputFormat.setOutputPath(job, outputDir); job.setOutputFormatClass(TextOutputFormat.class); // Delete output if exists if (fs.exists(outputDir)) { fs.delete(outputDir, true); } // Execute job int code = job.waitForCompletion(true) ? 0 : 1; if (code == 0) { Counters counters = job.getCounters(); Counter malformedCounter = counters.findCounter(ReducerDataEnum.MALFORMED_DATA); LOG.info("Counter malformed data: " + malformedCounter.getValue()); for (FileStatus fStatus : files) { LOG.info("File name:" + fStatus.getPath()); if (fStatus.isFile()) { LOG.info("Removing file in path:" + fStatus.getPath()); fs.delete(fStatus.getPath(), false); } } } }
From source file:com.tfm.utad.reducerdata.ReducerDataVertica.java
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM-dd-HH-mm-ss"); Date date = new Date(); Path inputPath = new Path("/home/jab/camus/reducer-data-vertica"); Path outputDir = new Path("/home/jab/camus/verticadb/" + sdf.format(date)); // Create configuration Configuration conf = new Configuration(true); conf.set(FS_DEFAULT_FS, HDFS_LOCALHOST_LOCALDOMAIN); FileSystem fs = FileSystem.get(conf); Path filesPath = new Path(inputPath + "/*"); FileStatus[] files = fs.globStatus(filesPath); // Create job Job job = new Job(conf, "ReducerDataVertica"); job.setJarByClass(ReducerDataVertica.class); // Setup MapReduce job.setMapperClass(ReducerDataVerticaMapper.class); job.setReducerClass(ReducerDataVerticaReducer.class); job.setNumReduceTasks(1);//from w ww . j ava 2 s . c om // Specify key / value job.setOutputKeyClass(Text.class); job.setOutputValueClass(ReducerVerticaValue.class); // Input FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(SequenceFileInputFormat.class); // Output FileOutputFormat.setOutputPath(job, outputDir); job.setOutputFormatClass(TextOutputFormat.class); // Delete output if exists if (fs.exists(outputDir)) { fs.delete(outputDir, true); } // Execute job int code = job.waitForCompletion(true) ? 0 : 1; if (code == 0) { Counters counters = job.getCounters(); Counter malformedCounter = counters.findCounter(ReducerDataEnum.MALFORMED_DATA); LOG.info("Counter malformed data: " + malformedCounter.getValue()); for (FileStatus fStatus : files) { LOG.info("File name:" + fStatus.getPath()); if (fStatus.isFile()) { LOG.info("Removing file in path:" + fStatus.getPath()); fs.delete(fStatus.getPath(), false); } } } }
From source file:com.toddbodnar.simpleHadoop.distributedHadoopDriver.java
/** * Runs a job// w w w .java 2 s. com * * @param theJob the MapReduceJob to be run * @param verbose if true, output progress information */ public static void run(MapReduceJob theJob, boolean verbose) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = GetConfiguration.get(); Job job = Job.getInstance(conf, theJob.toString()); job.setJarByClass(distributedHadoopDriver.class); job.setMapperClass(theJob.getMapper().getClass()); job.setReducerClass(theJob.getReducer().getClass()); job.setMapOutputKeyClass(theJob.getKeyType()); job.setMapOutputValueClass(theJob.getValueType()); theJob.writeConfig(job.getConfiguration()); hdfsFile input = hdfsFile.transferToHDFS(theJob.getInput().getFile()); if (!input.equals(theJob.getInput().getFile())) { garbage_collector.noteCreated(input); } if (theJob.getClass().equals(join.class)) { join jobLeftJoin = (join) theJob; hdfsFile input2 = hdfsFile.transferToHDFS(jobLeftJoin.getOtherInput().getFile()); if (!input2.equals(jobLeftJoin.getOtherInput().getFile())) { garbage_collector.noteCreated(input2); } Mapper maps[] = jobLeftJoin.getMapperPairs(); MultipleInputs.addInputPath(job, input.getPath(), TextInputFormat.class, maps[0].getClass()); MultipleInputs.addInputPath(job, input2.getPath(), TextInputFormat.class, maps[1].getClass()); } else { MultipleInputs.addInputPath(job, input.getPath(), TextInputFormat.class); } job.getConfiguration().set(TextOutputFormat.SEPERATOR, ""); job.setOutputFormatClass(TextOutputFormat.class); //FileInputFormat.setInputPaths(job, new Path(theJob.getInput().getFile().getLocation())); Path out = new Path(settings.hdfs_prefix + "/TMP_TABLE_" + theJob.hashCode()); FileOutputFormat.setOutputPath(job, out); boolean success = job.waitForCompletion(true); if (!success) { System.err.println("Error processing " + theJob); return; } FileSystem fs = FileSystem.get(GetConfiguration.get()); fs.delete(new Path(out, "_SUCCESS"), false); table output = new table(new hdfsFile(out), theJob.getOutput().getColNames()); output.setSeperator(theJob.getOutput().getSeperator()); theJob.setOutput(output); garbage_collector.noteCreated(output.getFile()); }