List of usage examples for org.apache.hadoop.mapreduce Job setNumReduceTasks
public void setNumReduceTasks(int tasks) throws IllegalStateException
From source file:com.telefonica.iot.tidoop.mrlib.jobs.MapOnly.java
License:Open Source License
@Override public int run(String[] args) throws Exception { // check the number of arguments, show the usage if it is wrong if (args.length != 3) { showUsage();/*from w ww .j a v a 2 s. c o m*/ return -1; } // if // get the arguments String input = args[0]; String output = args[1]; String mapFunction = args[2]; // create and configure a MapReduce job Configuration conf = this.getConf(); conf.set(Constants.PARAM_FUNCTION, mapFunction); Job job = Job.getInstance(conf, "tidoop-mr-lib-maponly"); job.setNumReduceTasks(0); job.setJarByClass(MapOnly.class); job.setMapperClass(CustomMapper.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(input)); FileOutputFormat.setOutputPath(job, new Path(output)); // run the MapReduce job return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.teradata.benchto.generator.HiveTypesGenerator.java
License:Apache License
@Override public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(//from w ww .j a v a 2s. c o m Option.builder("format").required().hasArg().desc("file format (orc, parquet or text)").build()); options.addOption(Option.builder("type").required().hasArg().desc( "hive type to be generated (bigint, int, boolean, double, binary, date, timestamp, string, decimal or varchar)") .build()); options.addOption(Option.builder("rows").required().hasArg().desc("total row count").build()); options.addOption(Option.builder("mappers").required().hasArg().desc("total mappers count").build()); options.addOption(Option.builder("path").hasArg() .desc("base path for generating files, default is: /benchmarks/benchto/types").build()); options.addOption(Option.builder("regex").numberOfArgs(3) .desc("generate varchars from regex pattern, arguments are: pattern, min length, max length") .build()); CommandLine line; String format; String hiveType; long numberOfRows; long numberOfFiles; String basePath; Optional<String> regexPattern = Optional.absent(); Optional<Integer> regexMinLength = Optional.absent(); Optional<Integer> regexMaxLength = Optional.absent(); try { line = new DefaultParser().parse(options, args); format = line.getOptionValue("format"); hiveType = line.getOptionValue("type"); numberOfRows = parseLong(line.getOptionValue("rows")); numberOfFiles = parseLong(line.getOptionValue("mappers")); basePath = line.getOptionValue("path", "/benchmarks/benchto/types"); if (line.hasOption("regex")) { String[] values = line.getOptionValues("regex"); regexPattern = Optional.of(values[0]); regexMinLength = Optional.of(parseInt(values[1])); regexMaxLength = Optional.of(parseInt(values[2])); } } catch (Exception e) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("benchto-generator", options); throw e; } String jobName = format("GenerateData-%s-%s-%d", format, hiveType, numberOfRows); Path outputDir = new Path(format("%s/%s-%s/%d", basePath, format, hiveType, numberOfRows)); Class<? extends OutputFormat> outputFormatClass = getOutputFormatClass(format); LOG.info("Generating " + numberOfRows + " " + hiveType + "s, directory: " + outputDir + ", number of files: " + numberOfFiles); Configuration configuration = new Configuration(); configuration.set(FORMAT_PROPERTY_NAME, format); configuration.set(HIVE_TYPE_PROPERTY_NAME, hiveType); configuration.setLong(NUM_ROWS_PROPERTY_NAME, numberOfRows); configuration.setLong(NUM_MAPS, numberOfFiles); if (regexPattern.isPresent()) { configuration.set(REGEX_PATTERN, regexPattern.get()); configuration.setInt(REGEX_MIN_LENGTH, regexMinLength.get()); configuration.setInt(REGEX_MAX_LENGTH, regexMaxLength.get()); } Job generatorJob = Job.getInstance(configuration, jobName); FileOutputFormat.setOutputPath(generatorJob, outputDir); ParquetOutputFormat.setWriteSupportClass(generatorJob, DataWritableWriteSupport.class); generatorJob.setJarByClass(HiveTypesGenerator.class); generatorJob.setMapperClass(HiveTypesMapper.class); generatorJob.setNumReduceTasks(0); generatorJob.setOutputKeyClass(NullWritable.class); generatorJob.setOutputValueClass(Writable.class); generatorJob.setInputFormatClass(CounterInputFormat.class); generatorJob.setOutputFormatClass(outputFormatClass); return generatorJob.waitForCompletion(true) ? 0 : 1; }
From source file:com.teradata.compaction.mapreduce.MergeParquetFilesMR.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "MergeParquet"); if (args.length != 2) { System.err.println("Usage: java -jar MergeParquetFilesMR path_to_input_folder path_to_output_folder "); System.exit(0);/* ww w .j a v a 2s . com*/ } final Path inputPath = new Path(args[0]); final Path out = new Path(args[1]); Schema schemaParquetFile = getBaseSchema(inputPath, conf); job.setJarByClass(MergeParquetFilesMR.class); job.setMapperClass(SampleParquetMapper.class); job.setReducerClass(SampleParquetReducer.class); job.setInputFormatClass(AvroParquetInputFormat.class); job.setOutputFormatClass(AvroParquetOutputFormat.class); job.setMapOutputKeyClass(NullWritable.class); AvroJob.setMapOutputValueSchema(job, schemaParquetFile); AvroParquetOutputFormat.setSchema(job, schemaParquetFile); FileInputFormat.addInputPath(job, inputPath); AvroParquetOutputFormat.setOutputPath(job, out); job.setNumReduceTasks(1); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.tfm.utad.reducerdata.ReducerDataPig.java
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM-dd-HH-mm-ss"); Date date = new Date(); Path inputPath = new Path("/home/jab/camus/reducer-data-pig"); Path outputDir = new Path("/home/jab/camus/pigdata/" + sdf.format(date)); // Create configuration Configuration conf = new Configuration(true); conf.set(FS_DEFAULT_FS, HDFS_LOCALHOST_LOCALDOMAIN); FileSystem fs = FileSystem.get(conf); Path filesPath = new Path(inputPath + "/*"); FileStatus[] files = fs.globStatus(filesPath); // Create job Job job = new Job(conf, "ReducerDataPig"); job.setJarByClass(ReducerDataPig.class); // Setup MapReduce job.setMapperClass(ReducerDataPigMapper.class); job.setReducerClass(ReducerDataPigReducer.class); job.setNumReduceTasks(1); // Specify key / value job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(ReducerPigKey.class); // Input/*from w ww .j a v a 2 s . com*/ FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(SequenceFileInputFormat.class); // Output FileOutputFormat.setOutputPath(job, outputDir); job.setOutputFormatClass(TextOutputFormat.class); // Delete output if exists if (fs.exists(outputDir)) { fs.delete(outputDir, true); } // Execute job int code = job.waitForCompletion(true) ? 0 : 1; if (code == 0) { Counters counters = job.getCounters(); Counter malformedCounter = counters.findCounter(ReducerDataEnum.MALFORMED_DATA); LOG.info("Counter malformed data: " + malformedCounter.getValue()); for (FileStatus fStatus : files) { LOG.info("File name:" + fStatus.getPath()); if (fStatus.isFile()) { LOG.info("Removing file in path:" + fStatus.getPath()); fs.delete(fStatus.getPath(), false); } } } }
From source file:com.tfm.utad.reducerdata.ReducerDataVertica.java
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM-dd-HH-mm-ss"); Date date = new Date(); Path inputPath = new Path("/home/jab/camus/reducer-data-vertica"); Path outputDir = new Path("/home/jab/camus/verticadb/" + sdf.format(date)); // Create configuration Configuration conf = new Configuration(true); conf.set(FS_DEFAULT_FS, HDFS_LOCALHOST_LOCALDOMAIN); FileSystem fs = FileSystem.get(conf); Path filesPath = new Path(inputPath + "/*"); FileStatus[] files = fs.globStatus(filesPath); // Create job Job job = new Job(conf, "ReducerDataVertica"); job.setJarByClass(ReducerDataVertica.class); // Setup MapReduce job.setMapperClass(ReducerDataVerticaMapper.class); job.setReducerClass(ReducerDataVerticaReducer.class); job.setNumReduceTasks(1); // Specify key / value job.setOutputKeyClass(Text.class); job.setOutputValueClass(ReducerVerticaValue.class); // Input/*www. j a v a2 s . c o m*/ FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(SequenceFileInputFormat.class); // Output FileOutputFormat.setOutputPath(job, outputDir); job.setOutputFormatClass(TextOutputFormat.class); // Delete output if exists if (fs.exists(outputDir)) { fs.delete(outputDir, true); } // Execute job int code = job.waitForCompletion(true) ? 0 : 1; if (code == 0) { Counters counters = job.getCounters(); Counter malformedCounter = counters.findCounter(ReducerDataEnum.MALFORMED_DATA); LOG.info("Counter malformed data: " + malformedCounter.getValue()); for (FileStatus fStatus : files) { LOG.info("File name:" + fStatus.getPath()); if (fStatus.isFile()) { LOG.info("Removing file in path:" + fStatus.getPath()); fs.delete(fStatus.getPath(), false); } } } }
From source file:com.twitter.algebra.matrix.format.Sequence2MatrixFormatJob.java
License:Apache License
public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath) throws IOException, InterruptedException, ClassNotFoundException { FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "seq2mtx"); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(Sequence2MatrixFormatJob.class); job.setJobName(Sequence2MatrixFormatJob.class.getSimpleName()); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setNumReduceTasks(0); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit();//from w w w .ja v a2 s .c o m boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.algebra.matrix.multiply.ABInnerHDFSBroadcastOfB.java
License:Apache License
/** * Perform A x B, where A and B refer to the paths that contain matrices in * {@link SequenceFileInputFormat} Refer to {@link ABInnerHDFSBroadcastOfB} * for further details./*w w w .ja va2 s .c o m*/ * * @param conf the initial configuration * @param matrixInputPath path to matrix A * @param inMemMatrixDir path to matrix B (must be small enough to fit into * memory) * @param matrixOutputPath path to which AxB will be written * @param inMemMatrixNumRows B rows * @param inMemMatrixNumCols B cols * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public void run(Configuration conf, Path matrixInputPath, String inMemMatrixDir, Path matrixOutputPath, int inMemMatrixNumRows, int inMemMatrixNumCols) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "axbinner"); conf.set(MATRIXINMEMORY, inMemMatrixDir); conf.setInt(MATRIXINMEMORYROWS, inMemMatrixNumRows); conf.setInt(MATRIXINMEMORYCOLS, inMemMatrixNumCols); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(ABInnerHDFSBroadcastOfB.class); job.setJobName(ABInnerHDFSBroadcastOfB.class.getSimpleName()); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setNumReduceTasks(0); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); // since we do not use reducer, to get total order, the map output files has // to be renamed after this function returns: {@link // AlgebraCommon#fixPartitioningProblem} job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.algebra.matrix.multiply.AtBOuterStaticMapsideJoinJob.java
License:Apache License
public void run(Configuration conf, Path atPath, Path bPath, Path outPath, int outCardinality) throws IOException, InterruptedException, ClassNotFoundException { conf.setInt(OUT_CARD, outCardinality); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJobName(AtBOuterStaticMapsideJoinJob.class.getSimpleName()); job.setJarByClass(AtBOuterStaticMapsideJoinJob.class); FileSystem fs = FileSystem.get(atPath.toUri(), conf); atPath = fs.makeQualified(atPath);//from w w w.j av a2 s . c om bPath = fs.makeQualified(bPath); job.setInputFormatClass(CompositeInputFormat.class); //mapside join expression job.getConfiguration().set(CompositeInputFormat.JOIN_EXPR, CompositeInputFormat.compose("inner", SequenceFileInputFormat.class, atPath, bPath)); job.setOutputFormatClass(MatrixOutputFormat.class); outPath = fs.makeQualified(outPath); FileOutputFormat.setOutputPath(job, outPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); job.setCombinerClass(MyReducer.class); int numReducers = conf.getInt("algebra.reduceslots.multiply", 10); job.setNumReduceTasks(numReducers); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed"); }
From source file:com.twitter.algebra.matrix.multiply.AtB_DMJ.java
License:Apache License
/** * Perform A x B, where At and B refer to the paths that contain matrices in * {@link SequenceFileInputFormat}. One of At and B must also conform with * {@link MapDir} format. Refer to {@link AtB_DMJ} for further details. * /*from w w w . jav a2s .c o m*/ * @param conf the initial configuration * @param mapDirPath path to the matrix in {@link MapDir} format * @param matrixInputPaths the list of paths to matrix input partitions over * which we iterate * @param matrixOutputPath path to which AxB will be written * @param atCols number of columns of At (rows of A) * @param bCols * @param colsPerPartition cols per partition of the input matrix (whether At or B) * @param aIsMapDir is A chosen to be loaded as MapDir * @param useInMemCombiner * @param numberOfJobs the hint for the desired number of parallel jobs * @return the running job * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public Job run(Configuration conf, Path mapDirPath, Path matrixInputPaths, Path matrixOutputPath, int atCols, int bCols, int colsPerPartition, boolean aIsMapDir, boolean useInMemCombiner) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.set(MATRIXINMEMORY, mapDirPath.toString()); conf.setBoolean(AISMAPDIR, aIsMapDir); conf.setBoolean(USEINMEMCOMBINER, useInMemCombiner); conf.setInt(RESULTROWS, atCols); conf.setInt(RESULTCOLS, bCols); conf.setInt(PARTITIONCOLS, colsPerPartition); FileSystem fs = FileSystem.get(matrixOutputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPaths, "dmj"); if (useInMemCombiner) { Configuration newConf = new Configuration(conf); newConf.set("mapreduce.task.io.sort.mb", "1"); conf = newConf; } @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(AtB_DMJ.class); job.setJobName(AtB_DMJ.class.getSimpleName()); matrixOutputPath = fs.makeQualified(matrixOutputPath); matrixInputPaths = fs.makeQualified(matrixInputPaths); MultipleInputs.addInputPath(job, matrixInputPaths, SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); if (!useInMemCombiner) job.setCombinerClass(AtBOuterStaticMapsideJoinJob.MyReducer.class); int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "dmj"); job.setNumReduceTasks(numReducers); // ensures total order (when used with {@link MatrixOutputFormat}), RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, atCols); job.setReducerClass(EpsilonReducer.class); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); return job; }
From source file:com.twitter.algebra.matrix.multiply.PartitionerJob.java
License:Apache License
public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int aRows, int partitions) throws IOException, InterruptedException, ClassNotFoundException { @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(PartitionerJob.class); job.setJobName(PartitionerJob.class.getSimpleName() + "-" + matrixOutputPath.getName()); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setNumReduceTasks(partitions); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(IdMapper.class); job.setReducerClass(IdReducer.class); RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, aRows); job.submit();//from www. ja v a2 s . co m boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }