List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass
public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException
From source file:com.talis.labs.pagerank.mapreduce.CountPages.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: CountPages <input path> <output path>"); return -1; }//from w w w. j av a2 s.com FileSystem.get(getConf()).delete(new Path(args[1]), true); Job job = new Job(getConf(), "CountPages"); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(CountPagesMapper.class); job.setCombinerClass(CountPagesReducer.class); job.setReducerClass(CountPagesReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setNumReduceTasks(1); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.talis.labs.pagerank.mapreduce.DanglingPages.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: DanglingPages <input path> <output path>"); return -1; }//w w w . j av a 2 s .c om FileSystem.get(getConf()).delete(new Path(args[1]), true); Job job = new Job(getConf(), "DanglingPages"); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(DanglingPagesMapper.class); job.setCombinerClass(DanglingPagesReducer.class); job.setReducerClass(DanglingPagesReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); job.setNumReduceTasks(1); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.talis.mapreduce.wordcount.newapi.WordCount.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; }/* w w w . j a v a2 s.c o m*/ Job job = new Job(getConf(), getClass().getSimpleName()); job.setJarByClass(getClass()); job.setMapperClass(WordCountMapper.class); job.setCombinerClass(WordCountReducer.class); job.setReducerClass(WordCountReducer.class); // job.setPartitionerClass(HashPartitioner.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.tdunning.plume.local.lazy.MapRedExecutor.java
License:Apache License
/** * This method returns a Job instance out of a {@link MSCR} entity. It puts the Class of * the {@link PlumeWorkflow} argument and the MSCR id in the hadoop configuration. * //from ww w . ja v a 2 s .co m * @param mscr The MSCR to convert * @param workflow The workflow whose class will be instantiated by hadoop mappers/reducers * @param outputPath The output path of the MapRed job * @return A hadoop-executable MapRed Job * * @throws IOException */ static Job getMapRed(final MSCR mscr, PlumeWorkflow workFlow, String workFlowOutputPath, String outputPath) throws IOException { Configuration conf = new Configuration(); conf.set(WORKFLOW_NAME, workFlow.getClass().getName()); conf.setInt(MSCR_ID, mscr.getId()); conf.set(TEMP_OUTPUT_PATH, workFlowOutputPath); Job job = new Job(conf, "MSCR"); // TODO deprecation job.setMapOutputKeyClass(PlumeObject.class); job.setMapOutputValueClass(PlumeObject.class); job.setJarByClass(MapRedExecutor.class); /** * Define multiple inputs */ for (PCollection<?> input : mscr.getInputs()) { if (!(input instanceof LazyCollection)) { throw new IllegalArgumentException("Can't create MapRed from MSCR whose inputs are not LazyTable"); } LazyCollection<Text> l = (LazyCollection<Text>) input; if (!(l.isMaterialized() && l.getFile() != null)) { // Collections have plume ID only if they are intermediate results - TODO better naming for this if (l.getPlumeId().length() < 1) { throw new IllegalArgumentException( "Can't create MapRed from MSCR inputs that are not materialized to a file"); } } PCollectionType<?> rType = l.getType(); Class<? extends InputFormat> format = SequenceFileInputFormat.class; if (rType instanceof PTableType) { PTableType<?, ?> tType = (PTableType<?, ?>) rType; if (tType.valueType() instanceof StringType && tType.keyType() instanceof StringType) { format = KeyValueTextInputFormat.class; } MultipleInputs.addInputPath(job, new Path(l.getFile()), format, MSCRMapper.class); } else { if (rType.elementType() instanceof StringType) { format = TextInputFormat.class; } MultipleInputs.addInputPath(job, new Path(l.getFile()), format, MSCRMapper.class); } } /** * Define multiple outputs */ FileOutputFormat.setOutputPath(job, new Path(outputPath)); for (Map.Entry<PCollection<?>, Integer> entry : mscr.getNumberedChannels().entrySet()) { PCollectionType<?> rType = ((LazyCollection<?>) mscr.getOutputChannels().get(entry.getKey()).output) .getType(); if (rType instanceof PTableType) { PTableType<?, ?> tType = (PTableType<?, ?>) rType; Class<? extends OutputFormat> outputFormat = SequenceFileOutputFormat.class; if (tType.keyType() instanceof StringType && tType.valueType() instanceof StringType) { outputFormat = TextOutputFormat.class; } MultipleOutputs.addNamedOutput(job, entry.getValue() + "", outputFormat, getHadoopType(tType.keyType()), getHadoopType(tType.valueType())); } else { Class<? extends OutputFormat> outputFormat = SequenceFileOutputFormat.class; if (rType.elementType() instanceof StringType) { outputFormat = TextOutputFormat.class; } MultipleOutputs.addNamedOutput(job, entry.getValue() + "", outputFormat, NullWritable.class, getHadoopType(rType.elementType())); } } /** * Define Reducer & Combiner */ job.setCombinerClass(MSCRCombiner.class); job.setReducerClass(MSCRReducer.class); job.setNumReduceTasks(1); return job; }
From source file:com.telefonica.iot.tidoop.apiext.utils.CKANMapReduceExample.java
License:Open Source License
@Override public int run(String[] args) throws Exception { // check the number of arguments, show the usage if it is wrong if (args.length != 7) { showUsage();// www . j av a2 s . co m return -1; } // if // get the arguments String ckanHost = args[0]; String ckanPort = args[1]; boolean sslEnabled = args[2].equals("true"); String ckanAPIKey = args[3]; String ckanInputs = args[4]; String ckanOutput = args[5]; String splitsLength = args[6]; // create and configure a MapReduce job Configuration conf = this.getConf(); Job job = Job.getInstance(conf, "CKAN MapReduce test"); job.setJarByClass(CKANMapReduceExample.class); job.setMapperClass(RecordSizeGetter.class); job.setCombinerClass(RecordSizeAdder.class); job.setReducerClass(RecordSizeAdder.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setInputFormatClass(CKANInputFormat.class); CKANInputFormat.setInput(job, ckanInputs); CKANInputFormat.setEnvironment(job, ckanHost, ckanPort, sslEnabled, ckanAPIKey); CKANInputFormat.setSplitsLength(job, splitsLength); job.setOutputFormatClass(CKANOutputFormat.class); CKANOutputFormat.setEnvironment(job, ckanHost, ckanPort, sslEnabled, ckanAPIKey); CKANOutputFormat.setOutputPkg(job, ckanOutput); // run the MapReduce job return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.telefonica.iot.tidoop.mrlib.jobs.Filter.java
License:Open Source License
@Override public int run(String[] args) throws Exception { // check the number of arguments, show the usage if it is wrong if (args.length != 3) { showUsage();/*w w w . j a v a2 s . c om*/ return -1; } // if // get the arguments String input = args[0]; String output = args[1]; String regex = args[2]; // create and configure a MapReduce job Configuration conf = this.getConf(); conf.set(Constants.PARAM_REGEX, regex); Job job = Job.getInstance(conf, "tidoop-mr-lib-filter"); job.setNumReduceTasks(1); job.setJarByClass(Filter.class); job.setMapperClass(LineFilter.class); job.setCombinerClass(LinesCombiner.class); job.setReducerClass(LinesJoiner.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(input)); FileOutputFormat.setOutputPath(job, new Path(output)); // run the MapReduce job return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.trexinhca.TrexinHCATest.java
License:Apache License
public static void main(String[] args) throws Exception { ks = KieServices.Factory.get();/*from www .j a v a2 s . co m*/ kContainer = ks.getKieClasspathContainer(); ksession = TrexinHCATest.kContainer.newKieSession("MapReduceKS"); Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: TrexinHCATest <in> [<in>...] <out>"); System.exit(2); } Job job = Job.getInstance(conf); job.setJobName("HCATest"); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(TrexinHCAReducer.class); job.setReducerClass(TrexinHCAReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(TextOutputFormat.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); job.setJarByClass(TrexinHCATest.class); job.waitForCompletion(true); }
From source file:com.twitter.algebra.matrix.multiply.ABOuterHDFSBroadcastOfA.java
License:Apache License
/** * Perform A x B, where A and B refer to the paths that contain matrices in * {@link SequenceFileInputFormat} Refer to {@link ABOuterHDFSBroadcastOfA} * for further details.//from w w w . jav a 2 s . c o m * * @param conf * the initial configuration * @param matrixInputPath * path to matrix A * @param inMemMatrixDir * path to matrix B (must be small enough to fit into memory) * @param matrixOutputPath * path to which AxB will be written * @param inMemMatrixNumRows * B rows * @param inMemMatrixNumCols * B cols * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public void run(Configuration conf, String inMemMatrixDir, Path matrixInputPath, Path matrixOutputPath, int inMemMatrixNumRows, int inMemMatrixNumCols) throws IOException, InterruptedException, ClassNotFoundException { conf.set(MATRIXINMEMORY, inMemMatrixDir); conf.setInt(MATRIXINMEMORYROWS, inMemMatrixNumRows); conf.setInt(MATRIXINMEMORYCOLS, inMemMatrixNumCols); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(ABOuterHDFSBroadcastOfA.class); job.setJobName(ABOuterHDFSBroadcastOfA.class.getSimpleName()); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); // ensures total order (when used with {@link MatrixOutputFormat}), RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, inMemMatrixNumRows); job.setCombinerClass(AtBOuterStaticMapsideJoinJob.MyReducer.class); job.setReducerClass(AtBOuterStaticMapsideJoinJob.MyReducer.class); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.algebra.matrix.multiply.AtBOuterStaticMapsideJoinJob.java
License:Apache License
public void run(Configuration conf, Path atPath, Path bPath, Path outPath, int outCardinality) throws IOException, InterruptedException, ClassNotFoundException { conf.setInt(OUT_CARD, outCardinality); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJobName(AtBOuterStaticMapsideJoinJob.class.getSimpleName()); job.setJarByClass(AtBOuterStaticMapsideJoinJob.class); FileSystem fs = FileSystem.get(atPath.toUri(), conf); atPath = fs.makeQualified(atPath);//from ww w . ja v a2 s . c o m bPath = fs.makeQualified(bPath); job.setInputFormatClass(CompositeInputFormat.class); //mapside join expression job.getConfiguration().set(CompositeInputFormat.JOIN_EXPR, CompositeInputFormat.compose("inner", SequenceFileInputFormat.class, atPath, bPath)); job.setOutputFormatClass(MatrixOutputFormat.class); outPath = fs.makeQualified(outPath); FileOutputFormat.setOutputPath(job, outPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); job.setCombinerClass(MyReducer.class); int numReducers = conf.getInt("algebra.reduceslots.multiply", 10); job.setNumReduceTasks(numReducers); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed"); }
From source file:com.twitter.algebra.matrix.multiply.AtB_DMJ.java
License:Apache License
/** * Perform A x B, where At and B refer to the paths that contain matrices in * {@link SequenceFileInputFormat}. One of At and B must also conform with * {@link MapDir} format. Refer to {@link AtB_DMJ} for further details. * // w w w . jav a 2 s. c o m * @param conf the initial configuration * @param mapDirPath path to the matrix in {@link MapDir} format * @param matrixInputPaths the list of paths to matrix input partitions over * which we iterate * @param matrixOutputPath path to which AxB will be written * @param atCols number of columns of At (rows of A) * @param bCols * @param colsPerPartition cols per partition of the input matrix (whether At or B) * @param aIsMapDir is A chosen to be loaded as MapDir * @param useInMemCombiner * @param numberOfJobs the hint for the desired number of parallel jobs * @return the running job * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public Job run(Configuration conf, Path mapDirPath, Path matrixInputPaths, Path matrixOutputPath, int atCols, int bCols, int colsPerPartition, boolean aIsMapDir, boolean useInMemCombiner) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.set(MATRIXINMEMORY, mapDirPath.toString()); conf.setBoolean(AISMAPDIR, aIsMapDir); conf.setBoolean(USEINMEMCOMBINER, useInMemCombiner); conf.setInt(RESULTROWS, atCols); conf.setInt(RESULTCOLS, bCols); conf.setInt(PARTITIONCOLS, colsPerPartition); FileSystem fs = FileSystem.get(matrixOutputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPaths, "dmj"); if (useInMemCombiner) { Configuration newConf = new Configuration(conf); newConf.set("mapreduce.task.io.sort.mb", "1"); conf = newConf; } @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(AtB_DMJ.class); job.setJobName(AtB_DMJ.class.getSimpleName()); matrixOutputPath = fs.makeQualified(matrixOutputPath); matrixInputPaths = fs.makeQualified(matrixInputPaths); MultipleInputs.addInputPath(job, matrixInputPaths, SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); if (!useInMemCombiner) job.setCombinerClass(AtBOuterStaticMapsideJoinJob.MyReducer.class); int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "dmj"); job.setNumReduceTasks(numReducers); // ensures total order (when used with {@link MatrixOutputFormat}), RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, atCols); job.setReducerClass(EpsilonReducer.class); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); return job; }