List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass
public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException
From source file:com.twitter.algebra.matrix.format.RowPartitioner.java
License:Apache License
/** * Configure a job to use RowPartitioner * @param job the job// w w w . j av a 2 s . c om * @param pClass the class that inherits RowPartitioner * @param totalKeys total number of rows */ @SuppressWarnings("rawtypes") public static void setPartitioner(Job job, Class<? extends RowPartitioner> pClass, int totalKeys) { job.setPartitionerClass(pClass); job.getConfiguration().setInt(TOTAL_KEYS, totalKeys); }
From source file:com.twitter.algebra.nmf.RowColPartitioner.java
License:Apache License
/** * Configure a job to use {@link RowColPartitioner} * /*from www.j a va 2 s .co m*/ * @param job the job * @param pClass the class that inherits RowPartitioner * @param totalRows total number of rows */ @SuppressWarnings("rawtypes") public static void setPartitioner(Job job, Class<? extends RowColPartitioner> pClass, int totalRows, int totalCols, int totalColPartitions) { job.setPartitionerClass(pClass); job.getConfiguration().setInt(TOTAL_ROWS, totalRows); job.getConfiguration().setInt(TOTAL_COLS, totalCols); job.getConfiguration().setInt(TOTAL_COL_PARTITIONS, totalColPartitions); }
From source file:com.twitter.elephanttwin.indexing.AbstractBlockIndexingJob.java
License:Open Source License
/** * Sets up various job properites required for the indexing job. * If your implementation needs to mess with the conf, you can do so by overriding * this method (remember to call super.setupJob()!) or in setMapper(). * @param conf/*from w ww . ja v a 2s.c o m*/ * @return * @throws IOException */ protected Job setupJob(Configuration conf) throws IOException { Job job = new Job(new Configuration(conf)); job.setJarByClass(getClass()); job.setInputFormatClass(BlockIndexedFileInputFormat.class); job.setReducerClass(MapFileIndexingReducer.class); job.setMapOutputKeyClass(TextLongPairWritable.class); job.setMapOutputValueClass(LongPairWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(ListLongPair.class); job.setPartitionerClass(TextLongPairWritable.Parititioner.class); job.setSortComparatorClass(TextLongPairWritable.PairComparator.class); job.setGroupingComparatorClass(TextLongPairWritable.KeyOnlyComparator.class); job.setOutputFormatClass(MapFileOutputFormat.class); job.setNumReduceTasks(getNumPartitions()); BlockIndexedFileInputFormat.setIndexOptions(job, getInputFormat(), getValueClass(), getIndex(), getColumnName()); return job; }
From source file:com.veera.secondarysort.demo2.SsJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); Job job = new Job(conf, "secondary sort"); job.setJarByClass(SsJob.class); job.setPartitionerClass(NaturalKeyPartitioner.class); job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class); job.setSortComparatorClass(CompositeKeyComparator.class); job.setMapOutputKeyClass(StockKey.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(SsMapper.class); job.setReducerClass(SsReducer.class); job.waitForCompletion(true);/*ww w . j a v a 2s .co m*/ return 0; }
From source file:com.yahoo.glimmer.indexing.generator.TripleIndexGenerator.java
License:Open Source License
public int run(String[] args) throws Exception { SimpleJSAP jsap = new SimpleJSAP(TripleIndexGenerator.class.getName(), "Generates a keyword index from RDF data.", new Parameter[] { new Switch(NO_CONTEXTS_ARG, 'C', "withoutContexts", "Don't process the contexts for each tuple."), new FlaggedOption(METHOD_ARG, JSAP.STRING_PARSER, "horizontal", JSAP.REQUIRED, 'm', METHOD_ARG, "horizontal or vertical."), new FlaggedOption(PREDICATES_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'p', PREDICATES_ARG, "Subset of the properties to be indexed."), new FlaggedOption(RESOURCE_PREFIX_ARG, JSAP.STRING_PARSER, "@", JSAP.NOT_REQUIRED, 'r', RESOURCE_PREFIX_ARG, "Prefix to add to object resource hash values when indexing. Stops queries for numbers matching resource hash values. Default is '@'"), new UnflaggedOption("input", JSAP.STRING_PARSER, JSAP.REQUIRED, "HDFS location for the input data."), new UnflaggedOption(NUMBER_OF_DOCS_ARG, JSAP.LONG_PARSER, JSAP.REQUIRED, "Number of documents to index"), new UnflaggedOption("output", JSAP.STRING_PARSER, JSAP.REQUIRED, "HDFS location for the output."), new UnflaggedOption(RESOURCES_HASH_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED, "HDFS location of the resources hash file."), }); JSAPResult jsapResult = jsap.parse(args); // check whether the command line was valid, and if it wasn't, // display usage information and exit. if (!jsapResult.success()) { System.err.println();//from w w w. java 2 s.co m System.err.println("Usage: java " + TripleIndexGenerator.class.getName()); System.err.println(" " + jsap.getUsage()); System.err.println(); System.exit(1); } Job job = Job.getInstance(getConf()); job.setJarByClass(TripleIndexGenerator.class); job.setJobName("TripleIndexGenerator" + System.currentTimeMillis()); FileInputFormat.setInputPaths(job, new Path(jsapResult.getString("input"))); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(DocumentMapper.class); job.setMapOutputKeyClass(TermKey.class); job.setMapOutputValueClass(TermValue.class); job.setPartitionerClass(TermKey.FirstPartitioner.class); job.setGroupingComparatorClass(TermKey.FirstGroupingComparator.class); job.setReducerClass(TermReduce.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IndexRecordWriterValue.class); job.setOutputFormatClass(IndexRecordWriter.OutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(jsapResult.getString("output"))); Configuration conf = job.getConfiguration(); conf.setClass("mapred.output.key.comparator.class", TermKey.Comparator.class, WritableComparator.class); conf.set("mapreduce.user.classpath.first", "true"); long numDocs = jsapResult.getLong(NUMBER_OF_DOCS_ARG); conf.setLong(NUMBER_OF_DOCUMENTS, numDocs); // Set this in a attempt to get around the 2GB of ram task limit on our cluster. // Setting this in the hope of fixing Direct buffer memory errors conf.setInt(INDEX_WRITER_CACHE_SIZE, 1024 * 1024); conf.set(OUTPUT_DIR, jsapResult.getString("output")); boolean withContexts = !jsapResult.getBoolean(NO_CONTEXTS_ARG, false); if (jsapResult.getString(METHOD_ARG).equalsIgnoreCase(METHOD_ARG_VALUE_HORIZONTAL)) { HorizontalDocumentFactory.setupConf(conf, withContexts, jsapResult.getString(RESOURCES_HASH_ARG), jsapResult.getString(RESOURCE_PREFIX_ARG)); } else if (jsapResult.getString(METHOD_ARG).equalsIgnoreCase(METHOD_ARG_VALUE_VERTICAL)) { if (!jsapResult.contains(PREDICATES_ARG)) { throw new IllegalArgumentException("When '" + METHOD_ARG + "' is '" + METHOD_ARG_VALUE_VERTICAL + "' you have to give a predicates file too."); } VerticalDocumentFactory.setupConf(conf, withContexts, jsapResult.getString(RESOURCES_HASH_ARG), jsapResult.getString(RESOURCE_PREFIX_ARG), jsapResult.getString(PREDICATES_ARG)); } else { throw new IllegalArgumentException(METHOD_ARG + " should be '" + METHOD_ARG_VALUE_HORIZONTAL + "' or '" + METHOD_ARG_VALUE_VERTICAL + "'"); } conf.setInt("mapreduce.input.linerecordreader.line.maxlength", 1024 * 1024); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
From source file:com.yahoo.ycsb.bulk.hbase.BulkDataGeneratorJob.java
License:Apache License
/** * Parameters for bulk loader specified through the config file: * * - prefix for the row keys/* ww w . j a va 2s . c o m*/ * - range start * - range end (inclusive) * - num splits (or number of partitions). * - user * - password * - table * * For the accepted default options * @see org.apache.hadoop.util.Tool#run(java.lang.String[]) */ public int run(String[] args) throws Exception { Configuration conf = this.getConf(); Util.printArgs("run", args, System.err); printKeyValues(conf, ARG_KEYS, System.err); if (args.length > 1 || (args.length == 1 && "-help".compareToIgnoreCase(args[0]) == 0)) { System.err.println("Usage: " + this.getClass().getName() + "input_path [generic options] [input_paths...] ouptut_path"); GenericOptionsParser.printGenericCommandUsage(System.err); return 1; } // Time run long startTime = System.currentTimeMillis(); String workdir; if (args.length == 1) { /* override workdir in the config if it is specified in the * command line */ conf.set(ARG_KEY_OUTDIR, args[0]); workdir = args[0]; } workdir = conf.get(ARG_KEY_OUTDIR); if (workdir == null) { System.err.println("No output directory specified"); return 1; } /* Initialize job, check parameters and decide which mapper to use */ Job job = new Job(conf, conf.get(ARG_KEY_JOBNAME, "YCSB KV data generator")); /* these settings are the same (i.e., fixed) independent of the * parameters */ job.setJarByClass(this.getClass()); // job.setInputFormatClass(TextInputFormat.class); job.setInputFormatClass(NLineInputFormat.class); /* these settings should depend on the type of output file */ job.setOutputFormatClass(HFileOutputFormat.class); /* not sure the next two are needed */ job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(KeyValue.class); this.createInputFile(job, workdir); HFileOutputFormat.setOutputPath(job, new Path(workdir + "/files")); /* depending on whether the keys need to be sorted and hashed, then * decide which mapper and reducer to use */ boolean hashKeys = conf.getBoolean(ARG_KEY_HASH_KEYS, false); boolean sortKeys = conf.getBoolean(ARG_KEY_SORTKEYS, true); /* get splits file name: side-effect -> this may generate a splits file */ String splitsfile = this.getSplitsFile(job, workdir); if (sortKeys && hashKeys) { /* do a full map reduce job */ job.setMapperClass(RowGeneratorMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setPartitionerClass(RangePartitioner.class); if (splitsfile == null) { /* Auto generate the splits file either from: * - the input key ranges * - from the current table splits */ throw new InvalidInputException("No splits specified"); } /* Set splits file */ RangePartitioner.setSplitFile(job, splitsfile); /* Add reducer (based on mapper code) */ job.setReducerClass(RowGeneratorReduce.class); /* the number of reducers is dependent on the number of * partitions */ int numReduce = conf.getInt(ARG_KEY_NUMREDUCE, 1); job.setNumReduceTasks(numReduce); } else { /* perform a map only job */ job.setMapperClass(RowGeneratorMapOnly.class); /* map output key and value types are the same as * for the job */ job.setMapOutputKeyClass(job.getOutputKeyClass()); job.setMapOutputValueClass(job.getOutputValueClass()); job.setNumReduceTasks(0); } job.waitForCompletion(true); // JobClient.runJob(conf); SimpleDateFormat df = new SimpleDateFormat("yyyy.MM.dd HH:mm:ss.SSS z"); SimpleDateFormat ddf = new SimpleDateFormat("HH:mm:ss.SSS"); ddf.setTimeZone(TimeZone.getTimeZone("UTC")); long endTime = System.currentTimeMillis(); System.out.println("Start time (ms): " + df.format(new Date(startTime)) + " -- " + startTime); System.out.println("End time (ms): " + df.format(new Date(endTime)) + " -- " + endTime); System.out .println("Elapsed time (ms): " + ddf.format(endTime - startTime) + " -- " + (endTime - startTime)); return 0; }
From source file:counting.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); final int NUMBER_OF_NODES = 31; final int MAX_NUMBER_OF_TASKS = 1000; final double REDUCER_CONSTANT = 0.95; // or 1.75 if (otherArgs.length < 5) { System.err.println(// ww w . j a v a 2 s .co m "Usage: wordcount <in> [<in>...] <out> <ngram> <combiner:yes/no> <custom partioner:yes/no>"); System.exit(2); } Job job = Job.getInstance(conf, "Word count"); // Setting map and reduce tasks //conf.setNumMapTasks(5); // Not possible with code in line? int NUMBER_OF_REDUCERS = (int) REDUCER_CONSTANT * NUMBER_OF_NODES * MAX_NUMBER_OF_TASKS; //System.out.println("Number of Reducers: " + NUMBER_OF_REDUCERS); job.setNumReduceTasks(12); // Placeholder job.setJarByClass(WordCount.class); job.setMapperClass(nGramMapper.class); nGramMapper.setN(Integer.parseInt(otherArgs[otherArgs.length - 3])); // Set ngram length System.out.println("n = " + nGramMapper.getN()); System.out.println("Combiner = " + otherArgs[otherArgs.length - 2]); System.out.println("Custom Partitioner = " + otherArgs[otherArgs.length - 1]); System.out.println("Number of reducers = " + NUMBER_OF_NODES); if (otherArgs[otherArgs.length - 2].equals("yes")) { job.setCombinerClass(IntSumReducer.class); } if (otherArgs[otherArgs.length - 1].equals("yes")) { job.setPartitionerClass(CustomPartitioner.class); //CustomPartitioner.setNumberOfReducers(NUMBER_OF_REDUCERS); } job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // Input paths for (int i = 0; i < otherArgs.length - 4; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } // Output paths FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 4])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = JobBuilder.parseInputAndOutput(this, getConf(), args); if (job == null) { return -1; }/*from www.j a v a 2 s .com*/ /*[*/job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(Mapper.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setPartitionerClass(HashPartitioner.class); job.setNumReduceTasks(1); job.setReducerClass(Reducer.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class);/*]*/ return job.waitForCompletion(true) ? 0 : 1; }
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { JobBuilder.printUsage(this, "<ncdc input> <station input> <output>"); return -1; }/*from w w w. j a v a 2s . c o m*/ Job job = new Job(getConf(), "Join weather records with station names"); job.setJarByClass(getClass()); Path ncdcInputPath = new Path(args[0]); Path stationInputPath = new Path(args[1]); Path outputPath = new Path(args[2]); MultipleInputs.addInputPath(job, ncdcInputPath, TextInputFormat.class, JoinRecordMapper.class); MultipleInputs.addInputPath(job, stationInputPath, TextInputFormat.class, JoinStationMapper.class); FileOutputFormat.setOutputPath(job, outputPath); /*[*/job.setPartitionerClass(KeyPartitioner.class); job.setGroupingComparatorClass(TextPair.FirstComparator.class);/*]*/ job.setMapOutputKeyClass(TextPair.class); job.setReducerClass(JoinReducer.class); job.setOutputKeyClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = JobBuilder.parseInputAndOutput(this, getConf(), args); if (job == null) { return -1; }/* w w w . ja v a 2 s .c o m*/ job.setMapperClass(MaxTemperatureMapper.class); /*[*/job.setPartitionerClass(FirstPartitioner.class); /*]*/ /*[*/job.setSortComparatorClass(KeyComparator.class); /*]*/ /*[*/job.setGroupingComparatorClass(GroupComparator.class);/*]*/ job.setReducerClass(MaxTemperatureReducer.class); job.setOutputKeyClass(IntPair.class); job.setOutputValueClass(NullWritable.class); return job.waitForCompletion(true) ? 0 : 1; }