List of usage examples for org.apache.hadoop.mapreduce Job setNumReduceTasks
public void setNumReduceTasks(int tasks) throws IllegalStateException
From source file:com.moz.fiji.mapreduce.output.FileMapReduceJobOutput.java
License:Apache License
/** {@inheritDoc} */ @Override//from w w w . j a v a2 s . c om public void configure(Job job) throws IOException { super.configure(job); FileOutputFormat.setOutputPath(job, mFilePath); job.setNumReduceTasks(mNumSplits); }
From source file:com.moz.fiji.mapreduce.output.framework.HFileReducerMapReduceJobOutput.java
License:Apache License
/** {@inheritDoc} */ @Override/*from w w w. j a v a 2s . co m*/ public void configure(Job job) throws IOException { super.configure(job); // sets the Hadoop output format final Configuration conf = job.getConfiguration(); conf.set(FijiConfKeys.FIJI_OUTPUT_TABLE_URI, mJobOutput.getOutputTableURI().toString()); // Fiji table context: conf.setClass(FijiConfKeys.FIJI_TABLE_CONTEXT_CLASS, HFileWriterContext.class, FijiTableContext.class); // Set the output path. FileOutputFormat.setOutputPath(job, mJobOutput.getPath()); job.setNumReduceTasks(mJobOutput.getNumReduceTasks()); }
From source file:com.moz.fiji.mapreduce.output.HFileMapReduceJobOutput.java
License:Apache License
/** * Configures the partitioner for generating HFiles. * * <p>Each generated HFile should fit within a region of of the target table. * Additionally, it's optimal to have only one HFile to load into each region, since a * read from that region will require reading from each HFile under management (until * compaction happens and merges them all back into one HFile).</p> * * <p>To achieve this, we configure a TotalOrderPartitioner that will partition the * records output from the Mapper based on their rank in a total ordering of the * keys. The <code>startKeys</code> argument should contain a list of the first key in * each of those partitions.</p>//from ww w . j ava2 s. c o m * * @param job The job to configure. * @param startKeys A list of keys that will mark the boundaries between the partitions * for the sorted map output records. * @throws IOException If there is an error. */ public static void configurePartitioner(Job job, List<HFileKeyValue> startKeys) throws IOException { FijiMRPlatformBridge.get().setTotalOrderPartitionerClass(job); LOG.info("Configuring " + startKeys.size() + " reduce partitions."); job.setNumReduceTasks(startKeys.size()); // Write the file that the TotalOrderPartitioner reads to determine where to partition records. Path partitionFilePath = new Path(job.getWorkingDirectory(), "partitions_" + System.currentTimeMillis()); LOG.info("Writing partition information to " + partitionFilePath); final FileSystem fs = partitionFilePath.getFileSystem(job.getConfiguration()); partitionFilePath = partitionFilePath.makeQualified(fs); writePartitionFile(job.getConfiguration(), partitionFilePath, startKeys); // Add it to the distributed cache. try { final URI cacheUri = new URI(partitionFilePath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH); DistributedCache.addCacheFile(cacheUri, job.getConfiguration()); } catch (URISyntaxException e) { throw new IOException(e); } DistributedCache.createSymlink(job.getConfiguration()); }
From source file:com.mozilla.hadoop.Backup.java
License:Apache License
/** * @param args/*from w w w. j av a2 s.c o m*/ * @return * @throws IOException * @throws ParseException */ public Job initJob(String[] args) throws IOException, ParseException { Path inputPath = null; Path loadPath = null; String outputPath = null; boolean useSpecifiedPaths = false; for (int idx = 0; idx < args.length; idx++) { if ("-f".equals(args[idx])) { useSpecifiedPaths = true; loadPath = new Path(args[++idx]); } else if (idx == args.length - 1) { outputPath = args[idx]; } else { inputPath = new Path(args[idx]); } } Path mrOutputPath = new Path(NAME + "-results"); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.set("backup.input.path", inputPath.toString()); conf.set("backup.output.path", outputPath); FileSystem inputFs = null; FileSystem outputFs = null; Path[] inputSources = null; try { inputFs = FileSystem.get(inputPath.toUri(), new Configuration()); outputFs = FileSystem.get(getConf()); if (useSpecifiedPaths) { inputSources = createInputSources(loadPaths(outputFs, loadPath), outputFs); } else { inputSources = createInputSources(getPaths(inputFs, inputPath, 0, 2), outputFs); } } finally { checkAndClose(inputFs); checkAndClose(outputFs); } Job job = new Job(getConf()); job.setJobName(NAME); job.setJarByClass(Backup.class); job.setMapperClass(BackupMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(0); job.setInputFormatClass(TextInputFormat.class); for (Path source : inputSources) { System.out.println("Adding input path: " + source.toString()); FileInputFormat.addInputPath(job, source); } FileOutputFormat.setOutputPath(job, mrOutputPath); return job; }
From source file:com.mozilla.main.ReadHBaseWriteHdfs.java
License:LGPL
@Override public int run(String[] args) throws Exception { Configuration conf = new Configuration(); conf.set("mapred.job.queue.name", "prod"); Job job = new Job(conf, "ReadHBaseWriteHDFS"); job.setJarByClass(ReadHBaseWriteHdfs.class); Scan scan = new Scan(); scan.addFamily("data".getBytes()); TableMapReduceUtil.initTableMapperJob(TABLE_NAME, scan, ReadHBaseWriteHdfsMapper.class, Text.class, Text.class, job); job.setReducerClass(ReadHBaseWriteHdfsReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(1000); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); SequenceFileOutputFormat.setOutputPath(job, new Path(args[0])); job.waitForCompletion(true);/* w w w. j a v a2 s. co m*/ if (job.isSuccessful()) { System.out.println("DONE"); } return 0; }
From source file:com.mozilla.socorro.hadoop.CrashReportJob.java
License:LGPL
/** * @param args/* w w w.j a va 2s. com*/ * @return * @throws IOException * @throws ParseException */ public static Job initJob(String jobName, Configuration conf, Class<?> mainClass, Class<? extends TableMapper> mapperClass, Class<? extends Reducer> combinerClass, Class<? extends Reducer> reducerClass, Map<byte[], byte[]> columns, Class<? extends WritableComparable> keyOut, Class<? extends Writable> valueOut, Path outputPath) throws IOException, ParseException { // Set both start/end time and start/stop row Calendar startCal = Calendar.getInstance(); Calendar endCal = Calendar.getInstance(); SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd"); String startDateStr = conf.get(START_DATE); String endDateStr = conf.get(END_DATE); if (!StringUtils.isBlank(startDateStr)) { startCal.setTime(sdf.parse(startDateStr)); } if (!StringUtils.isBlank(endDateStr)) { endCal.setTime(sdf.parse(endDateStr)); } conf.setLong(START_TIME, startCal.getTimeInMillis()); conf.setLong(END_TIME, DateUtil.getEndTimeAtResolution(endCal.getTimeInMillis(), Calendar.DATE)); Job job = new Job(conf); job.setJobName(jobName); job.setJarByClass(mainClass); // input table configuration Scan[] scans = MultiScanTableMapReduceUtil.generateScans(startCal, endCal, columns, 100, false); MultiScanTableMapReduceUtil.initMultiScanTableMapperJob(TABLE_NAME_CRASH_REPORTS, scans, mapperClass, keyOut, valueOut, job); if (combinerClass != null) { job.setCombinerClass(combinerClass); } if (reducerClass != null) { job.setReducerClass(reducerClass); } else { job.setNumReduceTasks(0); } FileOutputFormat.setOutputPath(job, outputPath); return job; }
From source file:com.mozilla.socorro.hadoop.HardwareAccel.java
License:LGPL
/** * @param args/* w ww . j a v a 2s . c o m*/ * @return * @throws IOException * @throws ParseException */ public Job initJob(String[] args) throws IOException, ParseException { Map<byte[], byte[]> columns = new HashMap<byte[], byte[]>(); columns.put(PROCESSED_DATA_BYTES, JSON_BYTES); Job job = CrashReportJob.initJob(NAME, getConf(), HardwareAccel.class, HardwareAccelMapper.class, IntSumReducer.class, IntSumReducer.class, columns, Text.class, IntWritable.class, new Path(args[0])); job.setNumReduceTasks(1); return job; }
From source file:com.msd.gin.halyard.tools.HalyardParallelExport.java
License:Apache License
@Override public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(newOption("h", null, "Prints this help")); options.addOption(newOption("v", null, "Prints version")); options.addOption(newOption("s", "source_htable", "Source HBase table with Halyard RDF store")); options.addOption(newOption("q", "sparql_query", "SPARQL tuple or graph query with use of '" + PARALLEL_SPLIT_FUNCTION_URI + "' function")); options.addOption(newOption("t", "target_url", "file://<path>/<file_name>{0}.<ext> or hdfs://<path>/<file_name>{0}.<ext> or jdbc:<jdbc_connection>/<table_name>")); options.addOption(newOption("p", "property=value", "JDBC connection properties")); options.addOption(newOption("l", "driver_classpath", "JDBC driver classpath delimited by ':'")); options.addOption(newOption("c", "driver_class", "JDBC driver class name")); try {//from w w w. j av a2 s .c o m CommandLine cmd = new PosixParser().parse(options, args); if (args.length == 0 || cmd.hasOption('h')) { printHelp(options); return -1; } if (cmd.hasOption('v')) { Properties p = new Properties(); try (InputStream in = HalyardExport.class .getResourceAsStream("/META-INF/maven/com.msd.gin.halyard/hbasesail/pom.properties")) { if (in != null) p.load(in); } System.out.println("Halyard Parallel Export version " + p.getProperty("version", "unknown")); return 0; } if (!cmd.getArgList().isEmpty()) throw new ExportException("Unknown arguments: " + cmd.getArgList().toString()); for (char c : "sqt".toCharArray()) { if (!cmd.hasOption(c)) throw new ExportException("Missing mandatory option: " + c); } for (char c : "sqtlc".toCharArray()) { String s[] = cmd.getOptionValues(c); if (s != null && s.length > 1) throw new ExportException("Multiple values for option: " + c); } String source = cmd.getOptionValue('s'); String query = cmd.getOptionValue('q'); if (!query.contains(PARALLEL_SPLIT_FUNCTION_NAME)) { throw new ExportException("Parallel export SPARQL query must contain '" + PARALLEL_SPLIT_FUNCTION_URI + "' function."); } String target = cmd.getOptionValue('t'); if ((target.startsWith("file:") || target.startsWith("hdfs:")) && !target.contains("{0}")) { throw new ExportException( "Parallel export file target must contain '{0}' counter in the file path or name."); } getConf().set(SOURCE, source); getConf().set(QUERY, query); getConf().set(TARGET, target); String driver = cmd.getOptionValue('c'); if (driver != null) { getConf().set(JDBC_DRIVER, driver); } String props[] = cmd.getOptionValues('p'); if (props != null) { for (int i = 0; i < props.length; i++) { props[i] = Base64.encodeBase64String(props[i].getBytes(UTF8)); } getConf().setStrings(JDBC_PROPERTIES, props); } TableMapReduceUtil.addDependencyJars(getConf(), HalyardExport.class, NTriplesUtil.class, Rio.class, AbstractRDFHandler.class, RDFFormat.class, RDFParser.class, HTable.class, HBaseConfiguration.class, AuthenticationProtos.class, Trace.class); HBaseConfiguration.addHbaseResources(getConf()); Job job = Job.getInstance(getConf(), "HalyardParallelExport " + source + " -> " + target); String cp = cmd.getOptionValue('l'); if (cp != null) { String jars[] = cp.split(":"); for (int i = 0; i < jars.length; i++) { File f = new File(jars[i]); if (!f.isFile()) throw new ExportException("Invalid JDBC driver classpath element: " + jars[i]); job.addFileToClassPath(new Path(f.toURI())); jars[i] = f.getName(); } job.getConfiguration().setStrings(JDBC_CLASSPATH, jars); } job.setJarByClass(HalyardParallelExport.class); job.setMaxMapAttempts(1); job.setMapperClass(ParallelExportMapper.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Void.class); job.setNumReduceTasks(0); job.setInputFormatClass(IndexedInputFormat.class); job.setOutputFormatClass(NullOutputFormat.class); TableMapReduceUtil.initCredentials(job); if (job.waitForCompletion(true)) { LOG.info("Parallel Export Completed.."); return 0; } return -1; } catch (RuntimeException exp) { System.out.println(exp.getMessage()); printHelp(options); throw exp; } }
From source file:com.mycompany.hadooptrain.WordCount.java
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Path inputPath = new Path(args[0]); Path outputDir = new Path(args[1]); // Create configuration Configuration conf = new Configuration(true); // Create job Job job = new Job(conf, "WordCount"); job.setJarByClass(WordCountMapper.class); // Setup MapReduce job.setMapperClass(WordCountMapper.class); job.setReducerClass(WordCountReducer.class); job.setNumReduceTasks(1); // Specify key / value job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // Input/*w w w. ja v a2 s .c o m*/ FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(TextInputFormat.class); // Output FileOutputFormat.setOutputPath(job, outputDir); job.setOutputFormatClass(TextOutputFormat.class); // Delete output if exists FileSystem hdfs = FileSystem.get(conf); if (hdfs.exists(outputDir)) hdfs.delete(outputDir, true); // Execute job int code = job.waitForCompletion(true) ? 0 : 1; System.exit(code); }
From source file:com.nearinfinity.blur.mapreduce.BlurTask.java
License:Apache License
public Job configureJob(Configuration configuration) throws IOException { if (getIndexingType() == INDEXING_TYPE.UPDATE) { checkTable();//from w ww. j a va 2 s .c o m } ByteArrayOutputStream os = new ByteArrayOutputStream(); DataOutputStream output = new DataOutputStream(os); write(output); output.close(); String blurTask = new String(Base64.encodeBase64(os.toByteArray())); configuration.set(BLUR_BLURTASK, blurTask); Job job = new Job(configuration, "Blur Indexer"); job.setReducerClass(BlurReducer.class); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(BlurMutate.class); job.setNumReduceTasks(getNumReducers(configuration)); return job; }