List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass
public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException
From source file:io.druid.indexer.IndexGeneratorJob.java
License:Apache License
public boolean run() { try {/* www.j a v a 2 s. c o m*/ Job job = Job.getInstance(new Configuration(), String.format("%s-index-generator-%s", config.getDataSource(), config.getIntervals())); job.getConfiguration().set("io.sort.record.percent", "0.23"); JobHelper.injectSystemProperties(job); config.addJobProperties(job); job.setMapperClass(IndexGeneratorMapper.class); job.setMapOutputValueClass(BytesWritable.class); SortableBytes.useSortableBytesAsMapOutputKey(job); int numReducers = Iterables.size(config.getAllBuckets().get()); if (numReducers == 0) { throw new RuntimeException("No buckets?? seems there is no data to index."); } if (config.getSchema().getTuningConfig().getUseCombiner()) { job.setCombinerClass(IndexGeneratorCombiner.class); job.setCombinerKeyGroupingComparatorClass(BytesWritable.Comparator.class); } job.setNumReduceTasks(numReducers); job.setPartitionerClass(IndexGeneratorPartitioner.class); setReducerClass(job); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(IndexGeneratorOutputFormat.class); FileOutputFormat.setOutputPath(job, config.makeIntermediatePath()); config.addInputPaths(job); // hack to get druid.processing.bitmap property passed down to hadoop job. // once IndexIO doesn't rely on globally injected properties, we can move this into the HadoopTuningConfig. final String bitmapProperty = "druid.processing.bitmap.type"; final String bitmapType = HadoopDruidIndexerConfig.properties.getProperty(bitmapProperty); if (bitmapType != null) { for (String property : new String[] { "mapreduce.reduce.java.opts", "mapreduce.map.java.opts" }) { // prepend property to allow overriding using hadoop.xxx properties by JobHelper.injectSystemProperties above String value = Strings.nullToEmpty(job.getConfiguration().get(property)); job.getConfiguration().set(property, String.format("-D%s=%s %s", bitmapProperty, bitmapType, value)); } } config.intoConfiguration(job); JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), job); job.submit(); log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL()); boolean success = job.waitForCompletion(true); Counter invalidRowCount = job.getCounters() .findCounter(HadoopDruidIndexerConfig.IndexJobCounters.INVALID_ROW_COUNTER); jobStats.setInvalidRowCount(invalidRowCount.getValue()); return success; } catch (Exception e) { throw new RuntimeException(e); } }
From source file:io.fluo.stress.trie.Init.java
License:Apache License
private int buildTree(int nodeSize, FluoConfiguration props, Path tmp, int stopLevel) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(Init.class); job.setJobName(Init.class.getName() + "_load"); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.getConfiguration().setInt(TRIE_NODE_SIZE_PROP, nodeSize); job.getConfiguration().setInt(TRIE_STOP_LEVEL_PROP, stopLevel); job.setInputFormatClass(SequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(job, new Path(tmp, "nums")); job.setMapperClass(InitMapper.class); job.setCombinerClass(InitCombiner.class); job.setReducerClass(InitReducer.class); job.setOutputFormatClass(AccumuloFileOutputFormat.class); job.setPartitionerClass(RangePartitioner.class); FileSystem fs = FileSystem.get(job.getConfiguration()); Connector conn = AccumuloUtil.getConnector(props); Path splitsPath = new Path(tmp, "splits.txt"); Collection<Text> splits1 = writeSplits(props, fs, conn, splitsPath); RangePartitioner.setSplitFile(job, splitsPath.toString()); job.setNumReduceTasks(splits1.size() + 1); Path outPath = new Path(tmp, "out"); AccumuloFileOutputFormat.setOutputPath(job, outPath); boolean success = job.waitForCompletion(true); if (success) { Path failPath = new Path(tmp, "failures"); fs.mkdirs(failPath);// w ww . jav a2 s . c om conn.tableOperations().importDirectory(props.getAccumuloTable(), outPath.toString(), failPath.toString(), false); } return success ? 0 : 1; }
From source file:ipldataanalysis3.IPLDataAnalysis3.java
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.out.printf("Two parameters are required for Data Analysis for IPL- <input dir> <output dir>\n"); return -1; }//from ww w . j a va 2s . c o m Job job = new Job(getConf(), "Job1"); job.setJarByClass(IPLDataAnalysis3.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(DataAnalysisMapper.class); job.setNumReduceTasks(13); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setPartitionerClass(DataAnalysisPartitioner.class); job.setReducerClass(DataAnalysisReducer.class); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
From source file:it.crs4.pydoop.mapreduce.pipes.CommandLineParser.java
License:Apache License
private static void setupPipesJob(Job job) throws IOException, ClassNotFoundException { Configuration conf = job.getConfiguration(); // default map output types to Text if (!getIsJavaMapper(conf)) { job.setMapperClass(PipesMapper.class); // Save the user's partitioner and hook in our's. setJavaPartitioner(conf, job.getPartitionerClass()); job.setPartitionerClass(PipesPartitioner.class); }//from ww w . ja v a 2 s . co m if (!getIsJavaReducer(conf)) { job.setReducerClass(PipesReducer.class); if (!getIsJavaRecordWriter(conf)) { job.setOutputFormatClass(NullOutputFormat.class); } } String textClassname = Text.class.getName(); setIfUnset(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS, textClassname); setIfUnset(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS, textClassname); setIfUnset(conf, MRJobConfig.OUTPUT_KEY_CLASS, textClassname); setIfUnset(conf, MRJobConfig.OUTPUT_VALUE_CLASS, textClassname); // Use PipesNonJavaInputFormat if necessary to handle progress reporting // from C++ RecordReaders ... if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) { conf.setClass(Submitter.INPUT_FORMAT, job.getInputFormatClass(), InputFormat.class); job.setInputFormatClass(PipesNonJavaInputFormat.class); } if (avroInput != null) { if (explicitInputFormat) { conf.setClass(Submitter.INPUT_FORMAT, job.getInputFormatClass(), InputFormat.class); } // else let the bridge fall back to the appropriate Avro IF switch (avroInput) { case K: job.setInputFormatClass(PydoopAvroInputKeyBridge.class); break; case V: job.setInputFormatClass(PydoopAvroInputValueBridge.class); break; case KV: job.setInputFormatClass(PydoopAvroInputKeyValueBridge.class); break; default: throw new IllegalArgumentException("Bad Avro input type"); } } if (avroOutput != null) { if (explicitOutputFormat) { conf.setClass(Submitter.OUTPUT_FORMAT, job.getOutputFormatClass(), OutputFormat.class); } // else let the bridge fall back to the appropriate Avro OF conf.set(props.getProperty("AVRO_OUTPUT"), avroOutput.name()); switch (avroOutput) { case K: job.setOutputFormatClass(PydoopAvroOutputKeyBridge.class); break; case V: job.setOutputFormatClass(PydoopAvroOutputValueBridge.class); break; case KV: job.setOutputFormatClass(PydoopAvroOutputKeyValueBridge.class); break; default: throw new IllegalArgumentException("Bad Avro output type"); } } String exec = getExecutable(conf); if (exec == null) { String msg = "No application program defined."; throw new IllegalArgumentException(msg); } // add default debug script only when executable is expressed as // <path>#<executable> //FIXME: this is kind of useless if the pipes program is not in c++ if (exec.contains("#")) { // set default gdb commands for map and reduce task String defScript = "$HADOOP_PREFIX/src/c++/pipes/debug/pipes-default-script"; setIfUnset(conf, MRJobConfig.MAP_DEBUG_SCRIPT, defScript); setIfUnset(conf, MRJobConfig.REDUCE_DEBUG_SCRIPT, defScript); } URI[] fileCache = DistributedCache.getCacheFiles(conf); if (fileCache == null) { fileCache = new URI[1]; } else { URI[] tmp = new URI[fileCache.length + 1]; System.arraycopy(fileCache, 0, tmp, 1, fileCache.length); fileCache = tmp; } try { fileCache[0] = new URI(exec); } catch (URISyntaxException e) { String msg = "Problem parsing executable URI " + exec; IOException ie = new IOException(msg); ie.initCause(e); throw ie; } DistributedCache.setCacheFiles(fileCache, conf); }
From source file:it.crs4.pydoop.mapreduce.pipes.CommandLineParser.java
License:Apache License
public int run(String[] args) throws Exception { CommandLineParser cli = new CommandLineParser(); if (args.length == 0) { cli.printUsage();//from w w w . ja v a 2 s.c o m return 1; } try { Job job = new Job(new Configuration()); job.setJobName(getClass().getName()); Configuration conf = job.getConfiguration(); CommandLine results = cli.parse(conf, args); if (results.hasOption("input")) { Path path = new Path(results.getOptionValue("input")); FileInputFormat.setInputPaths(job, path); } if (results.hasOption("output")) { Path path = new Path(results.getOptionValue("output")); FileOutputFormat.setOutputPath(job, path); } if (results.hasOption("jar")) { job.setJar(results.getOptionValue("jar")); } if (results.hasOption("inputformat")) { explicitInputFormat = true; setIsJavaRecordReader(conf, true); job.setInputFormatClass(getClass(results, "inputformat", conf, InputFormat.class)); } if (results.hasOption("javareader")) { setIsJavaRecordReader(conf, true); } if (results.hasOption("map")) { setIsJavaMapper(conf, true); job.setMapperClass(getClass(results, "map", conf, Mapper.class)); } if (results.hasOption("partitioner")) { job.setPartitionerClass(getClass(results, "partitioner", conf, Partitioner.class)); } if (results.hasOption("reduce")) { setIsJavaReducer(conf, true); job.setReducerClass(getClass(results, "reduce", conf, Reducer.class)); } if (results.hasOption("reduces")) { job.setNumReduceTasks(Integer.parseInt(results.getOptionValue("reduces"))); } if (results.hasOption("writer")) { explicitOutputFormat = true; setIsJavaRecordWriter(conf, true); job.setOutputFormatClass(getClass(results, "writer", conf, OutputFormat.class)); } if (results.hasOption("lazyOutput")) { if (Boolean.parseBoolean(results.getOptionValue("lazyOutput"))) { LazyOutputFormat.setOutputFormatClass(job, job.getOutputFormatClass()); } } if (results.hasOption("avroInput")) { avroInput = AvroIO.valueOf(results.getOptionValue("avroInput").toUpperCase()); } if (results.hasOption("avroOutput")) { avroOutput = AvroIO.valueOf(results.getOptionValue("avroOutput").toUpperCase()); } if (results.hasOption("program")) { setExecutable(conf, results.getOptionValue("program")); } // if they gave us a jar file, include it into the class path String jarFile = job.getJar(); if (jarFile != null) { final URL[] urls = new URL[] { FileSystem.getLocal(conf).pathToFile(new Path(jarFile)).toURL() }; // FindBugs complains that creating a URLClassLoader should be // in a doPrivileged() block. ClassLoader loader = AccessController.doPrivileged(new PrivilegedAction<ClassLoader>() { public ClassLoader run() { return new URLClassLoader(urls); } }); conf.setClassLoader(loader); } setupPipesJob(job); return job.waitForCompletion(true) ? 0 : 1; } catch (ParseException pe) { LOG.info("Error : " + pe); cli.printUsage(); return 1; } }
From source file:it.crs4.seal.demux.Demux.java
License:Open Source License
@Override public int run(String[] args) throws Exception { LOG.info("starting"); Configuration conf = getConf(); DemuxOptionParser parser = new DemuxOptionParser(); parser.parse(conf, args);//from w w w . j a va2s .co m conf.setBoolean(CONF_NO_INDEX_READS, parser.getNoIndexReads()); conf.setBoolean(CONF_SEPARATE_READS, parser.getSeparateReads()); LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks"); if (parser.getNoIndexReads()) LOG.info("Not expecting to find any index reads. Will demultiplex based only on lane."); // load sample sheet to fail early in case of problems DemuxUtils.loadSampleSheet(parser.getSampleSheetPath(), conf); // must be called before creating the job, since the job // *copies* the Configuration. distributeSampleSheet(parser.getSampleSheetPath()); // Create a Job using the processed conf Job job = new Job(getConf(), makeJobName(parser.getInputPaths().get(0))); job.setJarByClass(Demux.class); // input paths for (Path p : parser.getInputPaths()) FileInputFormat.addInputPath(job, p); job.setInputFormatClass(FormatNameMap.getInputFormat(parser.getInputFormatName("qseq"))); job.setMapperClass(Map.class); job.setMapOutputKeyClass(SequenceId.class); job.setMapOutputValueClass(SequencedFragment.class); job.setPartitionerClass(SequenceIdLocationPartitioner.class); job.setGroupingComparatorClass(GroupByLocationComparator.class); job.setSortComparatorClass(TwoOneThreeSortComparator.class); job.setReducerClass(Red.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(SequencedFragment.class); // output job.setOutputFormatClass(DemuxOutputFormat.class); FileOutputFormat.setOutputPath(job, parser.getOutputPath()); // Submit the job, then poll for progress until the job is complete boolean result = job.waitForCompletion(true); if (result) { LOG.info("done"); if (parser.getCreateLaneContent()) createLaneContentFiles(parser.getOutputPath(), parser.getSampleSheetPath()); return 0; } else { LOG.fatal(this.getClass().getName() + " failed!"); return 1; } }
From source file:it.crs4.seal.prq.PairReadsQSeq.java
License:Open Source License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); // defaults//from w w w .java 2s. c o m conf.set(PrqOptionParser.INPUT_FORMAT_CONF, PrqOptionParser.InputFormatDefault); // parse command line PrqOptionParser parser = new PrqOptionParser(); parser.parse(conf, args); Job job = new Job(conf, "PairReadsQSeq " + parser.getInputPaths().get(0)); job.setJarByClass(PairReadsQSeq.class); job.setInputFormatClass(FormatNameMap.getInputFormat(parser.getInputFormatName())); job.setOutputFormatClass(FormatNameMap.getOutputFormat(parser.getOutputFormatName("prq"))); job.setMapperClass(PrqMapper.class); job.setMapOutputKeyClass(SequenceId.class); job.setMapOutputValueClass(Text.class); job.setPartitionerClass(FirstPartitioner.class); job.setGroupingComparatorClass(GroupByLocationComparator.class); job.setReducerClass(PrqReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(ReadPair.class); for (Path p : parser.getInputPaths()) FileInputFormat.addInputPath(job, p); FileOutputFormat.setOutputPath(job, parser.getOutputPath()); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:it.crs4.seal.read_sort.ReadSort.java
License:Open Source License
public int run(String[] args) throws Exception { LOG.info("starting"); Configuration conf = getConf(); ReadSortOptionParser parser = new ReadSortOptionParser(); parser.parse(conf, args);/*from w w w. j a v a 2 s.c o m*/ LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks"); // Create a Job using the processed conf Job job = new Job(conf, makeJobName(parser.getInputPaths().get(0))); job.setJarByClass(ReadSort.class); // input paths for (Path p : parser.getInputPaths()) FileInputFormat.addInputPath(job, p); job.setMapperClass(ReadSortSamMapper.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setPartitionerClass(WholeReferencePartitioner.class); job.setReducerClass(ReadSortSamReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // output path FileOutputFormat.setOutputPath(job, parser.getOutputPath()); // Submit the job, then poll for progress until the job is complete boolean result = job.waitForCompletion(true); if (result) { LOG.info("done"); return 0; } else { LOG.fatal("ReadSort failed!"); return 1; } }
From source file:it.crs4.seal.tsv_sort.TsvSort.java
License:Apache License
public int run(String[] args) throws Exception { LOG.info("starting"); TsvSortOptionParser parser = new TsvSortOptionParser(); parser.parse(getConf(), args);/*w w w. ja va2s . c o m*/ LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks"); Job job = new Job(getConf()); job.setJobName("TsvSort " + parser.getInputPaths().get(0)); job.setJarByClass(TsvSort.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(TsvInputFormat.class); job.setOutputFormatClass(TextValueOutputFormat.class); job.setPartitionerClass(TotalOrderPartitioner.class); // output path FileOutputFormat.setOutputPath(job, parser.getOutputPath()); FileSystem fs = parser.getOutputPath().getFileSystem(job.getConfiguration()); /* * * Pick a random name for the partition file in the same directory as the * output path. So, TsvSort /user/me/input /user/me/output * results in the partition file being placed in /user/me/_partition.lst.12340921387402174 * * Why not place it directly in the input path? * * We wouldn't be able to run two sorts on the same data at the same time. * We've received complaints about this in the past, so it has been a * limit in practice. * * Why not place it directly in the output path? * * We'd have to create the output path before the output format did. * For this to work we'd have to disable the FileOutputFormat's default check * that verifies that the output directory doesn't exist. This means that we'd * need some other way to ensure that we're not writing to the same path where * some other job wrote. */ Path partitionFile; Random rnd = new Random(); do { partitionFile = new Path(parser.getOutputPath().getParent(), String.format("_partition.lst.%012d", Math.abs(rnd.nextLong()))); } while (fs.exists(partitionFile)); // this is still subject to a race condition between it and another instance of this program partitionFile = partitionFile.makeQualified(fs); LOG.info("partition file path: " + partitionFile); URI partitionUri = new URI(partitionFile.toString() + "#" + PARTITION_SYMLINK); LOG.debug("partitionUri for distributed cache: " + partitionUri); // input paths for (Path p : parser.getInputPaths()) TsvInputFormat.addInputPath(job, p); LOG.info("sampling input"); TextSampler.writePartitionFile(new TsvInputFormat(), job, partitionFile); LOG.info("created partitions"); try { DistributedCache.addCacheFile(partitionUri, job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); int retcode = job.waitForCompletion(true) ? 0 : 1; LOG.info("done"); return retcode; } finally { LOG.debug("deleting partition file " + partitionFile); fs.delete(partitionFile, false); } }
From source file:it.crs4.seal.usort.USort.java
License:Open Source License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); // defaults/*from ww w. j ava 2 s. c om*/ conf.set(SealToolParser.INPUT_FORMAT_CONF, USortOptionParser.InputFormatDefault); conf.set(SealToolParser.OUTPUT_FORMAT_CONF, USortOptionParser.OutputFormatDefault); // parse command line USortOptionParser parser = new USortOptionParser(); parser.parse(conf, args); Job job = new Job(conf, "USort " + parser.getInputPaths().get(0)); job.setJarByClass(USort.class); job.setInputFormatClass(FormatNameMap.getInputFormat(parser.getInputFormatName())); job.setOutputFormatClass(FormatNameMap.getOutputFormat(parser.getOutputFormatName())); job.setMapperClass(Demux.Map.class); job.setMapOutputKeyClass(SequenceId.class); job.setMapOutputValueClass(SequencedFragment.class); job.setPartitionerClass(USortPartitioner.class); job.setReducerClass(Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(SequencedFragment.class); for (Path p : parser.getInputPaths()) FileInputFormat.addInputPath(job, p); FileOutputFormat.setOutputPath(job, parser.getOutputPath()); boolean result = job.waitForCompletion(true); if (!result) { LOG.fatal(this.getClass().getName() + " failed!"); return 1; } else return 0; }