List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass
public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException
From source file:org.mrgeo.data.accumulo.output.image.AccumuloMrsImagePyramidOutputFormatProvider.java
License:Apache License
@Override public void setupJob(final Job job) throws DataProviderException { try {//from www. java2 s . com //TODO: there is an assumption here that the output is going to accumulo directly - not bulk super.setupJob(job); job.getConfiguration().addResource(AccumuloConnector.getAccumuloPropertiesLocation()); // zoom level - output zoom level zoomLevel = context.getZoomlevel(); // zoomLevel = job.getConfiguration().getInt("zoomlevel", 0); if (zoomLevel != 0) { job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_ZOOMLEVEL, Integer.toString(zoomLevel)); } //job.getConfiguration().set("zoomLevel", Integer.toString(zoomLevel)); if (doBulk) { job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_JOBTYPE, MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_BULK); job.getConfiguration().set( MrGeoAccumuloConstants.MRGEO_ACC_KEY_PREFIX + Integer.toString(zoomLevel), MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_BULK); } else { job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_JOBTYPE, MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_DIRECT); job.getConfiguration().set( MrGeoAccumuloConstants.MRGEO_ACC_KEY_PREFIX + Integer.toString(zoomLevel), MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_DIRECT); } Properties props = AccumuloConnector.getAccumuloProperties(); if (props != null) { // this used to be the variable "name" in TiledOutputFormatContext, but was always "". String enc = AccumuloConnector.encodeAccumuloProperties(""); job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_RESOURCE, enc); job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_INSTANCE, props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_INSTANCE)); job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_ZOOKEEPERS, props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_ZOOKEEPERS)); if (props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE) == null) { job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE, this.table); } else { job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE, props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE)); } // username and password job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_USER, props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_USER)); // make sure the password is set with Base64Encoding String pw = props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD); String isEnc = props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PWENCODED64, "false"); if (isEnc.equalsIgnoreCase("true")) { job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD, props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD)); } else { byte[] p = Base64.encodeBase64( props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD).getBytes()); job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD, new String(p)); job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PWENCODED64, new String("true")); } if (job.getConfiguration().get(MrGeoConstants.MRGEO_PROTECTION_LEVEL) != null) { cv = new ColumnVisibility(job.getConfiguration().get(MrGeoConstants.MRGEO_PROTECTION_LEVEL)); } if (cv == null) { if (props.containsKey(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ)) { job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ, props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ)); cv = new ColumnVisibility(props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ)); } } else { job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ, new String(cv.getExpression())); } } if (doBulk) { LongRectangle outTileBounds = tileBounds.toLongRectangle(); // setup the output for the job if (props.containsKey(MrGeoAccumuloConstants.MRGEO_ACC_KEY_WORKDIR)) { workDir = props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_WORKDIR); if (workDir != null) { workDir += File.separator; } } else { workDir = ""; } workDir += AccumuloMrsImagePyramidFileOutputFormat.class.getSimpleName() + File.separator + this.table + File.separator;// + // System.currentTimeMillis() + // File.separator; // delete the work dir if possible // Path wd = new Path(workDir); // FileSystem fs = HadoopFileUtils.getFileSystem(wd); // if (fs.exists(wd)) // { // fs.delete(wd, false); // } job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_WORKDIR, workDir); // determine the starting points for the splits ArrayList<Pair<Long, Long>> splitPoints = new ArrayList<Pair<Long, Long>>(); // think about the multiple levels and creating other splits!!! long step = bulkThreshold / outTileBounds.getWidth(); long rem = bulkThreshold % outTileBounds.getWidth(); if (rem > 0) { step++; } for (long y = outTileBounds.getMinY(); y <= outTileBounds.getMaxY(); y += step) { Pair<Long, Long> cur = new Pair<Long, Long>(outTileBounds.getMinX(), y); splitPoints.add(cur); } // we now have our list of split points // now build the splits file!!! FileSystem fs = null; //FileSystem.get(job.getConfiguration()); PrintStream out = null; try { Path wd = new Path(workDir); fs = FileSystem.get(job.getConfiguration()); if (fs.exists(wd)) { fs.delete(wd, true); } out = new PrintStream(new BufferedOutputStream(fs.create(new Path(workDir + "splits.txt")))); for (Pair<Long, Long> p : splitPoints) { long split = TMSUtils.tileid(p.getFirst(), p.getSecond(), zoomLevel); //TileIdWritable t = new TileIdWritable(split); Text t = new Text(longToBytes(split)); out.println(new String(Base64.encodeBase64(TextUtil.getBytes(t)))); log.debug("Point: " + p.getFirst() + "\t" + p.getSecond() + "\t" + split + "\t" + t.getLength()); } job.setNumReduceTasks(splitPoints.size() + 1); out.close(); job.setPartitionerClass(AccumuloMrGeoRangePartitioner.class); AccumuloMrGeoRangePartitioner.setSplitFile(job, workDir + "splits.txt"); } catch (IOException ioe) { ioe.printStackTrace(); throw new DataProviderException( "Problem creating output splits.txt for bulk ingest directory."); } job.setOutputFormatClass(AccumuloMrsImagePyramidFileOutputFormat.class); AccumuloMrsImagePyramidFileOutputFormat.setOutputPath(job, new Path(workDir + "files")); //AccumuloMrsImagePyramidFileOutputFormat.setZoomLevel(zoomLevel); } else { log.info("Setting the output format of: " + AccumuloMrsImagePyramidOutputFormat.class.getCanonicalName()); job.setOutputFormatClass(AccumuloMrsImagePyramidOutputFormat.class); AccumuloMrsImagePyramidOutputFormat.setJob(job); log.info("Setting zoom level to " + zoomLevel); log.info("Visibility is " + cv.toString()); log.info("Setting the number of reducers to " + MrGeoAccumuloConstants.MRGEO_DEFAULT_NUM_REDUCERS); job.setNumReduceTasks(MrGeoAccumuloConstants.MRGEO_DEFAULT_NUM_REDUCERS); } job.setOutputKeyClass(TileIdWritable.class); job.setOutputValueClass(RasterWritable.class); } catch (IOException ioe) { throw new DataProviderException("Error running job setup", ioe); } }
From source file:org.mrgeo.data.accumulo.output.image.AccumuloMrsPyramidOutputFormatProvider.java
License:Apache License
@SuppressWarnings("squid:S2095") // hadoop FileSystem cannot be closed, or else subsequent uses will fail private void setupConfig(final Configuration conf, final Job job) throws DataProviderException { try {//from w ww . j a va2s.c om // zoom level - output zoom level zoomLevel = context.getZoomLevel(); // zoomLevel = conf.getInt("zoomlevel", 0); if (zoomLevel != 0) { conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_ZOOMLEVEL, Integer.toString(zoomLevel)); } //conf.set("zoomLevel", Integer.toString(zoomLevel)); if (doBulk || forceBulk) { conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_JOBTYPE, MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_BULK); conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PREFIX + Integer.toString(zoomLevel), MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_BULK); } else { conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_JOBTYPE, MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_DIRECT); conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PREFIX + Integer.toString(zoomLevel), MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_DIRECT); } Properties props = AccumuloConnector.getAccumuloProperties(); // this used to be the variable "name" in ImageOutputFormatContext, but was always "". String enc = AccumuloConnector.encodeAccumuloProperties(""); conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_RESOURCE, enc); // conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_INSTANCE, // props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_INSTANCE)); // conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_ZOOKEEPERS, // props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_ZOOKEEPERS)); if (props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE) == null) { conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE, this.table); } else { conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE, props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE)); } // // username and password // conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_USER, // props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_USER)); // // // make sure the password is set with Base64Encoding // String pw = props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD); // String isEnc = props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PWENCODED64, "false"); // // if(isEnc.equalsIgnoreCase("true")){ // conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD, // props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD)); // } else { // byte[] p = Base64.encodeBase64(props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD).getBytes()); // // conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD, // new String(p)); // conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PWENCODED64, // new String("true")); // } if (conf.get(MrGeoConstants.MRGEO_PROTECTION_LEVEL) != null) { cv = new ColumnVisibility(conf.get(MrGeoConstants.MRGEO_PROTECTION_LEVEL)); } if (cv == null) { if (props.containsKey(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ)) { conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ, props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ)); cv = new ColumnVisibility(props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ)); } } else { conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ, new String(cv.getExpression())); } if (doBulk || forceBulk) { LongRectangle outTileBounds = tileBounds.toLongRectangle(); // setup the output for the job if (props.containsKey(MrGeoAccumuloConstants.MRGEO_ACC_KEY_WORKDIR)) { workDir = props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_WORKDIR); if (workDir != null) { workDir += File.separator; } } else { workDir = ""; } workDir += AccumuloMrsPyramidFileOutputFormat.class.getSimpleName() + File.separator + this.table + File.separator;// + // System.currentTimeMillis() + // File.separator; // delete the work dir if possible Path wd = new Path(workDir); FileSystem fs = FileSystem.get(conf); if (fs.exists(wd)) { fs.delete(wd, true); } conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_WORKDIR, workDir); if (job != null) { // determine the starting points for the splits ArrayList<Pair<Long, Long>> splitPoints = new ArrayList<Pair<Long, Long>>(); // think about the multiple levels and creating other splits!!! long step = bulkThreshold / outTileBounds.getWidth(); long rem = bulkThreshold % outTileBounds.getWidth(); if (rem > 0) { step++; } for (long y = outTileBounds.getMinY(); y <= outTileBounds.getMaxY(); y += step) { Pair<Long, Long> cur = new Pair<Long, Long>(outTileBounds.getMinX(), y); splitPoints.add(cur); } // we now have our list of split points // now build the splits file!!! try (BufferedOutputStream bos = new BufferedOutputStream( fs.create(new Path(workDir + "splits.txt")))) { try (PrintStream out = new PrintStream(bos)) { for (Pair<Long, Long> p : splitPoints) { long split = TMSUtils.tileid(p.getFirst(), p.getSecond(), zoomLevel); //TileIdWritable t = new TileIdWritable(split); Text t = new Text(longToBytes(split)); out.println(Base64Utils.encodeObject(t.toString())); log.debug("Point: " + p.getFirst() + "\t" + p.getSecond() + "\t" + split + "\t" + t.getLength()); } job.setNumReduceTasks(splitPoints.size() + 1); out.close(); job.setPartitionerClass(AccumuloMrGeoRangePartitioner.class); AccumuloMrGeoRangePartitioner.setSplitFile(job, workDir + "splits.txt"); } } catch (IOException ioe) { throw new DataProviderException( "Problem creating output splits.txt for bulk ingest directory.", ioe); } job.setOutputFormatClass(AccumuloMrsPyramidFileOutputFormat.class); } Path workFilesPath = new Path(workDir + "files"); if (job != null) { AccumuloMrsPyramidFileOutputFormat.setOutputPath(job, workFilesPath); //AccumuloMrsPyramidFileOutputFormat.setZoomLevel(zoomLevel); } else { Path outputDir = workFilesPath.getFileSystem(conf).makeQualified(workFilesPath); // conf.set(AccumuloMrsPyramidFileOutputFormat.OUTDIR, outputDir.toString()); conf.set("mapred.output.dir", outputDir.toString()); conf.set("mapreduce.output.fileoutputformat.outputdir", outputDir.toString()); } } else { if (job != null) { log.info("Setting the output format of: " + AccumuloMrsPyramidOutputFormat.class.getCanonicalName()); job.setOutputFormatClass(AccumuloMrsPyramidOutputFormat.class); AccumuloMrsPyramidOutputFormat.setJob(job); log.info("Setting zoom level to " + zoomLevel); log.info("Visibility is " + cv.toString()); log.info("Setting the number of reducers to " + MrGeoAccumuloConstants.MRGEO_DEFAULT_NUM_REDUCERS); job.setNumReduceTasks(MrGeoAccumuloConstants.MRGEO_DEFAULT_NUM_REDUCERS); } } if (job != null) { job.setOutputKeyClass(TileIdWritable.class); job.setOutputValueClass(RasterWritable.class); } } catch (IOException ioe) { throw new DataProviderException("Error running job setup", ioe); } }
From source file:org.mrgeo.hdfs.partitioners.TileIdPartitioner.java
License:Apache License
public static Path setup(final Job job, final SplitGenerator splitGenerator) throws IOException { // don't set up a partitioner in local mode if (HadoopUtils.isLocal(job.getConfiguration())) { // make sure we have at least 1 reducer... if (job.getNumReduceTasks() < 1) { job.setNumReduceTasks(1);//from www . j a v a 2 s.co m } return null; } PartitionerSplit splits = new PartitionerSplit(); splits.generateSplits(splitGenerator); // create a split file in the hadoop tmp directory // this is copied into the job's output directory upon job completion final int uniquePrefixLen = 5; Path splitFile = new Path(HadoopFileUtils.getTempDir(job.getConfiguration()), HadoopUtils.createRandomString(uniquePrefixLen) + "_" + PartitionerSplit.SPLIT_FILE); splits.writeSplits(splitFile); job.setNumReduceTasks(splits.length()); job.setPartitionerClass(TileIdPartitioner.class); setSplitFile(splitFile.toString(), job); return splitFile; }
From source file:org.myorg.KSorter.java
License:Open Source License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "parallelsort"); job.setInputFormatClass(KeyValueTextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setPartitionerClass(ScorePartitioner.class); job.setJarByClass(KSorter.class); job.setMapperClass(SortMapper.class); job.setReducerClass(SortReducer.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true);//from w ww. ja v a 2 s . c o m }
From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java
License:Open Source License
protected boolean runDictionaryJob() throws ClassNotFoundException, IOException, InterruptedException, URISyntaxException { boolean jobOK; Job job = null; BufferedWriter bufferedWriter; // if output path exists... if (this.dictionaryFS.exists(this.conf.getDictionaryOutputPath())) { if (this.conf.getDeleteDictionaryOutputPath()) { // ... and option provided, delete recursively this.dictionaryFS.delete(this.conf.getDictionaryOutputPath(), true); } else { // ... and option not provided, fail System.out.println("Dictionary output path does exist: " + this.conf.getDictionaryOutputPath()); System.out.println("Select other path or use option -dd to overwrite"); System.exit(-1);/*from w w w. j a v a2 s. c o m*/ } } // Sample the SequenceInputFormat to do TotalSort and create final output job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName() + " phase 2"); job.setJarByClass(HDTBuilderDriver.class); System.out.println("samples = " + this.conf.getDictionarySamplesPath()); System.out.println("output = " + this.conf.getDictionaryOutputPath()); FileInputFormat.addInputPath(job, this.conf.getDictionarySamplesPath()); FileOutputFormat.setOutputPath(job, this.conf.getDictionaryOutputPath()); job.setInputFormatClass(SequenceFileInputFormat.class); LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); // Identity Mapper // job.setMapperClass(Mapper.class); job.setCombinerClass(DictionaryCombiner.class); job.setPartitionerClass(TotalOrderPartitioner.class); job.setReducerClass(DictionaryReducer.class); job.setNumReduceTasks(this.conf.getDictionaryReducers()); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); System.out.println("Sampling started"); InputSampler.writePartitionFile(job, new InputSampler.IntervalSampler<Text, Text>(this.conf.getSampleProbability())); String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration()); URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH); DistributedCache.addCacheFile(partitionUri, job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); System.out.println("Sampling finished"); MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SHARED, SequenceFileOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SUBJECTS, SequenceFileOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.PREDICATES, SequenceFileOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.OBJECTS, SequenceFileOutputFormat.class, Text.class, NullWritable.class); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); jobOK = job.waitForCompletion(true); this.numShared = job.getCounters().findCounter(Counters.Shared).getValue(); this.numSubjects = job.getCounters().findCounter(Counters.Subjects).getValue(); this.numPredicates = job.getCounters().findCounter(Counters.Predicates).getValue(); this.numObjects = job.getCounters().findCounter(Counters.Objects).getValue(); bufferedWriter = new BufferedWriter( new OutputStreamWriter(this.dictionaryFS.create(this.conf.getDictionaryCountersFile()))); bufferedWriter.write(HDTBuilderConfiguration.SHARED + "=" + this.numShared + "\n"); bufferedWriter.write(HDTBuilderConfiguration.SUBJECTS + "=" + this.numSubjects + "\n"); bufferedWriter.write(HDTBuilderConfiguration.PREDICATES + "=" + this.numPredicates + "\n"); bufferedWriter.write(HDTBuilderConfiguration.OBJECTS + "=" + this.numObjects + "\n"); bufferedWriter.close(); return jobOK; }
From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java
License:Open Source License
protected boolean runTriplesJob() throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException { Job job = null; boolean jobOK; // if triples output path exists... if (this.triplesFS.exists(this.conf.getTriplesOutputPath())) { if (this.conf.getDeleteTriplesOutputPath()) { // ... and option provided, delete recursively this.triplesFS.delete(this.conf.getTriplesOutputPath(), true); } else { // ... and option not provided, fail System.out.println("Triples output path does exist: " + this.conf.getTriplesOutputPath()); System.out.println("Select other path or use option -dt to overwrite"); System.exit(-1);/*from www . j a v a 2s . c o m*/ } } job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName() + " phase 2"); job.setJarByClass(HDTBuilderDriver.class); FileInputFormat.addInputPath(job, this.conf.getTriplesSamplesPath()); FileOutputFormat.setOutputPath(job, this.conf.getTriplesOutputPath()); job.setInputFormatClass(SequenceFileInputFormat.class); LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); job.setSortComparatorClass(TripleSPOComparator.class); job.setGroupingComparatorClass(TripleSPOComparator.class); job.setPartitionerClass(TotalOrderPartitioner.class); job.setOutputKeyClass(TripleSPOWritable.class); job.setOutputValueClass(NullWritable.class); job.setNumReduceTasks(this.conf.getTriplesReducers()); System.out.println("Sampling started"); InputSampler.writePartitionFile(job, new InputSampler.IntervalSampler<Text, Text>(this.conf.getSampleProbability())); String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration()); URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH); DistributedCache.addCacheFile(partitionUri, job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); System.out.println("Sampling finished"); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); jobOK = job.waitForCompletion(true); return jobOK; }
From source file:org.seqdoop.hadoop_bam.cli.plugins.FixMate.java
License:Open Source License
@Override protected int run(CmdLineParser parser) { final List<String> args = parser.getRemainingArgs(); if (args.isEmpty()) { System.err.println("fixmate :: WORKDIR not given."); return 3; }//from w ww. j a v a2s . c o m if (args.size() == 1) { System.err.println("fixmate :: INPATH not given."); return 3; } if (!cacheAndSetProperties(parser)) return 3; final ValidationStringency stringency = Utils.toStringency( parser.getOptionValue(stringencyOpt, ValidationStringency.DEFAULT_STRINGENCY.toString()), "fixmate"); if (stringency == null) return 3; Path wrkDir = new Path(args.get(0)); final List<String> strInputs = args.subList(1, args.size()); final List<Path> inputs = new ArrayList<Path>(strInputs.size()); for (final String in : strInputs) inputs.add(new Path(in)); final Configuration conf = getConf(); // Used by Utils.getMergeableWorkFile() to name the output files. final String intermediateOutName = (outPath == null ? inputs.get(0) : outPath).getName(); conf.set(Utils.WORK_FILENAME_PROPERTY, intermediateOutName); if (stringency != null) conf.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, stringency.toString()); final boolean globalSort = parser.getBoolean(sortOpt); if (globalSort) Utils.setHeaderMergerSortOrder(conf, SAMFileHeader.SortOrder.queryname); conf.setStrings(Utils.HEADERMERGER_INPUTS_PROPERTY, strInputs.toArray(new String[0])); final Timer t = new Timer(); try { // Required for path ".", for example. wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir); if (globalSort) Utils.configureSampling(wrkDir, intermediateOutName, conf); final Job job = new Job(conf); job.setJarByClass(FixMate.class); job.setMapperClass(FixMateMapper.class); job.setReducerClass(FixMateReducer.class); if (!parser.getBoolean(noCombinerOpt)) job.setCombinerClass(FixMateReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(SAMRecordWritable.class); job.setInputFormatClass(AnySAMInputFormat.class); job.setOutputFormatClass(CLIMergingAnySAMOutputFormat.class); for (final Path in : inputs) FileInputFormat.addInputPath(job, in); FileOutputFormat.setOutputPath(job, wrkDir); if (globalSort) { job.setPartitionerClass(TotalOrderPartitioner.class); System.out.println("fixmate :: Sampling..."); t.start(); InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job, new InputSampler.RandomSampler<LongWritable, SAMRecordWritable>(0.01, 10000, Math.max(100, reduceTasks))); System.out.printf("fixmate :: Sampling complete in %d.%03d s.\n", t.stopS(), t.fms()); } job.submit(); System.out.println("fixmate :: Waiting for job completion..."); t.start(); if (!job.waitForCompletion(verbose)) { System.err.println("fixmate :: Job failed."); return 4; } System.out.printf("fixmate :: Job complete in %d.%03d s.\n", t.stopS(), t.fms()); } catch (IOException e) { System.err.printf("fixmate :: Hadoop error: %s\n", e); return 4; } catch (ClassNotFoundException e) { throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } if (outPath != null) try { Utils.mergeSAMInto(outPath, wrkDir, "", "", samFormat, conf, "fixmate"); } catch (IOException e) { System.err.printf("fixmate :: Output merging failed: %s\n", e); return 5; } return 0; }
From source file:org.seqdoop.hadoop_bam.cli.plugins.Sort.java
License:Open Source License
@Override protected int run(CmdLineParser parser) { final List<String> args = parser.getRemainingArgs(); if (args.isEmpty()) { System.err.println("sort :: WORKDIR not given."); return 3; }//from ww w . j a v a2s .co m if (args.size() == 1) { System.err.println("sort :: INPATH not given."); return 3; } if (!cacheAndSetProperties(parser)) return 3; final ValidationStringency stringency = Utils.toStringency( parser.getOptionValue(stringencyOpt, ValidationStringency.DEFAULT_STRINGENCY.toString()), "sort"); if (stringency == null) return 3; Path wrkDir = new Path(args.get(0)); final List<String> strInputs = args.subList(1, args.size()); final List<Path> inputs = new ArrayList<Path>(strInputs.size()); for (final String in : strInputs) inputs.add(new Path(in)); final Configuration conf = getConf(); Utils.setHeaderMergerSortOrder(conf, SortOrder.coordinate); conf.setStrings(Utils.HEADERMERGER_INPUTS_PROPERTY, strInputs.toArray(new String[0])); if (stringency != null) conf.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, stringency.toString()); // Used by Utils.getMergeableWorkFile() to name the output files. final String intermediateOutName = (outPath == null ? inputs.get(0) : outPath).getName(); conf.set(Utils.WORK_FILENAME_PROPERTY, intermediateOutName); final Timer t = new Timer(); try { // Required for path ".", for example. wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir); Utils.configureSampling(wrkDir, intermediateOutName, conf); final Job job = new Job(conf); job.setJarByClass(Sort.class); job.setMapperClass(Mapper.class); job.setReducerClass(SortReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(SAMRecordWritable.class); job.setInputFormatClass(SortInputFormat.class); job.setOutputFormatClass(CLIMergingAnySAMOutputFormat.class); for (final Path in : inputs) FileInputFormat.addInputPath(job, in); FileOutputFormat.setOutputPath(job, wrkDir); job.setPartitionerClass(TotalOrderPartitioner.class); System.out.println("sort :: Sampling..."); t.start(); InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job, new InputSampler.RandomSampler<LongWritable, SAMRecordWritable>(0.01, 10000, Math.max(100, reduceTasks))); System.out.printf("sort :: Sampling complete in %d.%03d s.\n", t.stopS(), t.fms()); job.submit(); System.out.println("sort :: Waiting for job completion..."); t.start(); if (!job.waitForCompletion(verbose)) { System.err.println("sort :: Job failed."); return 4; } System.out.printf("sort :: Job complete in %d.%03d s.\n", t.stopS(), t.fms()); } catch (IOException e) { System.err.printf("sort :: Hadoop error: %s\n", e); return 4; } catch (ClassNotFoundException e) { throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } if (outPath != null) try { Utils.mergeSAMInto(outPath, wrkDir, "", "", samFormat, conf, "sort"); } catch (IOException e) { System.err.printf("sort :: Output merging failed: %s\n", e); return 5; } return 0; }
From source file:org.sifarish.common.AttributeBasedDiversifier.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "Attribute based diversifer for ranked and recommended items MR"; job.setJobName(jobName);// w w w . j ava2 s.c om job.setJarByClass(AttributeBasedDiversifier.class); FileInputFormat.addInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(AttributeBasedDiversifier.AttributeDiversifierMapper.class); job.setReducerClass(AttributeBasedDiversifier.AttributeDiversifierReducer.class); job.setMapOutputKeyClass(Tuple.class); job.setMapOutputValueClass(Tuple.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class); job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class); Utility.setConfiguration(job.getConfiguration()); int numReducer = job.getConfiguration().getInt("abd.num.reducer", -1); numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer; job.setNumReduceTasks(numReducer); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
From source file:org.sifarish.common.BusinessGoalInjector.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "Business goal injector MR"; job.setJobName(jobName);//w ww.j a v a 2 s.com job.setJarByClass(BusinessGoalInjector.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(BusinessGoalInjector.BusinessGoalMapper.class); job.setReducerClass(BusinessGoalInjector.BusinessGoalReducer.class); job.setMapOutputKeyClass(Tuple.class); job.setMapOutputValueClass(Tuple.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class); job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class); Utility.setConfiguration(job.getConfiguration()); int numReducer = job.getConfiguration().getInt("bgi.num.reducer", -1); numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer; job.setNumReduceTasks(numReducer); int status = job.waitForCompletion(true) ? 0 : 1; return status; }