List of usage examples for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setOutputPath
public static void setOutputPath(Job job, Path outputDir)
From source file:com.daleway.training.hadoop.pagerank.PageRankSecondarySort.java
License:Apache License
public static Job createJob(Configuration conf, String inputPath, String outputPath) throws IOException { Job job = new Job(conf, "pair wise count"); job.setJarByClass(PageRankSecondarySort.class); job.setMapperClass(TokenizerMapper.class); job.setReducerClass(IntSumReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setSortComparatorClass(LongWritable.DecreasingComparator.class); job.setMaxReduceAttempts(1);/*from w w w.ja v a 2s . c om*/ job.setNumReduceTasks(1); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job; }
From source file:com.daleway.training.hadoop.pagerank.PageRankSimple.java
License:Apache License
public static Job createJob(Configuration conf, String inputPath, String outputPath) throws IOException { Job job = new Job(conf, "pair wise count"); job.setJarByClass(PageRankSimple.class); job.setMapperClass(PageRankMapper.class); //job.setCombinerClass(IntSumReducer.class); job.setReducerClass(PageRankReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job;//from w w w. j av a 2 s . c om }
From source file:com.datasalt.pangool.benchmark.secondarysort.HadoopSecondarySort.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: secondarysrot <in> <out>"); System.exit(2);/* ww w. ja v a 2 s . c o m*/ } Job job = new Job(conf, "Hadoop Secondary Sort"); FileSystem fS = FileSystem.get(conf); fS.delete(new Path(otherArgs[1]), true); job.setJarByClass(HadoopSecondarySort.class); job.setMapperClass(MapClass.class); job.setReducerClass(Reduce.class); job.setPartitionerClass(KeyPartitioner.class); job.setGroupingComparatorClass(GroupingComparator.class); job.setMapOutputKeyClass(ComplexType.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); job.waitForCompletion(true); }
From source file:com.datasalt.pangool.benchmark.wordcount.HadoopWordCount.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2);/* w w w . ja v a 2s .c o m*/ } //conf.setBoolean("hadoop.security.authorization", false); //conf.set("hadoop.security.authentication","simple"); Job job = new Job(conf, "word count"); job.setJarByClass(HadoopWordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); HadoopUtils.deleteIfExists(FileSystem.get(conf), new Path(otherArgs[1])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); job.waitForCompletion(true); }
From source file:com.datasalt.pangool.tuplemr.MapOnlyJobBuilder.java
License:Apache License
public Job createJob() throws IOException, TupleMRException, URISyntaxException { // perform a deep copy of the configuration this.conf = new Configuration(this.conf); String uniqueName = UUID.randomUUID().toString() + '.' + "out-format.dat"; try {//w w w.j a v a 2 s. co m InstancesDistributor.distribute(outputFormat, uniqueName, conf); instanceFilesCreated.add(uniqueName); } catch (URISyntaxException e1) { throw new TupleMRException(e1); } Job job; if (jobName == null) { job = new Job(conf); } else { job = new Job(conf, jobName); } job.setNumReduceTasks(0); job.getConfiguration().set(ProxyOutputFormat.PROXIED_OUTPUT_FORMAT_CONF, uniqueName); job.setOutputFormatClass(ProxyOutputFormat.class); if (outputKeyClass == null) { throw new TupleMRException("Output spec must be defined, use setOutput()"); } job.setOutputKeyClass(outputKeyClass); job.setOutputValueClass(outputValueClass); FileOutputFormat.setOutputPath(job, outputPath); Input lastInput = null; for (Input input : multipleInputs.getMultiInputs()) { if (input.inputProcessor == null) { input.inputProcessor = mapOnlyMapper; if (input.inputProcessor == null) { throw new TupleMRException("Either mapOnlyMapper property or full Input spec must be set."); } } lastInput = input; } if (lastInput == null) { throw new TupleMRException("At least one input must be specified"); } job.setJarByClass((jarByClass != null) ? jarByClass : lastInput.inputProcessor.getClass()); instanceFilesCreated.addAll(multipleInputs.configureJob(job)); instanceFilesCreated.addAll(namedOutputs.configureJob(job)); return job; }
From source file:com.datasalt.pangool.tuplemr.TupleMRBuilder.java
License:Apache License
public Job createJob() throws IOException, TupleMRException { failIfNull(tupleReducer, "Need to set a group handler"); failIfEmpty(multipleInputs.getMultiInputs(), "Need to add at least one input"); failIfNull(outputFormat, "Need to set output format"); failIfNull(outputKeyClass, "Need to set outputKeyClass"); failIfNull(outputValueClass, "Need to set outputValueClass"); failIfNull(outputPath, "Need to set outputPath"); // perform a deep copy of the Configuration this.conf = new Configuration(this.conf); TupleMRConfig tupleMRConf = buildConf(); // Serialize PangoolConf in Hadoop Configuration instanceFilesCreated.addAll(TupleMRConfig.set(tupleMRConf, conf)); Job job = (jobName == null) ? new Job(conf) : new Job(conf, jobName); if (tupleMRConf.getRollupFrom() != null) { job.setReducerClass(RollupReducer.class); } else {/*from www. jav a 2 s. c om*/ job.setReducerClass(SimpleReducer.class); } if (tupleCombiner != null) { job.setCombinerClass(SimpleCombiner.class); // not rollup by now // Set Combiner Handler String uniqueName = UUID.randomUUID().toString() + '.' + "combiner-handler.dat"; try { InstancesDistributor.distribute(tupleCombiner, uniqueName, job.getConfiguration()); instanceFilesCreated.add(uniqueName); job.getConfiguration().set(SimpleCombiner.CONF_COMBINER_HANDLER, uniqueName); } catch (URISyntaxException e1) { throw new TupleMRException(e1); } } // Set Tuple Reducer try { String uniqueName = UUID.randomUUID().toString() + '.' + "group-handler.dat"; InstancesDistributor.distribute(tupleReducer, uniqueName, job.getConfiguration()); instanceFilesCreated.add(uniqueName); job.getConfiguration().set(SimpleReducer.CONF_REDUCER_HANDLER, uniqueName); } catch (URISyntaxException e1) { throw new TupleMRException(e1); } // Enabling serialization TupleSerialization.enableSerialization(job.getConfiguration()); job.setJarByClass((jarByClass != null) ? jarByClass : tupleReducer.getClass()); job.setMapOutputKeyClass(DatumWrapper.class); job.setMapOutputValueClass(NullWritable.class); job.setPartitionerClass(TupleHashPartitioner.class); job.setGroupingComparatorClass(GroupComparator.class); job.setSortComparatorClass(SortComparator.class); job.setOutputKeyClass(outputKeyClass); job.setOutputValueClass(outputValueClass); FileOutputFormat.setOutputPath(job, outputPath); instanceFilesCreated.addAll(multipleInputs.configureJob(job)); instanceFilesCreated.addAll(namedOutputs.configureJob(job)); // Configure a {@link ProxyOutputFormat} for Pangool's Multiple Outputs to // work: {@link PangoolMultipleOutput} String uniqueName = UUID.randomUUID().toString() + '.' + "out-format.dat"; try { InstancesDistributor.distribute(outputFormat, uniqueName, conf); instanceFilesCreated.add(uniqueName); } catch (URISyntaxException e1) { throw new TupleMRException(e1); } job.getConfiguration().set(ProxyOutputFormat.PROXIED_OUTPUT_FORMAT_CONF, uniqueName); job.setOutputFormatClass(ProxyOutputFormat.class); return job; }
From source file:com.datasalt.utils.mapred.counter.MapRedCounter.java
License:Apache License
protected static Job buildMapRedCounterJobWithoutCombiner(String name, @SuppressWarnings("rawtypes") Class<? extends OutputFormat> outputFormat, String outPath, Configuration conf) throws IOException { Job job = new Job(conf, name); Path output = new Path(outPath); HadoopUtils.deleteIfExists(FileSystem.get(conf), output); job.setJarByClass(MapRedCounter.class); job.setReducerClass(MapRedCountReducer.class); job.setMapOutputKeyClass(CounterKey.class); job.setMapOutputValueClass(CounterValue.class); job.setOutputFormatClass(outputFormat); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); // Secondary sorting configuration. job.setGroupingComparatorClass(CounterKey.IdGroupComparator.class); job.setPartitionerClass(CounterKey.IdGroupPartitioner.class); FileOutputFormat.setOutputPath(job, output); String uniqueName = UUID.randomUUID().toString() + '.' + "out-format.dat"; try {//from w w w.j a v a2 s .co m DCUtils.serializeToDC(new HadoopOutputFormat(SequenceFileOutputFormat.class), uniqueName, conf); job.getConfiguration().set(ProxyOutputFormat.PROXIED_OUTPUT_FORMAT_CONF, uniqueName); job.setOutputFormatClass(ProxyOutputFormat.class); // Multioutput configuration PangoolMultipleOutputs.addNamedOutput(job, Outputs.COUNTFILE.toString(), new HadoopOutputFormat(SequenceFileOutputFormat.class), CounterKey.class, LongWritable.class); PangoolMultipleOutputs.addNamedOutput(job, Outputs.COUNTDISTINCTFILE.toString(), new HadoopOutputFormat(SequenceFileOutputFormat.class), CounterDistinctKey.class, LongPairWritable.class); } catch (URISyntaxException e) { e.printStackTrace(); throw new IOException(e); } return job; }
From source file:com.datasalt.utils.mapred.joiner.MultiJoiner.java
License:Apache License
public Job getJob() throws IOException { if (job == null) { job = new Job(conf, name); HadoopUtils.deleteIfExists(FileSystem.get(conf), outputPath); job.setJarByClass((jarByClass != null) ? jarByClass : reducer); job.setReducerClass(reducer);/* w w w .j ava 2 s .c o m*/ job.setMapOutputValueClass(MultiJoinDatum.class); job.setMapOutputKeyClass(MultiJoinPair.class); job.setOutputFormatClass(outputFormat); job.setOutputKeyClass(outputKeyClass); job.setOutputValueClass(outputValueClass); job.setGroupingComparatorClass(MultiJoinPair.GroupComparator.class); job.setPartitionerClass(MultiJoinPair.GroupPartitioner.class); FileOutputFormat.setOutputPath(job, outputPath); setMultiJoinPairClass(MultiJoinPair.class); } return job; }
From source file:com.declum.squzer.example.hbase.table2file.Export.java
License:Apache License
/** * Sets up the actual job./*from w ww . java 2 s . com*/ * * @param conf * The current configuration. * @param args * The command line parameters. * @return The newly created job. * @throws IOException * When setting up the job fails. */ public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException { String tableName = args[0]; Path outputDir = new Path(args[1]); Job job = Job.getInstance(conf); job.setJobName(tableName); job.setJobName(NAME + "_" + tableName); job.setJarByClass(Exporter.class); // TODO: Allow passing filter and subset of rows/columns. Scan s = new Scan(); // Optional arguments. int versions = args.length > 2 ? Integer.parseInt(args[2]) : 1; s.setMaxVersions(versions); long startTime = args.length > 3 ? Long.parseLong(args[3]) : 0L; long endTime = args.length > 4 ? Long.parseLong(args[4]) : Long.MAX_VALUE; s.setTimeRange(startTime, endTime); s.setCacheBlocks(false); if (conf.get(TableInputFormat.SCAN_COLUMN_FAMILY) != null) { s.addFamily(Bytes.toBytes(conf.get(TableInputFormat.SCAN_COLUMN_FAMILY))); } LOG.info("verisons=" + versions + ", starttime=" + startTime + ", endtime=" + endTime); TableMapReduceUtil.initTableMapperJob(tableName, s, Exporter.class, null, null, job); // No reducers. Just write straight to output files. job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(Result.class); FileOutputFormat.setOutputPath(job, outputDir); return job; }
From source file:com.digitalpebble.behemoth.mahout.BehemothDocumentProcessor.java
License:Apache License
/** * Convert the input documents into token array using the * {@link StringTuple} The input documents has to be in the * {@link org.apache.hadoop.io.SequenceFile} format * /* w w w . j av a 2 s . c o m*/ * @param input * input directory of the documents in * {@link org.apache.hadoop.io.SequenceFile} format * @param output * output directory were the {@link StringTuple} token array of * each document has to be created * @param type * The annotation type representing the tokens * @param feature * The name of the features holding the token value * @throws IOException * @throws ClassNotFoundException * @throws InterruptedException */ public static void tokenizeDocuments(Path input, String type, String feature, Path output) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); // this conf parameter needs to be set enable serialisation of conf // values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.set(TOKEN_TYPE, type); conf.set(FEATURE_NAME, feature); Job job = new Job(conf); job.setJobName("DocumentProcessor::BehemothTokenizer: input-folder: " + input); job.setJarByClass(BehemothDocumentProcessor.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(StringTuple.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(BehemothTokenizerMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }