List of usage examples for org.apache.hadoop.mapreduce Job setOutputValueClass
public void setOutputValueClass(Class<?> theClass) throws IllegalStateException
From source file:com.daleway.training.hadoop.pagerank.PageRankSimple.java
License:Apache License
public static Job createJob(Configuration conf, String inputPath, String outputPath) throws IOException { Job job = new Job(conf, "pair wise count"); job.setJarByClass(PageRankSimple.class); job.setMapperClass(PageRankMapper.class); //job.setCombinerClass(IntSumReducer.class); job.setReducerClass(PageRankReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job;// w w w . j av a 2 s.c o m }
From source file:com.datasalt.pangool.benchmark.secondarysort.HadoopSecondarySort.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: secondarysrot <in> <out>"); System.exit(2);//from ww w . j av a 2s. c o m } Job job = new Job(conf, "Hadoop Secondary Sort"); FileSystem fS = FileSystem.get(conf); fS.delete(new Path(otherArgs[1]), true); job.setJarByClass(HadoopSecondarySort.class); job.setMapperClass(MapClass.class); job.setReducerClass(Reduce.class); job.setPartitionerClass(KeyPartitioner.class); job.setGroupingComparatorClass(GroupingComparator.class); job.setMapOutputKeyClass(ComplexType.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); job.waitForCompletion(true); }
From source file:com.datasalt.pangool.benchmark.wordcount.HadoopWordCount.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2);/*from w w w.j av a 2 s.c om*/ } //conf.setBoolean("hadoop.security.authorization", false); //conf.set("hadoop.security.authentication","simple"); Job job = new Job(conf, "word count"); job.setJarByClass(HadoopWordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); HadoopUtils.deleteIfExists(FileSystem.get(conf), new Path(otherArgs[1])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); job.waitForCompletion(true); }
From source file:com.datasalt.pangool.tuplemr.MapOnlyJobBuilder.java
License:Apache License
public Job createJob() throws IOException, TupleMRException, URISyntaxException { // perform a deep copy of the configuration this.conf = new Configuration(this.conf); String uniqueName = UUID.randomUUID().toString() + '.' + "out-format.dat"; try {//from ww w .j a v a2s . c o m InstancesDistributor.distribute(outputFormat, uniqueName, conf); instanceFilesCreated.add(uniqueName); } catch (URISyntaxException e1) { throw new TupleMRException(e1); } Job job; if (jobName == null) { job = new Job(conf); } else { job = new Job(conf, jobName); } job.setNumReduceTasks(0); job.getConfiguration().set(ProxyOutputFormat.PROXIED_OUTPUT_FORMAT_CONF, uniqueName); job.setOutputFormatClass(ProxyOutputFormat.class); if (outputKeyClass == null) { throw new TupleMRException("Output spec must be defined, use setOutput()"); } job.setOutputKeyClass(outputKeyClass); job.setOutputValueClass(outputValueClass); FileOutputFormat.setOutputPath(job, outputPath); Input lastInput = null; for (Input input : multipleInputs.getMultiInputs()) { if (input.inputProcessor == null) { input.inputProcessor = mapOnlyMapper; if (input.inputProcessor == null) { throw new TupleMRException("Either mapOnlyMapper property or full Input spec must be set."); } } lastInput = input; } if (lastInput == null) { throw new TupleMRException("At least one input must be specified"); } job.setJarByClass((jarByClass != null) ? jarByClass : lastInput.inputProcessor.getClass()); instanceFilesCreated.addAll(multipleInputs.configureJob(job)); instanceFilesCreated.addAll(namedOutputs.configureJob(job)); return job; }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.PangoolMultipleOutputs.java
License:Apache License
public synchronized RecordWriter getRecordWriter(String baseFileName) throws IOException, InterruptedException { // Look for record-writer in the cache OutputContext context = outputContexts.get(baseFileName); // If not in cache, create a new one if (context == null) { context = new OutputContext(); OutputFormat mainOutputFormat;/*from w w w . j a va 2 s. com*/ try { mainOutputFormat = ((OutputFormat) ReflectionUtils.newInstance(this.context.getOutputFormatClass(), this.context.getConfiguration())); } catch (ClassNotFoundException e1) { throw new RuntimeException(e1); } ProxyOutputCommitter baseOutputCommitter = ((ProxyOutputCommitter) mainOutputFormat .getOutputCommitter(this.context)); // The trick is to create a new Job for each output Job job = new Job(this.context.getConfiguration()); job.setOutputKeyClass(getNamedOutputKeyClass(this.context, baseFileName)); job.setOutputValueClass(getNamedOutputValueClass(this.context, baseFileName)); // Check possible specific context for the output setSpecificNamedOutputContext(this.context.getConfiguration(), job, baseFileName); TaskAttemptContext taskContext; try { taskContext = TaskAttemptContextFactory.get(job.getConfiguration(), this.context.getTaskAttemptID()); } catch (Exception e) { throw new IOException(e); } // First we change the output dir for the new OutputFormat that we will // create // We put it inside the main output work path -> in case the Job fails, // everything will be discarded taskContext.getConfiguration().set("mapred.output.dir", baseOutputCommitter.getBaseDir() + "/" + baseFileName); // This is for Hadoop 2.0 : taskContext.getConfiguration().set("mapreduce.output.fileoutputformat.outputdir", baseOutputCommitter.getBaseDir() + "/" + baseFileName); context.taskAttemptContext = taskContext; // Load the OutputFormat instance OutputFormat outputFormat = InstancesDistributor.loadInstance( context.taskAttemptContext.getConfiguration(), OutputFormat.class, getNamedOutputFormatInstanceFile(this.context, baseFileName), true); // We have to create a JobContext for meeting the contract of the // OutputFormat JobContext jobContext; try { jobContext = JobContextFactory.get(taskContext.getConfiguration(), taskContext.getJobID()); } catch (Exception e) { throw new IOException(e); } context.jobContext = jobContext; // The contract of the OutputFormat is to check the output specs outputFormat.checkOutputSpecs(jobContext); // We get the output committer so we can call it later context.outputCommitter = outputFormat.getOutputCommitter(taskContext); // Save the RecordWriter to cache it context.recordWriter = outputFormat.getRecordWriter(taskContext); // if counters are enabled, wrap the writer with context // to increment counters if (countersEnabled) { context.recordWriter = new RecordWriterWithCounter(context.recordWriter, baseFileName, this.context); } outputContexts.put(baseFileName, context); } return context.recordWriter; }
From source file:com.datasalt.pangool.tuplemr.TupleMRBuilder.java
License:Apache License
public Job createJob() throws IOException, TupleMRException { failIfNull(tupleReducer, "Need to set a group handler"); failIfEmpty(multipleInputs.getMultiInputs(), "Need to add at least one input"); failIfNull(outputFormat, "Need to set output format"); failIfNull(outputKeyClass, "Need to set outputKeyClass"); failIfNull(outputValueClass, "Need to set outputValueClass"); failIfNull(outputPath, "Need to set outputPath"); // perform a deep copy of the Configuration this.conf = new Configuration(this.conf); TupleMRConfig tupleMRConf = buildConf(); // Serialize PangoolConf in Hadoop Configuration instanceFilesCreated.addAll(TupleMRConfig.set(tupleMRConf, conf)); Job job = (jobName == null) ? new Job(conf) : new Job(conf, jobName); if (tupleMRConf.getRollupFrom() != null) { job.setReducerClass(RollupReducer.class); } else {/* w w w . j a va 2s . c o m*/ job.setReducerClass(SimpleReducer.class); } if (tupleCombiner != null) { job.setCombinerClass(SimpleCombiner.class); // not rollup by now // Set Combiner Handler String uniqueName = UUID.randomUUID().toString() + '.' + "combiner-handler.dat"; try { InstancesDistributor.distribute(tupleCombiner, uniqueName, job.getConfiguration()); instanceFilesCreated.add(uniqueName); job.getConfiguration().set(SimpleCombiner.CONF_COMBINER_HANDLER, uniqueName); } catch (URISyntaxException e1) { throw new TupleMRException(e1); } } // Set Tuple Reducer try { String uniqueName = UUID.randomUUID().toString() + '.' + "group-handler.dat"; InstancesDistributor.distribute(tupleReducer, uniqueName, job.getConfiguration()); instanceFilesCreated.add(uniqueName); job.getConfiguration().set(SimpleReducer.CONF_REDUCER_HANDLER, uniqueName); } catch (URISyntaxException e1) { throw new TupleMRException(e1); } // Enabling serialization TupleSerialization.enableSerialization(job.getConfiguration()); job.setJarByClass((jarByClass != null) ? jarByClass : tupleReducer.getClass()); job.setMapOutputKeyClass(DatumWrapper.class); job.setMapOutputValueClass(NullWritable.class); job.setPartitionerClass(TupleHashPartitioner.class); job.setGroupingComparatorClass(GroupComparator.class); job.setSortComparatorClass(SortComparator.class); job.setOutputKeyClass(outputKeyClass); job.setOutputValueClass(outputValueClass); FileOutputFormat.setOutputPath(job, outputPath); instanceFilesCreated.addAll(multipleInputs.configureJob(job)); instanceFilesCreated.addAll(namedOutputs.configureJob(job)); // Configure a {@link ProxyOutputFormat} for Pangool's Multiple Outputs to // work: {@link PangoolMultipleOutput} String uniqueName = UUID.randomUUID().toString() + '.' + "out-format.dat"; try { InstancesDistributor.distribute(outputFormat, uniqueName, conf); instanceFilesCreated.add(uniqueName); } catch (URISyntaxException e1) { throw new TupleMRException(e1); } job.getConfiguration().set(ProxyOutputFormat.PROXIED_OUTPUT_FORMAT_CONF, uniqueName); job.setOutputFormatClass(ProxyOutputFormat.class); return job; }
From source file:com.datasalt.utils.mapred.counter.MapRedCounter.java
License:Apache License
protected static Job buildMapRedCounterJobWithoutCombiner(String name, @SuppressWarnings("rawtypes") Class<? extends OutputFormat> outputFormat, String outPath, Configuration conf) throws IOException { Job job = new Job(conf, name); Path output = new Path(outPath); HadoopUtils.deleteIfExists(FileSystem.get(conf), output); job.setJarByClass(MapRedCounter.class); job.setReducerClass(MapRedCountReducer.class); job.setMapOutputKeyClass(CounterKey.class); job.setMapOutputValueClass(CounterValue.class); job.setOutputFormatClass(outputFormat); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); // Secondary sorting configuration. job.setGroupingComparatorClass(CounterKey.IdGroupComparator.class); job.setPartitionerClass(CounterKey.IdGroupPartitioner.class); FileOutputFormat.setOutputPath(job, output); String uniqueName = UUID.randomUUID().toString() + '.' + "out-format.dat"; try {//from w w w .j a v a 2 s .c o m DCUtils.serializeToDC(new HadoopOutputFormat(SequenceFileOutputFormat.class), uniqueName, conf); job.getConfiguration().set(ProxyOutputFormat.PROXIED_OUTPUT_FORMAT_CONF, uniqueName); job.setOutputFormatClass(ProxyOutputFormat.class); // Multioutput configuration PangoolMultipleOutputs.addNamedOutput(job, Outputs.COUNTFILE.toString(), new HadoopOutputFormat(SequenceFileOutputFormat.class), CounterKey.class, LongWritable.class); PangoolMultipleOutputs.addNamedOutput(job, Outputs.COUNTDISTINCTFILE.toString(), new HadoopOutputFormat(SequenceFileOutputFormat.class), CounterDistinctKey.class, LongPairWritable.class); } catch (URISyntaxException e) { e.printStackTrace(); throw new IOException(e); } return job; }
From source file:com.declum.squzer.example.hbase.table2file.Export.java
License:Apache License
/** * Sets up the actual job.//from ww w . j ava 2 s .c om * * @param conf * The current configuration. * @param args * The command line parameters. * @return The newly created job. * @throws IOException * When setting up the job fails. */ public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException { String tableName = args[0]; Path outputDir = new Path(args[1]); Job job = Job.getInstance(conf); job.setJobName(tableName); job.setJobName(NAME + "_" + tableName); job.setJarByClass(Exporter.class); // TODO: Allow passing filter and subset of rows/columns. Scan s = new Scan(); // Optional arguments. int versions = args.length > 2 ? Integer.parseInt(args[2]) : 1; s.setMaxVersions(versions); long startTime = args.length > 3 ? Long.parseLong(args[3]) : 0L; long endTime = args.length > 4 ? Long.parseLong(args[4]) : Long.MAX_VALUE; s.setTimeRange(startTime, endTime); s.setCacheBlocks(false); if (conf.get(TableInputFormat.SCAN_COLUMN_FAMILY) != null) { s.addFamily(Bytes.toBytes(conf.get(TableInputFormat.SCAN_COLUMN_FAMILY))); } LOG.info("verisons=" + versions + ", starttime=" + startTime + ", endtime=" + endTime); TableMapReduceUtil.initTableMapperJob(tableName, s, Exporter.class, null, null, job); // No reducers. Just write straight to output files. job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(Result.class); FileOutputFormat.setOutputPath(job, outputDir); return job; }
From source file:com.digitalpebble.behemoth.mahout.BehemothDocumentProcessor.java
License:Apache License
/** * Convert the input documents into token array using the * {@link StringTuple} The input documents has to be in the * {@link org.apache.hadoop.io.SequenceFile} format * /*from www . j a v a 2 s. c o m*/ * @param input * input directory of the documents in * {@link org.apache.hadoop.io.SequenceFile} format * @param output * output directory were the {@link StringTuple} token array of * each document has to be created * @param type * The annotation type representing the tokens * @param feature * The name of the features holding the token value * @throws IOException * @throws ClassNotFoundException * @throws InterruptedException */ public static void tokenizeDocuments(Path input, String type, String feature, Path output) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); // this conf parameter needs to be set enable serialisation of conf // values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.set(TOKEN_TYPE, type); conf.set(FEATURE_NAME, feature); Job job = new Job(conf); job.setJobName("DocumentProcessor::BehemothTokenizer: input-folder: " + input); job.setJarByClass(BehemothDocumentProcessor.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(StringTuple.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(BehemothTokenizerMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }
From source file:com.digitalpebble.behemoth.mahout.BehemothDocumentProcessor.java
License:Apache License
/** * Convert the input documents into token array using the * {@link StringTuple} The input documents has to be in the * {@link org.apache.hadoop.io.SequenceFile} format * //from w w w .j a v a 2s. c o m * @param input * input directory of the documents in * {@link org.apache.hadoop.io.SequenceFile} format * @param output * output directory were the {@link StringTuple} token array of * each document has to be created * @param analyzerClass * The Lucene {@link Analyzer} for tokenizing the UTF-8 text */ public static void tokenizeDocuments(Path input, Class<? extends Analyzer> analyzerClass, Path output, Configuration baseConf) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf // values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.set(ANALYZER_CLASS, analyzerClass.getName()); Job job = new Job(conf); job.setJobName("DocumentProcessor::LuceneTokenizer: input-folder: " + input); job.setJarByClass(BehemothDocumentProcessor.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(StringTuple.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(LuceneTokenizerMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }