List of usage examples for org.apache.hadoop.mapred JobConf setCombinerClass
public void setCombinerClass(Class<? extends Reducer> theClass)
From source file:net.peacesoft.nutch.crawl.ReLinkDb.java
License:Apache License
private static JobConf createJob(Configuration config, Path linkDb, boolean normalize, boolean filter) { Path newLinkDb = new Path("linkdb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(config); job.setJobName("linkdb " + linkDb); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(ReLinkDb.class); job.setCombinerClass(LinkDbMerger.class); // if we don't run the mergeJob, perform normalization/filtering now if (normalize || filter) { try {//from www. j av a 2 s . co m FileSystem fs = FileSystem.get(config); if (!fs.exists(linkDb)) { job.setBoolean(LinkDbFilter.URL_FILTERING, filter); job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize); } } catch (Exception e) { LOG.warn("ReLinkDb createJob: " + e); } } job.setReducerClass(LinkDbMerger.class); FileOutputFormat.setOutputPath(job, newLinkDb); job.setOutputFormat(MapFileOutputFormat.class); job.setBoolean("mapred.output.compress", true); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Inlinks.class); return job; }
From source file:net.team1.dev.HousingAnalysis.java
License:Apache License
/** * The main entry point for the map/reduce runner. * * @param args 2 args: \<input dir\> \<output dir\> * @throws Exception Throws IOException/*from w ww. j a v a 2 s . c om*/ */ public static void main(String[] args) throws Exception { Path inputDir = new Path(args[0]); Path outputDir = new Path(args[1]); FileSystem fs = FileSystem.get(new Configuration()); if (!fs.exists(inputDir)) throw new IOException("The input path does not exist."); if (fs.isFile(inputDir)) throw new IOException("The input path is a file."); if (fs.exists(outputDir)) fs.delete(outputDir, true); // set job configuration JobConf conf = new JobConf(HousingAnalysis.class); conf.setJobName("housinganalysis"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setOutputFormat(TextOutputFormat.class); conf.setCombinerClass(HousingReducer.class); conf.setReducerClass(HousingReducer.class); // set multiple input files HashMap<Path, Class<? extends Mapper>> inputMappers = getInputFilePaths(inputDir, fs); for (Path p : inputMappers.keySet()) { MultipleInputs.addInputPath(conf, p, TextInputFormat.class, inputMappers.get(p)); LOG.info(p.getName() + ": " + inputMappers.get(p).getName()); } // set output FileOutputFormat.setOutputPath(conf, outputDir); // start the job JobClient.runJob(conf); }
From source file:nl.tudelft.graphalytics.mapreducev2.MapReduceJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { FileSystem dfs = FileSystem.get(getConf()); String inPath = inputPath;/* w ww .ja v a 2s . c om*/ while (!isFinished()) { iteration++; // Prepare job configuration JobConf jobConfiguration = new JobConf(this.getConf()); jobConfiguration.setJarByClass(this.getClass()); jobConfiguration.setMapOutputKeyClass(getMapOutputKeyClass()); jobConfiguration.setMapOutputValueClass(getMapOutputValueClass()); jobConfiguration.setMapperClass(getMapperClass()); if (getCombinerClass() != null) jobConfiguration.setCombinerClass(getCombinerClass()); jobConfiguration.setReducerClass(getReducerClass()); jobConfiguration.setOutputKeyClass(getOutputKeyClass()); jobConfiguration.setOutputValueClass(getOutputValueClass()); jobConfiguration.setInputFormat(getInputFormatClass()); jobConfiguration.setOutputFormat(getOutputFormatClass()); if (getNumMappers() != -1) jobConfiguration.setNumMapTasks(getNumMappers()); if (getNumReducers() != -1) jobConfiguration.setNumReduceTasks(getNumReducers()); setConfigurationParameters(jobConfiguration); // Set the input and output paths String outPath = intermediatePath + "/iteration-" + iteration; FileInputFormat.addInputPath(jobConfiguration, new Path(inPath)); FileOutputFormat.setOutputPath(jobConfiguration, new Path(outPath)); // Execute the current iteration RunningJob jobExecution = JobClient.runJob(jobConfiguration); jobExecution.waitForCompletion(); // Remove the output of the previous job (unless it is the input graph) if (iteration != 1) { dfs.delete(new Path(inPath), true); } inPath = outPath; processJobOutput(jobExecution); } // Rename the last job output to the specified output path try { dfs.mkdirs(new Path(outputPath).getParent()); dfs.rename(new Path(inPath), new Path(outputPath)); } catch (Exception e) { LOG.warn("Failed to rename MapReduce job output.", e); } return 0; }
From source file:nthu.scopelab.tsqr.ssvd.BtJob.java
License:Apache License
public static void run(Configuration conf, Path[] inputPath, Path btPath, String qrfPath, int k, int p, int outerBlockHeight, int reduceTasks, boolean outputBBtProducts, String reduceSchedule, int mis) throws Exception { boolean outputQ = true; String stages[] = reduceSchedule.split(","); JobConf job = new JobConf(conf, BtJob.class); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setInt(SCHEDULE_NUM, stages.length); job.setInt(PROP_OUTER_PROD_BLOCK_HEIGHT, outerBlockHeight); job.setInt(QJob.PROP_K, k);/*w ww.j a v a 2s . c o m*/ job.setInt(QJob.PROP_P, p); job.setBoolean(QmultiplyJob.OUTPUT_Q, outputQ); job.setBoolean(PROP_OUPTUT_BBT_PRODUCTS, outputBBtProducts); job.set(QmultiplyJob.QRF_DIR, qrfPath); FileSystem.get(job).delete(btPath, true); FileOutputFormat.setOutputPath(job, btPath); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setJobName("BtJob"); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(SparseRowBlockWritable.class); job.setOutputKeyClass(IntWritable.class); //job.setOutputValueClass(SparseRowBlockWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(BtMapper.class); job.setCombinerClass(OuterProductCombiner.class); job.setReducerClass(OuterProductReducer.class); fileGather fgather = new fileGather(inputPath, "", FileSystem.get(job)); mis = Checker.checkMis(mis, fgather.getInputSize(), FileSystem.get(job)); job.setNumMapTasks(fgather.recNumMapTasks(mis)); //job.setNumReduceTasks(0); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, inputPath); if (outputQ) { MultipleOutputs.addNamedOutput(job, QmultiplyJob.Q_MAT, SequenceFileOutputFormat.class, IntWritable.class, LMatrixWritable.class); } if (outputBBtProducts) { MultipleOutputs.addNamedOutput(job, OUTPUT_BBT, SequenceFileOutputFormat.class, IntWritable.class, VectorWritable.class); } RunningJob rj = JobClient.runJob(job); System.out.println("Btjob Job ID: " + rj.getJobID().toString()); }
From source file:nthu.scopelab.tsqr.ssvd.itBtJob.java
License:Apache License
public static void run(Configuration conf, Path[] inputPath, Path btPath, String qrfPath, int k, int p, int outerBlockHeight, int reduceTasks, boolean outputBBtProducts, String reduceSchedule, int mis) throws Exception { boolean outputQ = true; String stages[] = reduceSchedule.split(","); JobConf job = new JobConf(conf, itBtJob.class); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setInt(SCHEDULE_NUM, stages.length); job.setInt(PROP_OUTER_PROD_BLOCK_HEIGHT, outerBlockHeight); job.setInt(QJob.PROP_K, k);//from w w w . ja v a 2s. co m job.setInt(QJob.PROP_P, p); job.setBoolean(QmultiplyJob.OUTPUT_Q, outputQ); job.setBoolean(PROP_OUPTUT_BBT_PRODUCTS, outputBBtProducts); job.set(QmultiplyJob.QRF_DIR, qrfPath); FileSystem.get(job).delete(btPath, true); FileOutputFormat.setOutputPath(job, btPath); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setJobName("itBtJob"); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(SparseRowBlockWritable.class); job.setOutputKeyClass(IntWritable.class); //job.setOutputValueClass(SparseRowBlockWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(BtMapper.class); job.setCombinerClass(OuterProductCombiner.class); job.setReducerClass(OuterProductReducer.class); fileGather fgather = new fileGather(inputPath, "", FileSystem.get(job)); mis = Checker.checkMis(mis, fgather.getInputSize(), FileSystem.get(job)); job.setNumMapTasks(fgather.recNumMapTasks(mis)); //job.setNumReduceTasks(0); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, inputPath); if (outputQ) { MultipleOutputs.addNamedOutput(job, QmultiplyJob.Q_MAT, SequenceFileOutputFormat.class, IntWritable.class, LMatrixWritable.class); } if (outputBBtProducts) { MultipleOutputs.addNamedOutput(job, OUTPUT_BBT, SequenceFileOutputFormat.class, IntWritable.class, VectorWritable.class); } RunningJob rj = JobClient.runJob(job); System.out.println("itBtJob Job ID: " + rj.getJobID().toString()); }
From source file:org.acacia.csr.java.LineCount.java
License:Apache License
public static void main(String[] args) throws Exception { /*//w w w . j a v a 2 s.c o m String dir1 = "/user/miyuru/wcout"; String dir2 = "/user/miyuru/lcout"; //We first delete the temporary directories if they exist on the HDFS FileSystem fs1 = FileSystem.get(new JobConf()); if(fs1.exists(new Path(dir2))){ fs1.delete(new Path(dir2), true); } JobConf conf = new JobConf(LineCount.class); conf.setJobName("LineCount"); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(dir1)); FileOutputFormat.setOutputPath(conf, new Path(dir2)); Job job = new Job(conf, "line count"); job.waitForCompletion(true); org.apache.hadoop.mapreduce.Counters cntr = job.getCounters(); System .out.println("Number of lines in the file" + cntr.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS").getValue()); */ long edgeCount = 0; //String dir3 = "/user/miyuru/wcout"; String dir4 = "/user/miyuru/lcout"; String dir5 = "/user/miyuru/input"; //We first delete the temporary directories if they exist on the HDFS FileSystem fs2 = FileSystem.get(new JobConf()); if (fs2.exists(new Path(dir4))) { fs2.delete(new Path(dir4), true); } JobConf conf1 = new JobConf(LineCount.class); conf1.setJobName("LineCount"); conf1.setOutputKeyClass(Text.class); conf1.setOutputValueClass(IntWritable.class); conf1.setMapperClass(Map.class); conf1.setCombinerClass(Reduce.class); conf1.setReducerClass(Reduce.class); conf1.setInputFormat(TextInputFormat.class); conf1.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf1, new Path(dir5)); FileOutputFormat.setOutputPath(conf1, new Path(dir4)); Job job1 = new Job(conf1, "line count"); job1.setNumReduceTasks(0); job1.waitForCompletion(true); org.apache.hadoop.mapreduce.Counters cntr = job1.getCounters(); edgeCount = cntr.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS").getValue(); File efile = new File("/tmp/efile"); if (efile.exists()) { efile.delete(); } PrintWriter writer = new PrintWriter("/tmp/efile", "UTF-8"); writer.println(edgeCount); writer.flush(); writer.close(); //edgeCount = edgeCount -1;//This is to remove the line number additionlly added to each edgelist file by HDFS. This is strange, but it happens. System.out.println("======>Edge count is : " + edgeCount); System.out.println("------Done Line Count---------------"); }
From source file:org.acacia.csr.java.NotInFinder.java
License:Apache License
public static void main(String[] args) throws Exception { String dir1 = "/user/miyuru/wcout"; String dir2 = "/user/miyuru/notinverts"; //We first delete the temporary directories if they exist on the HDFS FileSystem fs1 = FileSystem.get(new JobConf()); if (fs1.exists(new Path(dir2))) { fs1.delete(new Path(dir2), true); }/*w w w .j a va2 s .c om*/ JobConf conf = new JobConf(); conf.setNumMapTasks(96); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(LongWritable.class); conf.setMapperClass(TokenizerMapper.class); conf.setReducerClass(IntSumReducer.class); conf.setCombinerClass(IntSumReducer.class); conf.setInputFormat(NLinesInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(dir1)); FileOutputFormat.setOutputPath(conf, new Path(dir2)); Job job = new Job(conf, "NotInFinder"); job.setJarByClass(WordCount.class); // job.setMapperClass(TokenizerMapper.class); // job.setCombinerClass(IntSumReducer.class); // job.setReducerClass(IntSumReducer.class); // job.setOutputKeyClass(LongWritable.class); // job.setOutputValueClass(LongWritable.class); job.setSortComparatorClass(SortComparator.class); job.waitForCompletion(true); }
From source file:org.acacia.partitioner.java.NoptSplitter.java
License:Apache License
/** * @param args/*from w ww . j a v a 2s .c o m*/ */ public static void main(String[] args) { if (!validArgs(args)) { printUsage(); return; } //These are the temp paths that are created on HDFS String dir1 = "/user/miyuru/edgedistributed-out/nopt"; String dir2 = "/user/miyuru/nopt-distributed"; //We first delete the temporary directories if they exist on the HDFS FileSystem fs1; try { fs1 = FileSystem.get(new JobConf()); System.out.println("Deleting the dir : " + dir2); if (fs1.exists(new Path(dir2))) { fs1.delete(new Path(dir2), true); } // Path notinPath = new Path(dir2); // // if(!fs1.exists(notinPath)){ // fs1.create(notinPath); // } JobConf conf = new JobConf(NoptSplitter.class); // conf.setOutputKeyClass(Text.class); // conf.setOutputValueClass(Text.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); // conf.setInputFormat(TextInputFormat.class); // conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(dir1)); FileOutputFormat.setOutputPath(conf, new Path(dir2)); Job job1 = new Job(conf, "nopt_splitter"); job1.setNumReduceTasks(Integer.parseInt(args[0])); //The most importnt point in this job job1.waitForCompletion(true); } catch (IOException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } }
From source file:org.apache.avro.mapred.AvroJob.java
License:Apache License
/** Configure a job's combiner implementation. */ public static void setCombinerClass(JobConf job, Class<? extends AvroReducer> c) { job.set(COMBINER, c.getName());/*from www.ja v a 2s. c o m*/ job.setCombinerClass(HadoopCombiner.class); }
From source file:org.apache.avro.mapred.TestWordCountGeneric.java
License:Apache License
@Test @SuppressWarnings("deprecation") public void testJob() throws Exception { String dir = System.getProperty("test.dir", ".") + "/mapred"; Path outputPath = new Path(dir + "/out"); JobConf job = new JobConf(); try {/*from www . jav a2 s .co m*/ WordCountUtil.writeLinesFile(); job.setJobName("wordcount"); AvroJob.setInputGeneric(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputGeneric(job, WordCount.SCHEMA$); job.setMapperClass(MapImpl.class); job.setCombinerClass(ReduceImpl.class); job.setReducerClass(ReduceImpl.class); FileInputFormat.setInputPaths(job, new Path(dir + "/in")); FileOutputFormat.setOutputPath(job, outputPath); FileOutputFormat.setCompressOutput(job, true); JobClient.runJob(job); WordCountUtil.validateCountsFile(); } finally { outputPath.getFileSystem(job).delete(outputPath); } }