Example usage for org.apache.hadoop.mapred JobConf setCombinerClass

List of usage examples for org.apache.hadoop.mapred JobConf setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> theClass) 

Source Link

Document

Set the user-defined combiner class used to combine map-outputs before being sent to the reducers.

Usage

From source file:net.peacesoft.nutch.crawl.ReLinkDb.java

License:Apache License

private static JobConf createJob(Configuration config, Path linkDb, boolean normalize, boolean filter) {
    Path newLinkDb = new Path("linkdb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("linkdb " + linkDb);

    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(ReLinkDb.class);
    job.setCombinerClass(LinkDbMerger.class);
    // if we don't run the mergeJob, perform normalization/filtering now
    if (normalize || filter) {
        try {//from www.  j av a 2  s . co  m
            FileSystem fs = FileSystem.get(config);
            if (!fs.exists(linkDb)) {
                job.setBoolean(LinkDbFilter.URL_FILTERING, filter);
                job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize);
            }
        } catch (Exception e) {
            LOG.warn("ReLinkDb createJob: " + e);
        }
    }
    job.setReducerClass(LinkDbMerger.class);

    FileOutputFormat.setOutputPath(job, newLinkDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setBoolean("mapred.output.compress", true);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Inlinks.class);

    return job;
}

From source file:net.team1.dev.HousingAnalysis.java

License:Apache License

/**
 * The main entry point for the map/reduce runner.
 *
 * @param args 2 args: \<input dir\> \<output dir\>
 * @throws Exception Throws IOException/*from   w ww.  j a v a 2 s  .  c om*/
 */
public static void main(String[] args) throws Exception {
    Path inputDir = new Path(args[0]);
    Path outputDir = new Path(args[1]);
    FileSystem fs = FileSystem.get(new Configuration());

    if (!fs.exists(inputDir))
        throw new IOException("The input path does not exist.");
    if (fs.isFile(inputDir))
        throw new IOException("The input path is a file.");
    if (fs.exists(outputDir))
        fs.delete(outputDir, true);

    // set job configuration
    JobConf conf = new JobConf(HousingAnalysis.class);
    conf.setJobName("housinganalysis");
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setOutputFormat(TextOutputFormat.class);
    conf.setCombinerClass(HousingReducer.class);
    conf.setReducerClass(HousingReducer.class);

    // set multiple input files
    HashMap<Path, Class<? extends Mapper>> inputMappers = getInputFilePaths(inputDir, fs);
    for (Path p : inputMappers.keySet()) {
        MultipleInputs.addInputPath(conf, p, TextInputFormat.class, inputMappers.get(p));
        LOG.info(p.getName() + ": " + inputMappers.get(p).getName());
    }

    // set output
    FileOutputFormat.setOutputPath(conf, outputDir);

    // start the job
    JobClient.runJob(conf);
}

From source file:nl.tudelft.graphalytics.mapreducev2.MapReduceJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    FileSystem dfs = FileSystem.get(getConf());
    String inPath = inputPath;/*  w  ww .ja  v  a 2s . c  om*/

    while (!isFinished()) {
        iteration++;

        // Prepare job configuration
        JobConf jobConfiguration = new JobConf(this.getConf());
        jobConfiguration.setJarByClass(this.getClass());

        jobConfiguration.setMapOutputKeyClass(getMapOutputKeyClass());
        jobConfiguration.setMapOutputValueClass(getMapOutputValueClass());

        jobConfiguration.setMapperClass(getMapperClass());
        if (getCombinerClass() != null)
            jobConfiguration.setCombinerClass(getCombinerClass());
        jobConfiguration.setReducerClass(getReducerClass());

        jobConfiguration.setOutputKeyClass(getOutputKeyClass());
        jobConfiguration.setOutputValueClass(getOutputValueClass());

        jobConfiguration.setInputFormat(getInputFormatClass());
        jobConfiguration.setOutputFormat(getOutputFormatClass());

        if (getNumMappers() != -1)
            jobConfiguration.setNumMapTasks(getNumMappers());
        if (getNumReducers() != -1)
            jobConfiguration.setNumReduceTasks(getNumReducers());

        setConfigurationParameters(jobConfiguration);

        // Set the input and output paths
        String outPath = intermediatePath + "/iteration-" + iteration;
        FileInputFormat.addInputPath(jobConfiguration, new Path(inPath));
        FileOutputFormat.setOutputPath(jobConfiguration, new Path(outPath));

        // Execute the current iteration
        RunningJob jobExecution = JobClient.runJob(jobConfiguration);
        jobExecution.waitForCompletion();

        // Remove the output of the previous job (unless it is the input graph)
        if (iteration != 1) {
            dfs.delete(new Path(inPath), true);
        }
        inPath = outPath;

        processJobOutput(jobExecution);
    }

    // Rename the last job output to the specified output path
    try {
        dfs.mkdirs(new Path(outputPath).getParent());
        dfs.rename(new Path(inPath), new Path(outputPath));
    } catch (Exception e) {
        LOG.warn("Failed to rename MapReduce job output.", e);
    }

    return 0;
}

From source file:nthu.scopelab.tsqr.ssvd.BtJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputPath, Path btPath, String qrfPath, int k, int p,
        int outerBlockHeight, int reduceTasks, boolean outputBBtProducts, String reduceSchedule, int mis)
        throws Exception {
    boolean outputQ = true;

    String stages[] = reduceSchedule.split(",");

    JobConf job = new JobConf(conf, BtJob.class);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setInt(SCHEDULE_NUM, stages.length);
    job.setInt(PROP_OUTER_PROD_BLOCK_HEIGHT, outerBlockHeight);
    job.setInt(QJob.PROP_K, k);/*w  ww.j  a  v  a 2s  .  c o  m*/
    job.setInt(QJob.PROP_P, p);
    job.setBoolean(QmultiplyJob.OUTPUT_Q, outputQ);
    job.setBoolean(PROP_OUPTUT_BBT_PRODUCTS, outputBBtProducts);
    job.set(QmultiplyJob.QRF_DIR, qrfPath);
    FileSystem.get(job).delete(btPath, true);

    FileOutputFormat.setOutputPath(job, btPath);

    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setJobName("BtJob");

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(SparseRowBlockWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    //job.setOutputValueClass(SparseRowBlockWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(BtMapper.class);
    job.setCombinerClass(OuterProductCombiner.class);
    job.setReducerClass(OuterProductReducer.class);

    fileGather fgather = new fileGather(inputPath, "", FileSystem.get(job));
    mis = Checker.checkMis(mis, fgather.getInputSize(), FileSystem.get(job));
    job.setNumMapTasks(fgather.recNumMapTasks(mis));

    //job.setNumReduceTasks(0);
    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, inputPath);

    if (outputQ) {
        MultipleOutputs.addNamedOutput(job, QmultiplyJob.Q_MAT, SequenceFileOutputFormat.class,
                IntWritable.class, LMatrixWritable.class);
    }
    if (outputBBtProducts) {
        MultipleOutputs.addNamedOutput(job, OUTPUT_BBT, SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
    }
    RunningJob rj = JobClient.runJob(job);
    System.out.println("Btjob Job ID: " + rj.getJobID().toString());
}

From source file:nthu.scopelab.tsqr.ssvd.itBtJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputPath, Path btPath, String qrfPath, int k, int p,
        int outerBlockHeight, int reduceTasks, boolean outputBBtProducts, String reduceSchedule, int mis)
        throws Exception {
    boolean outputQ = true;

    String stages[] = reduceSchedule.split(",");

    JobConf job = new JobConf(conf, itBtJob.class);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setInt(SCHEDULE_NUM, stages.length);
    job.setInt(PROP_OUTER_PROD_BLOCK_HEIGHT, outerBlockHeight);
    job.setInt(QJob.PROP_K, k);//from  w w w . ja  v a 2s.  co  m
    job.setInt(QJob.PROP_P, p);
    job.setBoolean(QmultiplyJob.OUTPUT_Q, outputQ);
    job.setBoolean(PROP_OUPTUT_BBT_PRODUCTS, outputBBtProducts);
    job.set(QmultiplyJob.QRF_DIR, qrfPath);
    FileSystem.get(job).delete(btPath, true);

    FileOutputFormat.setOutputPath(job, btPath);

    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setJobName("itBtJob");

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(SparseRowBlockWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    //job.setOutputValueClass(SparseRowBlockWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(BtMapper.class);
    job.setCombinerClass(OuterProductCombiner.class);
    job.setReducerClass(OuterProductReducer.class);

    fileGather fgather = new fileGather(inputPath, "", FileSystem.get(job));
    mis = Checker.checkMis(mis, fgather.getInputSize(), FileSystem.get(job));
    job.setNumMapTasks(fgather.recNumMapTasks(mis));

    //job.setNumReduceTasks(0);
    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, inputPath);

    if (outputQ) {
        MultipleOutputs.addNamedOutput(job, QmultiplyJob.Q_MAT, SequenceFileOutputFormat.class,
                IntWritable.class, LMatrixWritable.class);
    }
    if (outputBBtProducts) {
        MultipleOutputs.addNamedOutput(job, OUTPUT_BBT, SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
    }
    RunningJob rj = JobClient.runJob(job);
    System.out.println("itBtJob Job ID: " + rj.getJobID().toString());
}

From source file:org.acacia.csr.java.LineCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    /*//w  w w  . j  a v a 2  s.c o m
      String dir1 = "/user/miyuru/wcout";
      String dir2 = "/user/miyuru/lcout";
       //We first delete the temporary directories if they exist on the HDFS
        FileSystem fs1 = FileSystem.get(new JobConf());
                
       if(fs1.exists(new Path(dir2))){
          fs1.delete(new Path(dir2), true);
       }
            
        JobConf conf = new JobConf(LineCount.class);
        conf.setJobName("LineCount");
               
        conf.setOutputKeyClass(IntWritable.class);
        conf.setOutputValueClass(IntWritable.class);
               
        conf.setMapperClass(Map.class);
        conf.setCombinerClass(Reduce.class);
        conf.setReducerClass(Reduce.class);
               
        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);
               
        FileInputFormat.setInputPaths(conf, new Path(dir1));
        FileOutputFormat.setOutputPath(conf, new Path(dir2));
               
        Job job = new Job(conf, "line count");
        job.waitForCompletion(true); 
        org.apache.hadoop.mapreduce.Counters cntr = job.getCounters();
        System .out.println("Number of lines in the file" + cntr.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS").getValue());
        */

    long edgeCount = 0;
    //String dir3 = "/user/miyuru/wcout";
    String dir4 = "/user/miyuru/lcout";
    String dir5 = "/user/miyuru/input";
    //We first delete the temporary directories if they exist on the HDFS
    FileSystem fs2 = FileSystem.get(new JobConf());

    if (fs2.exists(new Path(dir4))) {
        fs2.delete(new Path(dir4), true);
    }

    JobConf conf1 = new JobConf(LineCount.class);
    conf1.setJobName("LineCount");

    conf1.setOutputKeyClass(Text.class);
    conf1.setOutputValueClass(IntWritable.class);

    conf1.setMapperClass(Map.class);
    conf1.setCombinerClass(Reduce.class);
    conf1.setReducerClass(Reduce.class);

    conf1.setInputFormat(TextInputFormat.class);
    conf1.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf1, new Path(dir5));
    FileOutputFormat.setOutputPath(conf1, new Path(dir4));

    Job job1 = new Job(conf1, "line count");
    job1.setNumReduceTasks(0);
    job1.waitForCompletion(true);
    org.apache.hadoop.mapreduce.Counters cntr = job1.getCounters();
    edgeCount = cntr.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS").getValue();

    File efile = new File("/tmp/efile");

    if (efile.exists()) {
        efile.delete();
    }

    PrintWriter writer = new PrintWriter("/tmp/efile", "UTF-8");
    writer.println(edgeCount);
    writer.flush();
    writer.close();

    //edgeCount = edgeCount -1;//This is to remove the line number additionlly added to each edgelist file by HDFS. This is strange, but it happens.
    System.out.println("======>Edge count is : " + edgeCount);
    System.out.println("------Done Line Count---------------");
}

From source file:org.acacia.csr.java.NotInFinder.java

License:Apache License

public static void main(String[] args) throws Exception {
    String dir1 = "/user/miyuru/wcout";
    String dir2 = "/user/miyuru/notinverts";
    //We first delete the temporary directories if they exist on the HDFS
    FileSystem fs1 = FileSystem.get(new JobConf());

    if (fs1.exists(new Path(dir2))) {
        fs1.delete(new Path(dir2), true);
    }/*w w  w  .j  a va2  s .c  om*/

    JobConf conf = new JobConf();
    conf.setNumMapTasks(96);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(LongWritable.class);
    conf.setMapperClass(TokenizerMapper.class);
    conf.setReducerClass(IntSumReducer.class);
    conf.setCombinerClass(IntSumReducer.class);
    conf.setInputFormat(NLinesInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(dir1));
    FileOutputFormat.setOutputPath(conf, new Path(dir2));
    Job job = new Job(conf, "NotInFinder");
    job.setJarByClass(WordCount.class);
    //   job.setMapperClass(TokenizerMapper.class);
    //   job.setCombinerClass(IntSumReducer.class);
    //   job.setReducerClass(IntSumReducer.class);
    //   job.setOutputKeyClass(LongWritable.class);
    //   job.setOutputValueClass(LongWritable.class);

    job.setSortComparatorClass(SortComparator.class);
    job.waitForCompletion(true);

}

From source file:org.acacia.partitioner.java.NoptSplitter.java

License:Apache License

/**
 * @param args/*from w  ww  .  j a  v a 2s .c  o  m*/
 */
public static void main(String[] args) {
    if (!validArgs(args)) {
        printUsage();
        return;
    }
    //These are the temp paths that are created on HDFS
    String dir1 = "/user/miyuru/edgedistributed-out/nopt";
    String dir2 = "/user/miyuru/nopt-distributed";

    //We first delete the temporary directories if they exist on the HDFS
    FileSystem fs1;
    try {
        fs1 = FileSystem.get(new JobConf());

        System.out.println("Deleting the dir : " + dir2);

        if (fs1.exists(new Path(dir2))) {
            fs1.delete(new Path(dir2), true);
        }

        //         Path notinPath = new Path(dir2);
        //         
        //         if(!fs1.exists(notinPath)){
        //            fs1.create(notinPath);
        //         }

        JobConf conf = new JobConf(NoptSplitter.class);
        //          conf.setOutputKeyClass(Text.class);
        //          conf.setOutputValueClass(Text.class);
        conf.setMapperClass(Map.class);
        conf.setCombinerClass(Reduce.class);
        conf.setReducerClass(Reduce.class);

        //         conf.setInputFormat(TextInputFormat.class);
        //         conf.setOutputFormat(TextOutputFormat.class);

        FileInputFormat.setInputPaths(conf, new Path(dir1));
        FileOutputFormat.setOutputPath(conf, new Path(dir2));

        Job job1 = new Job(conf, "nopt_splitter");
        job1.setNumReduceTasks(Integer.parseInt(args[0])); //The most importnt point in this job
        job1.waitForCompletion(true);

    } catch (IOException e) {
        e.printStackTrace();
    } catch (InterruptedException e) {
        e.printStackTrace();
    } catch (ClassNotFoundException e) {
        e.printStackTrace();
    }
}

From source file:org.apache.avro.mapred.AvroJob.java

License:Apache License

/** Configure a job's combiner implementation. */
public static void setCombinerClass(JobConf job, Class<? extends AvroReducer> c) {
    job.set(COMBINER, c.getName());/*from  www.ja v  a 2s. c  o m*/
    job.setCombinerClass(HadoopCombiner.class);
}

From source file:org.apache.avro.mapred.TestWordCountGeneric.java

License:Apache License

@Test
@SuppressWarnings("deprecation")
public void testJob() throws Exception {
    String dir = System.getProperty("test.dir", ".") + "/mapred";
    Path outputPath = new Path(dir + "/out");
    JobConf job = new JobConf();
    try {/*from   www  .  jav a2  s .co m*/
        WordCountUtil.writeLinesFile();

        job.setJobName("wordcount");

        AvroJob.setInputGeneric(job, Schema.create(Schema.Type.STRING));
        AvroJob.setOutputGeneric(job, WordCount.SCHEMA$);

        job.setMapperClass(MapImpl.class);
        job.setCombinerClass(ReduceImpl.class);
        job.setReducerClass(ReduceImpl.class);

        FileInputFormat.setInputPaths(job, new Path(dir + "/in"));
        FileOutputFormat.setOutputPath(job, outputPath);
        FileOutputFormat.setCompressOutput(job, true);

        JobClient.runJob(job);

        WordCountUtil.validateCountsFile();
    } finally {
        outputPath.getFileSystem(job).delete(outputPath);
    }
}