Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException

Source Link

Document

Set the combiner class for the job.

Usage

From source file:org.apache.mahout.classifier.svm.mapreduce.MapReduceUtil.java

License:Apache License

/**
 * for hadoop .1.9.//from   w  w  w . ja va 2 s . com
 * 
 * Gets a job driver.
 * 
 * @param jobClass
 * @param jobName
 * @param numReducers
 * @param inputPath
 * @param outputPath
 * @param inputFormat
 * @param outputFormat
 * @param mapper
 * @param combiner
 * @param reducer
 * @param outputkey
 * @param outputValue
 * @return
 * @throws java.io.IOException
 * @deprecated to be removed soon
 */
public static Job getJobDriver(Class<?> jobClass, String jobName, int numReducers, String inputPath,
        String outputPath, Class<? extends InputFormat> inputFormat, Class<? extends OutputFormat> outputFormat,
        Class<? extends Mapper> mapper, Class<? extends Reducer> combiner, Class<? extends Reducer> reducer,
        Class<? extends WritableComparable> outputkey, Class<? extends Writable> outputValue)
        throws IOException {

    Job job = new Job(new Configuration());

    // job
    job.setJarByClass(jobClass);
    job.setJobName(jobName);

    // number reducers
    if (numReducers > -1) {
        job.setNumReduceTasks(numReducers);
    }

    // file path
    if (null != inputPath) {
        FileInputFormat.addInputPaths(job, inputPath);
    }
    if (null != outputPath) {
        FileOutputFormat.setOutputPath(job, new Path(outputPath));
    }

    // file format
    if (null != inputFormat) {
        job.setInputFormatClass(inputFormat);
    }
    if (null != outputFormat) {
        job.setOutputFormatClass(outputFormat);
    }

    // mapper, combiner, redcuer, partitioner
    if (null != mapper) {
        job.setMapperClass(mapper);
    }
    if (null != combiner) {
        job.setCombinerClass(combiner);
    }
    if (null != reducer) {
        job.setReducerClass(reducer);
    }

    // map, output key and value class
    if (null != outputkey) {
        job.setMapOutputKeyClass(outputkey);
    }
    if (null != outputValue) {
        job.setMapOutputValueClass(outputValue);
    }

    // output key and value class
    return job;
}

From source file:org.apache.mahout.classifier.svm.mapreduce.MapReduceUtil.java

License:Apache License

/**
 * Sets the static parameters (input, output, map, combine, reduce) related to
 * a job./* w  ww  .j a  v a2s.c  o  m*/
 * 
 * @param job
 * @param jobClass
 * @param jobName
 * @param inputFormat
 * @param outputFormat
 * @param mapper
 * @param combiner
 * @param reducer
 * @param mapOutputKey
 * @param mapOutputValue
 * @throws java.io.IOException
 */
public static void setJobStaticParameters(Job job, Class<?> jobClass, String jobName,
        Class<? extends InputFormat> inputFormat, Class<? extends OutputFormat> outputFormat,
        Class<? extends Mapper> mapper, Class<? extends Reducer> combiner, Class<? extends Reducer> reducer,
        Class<? extends WritableComparable> mapOutputKey, Class<? extends Writable> mapOutputValue)
        throws IOException {

    // job (name and class)
    job.setJobName(jobName);
    job.setJarByClass(jobClass);

    // format (input and output)
    if (null != inputFormat) {
        job.setInputFormatClass(inputFormat);
    }
    if (null != outputFormat) {
        job.setOutputFormatClass(outputFormat);
    }

    // mapper
    if (null != mapper) {
        job.setMapperClass(mapper);
    }
    if (null != mapOutputKey) {
        job.setMapOutputKeyClass(mapOutputKey);
    }
    if (null != mapOutputValue) {
        job.setMapOutputValueClass(mapOutputValue);
    }

    // combiner
    if (null != combiner) {
        job.setCombinerClass(combiner);
    }

    // reducer
    if (null != reducer) {
        job.setReducerClass(reducer);
    }
    // job.setOutputKeyClass(jobClass);
    // partitioner
}

From source file:org.apache.mahout.clustering.lda.cvb.CVB0Driver.java

License:Apache License

private double calculatePerplexity(Configuration conf, Path corpusPath, Path modelPath, int iteration)
        throws IOException, ClassNotFoundException, InterruptedException {
    String jobName = "Calculating perplexity for " + modelPath;
    log.info("About to run: {}", jobName);

    Path outputPath = perplexityPath(modelPath.getParent(), iteration);
    Job job = prepareJob(corpusPath, outputPath, CachingCVB0PerplexityMapper.class, DoubleWritable.class,
            DoubleWritable.class, DualDoubleSumReducer.class, DoubleWritable.class, DoubleWritable.class);

    job.setJobName(jobName);//from  w  w w  .j  a  va 2 s .c om
    job.setCombinerClass(DualDoubleSumReducer.class);
    job.setNumReduceTasks(1);
    setModelPaths(job, modelPath);
    HadoopUtil.delete(conf, outputPath);
    if (!job.waitForCompletion(true)) {
        throw new InterruptedException("Failed to calculate perplexity for: " + modelPath);
    }
    return readPerplexity(conf, modelPath.getParent(), iteration);
}

From source file:org.apache.mahout.clustering.lda.cvb.CVB0Driver.java

License:Apache License

public void runIteration(Configuration conf, Path corpusInput, Path modelInput, Path modelOutput,
        int iterationNumber, int maxIterations, int numReduceTasks)
        throws IOException, ClassNotFoundException, InterruptedException {
    String jobName = String.format("Iteration %d of %d, input path: %s", iterationNumber, maxIterations,
            modelInput);//from ww  w  .j  a  v  a 2 s .c  om
    log.info("About to run: {}", jobName);
    Job job = prepareJob(corpusInput, modelOutput, CachingCVB0Mapper.class, IntWritable.class,
            VectorWritable.class, VectorSumReducer.class, IntWritable.class, VectorWritable.class);
    job.setCombinerClass(VectorSumReducer.class);
    job.setNumReduceTasks(numReduceTasks);
    job.setJobName(jobName);
    setModelPaths(job, modelInput);
    HadoopUtil.delete(conf, modelOutput);
    if (!job.waitForCompletion(true)) {
        throw new InterruptedException(
                String.format("Failed to complete iteration %d stage 1", iterationNumber));
    }
}

From source file:org.apache.mahout.clustering.lda.LDADriver.java

License:Apache License

/**
 * Run the job using supplied arguments/*ww  w .j  a  v  a 2 s  .c o m*/
 * 
 * @param input
 *          the directory pathname for input points
 * @param stateIn
 *          the directory pathname for input state
 * @param stateOut
 *          the directory pathname for output state
 * @param numTopics
 *          the number of clusters
 * @param numReducers
 *          the number of Reducers desired
 */
private double runIteration(Path input, Path stateIn, Path stateOut, int numTopics, int numWords,
        double topicSmoothing, int numReducers)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    conf.set(STATE_IN_KEY, stateIn.toString());
    conf.set(NUM_TOPICS_KEY, Integer.toString(numTopics));
    conf.set(NUM_WORDS_KEY, Integer.toString(numWords));
    conf.set(TOPIC_SMOOTHING_KEY, Double.toString(topicSmoothing));

    Job job = new Job(conf);

    job.setOutputKeyClass(IntPairWritable.class);
    job.setOutputValueClass(DoubleWritable.class);
    FileInputFormat.addInputPaths(job, input.toString());
    FileOutputFormat.setOutputPath(job, stateOut);

    job.setMapperClass(LDAMapper.class);
    job.setReducerClass(LDAReducer.class);
    job.setCombinerClass(LDAReducer.class);
    job.setNumReduceTasks(numReducers);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setJarByClass(LDADriver.class);

    job.waitForCompletion(true);
    return findLL(stateOut, conf);
}

From source file:org.apache.mahout.clustering.spectral.eigencuts.EigencutsAffinityCutsJob.java

License:Apache License

/**
 * Runs a single iteration of defining cluster boundaries, based on
 * previous calculations and the formation of the "cut matrix".
 * /*  w  w w. j  a va  2s  .  c  o  m*/
 * @param currentAffinity Path to the current affinity matrix.
 * @param cutMatrix Path to the sensitivity matrix.
 * @param nextAffinity Output path for the new affinity matrix.
 */
public static long runjob(Path currentAffinity, Path cutMatrix, Path nextAffinity, Configuration conf)
        throws IOException, ClassNotFoundException, InterruptedException {

    // these options allow us to differentiate between the two vectors
    // in the mapper and reducer - we'll know from the working path
    // which SequenceFile we're accessing
    conf.set(EigencutsKeys.AFFINITY_PATH, currentAffinity.getName());
    conf.set(EigencutsKeys.CUTMATRIX_PATH, cutMatrix.getName());

    Job job = new Job(conf, "EigencutsAffinityCutsJob");
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(VertexWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.setMapperClass(EigencutsAffinityCutsMapper.class);
    job.setCombinerClass(EigencutsAffinityCutsCombiner.class);
    job.setReducerClass(EigencutsAffinityCutsReducer.class);

    //FileInputFormat.addInputPath(job, currentAffinity);
    FileInputFormat.addInputPath(job, cutMatrix);
    FileOutputFormat.setOutputPath(job, nextAffinity);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

    return job.getCounters().findCounter(CUTSCOUNTER.NUM_CUTS).getValue();
}

From source file:org.apache.mahout.feature.mrmr.MRMRDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*from  w  w  w  .  ja v  a2s  .  c  om*/
    addOutputOption();
    addOption(DefaultOptionCreator.targetColumnOption().create());
    addOption(DefaultOptionCreator.rowNumberOption().create());
    addOption(DefaultOptionCreator.columnNumberOption().create());
    addOption(DefaultOptionCreator.featureNumberOption().create());

    Map<String, List<String>> parsedArgs = parseArguments(args);

    Path input = getInputPath();
    Path output = getOutputPath();
    Path temp = getTempPath();

    int targetColumn = Integer.parseInt(getOption(DefaultOptionCreator.TARGET_COLUMN));
    int rowNumber = Integer.parseInt(getOption(DefaultOptionCreator.ROW_NUMBER));
    int columnNumber = Integer.parseInt(getOption(DefaultOptionCreator.COLUMN_NUMBER));
    int featureNumber = Integer.parseInt(getOption(DefaultOptionCreator.FEATURE_NUMBER));

    log.info("Feature selection algorithm: MRMR");

    Path tempMax = null;
    for (int i = 0; i < featureNumber; i++) {

        log.info("Generating candidates at stage " + i + "th");

        Path tempFeature = new Path(featureJobTempUri(temp, i));

        Configuration confFeature = getConf();
        confFeature.set(DefaultOptionCreator.TARGET_INDEX, "" + (targetColumn - 1));
        confFeature.set(DefaultOptionCreator.COLUMN_NUMBER, "" + columnNumber);
        confFeature.set(DefaultOptionCreator.ROW_NUMBER, "" + rowNumber);

        Job jobFeature = HadoopUtil.prepareJob(input, tempFeature, TextInputFormat.class, MRMRMapper.class,
                IntWritable.class, Text.class, MRMRReducer.class, LongWritable.class, Text.class,
                SequenceFileOutputFormat.class, confFeature);
        jobFeature.setJobName("Feature Candidate stage " + i + "th");

        if (i > 0) {
            DistributedCache.addCacheFile(new Path(cacheFileUri(temp, i)).toUri(),
                    jobFeature.getConfiguration());
        }

        boolean succeededFeature = jobFeature.waitForCompletion(true);
        if (!succeededFeature)
            return -1;

        // Selecting best candidate job
        log.info("Selecting the best candidate at stage " + i + "th");

        Configuration confMax = getConf();
        confFeature.set(DefaultOptionCreator.COLUMN_NUMBER, "" + columnNumber);

        tempMax = new Path(maxJobTempUri(temp, i));
        if (i == featureNumber - 1)
            tempMax = outputPath;

        Job jobMax = HadoopUtil.prepareJob(tempFeature, tempMax, SequenceFileInputFormat.class, Mapper.class,
                LongWritable.class, Text.class, MaxReducer.class, LongWritable.class, Text.class,
                TextOutputFormat.class, confMax);
        jobMax.setJobName("Best candidate at stage " + i + "th");
        jobMax.setCombinerClass(MaxCombiner.class);

        if (i > 0) {
            DistributedCache.addCacheFile(new Path(cacheFileUri(temp, i)).toUri(), jobMax.getConfiguration());
        }

        boolean succeededMax = jobMax.waitForCompletion(true);
        if (!succeededMax)
            return -1;

        try {
            FileSystem hdfs = FileSystem.get(confMax);
            hdfs.delete(tempFeature, true);
        } catch (IOException e) {
        }

    }
    return 0;
}

From source file:org.apache.mahout.fpm.pfpgrowth.dataset.KeyBasedStringTupleGrouper.java

License:Apache License

public static void startJob(Parameters params)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();

    conf.set("job.parameters", params.toString());
    conf.set("mapred.compress.map.output", "true");
    conf.set("mapred.output.compression.type", "BLOCK");
    conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");

    String input = params.get("input");
    Job job = new Job(conf, "Generating dataset based from input" + input);
    job.setJarByClass(KeyBasedStringTupleGrouper.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StringTuple.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(input));
    Path outPath = new Path(params.get("output"));
    FileOutputFormat.setOutputPath(job, outPath);

    HadoopUtil.delete(conf, outPath);//from   www  .  java 2  s  . c om

    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(KeyBasedStringTupleMapper.class);
    job.setCombinerClass(KeyBasedStringTupleCombiner.class);
    job.setReducerClass(KeyBasedStringTupleReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}

From source file:org.apache.mahout.fpm.pfpgrowth.PFPGrowth.java

License:Apache License

/**
 * Run the aggregation Job to aggregate the different TopK patterns and group each Pattern by the features
 * present in it and thus calculate the final Top K frequent Patterns for each feature
 * //from  w  ww. ja  v a  2 s.  c om
 * @param params
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static void startAggregating(Parameters params)
        throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration();
    params.set("fList", "");
    params.set("gList", "");
    conf.set("pfp.parameters", params.toString());
    conf.set("mapred.compress.map.output", "true");
    conf.set("mapred.output.compression.type", "BLOCK");

    String input = params.get("output") + "/fpgrowth";
    Job job = new Job(conf, "PFP Aggregator Driver running over input: " + input);
    job.setJarByClass(PFPGrowth.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(TopKStringPatterns.class);

    FileInputFormat.addInputPath(job, new Path(input));
    Path outPath = new Path(params.get("output"), "frequentPatterns");
    FileOutputFormat.setOutputPath(job, outPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(AggregatorMapper.class);
    job.setCombinerClass(AggregatorReducer.class);
    job.setReducerClass(AggregatorReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    HadoopUtil.overwriteOutput(outPath);
    job.waitForCompletion(true);
}

From source file:org.apache.mahout.fpm.pfpgrowth.PFPGrowth.java

License:Apache License

/**
 * Count the frequencies of various features in parallel using Map/Reduce
 * // w w  w . ja  va  2 s  .co m
 * @param params
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static void startParallelCounting(Parameters params)
        throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration();
    conf.set("pfp.parameters", params.toString());

    conf.set("mapred.compress.map.output", "true");
    conf.set("mapred.output.compression.type", "BLOCK");

    String input = params.get("input");
    Job job = new Job(conf, "Parallel Counting Driver running over input: " + input);
    job.setJarByClass(PFPGrowth.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    FileInputFormat.addInputPath(job, new Path(input));
    Path outPath = new Path(params.get("output"), "parallelcounting");
    FileOutputFormat.setOutputPath(job, outPath);

    HadoopUtil.overwriteOutput(outPath);

    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(ParallelCountingMapper.class);
    job.setCombinerClass(ParallelCountingReducer.class);
    job.setReducerClass(ParallelCountingReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.waitForCompletion(true);

}