Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException 

Source Link

Document

Set the combiner class for the job.

Usage

From source file:org.apache.mahout.classifier.svm.mapreduce.MapReduceUtil.java

License:Apache License

/**
 * for hadoop .1.9.//from   w  w  w . ja va 2 s . com
 * 
 * Gets a job driver.
 * 
 * @param jobClass
 * @param jobName
 * @param numReducers
 * @param inputPath
 * @param outputPath
 * @param inputFormat
 * @param outputFormat
 * @param mapper
 * @param combiner
 * @param reducer
 * @param outputkey
 * @param outputValue
 * @return
 * @throws java.io.IOException
 * @deprecated to be removed soon
 */
public static Job getJobDriver(Class<?> jobClass, String jobName, int numReducers, String inputPath,
        String outputPath, Class<? extends InputFormat> inputFormat, Class<? extends OutputFormat> outputFormat,
        Class<? extends Mapper> mapper, Class<? extends Reducer> combiner, Class<? extends Reducer> reducer,
        Class<? extends WritableComparable> outputkey, Class<? extends Writable> outputValue)
        throws IOException {

    Job job = new Job(new Configuration());

    // job
    job.setJarByClass(jobClass);
    job.setJobName(jobName);

    // number reducers
    if (numReducers > -1) {
        job.setNumReduceTasks(numReducers);
    }

    // file path
    if (null != inputPath) {
        FileInputFormat.addInputPaths(job, inputPath);
    }
    if (null != outputPath) {
        FileOutputFormat.setOutputPath(job, new Path(outputPath));
    }

    // file format
    if (null != inputFormat) {
        job.setInputFormatClass(inputFormat);
    }
    if (null != outputFormat) {
        job.setOutputFormatClass(outputFormat);
    }

    // mapper, combiner, redcuer, partitioner
    if (null != mapper) {
        job.setMapperClass(mapper);
    }
    if (null != combiner) {
        job.setCombinerClass(combiner);
    }
    if (null != reducer) {
        job.setReducerClass(reducer);
    }

    // map, output key and value class
    if (null != outputkey) {
        job.setMapOutputKeyClass(outputkey);
    }
    if (null != outputValue) {
        job.setMapOutputValueClass(outputValue);
    }

    // output key and value class
    return job;
}

From source file:org.apache.mahout.classifier.svm.mapreduce.MapReduceUtil.java

License:Apache License

/**
 * Sets the static parameters (input, output, map, combine, reduce) related to
 * a job./* w  ww  .j a  v a2s.c  o  m*/
 * 
 * @param job
 * @param jobClass
 * @param jobName
 * @param inputFormat
 * @param outputFormat
 * @param mapper
 * @param combiner
 * @param reducer
 * @param mapOutputKey
 * @param mapOutputValue
 * @throws java.io.IOException
 */
public static void setJobStaticParameters(Job job, Class<?> jobClass, String jobName,
        Class<? extends InputFormat> inputFormat, Class<? extends OutputFormat> outputFormat,
        Class<? extends Mapper> mapper, Class<? extends Reducer> combiner, Class<? extends Reducer> reducer,
        Class<? extends WritableComparable> mapOutputKey, Class<? extends Writable> mapOutputValue)
        throws IOException {

    // job (name and class)
    job.setJobName(jobName);
    job.setJarByClass(jobClass);

    // format (input and output)
    if (null != inputFormat) {
        job.setInputFormatClass(inputFormat);
    }
    if (null != outputFormat) {
        job.setOutputFormatClass(outputFormat);
    }

    // mapper
    if (null != mapper) {
        job.setMapperClass(mapper);
    }
    if (null != mapOutputKey) {
        job.setMapOutputKeyClass(mapOutputKey);
    }
    if (null != mapOutputValue) {
        job.setMapOutputValueClass(mapOutputValue);
    }

    // combiner
    if (null != combiner) {
        job.setCombinerClass(combiner);
    }

    // reducer
    if (null != reducer) {
        job.setReducerClass(reducer);
    }
    // job.setOutputKeyClass(jobClass);
    // partitioner
}

From source file:org.apache.mahout.clustering.lda.cvb.CVB0Driver.java

License:Apache License

private double calculatePerplexity(Configuration conf, Path corpusPath, Path modelPath, int iteration)
        throws IOException, ClassNotFoundException, InterruptedException {
    String jobName = "Calculating perplexity for " + modelPath;
    log.info("About to run: {}", jobName);

    Path outputPath = perplexityPath(modelPath.getParent(), iteration);
    Job job = prepareJob(corpusPath, outputPath, CachingCVB0PerplexityMapper.class, DoubleWritable.class,
            DoubleWritable.class, DualDoubleSumReducer.class, DoubleWritable.class, DoubleWritable.class);

    job.setJobName(jobName);//from  w  w w  .j  a  va 2 s .c om
    job.setCombinerClass(DualDoubleSumReducer.class);
    job.setNumReduceTasks(1);
    setModelPaths(job, modelPath);
    HadoopUtil.delete(conf, outputPath);
    if (!job.waitForCompletion(true)) {
        throw new InterruptedException("Failed to calculate perplexity for: " + modelPath);
    }
    return readPerplexity(conf, modelPath.getParent(), iteration);
}

From source file:org.apache.mahout.clustering.lda.cvb.CVB0Driver.java

License:Apache License

public void runIteration(Configuration conf, Path corpusInput, Path modelInput, Path modelOutput,
        int iterationNumber, int maxIterations, int numReduceTasks)
        throws IOException, ClassNotFoundException, InterruptedException {
    String jobName = String.format("Iteration %d of %d, input path: %s", iterationNumber, maxIterations,
            modelInput);//from ww  w  .j  a  v  a 2 s .c  om
    log.info("About to run: {}", jobName);
    Job job = prepareJob(corpusInput, modelOutput, CachingCVB0Mapper.class, IntWritable.class,
            VectorWritable.class, VectorSumReducer.class, IntWritable.class, VectorWritable.class);
    job.setCombinerClass(VectorSumReducer.class);
    job.setNumReduceTasks(numReduceTasks);
    job.setJobName(jobName);
    setModelPaths(job, modelInput);
    HadoopUtil.delete(conf, modelOutput);
    if (!job.waitForCompletion(true)) {
        throw new InterruptedException(
                String.format("Failed to complete iteration %d stage 1", iterationNumber));
    }
}

From source file:org.apache.mahout.clustering.lda.LDADriver.java

License:Apache License

/**
 * Run the job using supplied arguments/*ww  w .j  a  v  a 2 s  .c o m*/
 * 
 * @param input
 *          the directory pathname for input points
 * @param stateIn
 *          the directory pathname for input state
 * @param stateOut
 *          the directory pathname for output state
 * @param numTopics
 *          the number of clusters
 * @param numReducers
 *          the number of Reducers desired
 */
private double runIteration(Path input, Path stateIn, Path stateOut, int numTopics, int numWords,
        double topicSmoothing, int numReducers)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    conf.set(STATE_IN_KEY, stateIn.toString());
    conf.set(NUM_TOPICS_KEY, Integer.toString(numTopics));
    conf.set(NUM_WORDS_KEY, Integer.toString(numWords));
    conf.set(TOPIC_SMOOTHING_KEY, Double.toString(topicSmoothing));

    Job job = new Job(conf);

    job.setOutputKeyClass(IntPairWritable.class);
    job.setOutputValueClass(DoubleWritable.class);
    FileInputFormat.addInputPaths(job, input.toString());
    FileOutputFormat.setOutputPath(job, stateOut);

    job.setMapperClass(LDAMapper.class);
    job.setReducerClass(LDAReducer.class);
    job.setCombinerClass(LDAReducer.class);
    job.setNumReduceTasks(numReducers);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setJarByClass(LDADriver.class);

    job.waitForCompletion(true);
    return findLL(stateOut, conf);
}

From source file:org.apache.mahout.clustering.spectral.eigencuts.EigencutsAffinityCutsJob.java

License:Apache License

/**
 * Runs a single iteration of defining cluster boundaries, based on
 * previous calculations and the formation of the "cut matrix".
 * /*  w  w w. j  a va  2s  .  c  o  m*/
 * @param currentAffinity Path to the current affinity matrix.
 * @param cutMatrix Path to the sensitivity matrix.
 * @param nextAffinity Output path for the new affinity matrix.
 */
public static long runjob(Path currentAffinity, Path cutMatrix, Path nextAffinity, Configuration conf)
        throws IOException, ClassNotFoundException, InterruptedException {

    // these options allow us to differentiate between the two vectors
    // in the mapper and reducer - we'll know from the working path
    // which SequenceFile we're accessing
    conf.set(EigencutsKeys.AFFINITY_PATH, currentAffinity.getName());
    conf.set(EigencutsKeys.CUTMATRIX_PATH, cutMatrix.getName());

    Job job = new Job(conf, "EigencutsAffinityCutsJob");
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(VertexWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.setMapperClass(EigencutsAffinityCutsMapper.class);
    job.setCombinerClass(EigencutsAffinityCutsCombiner.class);
    job.setReducerClass(EigencutsAffinityCutsReducer.class);

    //FileInputFormat.addInputPath(job, currentAffinity);
    FileInputFormat.addInputPath(job, cutMatrix);
    FileOutputFormat.setOutputPath(job, nextAffinity);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

    return job.getCounters().findCounter(CUTSCOUNTER.NUM_CUTS).getValue();
}

From source file:org.apache.mahout.feature.mrmr.MRMRDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*from  w  w  w  .  ja v  a2s  .  c  om*/
    addOutputOption();
    addOption(DefaultOptionCreator.targetColumnOption().create());
    addOption(DefaultOptionCreator.rowNumberOption().create());
    addOption(DefaultOptionCreator.columnNumberOption().create());
    addOption(DefaultOptionCreator.featureNumberOption().create());

    Map<String, List<String>> parsedArgs = parseArguments(args);

    Path input = getInputPath();
    Path output = getOutputPath();
    Path temp = getTempPath();

    int targetColumn = Integer.parseInt(getOption(DefaultOptionCreator.TARGET_COLUMN));
    int rowNumber = Integer.parseInt(getOption(DefaultOptionCreator.ROW_NUMBER));
    int columnNumber = Integer.parseInt(getOption(DefaultOptionCreator.COLUMN_NUMBER));
    int featureNumber = Integer.parseInt(getOption(DefaultOptionCreator.FEATURE_NUMBER));

    log.info("Feature selection algorithm: MRMR");

    Path tempMax = null;
    for (int i = 0; i < featureNumber; i++) {

        log.info("Generating candidates at stage " + i + "th");

        Path tempFeature = new Path(featureJobTempUri(temp, i));

        Configuration confFeature = getConf();
        confFeature.set(DefaultOptionCreator.TARGET_INDEX, "" + (targetColumn - 1));
        confFeature.set(DefaultOptionCreator.COLUMN_NUMBER, "" + columnNumber);
        confFeature.set(DefaultOptionCreator.ROW_NUMBER, "" + rowNumber);

        Job jobFeature = HadoopUtil.prepareJob(input, tempFeature, TextInputFormat.class, MRMRMapper.class,
                IntWritable.class, Text.class, MRMRReducer.class, LongWritable.class, Text.class,
                SequenceFileOutputFormat.class, confFeature);
        jobFeature.setJobName("Feature Candidate stage " + i + "th");

        if (i > 0) {
            DistributedCache.addCacheFile(new Path(cacheFileUri(temp, i)).toUri(),
                    jobFeature.getConfiguration());
        }

        boolean succeededFeature = jobFeature.waitForCompletion(true);
        if (!succeededFeature)
            return -1;

        // Selecting best candidate job
        log.info("Selecting the best candidate at stage " + i + "th");

        Configuration confMax = getConf();
        confFeature.set(DefaultOptionCreator.COLUMN_NUMBER, "" + columnNumber);

        tempMax = new Path(maxJobTempUri(temp, i));
        if (i == featureNumber - 1)
            tempMax = outputPath;

        Job jobMax = HadoopUtil.prepareJob(tempFeature, tempMax, SequenceFileInputFormat.class, Mapper.class,
                LongWritable.class, Text.class, MaxReducer.class, LongWritable.class, Text.class,
                TextOutputFormat.class, confMax);
        jobMax.setJobName("Best candidate at stage " + i + "th");
        jobMax.setCombinerClass(MaxCombiner.class);

        if (i > 0) {
            DistributedCache.addCacheFile(new Path(cacheFileUri(temp, i)).toUri(), jobMax.getConfiguration());
        }

        boolean succeededMax = jobMax.waitForCompletion(true);
        if (!succeededMax)
            return -1;

        try {
            FileSystem hdfs = FileSystem.get(confMax);
            hdfs.delete(tempFeature, true);
        } catch (IOException e) {
        }

    }
    return 0;
}

From source file:org.apache.mahout.fpm.pfpgrowth.dataset.KeyBasedStringTupleGrouper.java

License:Apache License

public static void startJob(Parameters params)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();

    conf.set("job.parameters", params.toString());
    conf.set("mapred.compress.map.output", "true");
    conf.set("mapred.output.compression.type", "BLOCK");
    conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");

    String input = params.get("input");
    Job job = new Job(conf, "Generating dataset based from input" + input);
    job.setJarByClass(KeyBasedStringTupleGrouper.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StringTuple.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(input));
    Path outPath = new Path(params.get("output"));
    FileOutputFormat.setOutputPath(job, outPath);

    HadoopUtil.delete(conf, outPath);//from   www  .  java 2  s  . c om

    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(KeyBasedStringTupleMapper.class);
    job.setCombinerClass(KeyBasedStringTupleCombiner.class);
    job.setReducerClass(KeyBasedStringTupleReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}

From source file:org.apache.mahout.fpm.pfpgrowth.PFPGrowth.java

License:Apache License

/**
 * Run the aggregation Job to aggregate the different TopK patterns and group each Pattern by the features
 * present in it and thus calculate the final Top K frequent Patterns for each feature
 * //from  w  ww. ja  v a  2 s.  c om
 * @param params
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static void startAggregating(Parameters params)
        throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration();
    params.set("fList", "");
    params.set("gList", "");
    conf.set("pfp.parameters", params.toString());
    conf.set("mapred.compress.map.output", "true");
    conf.set("mapred.output.compression.type", "BLOCK");

    String input = params.get("output") + "/fpgrowth";
    Job job = new Job(conf, "PFP Aggregator Driver running over input: " + input);
    job.setJarByClass(PFPGrowth.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(TopKStringPatterns.class);

    FileInputFormat.addInputPath(job, new Path(input));
    Path outPath = new Path(params.get("output"), "frequentPatterns");
    FileOutputFormat.setOutputPath(job, outPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(AggregatorMapper.class);
    job.setCombinerClass(AggregatorReducer.class);
    job.setReducerClass(AggregatorReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    HadoopUtil.overwriteOutput(outPath);
    job.waitForCompletion(true);
}

From source file:org.apache.mahout.fpm.pfpgrowth.PFPGrowth.java

License:Apache License

/**
 * Count the frequencies of various features in parallel using Map/Reduce
 * // w w  w . ja  va  2 s  .co m
 * @param params
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static void startParallelCounting(Parameters params)
        throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration();
    conf.set("pfp.parameters", params.toString());

    conf.set("mapred.compress.map.output", "true");
    conf.set("mapred.output.compression.type", "BLOCK");

    String input = params.get("input");
    Job job = new Job(conf, "Parallel Counting Driver running over input: " + input);
    job.setJarByClass(PFPGrowth.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    FileInputFormat.addInputPath(job, new Path(input));
    Path outPath = new Path(params.get("output"), "parallelcounting");
    FileOutputFormat.setOutputPath(job, outPath);

    HadoopUtil.overwriteOutput(outPath);

    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(ParallelCountingMapper.class);
    job.setCombinerClass(ParallelCountingReducer.class);
    job.setReducerClass(ParallelCountingReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.waitForCompletion(true);

}