Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException

Source Link

Document

Set the combiner class for the job.

Usage

From source file:com.phantom.hadoop.examples.MultiFileWordCount.java

License:Apache License

public int run(String[] args) throws Exception {

    if (args.length < 2) {
        printUsage();/*from  ww  w  .  j  a  v  a2  s .  c o m*/
        return 2;
    }

    Job job = new Job(getConf());
    job.setJobName("MultiFileWordCount");
    job.setJarByClass(MultiFileWordCount.class);

    // set the InputFormat of the job to our InputFormat
    job.setInputFormatClass(MyInputFormat.class);

    // the keys are words (strings)
    job.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    job.setOutputValueClass(IntWritable.class);

    // use the defined mapper
    job.setMapperClass(MapClass.class);
    // use the WordCount Reducer
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.phantom.hadoop.examples.WordMean.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: wordmean <in> <out>");
        return 0;
    }//from  w  ww .  j  a  v a2  s . com

    Configuration conf = getConf();

    @SuppressWarnings("deprecation")
    Job job = new Job(conf, "word mean");
    job.setJarByClass(WordMean.class);
    job.setMapperClass(WordMeanMapper.class);
    job.setCombinerClass(WordMeanReducer.class);
    job.setReducerClass(WordMeanReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    Path outputpath = new Path(args[1]);
    FileOutputFormat.setOutputPath(job, outputpath);
    boolean result = job.waitForCompletion(true);
    mean = readAndCalcMean(outputpath, conf);

    return (result ? 0 : 1);
}

From source file:com.phantom.hadoop.examples.WordMedian.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: wordmedian <in> <out>");
        return 0;
    }//from   www.  j a v a 2  s .  co m

    setConf(new Configuration());
    Configuration conf = getConf();

    @SuppressWarnings("deprecation")
    Job job = new Job(conf, "word median");
    job.setJarByClass(WordMedian.class);
    job.setMapperClass(WordMedianMapper.class);
    job.setCombinerClass(WordMedianReducer.class);
    job.setReducerClass(WordMedianReducer.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    boolean result = job.waitForCompletion(true);

    // Wait for JOB 1 -- get middle value to check for Median

    long totalWords = job.getCounters().getGroup(TaskCounter.class.getCanonicalName())
            .findCounter("MAP_OUTPUT_RECORDS", "Map output records").getValue();
    int medianIndex1 = (int) Math.ceil((totalWords / 2.0));
    int medianIndex2 = (int) Math.floor((totalWords / 2.0));

    median = readAndFindMedian(args[1], medianIndex1, medianIndex2, conf);

    return (result ? 0 : 1);
}

From source file:com.phantom.hadoop.examples.WordStandardDeviation.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: wordstddev <in> <out>");
        return 0;
    }//from   w ww. j av  a2 s. c o  m

    Configuration conf = getConf();

    @SuppressWarnings("deprecation")
    Job job = new Job(conf, "word stddev");
    job.setJarByClass(WordStandardDeviation.class);
    job.setMapperClass(WordStandardDeviationMapper.class);
    job.setCombinerClass(WordStandardDeviationReducer.class);
    job.setReducerClass(WordStandardDeviationReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    Path outputpath = new Path(args[1]);
    FileOutputFormat.setOutputPath(job, outputpath);
    boolean result = job.waitForCompletion(true);

    // read output and calculate standard deviation
    stddev = readAndCalcStdDev(outputpath, conf);

    return (result ? 0 : 1);
}

From source file:com.philiphubbard.digraph.MRBuildVertices.java

License:Open Source License

public static void setupJob(Job job, Path inputPath, Path outputPath) throws IOException {
    job.setJarByClass(MRBuildVertices.class);
    job.setMapperClass(MRBuildVertices.Mapper.class);
    job.setCombinerClass(MRBuildVertices.Reducer.class);
    job.setReducerClass(MRBuildVertices.Reducer.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(BytesWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(BytesWritable.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);
}

From source file:com.philiphubbard.digraph.MRCompressChains.java

License:Open Source License

public static void setupIterationJob(Job job, Path inputPathOrig, Path outputPathOrig) throws IOException {
    job.setJarByClass(MRCompressChains.class);
    job.setMapperClass(MRCompressChains.Mapper.class);
    job.setCombinerClass(MRCompressChains.Reducer.class);
    job.setReducerClass(MRCompressChains.Reducer.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(BytesWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(BytesWritable.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    Path inputPath;/*from   w  ww . j  ava 2  s.  com*/
    if (iter == 0)
        inputPath = inputPathOrig;
    else
        inputPath = new Path(outputPathOrig.toString() + (iter - 1));
    Path outputPath = new Path(outputPathOrig.toString() + iter);

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);
}

From source file:com.philiphubbard.sabe.MRCompressMerChains.java

License:Open Source License

public static void setupIterationJob(Job job, Path inputPathOrig, Path outputPathOrig) throws IOException {
    MRCompressChains.setupIterationJob(job, inputPathOrig, outputPathOrig);

    job.setMapperClass(MRCompressMerChains.Mapper.class);
    job.setCombinerClass(MRCompressMerChains.Reducer.class);
    job.setReducerClass(MRCompressMerChains.Reducer.class);
}

From source file:com.pocketx.gravity.recommender.cf.similarity.job.PreparePreferenceMatrixJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();//w w  w . ja va  2  s  .c o  m
    addOutputOption();
    addOption("maxPrefsPerUser", "mppu", "max number of preferences to consider per user, "
            + "users with more preferences will be sampled down");
    addOption("minPrefsPerUser", "mp",
            "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
    addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());
    addOption("ratingShift", "rs", "shift ratings by this value", "0.0");

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
    boolean booleanData = Boolean.valueOf(getOption("booleanData"));
    float ratingShift = Float.parseFloat(getOption("ratingShift"));
    //convert items to an internal index
    Job itemIDIndex = prepareJob(getInputPath(), getOutputPath(ITEMID_INDEX), TextInputFormat.class,
            ItemIDIndexMapper.class, VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class,
            VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class);
    itemIDIndex.setCombinerClass(ItemIDIndexReducer.class);
    boolean succeeded = itemIDIndex.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }
    //convert user preferences into a vector per user
    Job toUserVectors = prepareJob(getInputPath(), getOutputPath(USER_VECTORS), TextInputFormat.class,
            ToItemPrefsMapper.class, VarLongWritable.class,
            booleanData ? VarLongWritable.class : EntityPrefWritable.class, ToUserVectorsReducer.class,
            VarLongWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    toUserVectors.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData);
    toUserVectors.getConfiguration().setInt(ToUserVectorsReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser);
    toUserVectors.getConfiguration().set(ToEntityPrefsMapper.RATING_SHIFT, String.valueOf(ratingShift));
    succeeded = toUserVectors.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }
    //we need the number of users later
    int numberOfUsers = (int) toUserVectors.getCounters().findCounter(ToUserVectorsReducer.Counters.USERS)
            .getValue();
    HadoopUtil.writeInt(numberOfUsers, getOutputPath(NUM_USERS), getConf());
    //build the rating matrix
    Job toItemVectors = prepareJob(getOutputPath(USER_VECTORS), getOutputPath(RATING_MATRIX),
            ToItemVectorsMapper.class, IntWritable.class, VectorWritable.class, ToItemVectorsReducer.class,
            IntWritable.class, VectorWritable.class);
    toItemVectors.setCombinerClass(ToItemVectorsReducer.class);

    /* configure sampling regarding the uservectors */
    if (hasOption("maxPrefsPerUser")) {
        int samplingSize = Integer.parseInt(getOption("maxPrefsPerUser"));
        toItemVectors.getConfiguration().setInt(ToItemVectorsMapper.SAMPLE_SIZE, samplingSize);
    }

    succeeded = toItemVectors.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    return 0;
}

From source file:com.pocketx.gravity.recommender.cf.similarity.job.RowSimilarityJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();//from w w  w  . j  a  v  a  2  s  .  co  m
    addOutputOption();
    addOption("numberOfColumns", "r", "Number of columns in the input matrix", false);
    addOption("similarityClassname", "s",
            "Name of distributed similarity class to instantiate, alternatively use "
                    + "one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')');
    addOption("maxSimilaritiesPerRow", "m",
            "Number of maximum similarities per row (default: " + DEFAULT_MAX_SIMILARITIES_PER_ROW + ')',
            String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW));
    addOption("excludeSelfSimilarity", "ess", "compute similarity of rows to themselves?",
            String.valueOf(false));
    addOption("threshold", "tr", "discard row pairs with a similarity value below this", false);
    addOption(DefaultOptionCreator.overwriteOption().create());

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    int numberOfColumns;

    if (hasOption("numberOfColumns")) {
        // Number of columns explicitly specified via CLI
        numberOfColumns = Integer.parseInt(getOption("numberOfColumns"));
    } else {
        // else get the number of columns by determining the cardinality of a vector in the input matrix
        numberOfColumns = getDimensions(getInputPath());
    }

    String similarityClassnameArg = getOption("similarityClassname");
    String similarityClassname;
    try {
        similarityClassname = VectorSimilarityMeasures.valueOf(similarityClassnameArg).getClassname();
    } catch (IllegalArgumentException iae) {
        similarityClassname = similarityClassnameArg;
    }

    // Clear the output and temp paths if the overwrite option has been set
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        // Clear the temp path
        HadoopUtil.delete(getConf(), getTempPath());
        // Clear the output path
        HadoopUtil.delete(getConf(), getOutputPath());
    }

    int maxSimilaritiesPerRow = Integer.parseInt(getOption("maxSimilaritiesPerRow"));
    boolean excludeSelfSimilarity = Boolean.parseBoolean(getOption("excludeSelfSimilarity"));
    double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold")) : NO_THRESHOLD;

    Path weightsPath = getTempPath("weights");
    Path normsPath = getTempPath("norms.bin");
    Path numNonZeroEntriesPath = getTempPath("numNonZeroEntries.bin");
    Path maxValuesPath = getTempPath("maxValues.bin");
    Path pairwiseSimilarityPath = getTempPath("pairwiseSimilarity");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job normsAndTranspose = prepareJob(getInputPath(), weightsPath, VectorNormMapper.class,
                IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class,
                VectorWritable.class);
        normsAndTranspose.setCombinerClass(MergeVectorsCombiner.class);
        Configuration normsAndTransposeConf = normsAndTranspose.getConfiguration();
        normsAndTransposeConf.set(THRESHOLD, String.valueOf(threshold));
        normsAndTransposeConf.set(NORMS_PATH, normsPath.toString());
        normsAndTransposeConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString());
        normsAndTransposeConf.set(MAXVALUES_PATH, maxValuesPath.toString());
        normsAndTransposeConf.set(SIMILARITY_CLASSNAME, similarityClassname);
        boolean succeeded = normsAndTranspose.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job pairwiseSimilarity = prepareJob(weightsPath, pairwiseSimilarityPath, CooccurrencesMapper.class,
                IntWritable.class, VectorWritable.class, SimilarityReducer.class, IntWritable.class,
                VectorWritable.class);
        pairwiseSimilarity.setCombinerClass(VectorSumReducer.class);
        Configuration pairwiseConf = pairwiseSimilarity.getConfiguration();
        pairwiseConf.set(THRESHOLD, String.valueOf(threshold));
        pairwiseConf.set(NORMS_PATH, normsPath.toString());
        pairwiseConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString());
        pairwiseConf.set(MAXVALUES_PATH, maxValuesPath.toString());
        pairwiseConf.set(SIMILARITY_CLASSNAME, similarityClassname);
        pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns);
        pairwiseConf.setBoolean(EXCLUDE_SELF_SIMILARITY, excludeSelfSimilarity);
        boolean succeeded = pairwiseSimilarity.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job asMatrix = prepareJob(pairwiseSimilarityPath, getOutputPath(), UnsymmetrifyMapper.class,
                IntWritable.class, VectorWritable.class, MergeToTopKSimilaritiesReducer.class,
                IntWritable.class, VectorWritable.class);
        asMatrix.setCombinerClass(MergeToTopKSimilaritiesReducer.class);
        asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow);
        boolean succeeded = asMatrix.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
    }

    return 0;
}

From source file:com.rockstor.compact.GenGarbageIndexTool.java

License:Apache License

private Job createSubmittableJob(Configuration conf) throws IOException {
    Job job = new Job(conf, NAME);

    job.setJarByClass(GenGarbageIndexTool.class);
    Scan scan = new Scan();
    TableMapReduceUtil.initTableMapperJob(GarbageChunkDB.TAB_NAME, scan, GarbageChunkMapper.class,
            ImmutableBytesWritable.class, ImmutableBytesWritable.class, job);

    TableMapReduceUtil.setScannerCaching(job, batchSize);
    job.setReducerClass(GarbageChunkReduce.class);
    job.setPartitionerClass(GarbageChunkPartition.class);
    job.setCombinerClass(GarbageChunkCombine.class);

    job.setNumReduceTasks(Compactor.getInstance().getReduceNum());
    job.setOutputFormatClass(NullOutputFormat.class);

    LOG.info("init job " + NAME + " finished!");
    return job;/*  w ww . jav a 2  s  .  co  m*/
}