Example usage for org.apache.hadoop.mapred JobConf setReducerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setReducerClass.

Prototype

public void setReducerClass(Class<? extends Reducer> theClass)

Source Link

Document

Set the Reducer class for the job.

Usage

From source file:org.apache.cassandra.bulkloader.CassandraBulkLoader.java

License:Apache License

public static void runJob(String[] args) {
    JobConf conf = new JobConf(CassandraBulkLoader.class);

    if (args.length >= 4) {
        conf.setNumReduceTasks(new Integer(args[3]));
    }//from w  ww.  j ava2s . c  om

    try {
        // We store the cassandra storage-conf.xml on the HDFS cluster
        DistributedCache.addCacheFile(new URI("/cassandra/storage-conf.xml#storage-conf.xml"), conf);
    } catch (URISyntaxException e) {
        throw new RuntimeException(e);
    }
    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setJobName("CassandraBulkLoader_v2");
    conf.setMapperClass(Map.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(conf, new Path(args[1]));
    FileOutputFormat.setOutputPath(conf, new Path(args[2]));
    try {
        JobClient.runJob(conf);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

From source file:org.apache.hcatalog.hcatmix.load.HadoopLoadGenerator.java

License:Apache License

/**
 * Prepare input directory/jobConf and launch the hadoop job, for load testing
 *
 * @param confFileName The properties file for the task, should be available in the classpath
 * @param conf//from  ww  w  .  ja  va2  s.  c o  m
 * @return
 * @throws IOException
 * @throws MetaException
 * @throws TException
 */
public SortedMap<Long, ReduceResult> runLoadTest(String confFileName, Configuration conf)
        throws Exception, MetaException, TException {
    JobConf jobConf;
    if (conf != null) {
        jobConf = new JobConf(conf);
    } else {
        jobConf = new JobConf(new Configuration());
    }
    InputStream confFileIS;
    try {
        confFileIS = HCatMixUtils.getInputStream(confFileName);
    } catch (Exception e) {
        LOG.error("Couldn't load configuration file " + confFileName);
        throw e;
    }
    Properties props = new Properties();
    try {
        props.load(confFileIS);
    } catch (IOException e) {
        LOG.error("Couldn't load properties file: " + confFileName, e);
        throw e;
    }

    LOG.info("Loading configuration file: " + confFileName);
    addToJobConf(jobConf, props, Conf.MAP_RUN_TIME_MINUTES);
    addToJobConf(jobConf, props, Conf.STAT_COLLECTION_INTERVAL_MINUTE);
    addToJobConf(jobConf, props, Conf.THREAD_INCREMENT_COUNT);
    addToJobConf(jobConf, props, Conf.THREAD_INCREMENT_INTERVAL_MINUTES);
    addToJobConf(jobConf, props, Conf.THREAD_COMPLETION_BUFFER_MINUTES);

    int numMappers = Integer
            .parseInt(props.getProperty(Conf.NUM_MAPPERS.propName, "" + Conf.NUM_MAPPERS.defaultValue));
    Path inputDir = new Path(props.getProperty(Conf.INPUT_DIR.propName, Conf.INPUT_DIR.defaultValueStr));
    Path outputDir = new Path(props.getProperty(Conf.OUTPUT_DIR.propName, Conf.OUTPUT_DIR.defaultValueStr));

    jobConf.setJobName(JOB_NAME);
    jobConf.setNumMapTasks(numMappers);
    jobConf.setMapperClass(HCatMapper.class);
    jobConf.setJarByClass(HCatMapper.class);
    jobConf.setReducerClass(HCatReducer.class);
    jobConf.setMapOutputKeyClass(LongWritable.class);
    jobConf.setMapOutputValueClass(IntervalResult.class);
    jobConf.setOutputKeyClass(LongWritable.class);
    jobConf.setOutputValueClass(ReduceResult.class);
    jobConf.setOutputFormat(SequenceFileOutputFormat.class);
    jobConf.set(Conf.TASK_CLASS_NAMES.getJobConfKey(),
            props.getProperty(Conf.TASK_CLASS_NAMES.propName, Conf.TASK_CLASS_NAMES.defaultValueStr));

    fs = FileSystem.get(jobConf);
    Path jarRoot = new Path("/tmp/hcatmix_jar_" + new Random().nextInt());
    HadoopUtils.uploadClasspathAndAddToJobConf(jobConf, jarRoot);
    fs.deleteOnExit(jarRoot);

    FileInputFormat.setInputPaths(jobConf, createInputFiles(inputDir, numMappers));
    if (fs.exists(outputDir)) {
        fs.delete(outputDir, true);
    }
    FileOutputFormat.setOutputPath(jobConf, outputDir);

    // Set up delegation token required for hiveMetaStoreClient in map task
    HiveConf hiveConf = new HiveConf(HadoopLoadGenerator.class);
    HiveMetaStoreClient hiveClient = new HiveMetaStoreClient(hiveConf);
    String tokenStr = hiveClient.getDelegationToken(UserGroupInformation.getCurrentUser().getUserName(),
            "mapred");
    Token<? extends AbstractDelegationTokenIdentifier> token = new Token<DelegationTokenIdentifier>();
    token.decodeFromUrlString(tokenStr);
    token.setService(new Text(METASTORE_TOKEN_SIGNATURE));
    jobConf.getCredentials().addToken(new Text(METASTORE_TOKEN_KEY), token);

    // Submit the job, once the job is complete see output
    LOG.info("Submitted hadoop job");
    RunningJob j = JobClient.runJob(jobConf);
    LOG.info("JobID is: " + j.getJobName());
    if (!j.isSuccessful()) {
        throw new IOException("Job failed");
    }
    return readResult(outputDir, jobConf);
}

From source file:org.apache.ignite.internal.processors.hadoop.examples.GridHadoopWordCount1.java

License:Apache License

/**
 * Sets task classes with related info if needed into configuration object.
 *
 * @param jobConf Configuration to change.
 * @param setMapper Option to set mapper and input format classes.
 * @param setCombiner Option to set combiner class.
 * @param setReducer Option to set reducer and output format classes.
 */// ww w  .  ja  v a  2  s .com
public static void setTasksClasses(JobConf jobConf, boolean setMapper, boolean setCombiner,
        boolean setReducer) {
    if (setMapper) {
        jobConf.setMapperClass(GridHadoopWordCount1Map.class);
        jobConf.setInputFormat(TextInputFormat.class);
    }

    if (setCombiner)
        jobConf.setCombinerClass(GridHadoopWordCount1Reduce.class);

    if (setReducer) {
        jobConf.setReducerClass(GridHadoopWordCount1Reduce.class);
        jobConf.setOutputFormat(TextOutputFormat.class);
    }
}

From source file:org.apache.ignite.internal.processors.hadoop.examples.HadoopWordCount1.java

License:Apache License

/**
 * Sets task classes with related info if needed into configuration object.
 *
 * @param jobConf Configuration to change.
 * @param setMapper Option to set mapper and input format classes.
 * @param setCombiner Option to set combiner class.
 * @param setReducer Option to set reducer and output format classes.
 */// w ww .j  a v a 2s. c om
public static void setTasksClasses(JobConf jobConf, boolean setMapper, boolean setCombiner,
        boolean setReducer) {
    if (setMapper) {
        jobConf.setMapperClass(HadoopWordCount1Map.class);
        jobConf.setInputFormat(TextInputFormat.class);
    }

    if (setCombiner)
        jobConf.setCombinerClass(HadoopWordCount1Reduce.class);

    if (setReducer) {
        jobConf.setReducerClass(HadoopWordCount1Reduce.class);
        jobConf.setOutputFormat(TextOutputFormat.class);
    }
}

From source file:org.apache.mahout.avro.text.mapred.AvroDocumentProcessor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf();
    if (args.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        return 0;
    }//from   w w w .j  av a  2s .  com

    conf.setStrings("io.serializations",
            new String[] { WritableSerialization.class.getName(), AvroSpecificSerialization.class.getName(),
                    AvroReflectSerialization.class.getName(), AvroGenericSerialization.class.getName() });

    AvroComparator.setSchema(AvroDocument._SCHEMA); //TODO: must be done in mapper, reducer configure method.

    conf.setClass("mapred.output.key.comparator.class", AvroComparator.class, RawComparator.class);

    conf.setJarByClass(AvroDocumentProcessor.class);
    conf.setMapperClass(ProcessorMapper.class);
    conf.setReducerClass(IdentityReducer.class);
    conf.setOutputKeyClass(AvroDocument.class);
    conf.setOutputValueClass(NullWritable.class);

    conf.setInputFormat(AvroInputFormat.class);
    conf.setOutputFormat(AvroOutputFormat.class);

    AvroInputFormat.setAvroInputClass(conf, AvroDocument.class);
    AvroOutputFormat.setAvroOutputClass(conf, AvroDocument.class);

    Path input = new Path(args[0]);
    Path output = new Path(args[1]);

    FileSystem fs = FileSystem.get(conf);
    fs.delete(output, true);

    FileInputFormat.addInputPath(conf, input);
    FileOutputFormat.setOutputPath(conf, output);

    RunningJob job = JobClient.runJob(conf);
    job.waitForCompletion();

    return job.isComplete() ? 0 : 1;
}

From source file:org.apache.mahout.avro.text.mapred.AvroDocumentsWordCount.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf();
    if (args.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        return 0;
    }//from   w ww . j  a  v  a2s .co  m

    conf.setStrings("io.serializations",
            new String[] { WritableSerialization.class.getName(), AvroSpecificSerialization.class.getName(),
                    AvroReflectSerialization.class.getName(), AvroGenericSerialization.class.getName() });

    conf.setJarByClass(AvroDocumentsWordCount.class);
    conf.setMapperClass(TokenizerMapper.class);
    conf.setCombinerClass(IntSumReducer.class);
    conf.setReducerClass(IntSumReducer.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setInputFormat(AvroInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    Path input = new Path(args[0]);
    Path output = new Path(args[1]);

    FileSystem fs = FileSystem.get(conf);
    fs.delete(output, true);

    AvroInputFormat.setAvroInputClass(conf, AvroDocument.class);
    FileInputFormat.addInputPath(conf, input);
    FileOutputFormat.setOutputPath(conf, output);

    RunningJob job = JobClient.runJob(conf);
    job.waitForCompletion();

    return job.isSuccessful() ? 1 : 0;
}

From source file:org.apache.mahout.avro.text.mapred.WikipediaToAvroDocuments.java

License:Apache License

/**
 * Run the job/*from w  w w. j av  a2 s  .c o  m*/
 * 
 * @param input
 *          the input pathname String
 * @param output
 *          the output pathname String
 * @param catFile
 *          the file containing the Wikipedia categories
 * @param exactMatchOnly
 *          if true, then the Wikipedia category must match exactly instead of
 *          simply containing the category string
 * @param all
 *          if true select all categories
 */
public static int runJob(String input, String output, String catFile, boolean exactMatchOnly, boolean all)
        throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(WikipediaToAvroDocuments.class);
    if (log.isInfoEnabled()) {
        log.info("Input: " + input + " Out: " + output + " Categories: " + catFile + " All Files: " + all);
    }

    Path inPath = new Path(input);
    Path outPath = new Path(output);

    FileInputFormat.setInputPaths(conf, inPath);
    FileOutputFormat.setOutputPath(conf, outPath);
    //AvroOutputFormat.setClass(conf, AvroDocument.class);
    //AvroOutputFormat.setSchema(conf, AvroDocument._SCHEMA);

    conf.set("xmlinput.start", "<page>");
    conf.set("xmlinput.end", "</page>");
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(AvroDocument.class);
    conf.setBoolean("exact.match.only", exactMatchOnly);
    conf.setBoolean("all.files", all);
    conf.setMapperClass(WikipediaAvroDocumentMapper.class);
    conf.setInputFormat(XmlInputFormat.class);
    conf.setReducerClass(IdentityReducer.class);
    conf.setOutputFormat(AvroOutputFormat.class);

    AvroOutputFormat.setAvroOutputClass(conf, AvroDocument.class);

    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
        dfs.delete(outPath, true);
    }

    Set<String> categories = new HashSet<String>();
    if (catFile.equals("") == false) {
        for (String line : new FileLineIterable(new File(catFile))) {
            categories.add(line.trim().toLowerCase());
        }
    }

    DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf,
            GenericsUtil.getClass(categories));

    String categoriesStr = setStringifier.toString(categories);

    conf.set("wikipedia.categories", categoriesStr);

    client.setConf(conf);
    RunningJob job = JobClient.runJob(conf);
    job.waitForCompletion();
    return job.isSuccessful() ? 1 : 0;
}

From source file:org.apache.mahout.classifier.bayes.BayesThetaNormalizerDriver.java

License:Apache License

/**
 * Run the job//from ww w. j av a  2  s  . c  om
 *
 * @param input  the input pathname String
 * @param output the output pathname String
 */
public static void runJob(String input, String output) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(BayesThetaNormalizerDriver.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DoubleWritable.class);
    FileInputFormat.addInputPath(conf, new Path(output + "/trainer-tfIdf/trainer-tfIdf"));
    Path outPath = new Path(output + "/trainer-thetaNormalizer");
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setNumMapTasks(100);
    //conf.setNumReduceTasks(1);
    conf.setMapperClass(BayesThetaNormalizerMapper.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setCombinerClass(BayesThetaNormalizerReducer.class);
    conf.setReducerClass(BayesThetaNormalizerReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.set("io.serializations",
            "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code

    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
        dfs.delete(outPath, true);
    }

    Path Sigma_kFiles = new Path(output + "/trainer-weights/Sigma_k/*");
    Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, Sigma_kFiles, conf);
    DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf,
            GenericsUtil.getClass(labelWeightSum));
    String labelWeightSumString = mapStringifier.toString(labelWeightSum);

    log.info("Sigma_k for Each Label");
    Map<String, Double> c = mapStringifier.fromString(labelWeightSumString);
    log.info("{}", c);
    conf.set("cnaivebayes.sigma_k", labelWeightSumString);

    Path sigma_kSigma_jFile = new Path(output + "/trainer-weights/Sigma_kSigma_j/*");
    double sigma_jSigma_k = SequenceFileModelReader.readSigma_jSigma_k(dfs, sigma_kSigma_jFile, conf);
    DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.class);
    String sigma_jSigma_kString = stringifier.toString(sigma_jSigma_k);

    log.info("Sigma_kSigma_j for each Label and for each Features");
    double retSigma_jSigma_k = stringifier.fromString(sigma_jSigma_kString);
    log.info("{}", retSigma_jSigma_k);
    conf.set("cnaivebayes.sigma_jSigma_k", sigma_jSigma_kString);

    Path vocabCountFile = new Path(output + "/trainer-tfIdf/trainer-vocabCount/*");
    double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf);
    String vocabCountString = stringifier.toString(vocabCount);

    log.info("Vocabulary Count");
    conf.set("cnaivebayes.vocabCount", vocabCountString);
    double retvocabCount = stringifier.fromString(vocabCountString);
    log.info("{}", retvocabCount);

    client.setConf(conf);

    JobClient.runJob(conf);

}

From source file:org.apache.mahout.classifier.bayes.common.BayesFeatureDriver.java

License:Apache License

/**
 * Run the job/*w ww  .  j  a v a2  s  . c o  m*/
 *
 * @param input  the input pathname String
 * @param output the output pathname String
 */
public static void runJob(String input, String output, int gramSize) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(BayesFeatureDriver.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DoubleWritable.class);

    FileInputFormat.setInputPaths(conf, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setNumMapTasks(100);
    //conf.setNumReduceTasks(1);
    conf.setMapperClass(BayesFeatureMapper.class);

    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setCombinerClass(BayesFeatureReducer.class);
    conf.setReducerClass(BayesFeatureReducer.class);
    conf.setOutputFormat(BayesFeatureOutputFormat.class);

    conf.set("io.serializations",
            "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code

    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
        dfs.delete(outPath, true);
    }

    DefaultStringifier<Integer> intStringifier = new DefaultStringifier<Integer>(conf, Integer.class);
    String gramSizeString = intStringifier.toString(gramSize);

    log.info("{}", intStringifier.fromString(gramSizeString));
    conf.set("bayes.gramSize", gramSizeString);

    client.setConf(conf);
    JobClient.runJob(conf);

}

From source file:org.apache.mahout.classifier.bayes.common.BayesTfIdfDriver.java

License:Apache License

/**
 * Run the job/*from  w  ww  . j  a  v a 2 s  . c  o m*/
 *
 * @param input  the input pathname String
 * @param output the output pathname String
 */
public static void runJob(String input, String output) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(BayesTfIdfDriver.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DoubleWritable.class);

    FileInputFormat.addInputPath(conf, new Path(output + "/trainer-termDocCount"));
    FileInputFormat.addInputPath(conf, new Path(output + "/trainer-wordFreq"));
    FileInputFormat.addInputPath(conf, new Path(output + "/trainer-featureCount"));
    Path outPath = new Path(output + "/trainer-tfIdf");
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setNumMapTasks(100);

    conf.setMapperClass(BayesTfIdfMapper.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setCombinerClass(BayesTfIdfReducer.class);
    conf.setReducerClass(BayesTfIdfReducer.class);
    conf.setOutputFormat(BayesTfIdfOutputFormat.class);

    conf.set("io.serializations",
            "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
        dfs.delete(outPath, true);
    }

    Path interimFile = new Path(output + "/trainer-docCount/part-*");

    Map<String, Double> labelDocumentCounts = SequenceFileModelReader.readLabelDocumentCounts(dfs, interimFile,
            conf);

    DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf,
            GenericsUtil.getClass(labelDocumentCounts));

    String labelDocumentCountString = mapStringifier.toString(labelDocumentCounts);
    log.info("Counts of documents in Each Label");
    Map<String, Double> c = mapStringifier.fromString(labelDocumentCountString);
    log.info("{}", c);

    conf.set("cnaivebayes.labelDocumentCounts", labelDocumentCountString);

    client.setConf(conf);

    JobClient.runJob(conf);
}