Example usage for org.apache.hadoop.mapreduce Job setMapperClass

List of usage examples for org.apache.hadoop.mapreduce Job setMapperClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapperClass.

Prototype

public void setMapperClass(Class<? extends Mapper> cls) throws IllegalStateException 

Source Link

Document

Set the Mapper for the job.

Usage

From source file:com.datasalt.pangool.benchmark.wordcount.HadoopWordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);//from  w w  w.  j av a 2  s. c om
    }
    //conf.setBoolean("hadoop.security.authorization", false);
    //conf.set("hadoop.security.authentication","simple");
    Job job = new Job(conf, "word count");
    job.setJarByClass(HadoopWordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    HadoopUtils.deleteIfExists(FileSystem.get(conf), new Path(otherArgs[1]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    job.waitForCompletion(true);
}

From source file:com.datasalt.pangool.tuplemr.mapred.lib.input.PangoolMultipleInputs.java

License:Apache License

/**
 * Add a {@link Path} with a custom {@link InputFormat} and {@link Mapper} to the list of inputs for the map-reduce
 * job. Returns the instance files created.
 * //from w  ww . j  a v  a 2 s .co m
 * @param job
 *          The {@link Job}
 * @param path
 *          {@link Path} to be added to the list of inputs for the job
 * @param inputFormat
 *          {@link InputFormat} class to use for this path
 * @param mapperInstance
 *          {@link Mapper} instance to use
 * @throws IOException
 * @throws FileNotFoundException
 */
public static Set<String> addInputPath(Job job, Path path, InputFormat inputFormat, Mapper mapperInstance,
        Map<String, String> specificContext) throws FileNotFoundException, IOException {

    Set<String> instanceFiles = new HashSet<String>();
    // Serialize the Mapper instance
    String uniqueNameMapper = UUID.randomUUID().toString() + '.' + "mapper.dat";
    try {
        InstancesDistributor.distribute(mapperInstance, uniqueNameMapper, job.getConfiguration());
        instanceFiles.add(uniqueNameMapper);
    } catch (URISyntaxException e) {
        throw new IOException(e);
    }
    // Serialize the Input Format
    String uniqueNameInputFormat = UUID.randomUUID().toString() + '.' + "inputFormat.dat";
    try {
        InstancesDistributor.distribute(inputFormat, uniqueNameInputFormat, job.getConfiguration());
        instanceFiles.add(uniqueNameInputFormat);
    } catch (URISyntaxException e) {
        throw new IOException(e);
    }
    for (Map.Entry<String, String> contextKeyValue : specificContext.entrySet()) {
        PangoolMultipleInputs.addInputContext(job, uniqueNameInputFormat, contextKeyValue.getKey(),
                contextKeyValue.getValue());
    }
    addInputPath(job, path, uniqueNameInputFormat);
    Configuration conf = job.getConfiguration();
    String mapperMapping = path.toString() + ";" + uniqueNameMapper;
    String mappers = conf.get(PANGOOL_INPUT_DIR_MAPPERS_CONF);
    conf.set(PANGOOL_INPUT_DIR_MAPPERS_CONF, mappers == null ? mapperMapping : mappers + "," + mapperMapping);
    job.setMapperClass(DelegatingMapper.class);
    return instanceFiles;
}

From source file:com.digitalpebble.behemoth.mahout.BehemothDocumentProcessor.java

License:Apache License

/**
 * Convert the input documents into token array using the
 * {@link StringTuple} The input documents has to be in the
 * {@link org.apache.hadoop.io.SequenceFile} format
 * /*from  w  w  w  .  j  a v a  2 s.  c  o  m*/
 * @param input
 *            input directory of the documents in
 *            {@link org.apache.hadoop.io.SequenceFile} format
 * @param output
 *            output directory were the {@link StringTuple} token array of
 *            each document has to be created
 * @param type
 *            The annotation type representing the tokens
 * @param feature
 *            The name of the features holding the token value
 * @throws IOException
 * @throws ClassNotFoundException
 * @throws InterruptedException
 */
public static void tokenizeDocuments(Path input, String type, String feature, Path output)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    // this conf parameter needs to be set enable serialisation of conf
    // values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.set(TOKEN_TYPE, type);
    conf.set(FEATURE_NAME, feature);

    Job job = new Job(conf);
    job.setJobName("DocumentProcessor::BehemothTokenizer: input-folder: " + input);
    job.setJarByClass(BehemothDocumentProcessor.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StringTuple.class);
    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(BehemothTokenizerMapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setNumReduceTasks(0);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");
}

From source file:com.digitalpebble.behemoth.mahout.BehemothDocumentProcessor.java

License:Apache License

/**
 * Convert the input documents into token array using the
 * {@link StringTuple} The input documents has to be in the
 * {@link org.apache.hadoop.io.SequenceFile} format
 * /*w w w.  j a  v  a 2  s. co  m*/
 * @param input
 *            input directory of the documents in
 *            {@link org.apache.hadoop.io.SequenceFile} format
 * @param output
 *            output directory were the {@link StringTuple} token array of
 *            each document has to be created
 * @param analyzerClass
 *            The Lucene {@link Analyzer} for tokenizing the UTF-8 text
 */
public static void tokenizeDocuments(Path input, Class<? extends Analyzer> analyzerClass, Path output,
        Configuration baseConf) throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf
    // values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.set(ANALYZER_CLASS, analyzerClass.getName());

    Job job = new Job(conf);
    job.setJobName("DocumentProcessor::LuceneTokenizer: input-folder: " + input);
    job.setJarByClass(BehemothDocumentProcessor.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StringTuple.class);
    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(LuceneTokenizerMapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setNumReduceTasks(0);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");

}

From source file:com.digitalpebble.behemoth.mahout.BehemothDocumentProcessor.java

License:Apache License

public static void dumpLabels(Path input, Path output, Configuration baseConf)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf
    // values/*  ww w  .ja  v a  2s  . co m*/
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");

    Job job = new Job(conf);
    job.setJobName("DocumentProcessor::LabelDumper: input-folder: " + input);
    job.setJarByClass(BehemothDocumentProcessor.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(BehemothLabelMapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setNumReduceTasks(0);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");

}

From source file:com.digitalpebble.behemoth.mahout.DocumentProcessor.java

License:Apache License

/**
 * Convert the input documents into token array using the
 * {@link StringTuple} The input documents has to be in the
 * {@link org.apache.hadoop.io.SequenceFile} format
 * //from   ww w. j  av a 2 s.co  m
 * @param input
 *            input directory of the documents in
 *            {@link org.apache.hadoop.io.SequenceFile} format
 * @param output
 *            output directory were the {@link StringTuple} token array of
 *            each document has to be created
 * @param type
 *            The annotation type representing the tokens
 * @param feature
 *            The name of the features holding the token value
 * @throws IOException
 * @throws ClassNotFoundException
 * @throws InterruptedException
 */
public static void tokenizeDocuments(Path input, String type, String feature, Path output)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    // this conf parameter needs to be set enable serialisation of conf
    // values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.set(TOKEN_TYPE, type);
    conf.set(FEATURE_NAME, feature);

    Job job = new Job(conf);
    job.setJobName("DocumentProcessor::DocumentTokenizer: input-folder: " + input);
    job.setJarByClass(DocumentProcessor.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StringTuple.class);
    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(SequenceFileTokenizerMapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setNumReduceTasks(0);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    HadoopUtil.delete(conf, output);

    job.waitForCompletion(true);
}

From source file:com.dipwater.accountAnalyze.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    conf.set("mapred.job.tracker", "192.168.1.51:9001");
    conf.set("fs.default.name", "hdfs://192.168.1.51:9000");

    String[] ars = new String[] { "input", "newout" };
    String[] otherArgs = new GenericOptionsParser(conf, ars).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);/*from   w  ww  . j a v a2  s  .com*/
    }
    Job job = new Job(conf, "word count");

    File jarFile = EJob.createTempJar("bin");
    EJob.addClasspath("/home/hadoop/hadoop-1.2.1/conf");
    ClassLoader classLoader = EJob.getClassLoader();
    Thread.currentThread().setContextClassLoader(classLoader);
    ((JobConf) job.getConfiguration()).setJar(jarFile.toString());

    //job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.dlyle.testh20.H2OTest.java

@Override
public int run(String[] args) throws Exception {
    Configuration conf = this.getConf();
    Job job = Job.getInstance(conf, "H2O Test");
    job.setJarByClass(H2OTest.class);
    job.setMapperClass(H2OMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.elephantscale.hbase.book.chapter1.SimpleMR.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: SimpleMR <in> <out>");
        return;/*from w  w w. j  a  va  2s.  c  o  m*/
    }
    Job job = new Job(conf, "SimpleMR");
    job.setJarByClass(SimpleMR.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    job.waitForCompletion(true);
}

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

private static double calculatePerplexity(Configuration conf, Path corpusPath, Path modelPath, int iteration)
        throws IOException, ClassNotFoundException, InterruptedException {
    String jobName = "Calculating perplexity for " + modelPath;
    log.info("About to run: " + jobName);
    Job job = new Job(conf, jobName);
    job.setJarByClass(CachingCVB0PerplexityMapper.class);
    job.setMapperClass(CachingCVB0PerplexityMapper.class);
    job.setCombinerClass(DualDoubleSumReducer.class);
    job.setReducerClass(DualDoubleSumReducer.class);
    job.setNumReduceTasks(1);//from w  w  w. j  a v a 2s.  c  o  m
    job.setOutputKeyClass(DoubleWritable.class);
    job.setOutputValueClass(DoubleWritable.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.addInputPath(job, corpusPath);
    Path outputPath = perplexityPath(modelPath.getParent(), iteration);
    FileOutputFormat.setOutputPath(job, outputPath);
    setModelPaths(job, modelPath);
    HadoopUtil.delete(conf, outputPath);
    if (!job.waitForCompletion(true)) {
        throw new InterruptedException("Failed to calculate perplexity for: " + modelPath);
    }
    return readPerplexity(conf, modelPath.getParent(), iteration);
}