Example usage for org.apache.hadoop.mapred JobConf setMapperClass

List of usage examples for org.apache.hadoop.mapred JobConf setMapperClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setMapperClass.

Prototype

public void setMapperClass(Class<? extends Mapper> theClass) 

Source Link

Document

Set the Mapper class for the job.

Usage

From source file:com.digitalpebble.behemoth.solr.LucidWorksIndexerJob.java

License:Apache License

public int run(String[] args) throws Exception {

    final FileSystem fs = FileSystem.get(getConf());

    if (args.length != 2) {
        String syntax = "com.digitalpebble.solr.LucidWorksIndexerJob in solrURL";
        System.err.println(syntax);
        return -1;
    }//from   w  ww  .j a v  a 2 s .c  o m

    Path inputPath = new Path(args[0]);
    String solrURL = args[1];

    JobConf job = new JobConf(getConf());

    job.setJarByClass(this.getClass());

    job.setJobName("Indexing " + inputPath + " into LucidWorks");

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(LucidWorksOutputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    job.setMapperClass(IdentityMapper.class);
    // no reducer : send straight to SOLR at end of mapping
    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt());
    FileOutputFormat.setOutputPath(job, tmp);

    job.set("solr.server.url", solrURL);

    try {
        long start = System.currentTimeMillis();
        JobClient.runJob(job);
        long finish = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("LucidWorksIndexerJob completed. Time " + (finish - start) + " ms");
        }
    } catch (Exception e) {
        LOG.error(e);
    } finally {
        fs.delete(tmp, true);
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.solr.SOLRIndexerJob.java

License:Apache License

public int run(String[] args) throws Exception {

    final FileSystem fs = FileSystem.get(getConf());

    if (args.length != 2) {
        String syntax = "com.digitalpebble.solr.SOLRIndexerJob in solrURL";
        System.err.println(syntax);
        return -1;
    }//from w w w . j a va2s. co m

    Path inputPath = new Path(args[0]);
    String solrURL = args[1];

    JobConf job = new JobConf(getConf());

    job.setJarByClass(this.getClass());

    job.setJobName("Indexing " + inputPath + " into SOLR");

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SOLROutputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    job.setMapperClass(IdentityMapper.class);
    // no reducer : send straight to SOLR at end of mapping
    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt());
    FileOutputFormat.setOutputPath(job, tmp);

    job.set("solr.server.url", solrURL);

    try {
        long start = System.currentTimeMillis();
        JobClient.runJob(job);
        long finish = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("SOLRIndexerJob completed. Timing: " + (finish - start) + " ms");
        }
    } catch (Exception e) {
        LOG.error(e);
    } finally {
        fs.delete(tmp, true);
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.tika.TikaDriver.java

License:Apache License

public int run(String[] args) throws Exception {

    final FileSystem fs = FileSystem.get(getConf());
    GroupBuilder gBuilder = new GroupBuilder().withName("Options:");
    List<Option> options = new ArrayList<Option>();
    Option inputOpt = buildOption("input", "i", "The input path", true, true, null);
    options.add(inputOpt);/* www. j ava 2s.co  m*/
    Option outOpt = buildOption("output", "o", "The output path", true, true, null);
    options.add(outOpt);
    Option tikaOpt = buildOption("tikaProcessor", "t",
            "The fully qualified name of a TikaProcessor class that handles the extraction (optional)", true,
            false, null);
    options.add(tikaOpt);
    Option mimeTypeOpt = buildOption("mimeType", "m", "The mime type to use (optional)", true, false, "");
    options.add(mimeTypeOpt);
    for (Option opt : options) {
        gBuilder = gBuilder.withOption(opt);
    }

    Group group = gBuilder.create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        // TODO catch exceptions with parsing of opts
        CommandLine cmdLine = parser.parse(args);
        Path inputPath = new Path(cmdLine.getValue(inputOpt).toString());
        Path outputPath = new Path(cmdLine.getValue(outOpt).toString());
        String handlerName = null;
        if (cmdLine.hasOption(tikaOpt)) {
            handlerName = cmdLine.getValue(tikaOpt).toString();
        }

        JobConf job = new JobConf(getConf());
        job.setJarByClass(this.getClass());

        if (cmdLine.hasOption(mimeTypeOpt)) {
            String mimeType = cmdLine.getValue(mimeTypeOpt).toString();
            job.set(TikaConstants.TIKA_MIME_TYPE_KEY, mimeType);
        }

        if (handlerName != null && handlerName.equals("") == false) {
            job.set(TIKA_PROCESSOR_KEY, handlerName);
        }

        job.setJobName("Tika : " + inputPath.toString());

        job.setInputFormat(SequenceFileInputFormat.class);
        job.setOutputFormat(SequenceFileOutputFormat.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(BehemothDocument.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BehemothDocument.class);

        job.setMapperClass(TikaMapper.class);

        boolean isFilterRequired = BehemothReducer.isRequired(job);
        if (isFilterRequired)
            job.setReducerClass(BehemothReducer.class);
        else {
            job.setNumReduceTasks(0);
        }

        FileInputFormat.addInputPath(job, inputPath);
        FileOutputFormat.setOutputPath(job, outputPath);

        try {
            long start = System.currentTimeMillis();
            JobClient.runJob(job);
            long finish = System.currentTimeMillis();
            if (log.isInfoEnabled()) {
                log.info("TikaDriver completed. Timing: " + (finish - start) + " ms");
            }
        } catch (Exception e) {
            log.error("Exception", e);
            return -1;
            // don't delete the output as some of it could be used
            // fs.delete(outputPath, true);
        } finally {
        }

    } catch (OptionException e) {
        log.error("OptionException", e.getMessage());
        HelpFormatter formatter = new HelpFormatter();
        formatter.setGroup(group);
        formatter.print();
        return -1;
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.uima.UIMADriver.java

License:Apache License

public int run(String[] args) throws Exception {

    final FileSystem fs = FileSystem.get(getConf());

    if (args.length != 3) {
        String syntax = "com.digitalpebble.behemoth.uima.UIMADriver in out path_pear_file";
        System.err.println(syntax);
        return -1;
    }/*from  ww  w . j ava 2  s . com*/

    Path inputPath = new Path(args[0]);
    Path outputPath = new Path(args[1]);
    String pearPath = args[2];

    // check that the GATE application has been stored on HDFS
    Path zap = new Path(pearPath);
    if (fs.exists(zap) == false) {
        System.err.println("The UIMA application " + pearPath + "can't be found on HDFS - aborting");
        return -1;
    }

    JobConf job = new JobConf(getConf());
    job.setJarByClass(this.getClass());
    job.setJobName("Processing with UIMA application : " + pearPath);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(BehemothDocument.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    job.setMapperClass(UIMAMapper.class);

    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    // push the UIMA pear onto the DistributedCache
    DistributedCache.addCacheFile(new URI(pearPath), job);

    job.set("uima.pear.path", pearPath);

    try {
        long start = System.currentTimeMillis();
        JobClient.runJob(job);
        long finish = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("UIMADriver completed. Timing: " + (finish - start) + " ms");
        }
    } catch (Exception e) {
        LOG.error("Exception", e);
        fs.delete(outputPath, true);
    } finally {
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.util.CorpusFilter.java

License:Apache License

public int run(String[] args) throws Exception {

    Options options = new Options();
    // automatically generate the help statement
    HelpFormatter formatter = new HelpFormatter();
    // create the parser
    CommandLineParser parser = new GnuParser();

    options.addOption("h", "help", false, "print this message");
    options.addOption("i", "input", true, "input Behemoth corpus");
    options.addOption("o", "output", true, "output Behemoth corpus");

    // parse the command line arguments
    CommandLine line = null;/*  ww  w. j  a  v  a2s .c  o m*/
    try {
        line = parser.parse(options, args);
        String input = line.getOptionValue("i");
        String output = line.getOptionValue("o");
        if (line.hasOption("help")) {
            formatter.printHelp("CorpusFilter", options);
            return 0;
        }
        if (input == null | output == null) {
            formatter.printHelp("CorpusFilter", options);
            return -1;
        }
    } catch (ParseException e) {
        formatter.printHelp("CorpusFilter", options);
    }

    final FileSystem fs = FileSystem.get(getConf());

    Path inputPath = new Path(line.getOptionValue("i"));
    Path outputPath = new Path(line.getOptionValue("o"));

    JobConf job = new JobConf(getConf());
    job.setJarByClass(this.getClass());

    job.setJobName("CorpusFilter : " + inputPath.toString());

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(BehemothDocument.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    boolean isFilterRequired = BehemothMapper.isRequired(job);
    // should be the case here
    if (!isFilterRequired) {
        System.err.println("No filters configured. Check your behemoth-site.xml");
        return -1;
    }
    job.setMapperClass(BehemothMapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    try {
        JobClient.runJob(job);
    } catch (Exception e) {
        e.printStackTrace();
        fs.delete(outputPath, true);
    } finally {
    }

    return 0;
}

From source file:com.dynamicalsoftware.feed.mapreduce.AggregatePerformanceData.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length >= 2) {
        JobConf conf = new JobConf(AggregatePerformanceData.class);
        conf.setJobName("aggregate news feed performance data");
        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);
        conf.setMapperClass(AggregatePerformanceData.Map.class);
        conf.setReducerClass(AggregatePerformanceData.Reduce.class);
        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);
        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));
        JobClient.runJob(conf);/*from   w  w w.jav a  2  s. co m*/
    } else {
        System.err.println("\nusage: AggregatePerformanceData input_directory output_directory\n");
    }
}

From source file:com.dynamicalsoftware.hadoop.mapreduce.SanFranciscoCrime.java

License:Apache License

private static void generate(String name, Class mapper, String input, String output) throws IOException {
    JobConf conf = new JobConf(SanFranciscoCrime.class);
    conf.setJobName(name);/*from  w  w  w  .j a  v a  2s.c  om*/
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(mapper);
    conf.setReducerClass(ReduceByWeek.class);
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, new Path(output));
    JobClient.runJob(conf);
}

From source file:com.dynamicalsoftware.hadoop.mapreduce.SanFranciscoCrimePrepOlap.java

License:Apache License

/**
 * sets up and runs the hadoop map/reduce job itself
 * @param name contains the name of the job itself
 * @param mapper identified which mapper class to use
 * @param input is the fully qualified path to the raw crime data
 * @param output is the fully qualified path to where the generated data should reside
 * @throws IOException/*  w ww. j  a v a 2s . c om*/
 */
private static void generate(String name, Class mapper, String input, String output) throws IOException {
    JobConf conf = new JobConf(SanFranciscoCrimePrepOlap.class);
    conf.setJobName(name);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(mapper);
    conf.setReducerClass(Reduce.class);
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, new Path(output));
    JobClient.runJob(conf);
}

From source file:com.example.hadoop.mapreduce.test.MapReduceTest.java

License:Open Source License

public static void main(String[] args) throws IOException {
    String input = HDFS_PATH + "/input/README.txt";
    String input2 = HDFS_PATH + "/input/README2.txt";
    String output = HDFS_PATH + "/test/output";

    // ?mapreduce???
    if (HdfsClient.exists(output)) {
        HdfsClient.rm(output);/*from  ww w .jav a  2 s  . co  m*/
    }

    JobConf conf = new JobConf(MapReduceTest.class);
    conf.setJobName("MapReduceTest");
    conf.addResource("classpath:/hadoop/core-site.xml");
    conf.addResource("classpath:/hadoop/hdfs-site.xml");
    conf.addResource("classpath:/hadoop/mapred-site.xml");

    // mapper
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);

    // reducer
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    // mapper
    conf.setMapperClass(MapperTest.class);
    // combiner?????mapper??reducer?
    conf.setCombinerClass(ReducerTest.class);
    // reducer
    conf.setReducerClass(ReducerTest.class);

    // MapReduce?
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    // MapReduce?
    FileInputFormat.setInputPaths(conf, new Path[] { new Path(input), new Path(input2) });
    // MapReduce?
    FileOutputFormat.setOutputPath(conf, new Path(output));

    try {
        JobClient.runJob(conf);
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:com.facebook.LinkBench.LinkBenchDriverMR.java

License:Apache License

/**
 * create JobConf for map reduce job//from  w  w  w  .ja va  2  s . c  o m
 * @param currentphase LOAD or REQUEST
 * @param nmappers number of mappers (loader or requester)
 */
private JobConf createJobConf(int currentphase, int nmappers) {
    final JobConf jobconf = new JobConf(getConf(), getClass());
    jobconf.setJobName("LinkBench MapReduce Driver");

    if (USE_INPUT_FILES) {
        jobconf.setInputFormat(SequenceFileInputFormat.class);
    } else {
        jobconf.setInputFormat(LinkBenchInputFormat.class);
    }
    jobconf.setOutputKeyClass(IntWritable.class);
    jobconf.setOutputValueClass(LongWritable.class);
    jobconf.setOutputFormat(SequenceFileOutputFormat.class);
    if (currentphase == LOAD) {
        jobconf.setMapperClass(LoadMapper.class);
    } else { //REQUEST
        jobconf.setMapperClass(RequestMapper.class);
    }
    jobconf.setNumMapTasks(nmappers);
    jobconf.setReducerClass(LoadRequestReducer.class);
    jobconf.setNumReduceTasks(1);

    // turn off speculative execution, because DFS doesn't handle
    // multiple writers to the same file.
    jobconf.setSpeculativeExecution(false);

    return jobconf;
}