Example usage for org.apache.hadoop.mapred JobConf setMapperClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setMapperClass.

Prototype

public void setMapperClass(Class<? extends Mapper> theClass)

Source Link

Document

Set the Mapper class for the job.

Usage

From source file:com.digitalpebble.behemoth.solr.LucidWorksIndexerJob.java

License:Apache License

public int run(String[] args) throws Exception {

    final FileSystem fs = FileSystem.get(getConf());

    if (args.length != 2) {
        String syntax = "com.digitalpebble.solr.LucidWorksIndexerJob in solrURL";
        System.err.println(syntax);
        return -1;
    }//from   w  ww  .j a v  a 2 s .c  o m

    Path inputPath = new Path(args[0]);
    String solrURL = args[1];

    JobConf job = new JobConf(getConf());

    job.setJarByClass(this.getClass());

    job.setJobName("Indexing " + inputPath + " into LucidWorks");

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(LucidWorksOutputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    job.setMapperClass(IdentityMapper.class);
    // no reducer : send straight to SOLR at end of mapping
    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt());
    FileOutputFormat.setOutputPath(job, tmp);

    job.set("solr.server.url", solrURL);

    try {
        long start = System.currentTimeMillis();
        JobClient.runJob(job);
        long finish = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("LucidWorksIndexerJob completed. Time " + (finish - start) + " ms");
        }
    } catch (Exception e) {
        LOG.error(e);
    } finally {
        fs.delete(tmp, true);
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.solr.SOLRIndexerJob.java

License:Apache License

public int run(String[] args) throws Exception {

    final FileSystem fs = FileSystem.get(getConf());

    if (args.length != 2) {
        String syntax = "com.digitalpebble.solr.SOLRIndexerJob in solrURL";
        System.err.println(syntax);
        return -1;
    }//from w w w . j a va2s. co m

    Path inputPath = new Path(args[0]);
    String solrURL = args[1];

    JobConf job = new JobConf(getConf());

    job.setJarByClass(this.getClass());

    job.setJobName("Indexing " + inputPath + " into SOLR");

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SOLROutputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    job.setMapperClass(IdentityMapper.class);
    // no reducer : send straight to SOLR at end of mapping
    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt());
    FileOutputFormat.setOutputPath(job, tmp);

    job.set("solr.server.url", solrURL);

    try {
        long start = System.currentTimeMillis();
        JobClient.runJob(job);
        long finish = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("SOLRIndexerJob completed. Timing: " + (finish - start) + " ms");
        }
    } catch (Exception e) {
        LOG.error(e);
    } finally {
        fs.delete(tmp, true);
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.tika.TikaDriver.java

License:Apache License

public int run(String[] args) throws Exception {

    final FileSystem fs = FileSystem.get(getConf());
    GroupBuilder gBuilder = new GroupBuilder().withName("Options:");
    List<Option> options = new ArrayList<Option>();
    Option inputOpt = buildOption("input", "i", "The input path", true, true, null);
    options.add(inputOpt);/* www. j ava 2s.co  m*/
    Option outOpt = buildOption("output", "o", "The output path", true, true, null);
    options.add(outOpt);
    Option tikaOpt = buildOption("tikaProcessor", "t",
            "The fully qualified name of a TikaProcessor class that handles the extraction (optional)", true,
            false, null);
    options.add(tikaOpt);
    Option mimeTypeOpt = buildOption("mimeType", "m", "The mime type to use (optional)", true, false, "");
    options.add(mimeTypeOpt);
    for (Option opt : options) {
        gBuilder = gBuilder.withOption(opt);
    }

    Group group = gBuilder.create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        // TODO catch exceptions with parsing of opts
        CommandLine cmdLine = parser.parse(args);
        Path inputPath = new Path(cmdLine.getValue(inputOpt).toString());
        Path outputPath = new Path(cmdLine.getValue(outOpt).toString());
        String handlerName = null;
        if (cmdLine.hasOption(tikaOpt)) {
            handlerName = cmdLine.getValue(tikaOpt).toString();
        }

        JobConf job = new JobConf(getConf());
        job.setJarByClass(this.getClass());

        if (cmdLine.hasOption(mimeTypeOpt)) {
            String mimeType = cmdLine.getValue(mimeTypeOpt).toString();
            job.set(TikaConstants.TIKA_MIME_TYPE_KEY, mimeType);
        }

        if (handlerName != null && handlerName.equals("") == false) {
            job.set(TIKA_PROCESSOR_KEY, handlerName);
        }

        job.setJobName("Tika : " + inputPath.toString());

        job.setInputFormat(SequenceFileInputFormat.class);
        job.setOutputFormat(SequenceFileOutputFormat.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(BehemothDocument.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BehemothDocument.class);

        job.setMapperClass(TikaMapper.class);

        boolean isFilterRequired = BehemothReducer.isRequired(job);
        if (isFilterRequired)
            job.setReducerClass(BehemothReducer.class);
        else {
            job.setNumReduceTasks(0);
        }

        FileInputFormat.addInputPath(job, inputPath);
        FileOutputFormat.setOutputPath(job, outputPath);

        try {
            long start = System.currentTimeMillis();
            JobClient.runJob(job);
            long finish = System.currentTimeMillis();
            if (log.isInfoEnabled()) {
                log.info("TikaDriver completed. Timing: " + (finish - start) + " ms");
            }
        } catch (Exception e) {
            log.error("Exception", e);
            return -1;
            // don't delete the output as some of it could be used
            // fs.delete(outputPath, true);
        } finally {
        }

    } catch (OptionException e) {
        log.error("OptionException", e.getMessage());
        HelpFormatter formatter = new HelpFormatter();
        formatter.setGroup(group);
        formatter.print();
        return -1;
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.uima.UIMADriver.java

License:Apache License

public int run(String[] args) throws Exception {

    final FileSystem fs = FileSystem.get(getConf());

    if (args.length != 3) {
        String syntax = "com.digitalpebble.behemoth.uima.UIMADriver in out path_pear_file";
        System.err.println(syntax);
        return -1;
    }/*from  ww  w . j ava 2  s . com*/

    Path inputPath = new Path(args[0]);
    Path outputPath = new Path(args[1]);
    String pearPath = args[2];

    // check that the GATE application has been stored on HDFS
    Path zap = new Path(pearPath);
    if (fs.exists(zap) == false) {
        System.err.println("The UIMA application " + pearPath + "can't be found on HDFS - aborting");
        return -1;
    }

    JobConf job = new JobConf(getConf());
    job.setJarByClass(this.getClass());
    job.setJobName("Processing with UIMA application : " + pearPath);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(BehemothDocument.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    job.setMapperClass(UIMAMapper.class);

    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    // push the UIMA pear onto the DistributedCache
    DistributedCache.addCacheFile(new URI(pearPath), job);

    job.set("uima.pear.path", pearPath);

    try {
        long start = System.currentTimeMillis();
        JobClient.runJob(job);
        long finish = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("UIMADriver completed. Timing: " + (finish - start) + " ms");
        }
    } catch (Exception e) {
        LOG.error("Exception", e);
        fs.delete(outputPath, true);
    } finally {
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.util.CorpusFilter.java

License:Apache License

public int run(String[] args) throws Exception {

    Options options = new Options();
    // automatically generate the help statement
    HelpFormatter formatter = new HelpFormatter();
    // create the parser
    CommandLineParser parser = new GnuParser();

    options.addOption("h", "help", false, "print this message");
    options.addOption("i", "input", true, "input Behemoth corpus");
    options.addOption("o", "output", true, "output Behemoth corpus");

    // parse the command line arguments
    CommandLine line = null;/*  ww  w. j  a  v  a2s .c  o m*/
    try {
        line = parser.parse(options, args);
        String input = line.getOptionValue("i");
        String output = line.getOptionValue("o");
        if (line.hasOption("help")) {
            formatter.printHelp("CorpusFilter", options);
            return 0;
        }
        if (input == null | output == null) {
            formatter.printHelp("CorpusFilter", options);
            return -1;
        }
    } catch (ParseException e) {
        formatter.printHelp("CorpusFilter", options);
    }

    final FileSystem fs = FileSystem.get(getConf());

    Path inputPath = new Path(line.getOptionValue("i"));
    Path outputPath = new Path(line.getOptionValue("o"));

    JobConf job = new JobConf(getConf());
    job.setJarByClass(this.getClass());

    job.setJobName("CorpusFilter : " + inputPath.toString());

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(BehemothDocument.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    boolean isFilterRequired = BehemothMapper.isRequired(job);
    // should be the case here
    if (!isFilterRequired) {
        System.err.println("No filters configured. Check your behemoth-site.xml");
        return -1;
    }
    job.setMapperClass(BehemothMapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    try {
        JobClient.runJob(job);
    } catch (Exception e) {
        e.printStackTrace();
        fs.delete(outputPath, true);
    } finally {
    }

    return 0;
}

From source file:com.dynamicalsoftware.feed.mapreduce.AggregatePerformanceData.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length >= 2) {
        JobConf conf = new JobConf(AggregatePerformanceData.class);
        conf.setJobName("aggregate news feed performance data");
        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);
        conf.setMapperClass(AggregatePerformanceData.Map.class);
        conf.setReducerClass(AggregatePerformanceData.Reduce.class);
        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);
        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));
        JobClient.runJob(conf);/*from   w  w w.jav a  2  s. co m*/
    } else {
        System.err.println("\nusage: AggregatePerformanceData input_directory output_directory\n");
    }
}

From source file:com.dynamicalsoftware.hadoop.mapreduce.SanFranciscoCrime.java

License:Apache License

private static void generate(String name, Class mapper, String input, String output) throws IOException {
    JobConf conf = new JobConf(SanFranciscoCrime.class);
    conf.setJobName(name);/*from  w  w  w  .j a  v a  2s.c  om*/
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(mapper);
    conf.setReducerClass(ReduceByWeek.class);
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, new Path(output));
    JobClient.runJob(conf);
}

From source file:com.dynamicalsoftware.hadoop.mapreduce.SanFranciscoCrimePrepOlap.java

License:Apache License

/**
 * sets up and runs the hadoop map/reduce job itself
 * @param name contains the name of the job itself
 * @param mapper identified which mapper class to use
 * @param input is the fully qualified path to the raw crime data
 * @param output is the fully qualified path to where the generated data should reside
 * @throws IOException/*  w ww. j  a v a 2s . c om*/
 */
private static void generate(String name, Class mapper, String input, String output) throws IOException {
    JobConf conf = new JobConf(SanFranciscoCrimePrepOlap.class);
    conf.setJobName(name);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(mapper);
    conf.setReducerClass(Reduce.class);
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, new Path(output));
    JobClient.runJob(conf);
}

From source file:com.example.hadoop.mapreduce.test.MapReduceTest.java

License:Open Source License

public static void main(String[] args) throws IOException {
    String input = HDFS_PATH + "/input/README.txt";
    String input2 = HDFS_PATH + "/input/README2.txt";
    String output = HDFS_PATH + "/test/output";

    // ?mapreduce???
    if (HdfsClient.exists(output)) {
        HdfsClient.rm(output);/*from  ww w .jav a  2 s  . co  m*/
    }

    JobConf conf = new JobConf(MapReduceTest.class);
    conf.setJobName("MapReduceTest");
    conf.addResource("classpath:/hadoop/core-site.xml");
    conf.addResource("classpath:/hadoop/hdfs-site.xml");
    conf.addResource("classpath:/hadoop/mapred-site.xml");

    // mapper
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);

    // reducer
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    // mapper
    conf.setMapperClass(MapperTest.class);
    // combiner?????mapper??reducer?
    conf.setCombinerClass(ReducerTest.class);
    // reducer
    conf.setReducerClass(ReducerTest.class);

    // MapReduce?
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    // MapReduce?
    FileInputFormat.setInputPaths(conf, new Path[] { new Path(input), new Path(input2) });
    // MapReduce?
    FileOutputFormat.setOutputPath(conf, new Path(output));

    try {
        JobClient.runJob(conf);
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:com.facebook.LinkBench.LinkBenchDriverMR.java

License:Apache License

/**
 * create JobConf for map reduce job//from  w  w  w  .ja va  2  s . c  o m
 * @param currentphase LOAD or REQUEST
 * @param nmappers number of mappers (loader or requester)
 */
private JobConf createJobConf(int currentphase, int nmappers) {
    final JobConf jobconf = new JobConf(getConf(), getClass());
    jobconf.setJobName("LinkBench MapReduce Driver");

    if (USE_INPUT_FILES) {
        jobconf.setInputFormat(SequenceFileInputFormat.class);
    } else {
        jobconf.setInputFormat(LinkBenchInputFormat.class);
    }
    jobconf.setOutputKeyClass(IntWritable.class);
    jobconf.setOutputValueClass(LongWritable.class);
    jobconf.setOutputFormat(SequenceFileOutputFormat.class);
    if (currentphase == LOAD) {
        jobconf.setMapperClass(LoadMapper.class);
    } else { //REQUEST
        jobconf.setMapperClass(RequestMapper.class);
    }
    jobconf.setNumMapTasks(nmappers);
    jobconf.setReducerClass(LoadRequestReducer.class);
    jobconf.setNumReduceTasks(1);

    // turn off speculative execution, because DFS doesn't handle
    // multiple writers to the same file.
    jobconf.setSpeculativeExecution(false);

    return jobconf;
}