Example usage for org.apache.hadoop.mapred JobConf setJarByClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setJarByClass.

Prototype

public void setJarByClass(Class cls)

Source Link

Document

Set the job's jar file by finding an example class location.

Usage

From source file:com.digitalpebble.behemoth.solr.LucidWorksIndexerJob.java

License:Apache License

public int run(String[] args) throws Exception {

    final FileSystem fs = FileSystem.get(getConf());

    if (args.length != 2) {
        String syntax = "com.digitalpebble.solr.LucidWorksIndexerJob in solrURL";
        System.err.println(syntax);
        return -1;
    }/*from w  w  w .j  a  v  a2 s. c  o m*/

    Path inputPath = new Path(args[0]);
    String solrURL = args[1];

    JobConf job = new JobConf(getConf());

    job.setJarByClass(this.getClass());

    job.setJobName("Indexing " + inputPath + " into LucidWorks");

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(LucidWorksOutputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    job.setMapperClass(IdentityMapper.class);
    // no reducer : send straight to SOLR at end of mapping
    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt());
    FileOutputFormat.setOutputPath(job, tmp);

    job.set("solr.server.url", solrURL);

    try {
        long start = System.currentTimeMillis();
        JobClient.runJob(job);
        long finish = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("LucidWorksIndexerJob completed. Time " + (finish - start) + " ms");
        }
    } catch (Exception e) {
        LOG.error(e);
    } finally {
        fs.delete(tmp, true);
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.solr.SOLRIndexerJob.java

License:Apache License

public int run(String[] args) throws Exception {

    final FileSystem fs = FileSystem.get(getConf());

    if (args.length != 2) {
        String syntax = "com.digitalpebble.solr.SOLRIndexerJob in solrURL";
        System.err.println(syntax);
        return -1;
    }//w  w w  . ja  v  a 2  s . c o  m

    Path inputPath = new Path(args[0]);
    String solrURL = args[1];

    JobConf job = new JobConf(getConf());

    job.setJarByClass(this.getClass());

    job.setJobName("Indexing " + inputPath + " into SOLR");

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SOLROutputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    job.setMapperClass(IdentityMapper.class);
    // no reducer : send straight to SOLR at end of mapping
    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt());
    FileOutputFormat.setOutputPath(job, tmp);

    job.set("solr.server.url", solrURL);

    try {
        long start = System.currentTimeMillis();
        JobClient.runJob(job);
        long finish = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("SOLRIndexerJob completed. Timing: " + (finish - start) + " ms");
        }
    } catch (Exception e) {
        LOG.error(e);
    } finally {
        fs.delete(tmp, true);
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.tika.TikaDriver.java

License:Apache License

public int run(String[] args) throws Exception {

    final FileSystem fs = FileSystem.get(getConf());
    GroupBuilder gBuilder = new GroupBuilder().withName("Options:");
    List<Option> options = new ArrayList<Option>();
    Option inputOpt = buildOption("input", "i", "The input path", true, true, null);
    options.add(inputOpt);/* w  w w  . ja v  a 2 s  . c o  m*/
    Option outOpt = buildOption("output", "o", "The output path", true, true, null);
    options.add(outOpt);
    Option tikaOpt = buildOption("tikaProcessor", "t",
            "The fully qualified name of a TikaProcessor class that handles the extraction (optional)", true,
            false, null);
    options.add(tikaOpt);
    Option mimeTypeOpt = buildOption("mimeType", "m", "The mime type to use (optional)", true, false, "");
    options.add(mimeTypeOpt);
    for (Option opt : options) {
        gBuilder = gBuilder.withOption(opt);
    }

    Group group = gBuilder.create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        // TODO catch exceptions with parsing of opts
        CommandLine cmdLine = parser.parse(args);
        Path inputPath = new Path(cmdLine.getValue(inputOpt).toString());
        Path outputPath = new Path(cmdLine.getValue(outOpt).toString());
        String handlerName = null;
        if (cmdLine.hasOption(tikaOpt)) {
            handlerName = cmdLine.getValue(tikaOpt).toString();
        }

        JobConf job = new JobConf(getConf());
        job.setJarByClass(this.getClass());

        if (cmdLine.hasOption(mimeTypeOpt)) {
            String mimeType = cmdLine.getValue(mimeTypeOpt).toString();
            job.set(TikaConstants.TIKA_MIME_TYPE_KEY, mimeType);
        }

        if (handlerName != null && handlerName.equals("") == false) {
            job.set(TIKA_PROCESSOR_KEY, handlerName);
        }

        job.setJobName("Tika : " + inputPath.toString());

        job.setInputFormat(SequenceFileInputFormat.class);
        job.setOutputFormat(SequenceFileOutputFormat.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(BehemothDocument.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BehemothDocument.class);

        job.setMapperClass(TikaMapper.class);

        boolean isFilterRequired = BehemothReducer.isRequired(job);
        if (isFilterRequired)
            job.setReducerClass(BehemothReducer.class);
        else {
            job.setNumReduceTasks(0);
        }

        FileInputFormat.addInputPath(job, inputPath);
        FileOutputFormat.setOutputPath(job, outputPath);

        try {
            long start = System.currentTimeMillis();
            JobClient.runJob(job);
            long finish = System.currentTimeMillis();
            if (log.isInfoEnabled()) {
                log.info("TikaDriver completed. Timing: " + (finish - start) + " ms");
            }
        } catch (Exception e) {
            log.error("Exception", e);
            return -1;
            // don't delete the output as some of it could be used
            // fs.delete(outputPath, true);
        } finally {
        }

    } catch (OptionException e) {
        log.error("OptionException", e.getMessage());
        HelpFormatter formatter = new HelpFormatter();
        formatter.setGroup(group);
        formatter.print();
        return -1;
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.uima.UIMADriver.java

License:Apache License

public int run(String[] args) throws Exception {

    final FileSystem fs = FileSystem.get(getConf());

    if (args.length != 3) {
        String syntax = "com.digitalpebble.behemoth.uima.UIMADriver in out path_pear_file";
        System.err.println(syntax);
        return -1;
    }/*w w w.  j  ava  2  s.c om*/

    Path inputPath = new Path(args[0]);
    Path outputPath = new Path(args[1]);
    String pearPath = args[2];

    // check that the GATE application has been stored on HDFS
    Path zap = new Path(pearPath);
    if (fs.exists(zap) == false) {
        System.err.println("The UIMA application " + pearPath + "can't be found on HDFS - aborting");
        return -1;
    }

    JobConf job = new JobConf(getConf());
    job.setJarByClass(this.getClass());
    job.setJobName("Processing with UIMA application : " + pearPath);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(BehemothDocument.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    job.setMapperClass(UIMAMapper.class);

    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    // push the UIMA pear onto the DistributedCache
    DistributedCache.addCacheFile(new URI(pearPath), job);

    job.set("uima.pear.path", pearPath);

    try {
        long start = System.currentTimeMillis();
        JobClient.runJob(job);
        long finish = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("UIMADriver completed. Timing: " + (finish - start) + " ms");
        }
    } catch (Exception e) {
        LOG.error("Exception", e);
        fs.delete(outputPath, true);
    } finally {
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.util.CorpusFilter.java

License:Apache License

public int run(String[] args) throws Exception {

    Options options = new Options();
    // automatically generate the help statement
    HelpFormatter formatter = new HelpFormatter();
    // create the parser
    CommandLineParser parser = new GnuParser();

    options.addOption("h", "help", false, "print this message");
    options.addOption("i", "input", true, "input Behemoth corpus");
    options.addOption("o", "output", true, "output Behemoth corpus");

    // parse the command line arguments
    CommandLine line = null;/* w w  w  .  j  a  v a 2s . c  o  m*/
    try {
        line = parser.parse(options, args);
        String input = line.getOptionValue("i");
        String output = line.getOptionValue("o");
        if (line.hasOption("help")) {
            formatter.printHelp("CorpusFilter", options);
            return 0;
        }
        if (input == null | output == null) {
            formatter.printHelp("CorpusFilter", options);
            return -1;
        }
    } catch (ParseException e) {
        formatter.printHelp("CorpusFilter", options);
    }

    final FileSystem fs = FileSystem.get(getConf());

    Path inputPath = new Path(line.getOptionValue("i"));
    Path outputPath = new Path(line.getOptionValue("o"));

    JobConf job = new JobConf(getConf());
    job.setJarByClass(this.getClass());

    job.setJobName("CorpusFilter : " + inputPath.toString());

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(BehemothDocument.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    boolean isFilterRequired = BehemothMapper.isRequired(job);
    // should be the case here
    if (!isFilterRequired) {
        System.err.println("No filters configured. Check your behemoth-site.xml");
        return -1;
    }
    job.setMapperClass(BehemothMapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    try {
        JobClient.runJob(job);
    } catch (Exception e) {
        e.printStackTrace();
        fs.delete(outputPath, true);
    } finally {
    }

    return 0;
}

From source file:com.ebay.erl.mobius.core.MobiusJob.java

License:Apache License

/**
 * Select the <code>columns</code> from the <code>dataset</code>, store
 * it into <code>outputFolder</code> with the given <code>outputFormat</code>
 * <p>/*from   w w  w. j a v a2s .  c  om*/
 * 
 * Here is an example:
 * <pre>
 * <code>
 * public MyJob extends MobiusJob
 * {
 *    public void run(String[] args)
 *    {
 *       Dataset students = ...;
 *       
 *       // save the result to $OUTPUT in SequenceFileOutputFormat,
 *       // the key will be NullWritable, and the value is a Tuple 
 *       // which contains 3 columns, id, f_name and l_name.
 *       this.list(students,
 *          new Path("$OUTPUT"),
 *          SequenceFileOutputFormat.class,
 *          new Column(students, "id"),
 *          new Column(students, "f_name"),
 *          new Column(students, "l_name")
 *       ); 
 *    }
 *    
 *    public static void main(String[] args) throw Exception
 *    {
 *       System.exit(MobiusJobRunner.run(new MyJob(), args));
 *    }
 * }
 * </code>
 * </pre>
 */
public Dataset list(Dataset dataset, Path outputFolder, Class<? extends FileOutputFormat> outputFormat,
        Column... columns) throws IOException {
    byte datasetID = 0;// set to 0 as there is only one dataset to be operated.

    JobConf job = dataset.createJobConf(datasetID);

    job.set("mapred.job.name", "Listing " + dataset.getName());
    job.setJarByClass(this.getClass());
    job.setNumReduceTasks(0); // list is map only job
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Tuple.class);
    job.setJobName("List " + dataset.getName());

    JobSetup.validateColumns(dataset, columns);
    JobSetup.setupInputs(job, dataset, datasetID);
    JobSetup.setupProjections(job, dataset, datasetID, columns);
    JobSetup.setupOutputs(job, outputFolder, outputFormat);

    this.addToExecQueue(job);

    AbstractDatasetBuilder builder = DatasetBuildersFactory.getInstance(this).getBuilder(outputFormat,
            "Dataset_" + outputFolder.getName());
    return builder.buildFromPreviousJob(job, outputFormat, Column.toSchemaArray(columns));
}

From source file:com.github.gaoyangthu.demo.mapred.dancing.DistributedPentomino.java

License:Apache License

public int run(String[] args) throws Exception {
    JobConf conf;
    int depth = 5;
    int width = 9;
    int height = 10;
    Class<? extends Pentomino> pentClass;
    if (args.length == 0) {
        System.out.println("Usage: pentomino <output> [-depth #] [-height #] [-width #]");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }/*from w  w  w  .ja va 2  s .c o m*/

    conf = new JobConf(getConf());

    // Pick up the parameters, should the user set these
    width = conf.getInt("pent.width", width);
    height = conf.getInt("pent.height", height);
    depth = conf.getInt("pent.depth", depth);
    pentClass = conf.getClass("pent.class", OneSidedPentomino.class, Pentomino.class);

    for (int i = 0; i < args.length; i++) {
        if (args[i].equalsIgnoreCase("-depth")) {
            depth = Integer.parseInt(args[++i].trim());
        } else if (args[i].equalsIgnoreCase("-height")) {
            height = Integer.parseInt(args[++i].trim());
        } else if (args[i].equalsIgnoreCase("-width")) {
            width = Integer.parseInt(args[++i].trim());
        }
    }

    // Set parameters for MR tasks to pick up either which way the user sets
    // them or not
    conf.setInt("pent.width", width);
    conf.setInt("pent.height", height);
    conf.setInt("pent.depth", depth);

    Path output = new Path(args[0]);
    Path input = new Path(output + "_input");
    FileSystem fileSys = FileSystem.get(conf);
    try {
        FileInputFormat.setInputPaths(conf, input);
        FileOutputFormat.setOutputPath(conf, output);
        conf.setJarByClass(PentMap.class);

        conf.setJobName("dancingElephant");
        Pentomino pent = ReflectionUtils.newInstance(pentClass, conf);
        pent.initialize(width, height);
        createInputDirectory(fileSys, input, pent, depth);

        // the keys are the prefix strings
        conf.setOutputKeyClass(Text.class);
        // the values are puzzle solutions
        conf.setOutputValueClass(Text.class);

        conf.setMapperClass(PentMap.class);
        conf.setReducerClass(IdentityReducer.class);

        conf.setNumMapTasks(2000);
        conf.setNumReduceTasks(1);

        JobClient.runJob(conf);
    } finally {
        fileSys.delete(input, true);
    }
    return 0;
}

From source file:com.github.gaoyangthu.demo.mapred.terasort.TeraGen.java

License:Apache License

/**
 * @param args the cli arguments//from   w  ww . ja va 2s. c  om
 */
public int run(String[] args) throws IOException {
    JobConf job = (JobConf) getConf();
    setNumberOfRows(job, Long.parseLong(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setJobName("TeraGen");
    job.setJarByClass(TeraGen.class);
    job.setMapperClass(SortGenMapper.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormat(RangeInputFormat.class);
    job.setOutputFormat(TeraOutputFormat.class);
    JobClient.runJob(job);
    return 0;
}

From source file:com.github.gaoyangthu.demo.mapred.terasort.TeraSort.java

License:Apache License

public int run(String[] args) throws Exception {
    LOG.info("starting");
    JobConf job = (JobConf) getConf();
    Path inputDir = new Path(args[0]);
    inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
    Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
    URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME);
    TeraInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setJobName("TeraSort");
    job.setJarByClass(TeraSort.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormat(TeraInputFormat.class);
    job.setOutputFormat(TeraOutputFormat.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    TeraInputFormat.writePartitionFile(job, partitionFile);
    DistributedCache.addCacheFile(partitionUri, job);
    DistributedCache.createSymlink(job);
    job.setInt("dfs.replication", 1);
    TeraOutputFormat.setFinalSync(job, true);
    JobClient.runJob(job);//from ww w  .j a v  a2s  .  c o m
    LOG.info("done");
    return 0;
}

From source file:com.google.mr4c.hadoop.mrv1.MRv1TestBinding.java

License:Open Source License

public synchronized JobConf createTestMRJobConf() throws IOException {
    if (m_mrCluster == null) {
        startMRCluster();/*from   w w  w.  j  a v  a 2 s  .  c om*/
    }
    JobConf job = m_mrCluster.createJobConf();
    job.setJarByClass(AlgoRunner.class);
    return job;
}