Example usage for org.apache.hadoop.mapred JobConf setJarByClass

List of usage examples for org.apache.hadoop.mapred JobConf setJarByClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setJarByClass.

Prototype

public void setJarByClass(Class cls) 

Source Link

Document

Set the job's jar file by finding an example class location.

Usage

From source file:org.hxx.hadoop.GeneratorRedHbase.java

License:Apache License

private RunningJob generateJob(String table, Path segment, int numLists, long topN, long curTime,
        boolean filter, boolean norm, boolean force) throws IOException {
    LOG.info("Generator: segment=" + segment);

    JobConf job = new NutchJob(getConf());
    job.setJarByClass(GeneratorRedHbase.class);
    job.setJobName("generate: from " + table + " "
            + (new SimpleDateFormat("MMdd HH:mm:ss")).format(System.currentTimeMillis()));
    // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000);

    if (numLists == -1) {
        numLists = job.getNumMapTasks(); // a partition per fetch task
    }/* w  w  w .  j  a va2 s.c  o  m*/
    if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
        // override
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        numLists = 1;
    }
    // job.setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
    job.setLong(GENERATOR_TOP_N, topN);
    job.setBoolean(GENERATOR_FILTER, filter);
    job.setBoolean(GENERATOR_NORMALISE, norm);
    job.set(GENERATL_TABLE, table);
    job.setInt(GENERATL_REDUCENUM, numLists);
    job.setInt("partition.url.seed", new Random().nextInt());

    job.setInputFormat(CodeInputFormat.class);
    job.setNumMapTasks(1);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setReducerClass(GenerateMark.class);
    job.setNumReduceTasks(numLists);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    job.setOutputKeyComparatorClass(HashComparator.class);
    Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);
    FileOutputFormat.setOutputPath(job, output);

    RunningJob r = null;
    try {
        r = JobClient.runJob(job);
    } catch (IOException e) {
        throw e;
    }
    return r;
}

From source file:org.kitesdk.apps.cli.commands.InstallCommand.java

License:Apache License

private static final List<File> getLibraryJars() {

    // Current implementation assumes that library files
    // are in the same directory, so locate it and
    // include it in the project library.

    // This is ugly, using the jobConf logic to identify the containing
    // JAR. There should be a better way to do this.
    JobConf jobConf = new JobConf();
    jobConf.setJarByClass(InstallCommand.class);
    String containingJar = jobConf.getJar();

    File file = new File(containingJar).getParentFile();

    File[] jarFiles = file.listFiles();

    return Arrays.asList(jarFiles);
}

From source file:org.kitesdk.apps.spark.spi.scheduled.SparkJobManager.java

License:Apache License

@Override
public void writeOozieActionBlock(XMLWriter writer, Schedule schedule) {

    writer.startElement("spark");
    writer.addAttribute("xmlns", "uri:oozie:spark-action:0.1");
    element(writer, "job-tracker", "${jobTracker}");
    element(writer, "name-node", "${nameNode}");

    // TODO: the job-xml should probably be job-specific configuration.
    // element(writer, "job-xml", "${appConfigPath}");

    // Make the nominal time visible to the workflow action.
    writer.startElement("configuration");

    // Use the spark and hive sharelibs since many actions use both.
    property(writer, "oozie.action.sharelib.for.spark", "spark,hive2");
    property(writer, "kiteAppRoot", "${kiteAppRoot}");

    OozieScheduling.writeJobConfiguration(writer, schedule, context.getHadoopConf());

    writer.endElement(); // configuration

    element(writer, "master", "yarn-cluster");
    element(writer, "name", schedule.getName());
    element(writer, "class", SparkScheduledJobMain.class.getCanonicalName());

    JobConf jobConf = new JobConf();
    jobConf.setJarByClass(schedule.getJobClass());
    String containingJar = jobConf.getJar();

    String jarName = containingJar != null ? "${kiteAppRoot}/lib/" + new File(containingJar).getName() : "";

    element(writer, "jar", jarName);
    element(writer, "spark-opts", getSparkConfString(schedule));
    element(writer, "arg", schedule.getJobClass().getName());

    writer.endElement(); // spark
}

From source file:org.kitesdk.apps.spark.spi.scheduled.SparkJobManager.java

License:Apache License

private static final List<File> getLibraryJars() {

    // Current implementation assumes that library files
    // are in the same directory, so locate it and
    // include it in the project library.

    // This is ugly, using the jobConf logic to identify the containing
    // JAR. There should be a better way to do this.
    JobConf jobConf = new JobConf();
    jobConf.setJarByClass(SchedulableJob.class);
    String containingJar = jobConf.getJar();

    if (containingJar == null)
        return Collections.emptyList();

    File file = new File(containingJar).getParentFile();

    File[] jarFiles = file.listFiles();

    return Arrays.asList(jarFiles);
}

From source file:org.kitesdk.apps.spark.spi.streaming.SparkStreamingJobManager.java

License:Apache License

private static final List<File> getLibraryJars() {

    // Current implementation assumes that library files
    // are in the same directory, so locate it and
    // include it in the project library.

    // This is ugly, using the jobConf logic to identify the containing
    // JAR. There should be a better way to do this.
    JobConf jobConf = new JobConf();
    jobConf.setJarByClass(StreamingJob.class);
    String containingJar = jobConf.getJar();

    if (containingJar == null)
        return Collections.emptyList();

    File file = new File(containingJar).getParentFile();

    File[] jarFiles = file.listFiles();

    return Arrays.asList(jarFiles);
}

From source file:org.kitesdk.apps.spark.spi.streaming.SparkStreamingJobManager.java

License:Apache License

@Override
public void start(FileSystem fs, Path appRoot) {
    JobConf jobConf = new JobConf();
    jobConf.setJarByClass(SparkStreamingJobMain.class);
    String containingJar = jobConf.getJar();

    Path libPath = new Path(appRoot, "lib");

    Path jarPath = new Path(libPath, new File(containingJar).getName());
    jarPath = fs.makeQualified(jarPath);

    SparkLauncher launcher = new SparkLauncher();

    launcher.setMainClass(SparkStreamingJobMain.class.getName());

    launcher.setAppResource(jarPath.toString());

    launcher.setMaster("yarn-cluster");

    try {/*from  w  w  w . j a v a2s  .c  om*/
        // Add the library JARs from HDFS so we don't need to reload
        // them separately into Spark.
        FileStatus[] libJars = fs.listStatus(libPath);

        for (FileStatus jar : libJars) {

            launcher.addJar(jar.getPath().toString());
        }

        // Add the sharelib JARs, since they are not visible to Spark otherwise.
        List<Path> shareLibJars = ShareLibs.jars(sparkJobContext.getHadoopConf(), "hive2");

        for (Path sharelibJar : shareLibJars) {

            launcher.addJar(fs.makeQualified(sharelibJar).toString());
        }

    } catch (IOException e) {
        throw new AppException(e);
    }

    launcher.addAppArgs(appRoot.toString(), description.getJobName());

    // Explicitly set the metastore URI to be usable in the job.
    launcher.setConf("spark.hadoop.hive.metastore.uris",
            sparkJobContext.getHadoopConf().get("hive.metastore.uris"));

    // Add the Avro classes.
    List<Schema> schemas = JobReflection.getSchemas(job);
    StringBuilder avroClassesArg = new StringBuilder();

    avroClassesArg.append("-D").append(KryoAvroRegistrator.KITE_AVRO_CLASSES).append("=");

    boolean first = true;

    for (Schema schema : schemas) {

        if (!first) {
            avroClassesArg.append(",");
        }

        avroClassesArg.append(SpecificData.get().getClass(schema).getName());

        first = false;
    }

    launcher.setConf("spark.driver.extraJavaOptions", avroClassesArg.toString());
    launcher.setConf("spark.executor.extraJavaOptions", avroClassesArg.toString());

    try {

        Process process = launcher.launch();

        // Redirect the spark-submit output to be visible to the reader.
        Thread stdoutThread = writeOutput(process.getInputStream(), System.out);
        Thread stderrThread = writeOutput(process.getErrorStream(), System.err);

        int result = process.waitFor();

        stdoutThread.join();
        stderrThread.join();

        if (result != 0) {
            throw new AppException("spark-submit returned error status: " + result);
        }

    } catch (IOException e) {
        throw new AppException(e);
    } catch (InterruptedException e) {
        throw new AppException(e);
    }
}

From source file:org.macau.util.FuzzyJoinDriver.java

License:Apache License

/**
 * /*ww w  .ja  va  2  s.  c  om*/
 * @param job
 * @throws IOException
 * run the job and output the basic information of the job
 * the start time
 * the finished time
 * the running time(finished_Time - start_Time)
 */
public static void run(JobConf job) throws IOException {
    job.setJarByClass(FuzzyJoinDriver.class);
    //
    // print info
    //
    String ret = "FuzzyJoinDriver(" + job.getJobName() + ")\n" + "  Input Path:  {";
    Path inputs[] = FileInputFormat.getInputPaths(job);
    for (int ctr = 0; ctr < inputs.length; ctr++) {
        if (ctr > 0) {
            ret += "\n                ";
        }
        ret += inputs[ctr].toString();
    }
    ret += "}\n";
    ret += "  Output Path: " + FileOutputFormat.getOutputPath(job) + "\n" + "  Map Jobs:    "
            + job.getNumMapTasks() + "\n" + "  Reduce Jobs: " + job.getNumReduceTasks() + "\n"
            + "  Properties:  {";
    String[][] properties = new String[][] {
            new String[] { FuzzyJoinConfig.SIMILARITY_NAME_PROPERTY, FuzzyJoinConfig.SIMILARITY_NAME_VALUE },
            new String[] { FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY,
                    "" + FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE },
            new String[] { FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE },
            new String[] { TOKENS_PACKAGE_PROPERTY, TOKENS_PACKAGE_VALUE },
            new String[] { TOKENS_LENGTHSTATS_PROPERTY, "" + TOKENS_LENGTHSTATS_VALUE },
            new String[] { RIDPAIRS_GROUP_CLASS_PROPERTY, RIDPAIRS_GROUP_CLASS_VALUE },
            new String[] { RIDPAIRS_GROUP_FACTOR_PROPERTY, "" + RIDPAIRS_GROUP_FACTOR_VALUE },
            new String[] { FuzzyJoinConfig.DATA_TOKENS_PROPERTY, "" },
            new String[] { DATA_JOININDEX_PROPERTY, "" }, };
    for (int crt = 0; crt < properties.length; crt++) {
        if (crt > 0) {
            ret += "\n                ";
        }
        ret += properties[crt][0] + "=" + job.get(properties[crt][0], properties[crt][1]);
    }
    ret += "}";
    System.out.println(ret);
    //
    // run job
    //
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    JobClient.runJob(job);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println(
            "The job took " + (end_time.getTime() - startTime.getTime()) / (float) 1000.0 + " seconds.");
}

From source file:org.pentaho.hadoop.mapreduce.test.TestSubmitMapReduceJob.java

License:Open Source License

@Test
public void submitJob() throws Exception {

    String[] args = { "hdfs://" + hostname + ":" + hdfsPort + "/junit/wordcount/input",
            "hdfs://" + hostname + ":" + hdfsPort + "/junit/wordcount/output" };

    JobConf conf = new JobConf();
    conf.setJobName("wordcount");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    File jar = new File("./test-res/pentaho-mapreduce-sample.jar");

    URLClassLoader loader = new URLClassLoader(new URL[] { jar.toURI().toURL() });

    conf.setMapperClass(//from w  ww . j  av  a  2s. c o  m
            (Class<? extends Mapper>) loader.loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount$Map"));
    conf.setCombinerClass((Class<? extends Reducer>) loader
            .loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount$Reduce"));
    conf.setReducerClass((Class<? extends Reducer>) loader
            .loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount$Reduce"));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    conf.set("fs.default.name", "hdfs://" + hostname + ":" + hdfsPort);
    conf.set("mapred.job.tracker", hostname + ":" + trackerPort);

    conf.setJarByClass(loader.loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount"));
    conf.setWorkingDirectory(new Path("/tmp/wordcount"));

    JobClient jobClient = new JobClient(conf);
    ClusterStatus status = jobClient.getClusterStatus();
    assertEquals(State.RUNNING, status.getJobTrackerState());

    RunningJob runningJob = jobClient.submitJob(conf);
    System.out.print("Running " + runningJob.getJobName() + "");
    while (!runningJob.isComplete()) {
        System.out.print(".");
        Thread.sleep(500);
    }
    System.out.println();
    System.out.println("Finished " + runningJob.getJobName() + ".");

    FileObject file = fsManager.resolveFile(buildHDFSURL("/junit/wordcount/output/part-00000"));
    String output = IOUtils.toString(file.getContent().getInputStream());
    assertEquals("Bye\t1\nGoodbye\t1\nHadoop\t2\nHello\t2\nWorld\t2\n", output);
}

From source file:org.pooledtimeseries.healthcheck.CheckCartesianProductSeqFile.java

License:Apache License

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    long start = System.currentTimeMillis();
    JobConf conf = new JobConf("Cartesian Product");
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: CheckCartesianProductSeqFile <input sequence file> <out>");
        System.exit(1);//from   w w w  .j a  v  a2 s.co  m
    }

    // Configure the join type
    conf.setJarByClass(CheckCartesianProductSeqFile.class);

    conf.setMapperClass(CartesianMapper.class);
    conf.setReducerClass(CartesianReducer.class);

    conf.setInputFormat(CartesianInputFormat.class);
    CartesianInputFormat.setLeftInputInfo(conf, SequenceFileInputFormat.class, otherArgs[0]);
    CartesianInputFormat.setRightInputInfo(conf, SequenceFileInputFormat.class, otherArgs[0]);

    TextOutputFormat.setOutputPath(conf, new Path(otherArgs[1]));

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    RunningJob job = JobClient.runJob(conf);
    while (!job.isComplete()) {
        Thread.sleep(1000);
    }

    long finish = System.currentTimeMillis();

    System.out.println("Time in ms: " + (finish - start));

    System.exit(job.isSuccessful() ? 0 : 2);
}

From source file:org.pooledtimeseries.SimilarityCalculation.java

License:Apache License

public static void main(String[] args) throws Exception {

    JobConf conf = new JobConf();
    System.out.println("Before Map:" + conf.getNumMapTasks());
    conf.setNumMapTasks(196);//from www.j av  a2 s . c om
    System.out.println("After Map:" + conf.getNumMapTasks());
    conf.setJobName("similarity_calc");

    conf.set("meanDistsFilePath", args[2]);

    System.out.println("Job Name: " + conf.getJobName());
    conf.setJarByClass(SimilarityCalculation.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setInputFormat(CartesianInputFormat.class);
    CartesianInputFormat.setLeftInputInfo(conf, SequenceFileInputFormat.class, args[0]);
    CartesianInputFormat.setRightInputInfo(conf, SequenceFileInputFormat.class, args[0]);

    conf.setOutputFormat(TextOutputFormat.class);

    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    conf.setMapperClass(Map.class);

    JobClient.runJob(conf);
}