List of usage examples for org.apache.hadoop.mapred JobConf setJarByClass
public void setJarByClass(Class cls)
From source file:org.hxx.hadoop.GeneratorRedHbase.java
License:Apache License
private RunningJob generateJob(String table, Path segment, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force) throws IOException { LOG.info("Generator: segment=" + segment); JobConf job = new NutchJob(getConf()); job.setJarByClass(GeneratorRedHbase.class); job.setJobName("generate: from " + table + " " + (new SimpleDateFormat("MMdd HH:mm:ss")).format(System.currentTimeMillis())); // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000); if (numLists == -1) { numLists = job.getNumMapTasks(); // a partition per fetch task }/* w w w . j a va2 s.c o m*/ if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } // job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.set(GENERATL_TABLE, table); job.setInt(GENERATL_REDUCENUM, numLists); job.setInt("partition.url.seed", new Random().nextInt()); job.setInputFormat(CodeInputFormat.class); job.setNumMapTasks(1); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenerateMark.class); job.setNumReduceTasks(numLists); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(HashComparator.class); Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME); FileOutputFormat.setOutputPath(job, output); RunningJob r = null; try { r = JobClient.runJob(job); } catch (IOException e) { throw e; } return r; }
From source file:org.kitesdk.apps.cli.commands.InstallCommand.java
License:Apache License
private static final List<File> getLibraryJars() { // Current implementation assumes that library files // are in the same directory, so locate it and // include it in the project library. // This is ugly, using the jobConf logic to identify the containing // JAR. There should be a better way to do this. JobConf jobConf = new JobConf(); jobConf.setJarByClass(InstallCommand.class); String containingJar = jobConf.getJar(); File file = new File(containingJar).getParentFile(); File[] jarFiles = file.listFiles(); return Arrays.asList(jarFiles); }
From source file:org.kitesdk.apps.spark.spi.scheduled.SparkJobManager.java
License:Apache License
@Override public void writeOozieActionBlock(XMLWriter writer, Schedule schedule) { writer.startElement("spark"); writer.addAttribute("xmlns", "uri:oozie:spark-action:0.1"); element(writer, "job-tracker", "${jobTracker}"); element(writer, "name-node", "${nameNode}"); // TODO: the job-xml should probably be job-specific configuration. // element(writer, "job-xml", "${appConfigPath}"); // Make the nominal time visible to the workflow action. writer.startElement("configuration"); // Use the spark and hive sharelibs since many actions use both. property(writer, "oozie.action.sharelib.for.spark", "spark,hive2"); property(writer, "kiteAppRoot", "${kiteAppRoot}"); OozieScheduling.writeJobConfiguration(writer, schedule, context.getHadoopConf()); writer.endElement(); // configuration element(writer, "master", "yarn-cluster"); element(writer, "name", schedule.getName()); element(writer, "class", SparkScheduledJobMain.class.getCanonicalName()); JobConf jobConf = new JobConf(); jobConf.setJarByClass(schedule.getJobClass()); String containingJar = jobConf.getJar(); String jarName = containingJar != null ? "${kiteAppRoot}/lib/" + new File(containingJar).getName() : ""; element(writer, "jar", jarName); element(writer, "spark-opts", getSparkConfString(schedule)); element(writer, "arg", schedule.getJobClass().getName()); writer.endElement(); // spark }
From source file:org.kitesdk.apps.spark.spi.scheduled.SparkJobManager.java
License:Apache License
private static final List<File> getLibraryJars() { // Current implementation assumes that library files // are in the same directory, so locate it and // include it in the project library. // This is ugly, using the jobConf logic to identify the containing // JAR. There should be a better way to do this. JobConf jobConf = new JobConf(); jobConf.setJarByClass(SchedulableJob.class); String containingJar = jobConf.getJar(); if (containingJar == null) return Collections.emptyList(); File file = new File(containingJar).getParentFile(); File[] jarFiles = file.listFiles(); return Arrays.asList(jarFiles); }
From source file:org.kitesdk.apps.spark.spi.streaming.SparkStreamingJobManager.java
License:Apache License
private static final List<File> getLibraryJars() { // Current implementation assumes that library files // are in the same directory, so locate it and // include it in the project library. // This is ugly, using the jobConf logic to identify the containing // JAR. There should be a better way to do this. JobConf jobConf = new JobConf(); jobConf.setJarByClass(StreamingJob.class); String containingJar = jobConf.getJar(); if (containingJar == null) return Collections.emptyList(); File file = new File(containingJar).getParentFile(); File[] jarFiles = file.listFiles(); return Arrays.asList(jarFiles); }
From source file:org.kitesdk.apps.spark.spi.streaming.SparkStreamingJobManager.java
License:Apache License
@Override public void start(FileSystem fs, Path appRoot) { JobConf jobConf = new JobConf(); jobConf.setJarByClass(SparkStreamingJobMain.class); String containingJar = jobConf.getJar(); Path libPath = new Path(appRoot, "lib"); Path jarPath = new Path(libPath, new File(containingJar).getName()); jarPath = fs.makeQualified(jarPath); SparkLauncher launcher = new SparkLauncher(); launcher.setMainClass(SparkStreamingJobMain.class.getName()); launcher.setAppResource(jarPath.toString()); launcher.setMaster("yarn-cluster"); try {/*from w w w . j a v a2s .c om*/ // Add the library JARs from HDFS so we don't need to reload // them separately into Spark. FileStatus[] libJars = fs.listStatus(libPath); for (FileStatus jar : libJars) { launcher.addJar(jar.getPath().toString()); } // Add the sharelib JARs, since they are not visible to Spark otherwise. List<Path> shareLibJars = ShareLibs.jars(sparkJobContext.getHadoopConf(), "hive2"); for (Path sharelibJar : shareLibJars) { launcher.addJar(fs.makeQualified(sharelibJar).toString()); } } catch (IOException e) { throw new AppException(e); } launcher.addAppArgs(appRoot.toString(), description.getJobName()); // Explicitly set the metastore URI to be usable in the job. launcher.setConf("spark.hadoop.hive.metastore.uris", sparkJobContext.getHadoopConf().get("hive.metastore.uris")); // Add the Avro classes. List<Schema> schemas = JobReflection.getSchemas(job); StringBuilder avroClassesArg = new StringBuilder(); avroClassesArg.append("-D").append(KryoAvroRegistrator.KITE_AVRO_CLASSES).append("="); boolean first = true; for (Schema schema : schemas) { if (!first) { avroClassesArg.append(","); } avroClassesArg.append(SpecificData.get().getClass(schema).getName()); first = false; } launcher.setConf("spark.driver.extraJavaOptions", avroClassesArg.toString()); launcher.setConf("spark.executor.extraJavaOptions", avroClassesArg.toString()); try { Process process = launcher.launch(); // Redirect the spark-submit output to be visible to the reader. Thread stdoutThread = writeOutput(process.getInputStream(), System.out); Thread stderrThread = writeOutput(process.getErrorStream(), System.err); int result = process.waitFor(); stdoutThread.join(); stderrThread.join(); if (result != 0) { throw new AppException("spark-submit returned error status: " + result); } } catch (IOException e) { throw new AppException(e); } catch (InterruptedException e) { throw new AppException(e); } }
From source file:org.macau.util.FuzzyJoinDriver.java
License:Apache License
/** * /*ww w .ja va 2 s. c om*/ * @param job * @throws IOException * run the job and output the basic information of the job * the start time * the finished time * the running time(finished_Time - start_Time) */ public static void run(JobConf job) throws IOException { job.setJarByClass(FuzzyJoinDriver.class); // // print info // String ret = "FuzzyJoinDriver(" + job.getJobName() + ")\n" + " Input Path: {"; Path inputs[] = FileInputFormat.getInputPaths(job); for (int ctr = 0; ctr < inputs.length; ctr++) { if (ctr > 0) { ret += "\n "; } ret += inputs[ctr].toString(); } ret += "}\n"; ret += " Output Path: " + FileOutputFormat.getOutputPath(job) + "\n" + " Map Jobs: " + job.getNumMapTasks() + "\n" + " Reduce Jobs: " + job.getNumReduceTasks() + "\n" + " Properties: {"; String[][] properties = new String[][] { new String[] { FuzzyJoinConfig.SIMILARITY_NAME_PROPERTY, FuzzyJoinConfig.SIMILARITY_NAME_VALUE }, new String[] { FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY, "" + FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE }, new String[] { FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE }, new String[] { TOKENS_PACKAGE_PROPERTY, TOKENS_PACKAGE_VALUE }, new String[] { TOKENS_LENGTHSTATS_PROPERTY, "" + TOKENS_LENGTHSTATS_VALUE }, new String[] { RIDPAIRS_GROUP_CLASS_PROPERTY, RIDPAIRS_GROUP_CLASS_VALUE }, new String[] { RIDPAIRS_GROUP_FACTOR_PROPERTY, "" + RIDPAIRS_GROUP_FACTOR_VALUE }, new String[] { FuzzyJoinConfig.DATA_TOKENS_PROPERTY, "" }, new String[] { DATA_JOININDEX_PROPERTY, "" }, }; for (int crt = 0; crt < properties.length; crt++) { if (crt > 0) { ret += "\n "; } ret += properties[crt][0] + "=" + job.get(properties[crt][0], properties[crt][1]); } ret += "}"; System.out.println(ret); // // run job // Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(job); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println( "The job took " + (end_time.getTime() - startTime.getTime()) / (float) 1000.0 + " seconds."); }
From source file:org.pentaho.hadoop.mapreduce.test.TestSubmitMapReduceJob.java
License:Open Source License
@Test public void submitJob() throws Exception { String[] args = { "hdfs://" + hostname + ":" + hdfsPort + "/junit/wordcount/input", "hdfs://" + hostname + ":" + hdfsPort + "/junit/wordcount/output" }; JobConf conf = new JobConf(); conf.setJobName("wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); File jar = new File("./test-res/pentaho-mapreduce-sample.jar"); URLClassLoader loader = new URLClassLoader(new URL[] { jar.toURI().toURL() }); conf.setMapperClass(//from w ww . j av a 2s. c o m (Class<? extends Mapper>) loader.loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount$Map")); conf.setCombinerClass((Class<? extends Reducer>) loader .loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount$Reduce")); conf.setReducerClass((Class<? extends Reducer>) loader .loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount$Reduce")); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); conf.set("fs.default.name", "hdfs://" + hostname + ":" + hdfsPort); conf.set("mapred.job.tracker", hostname + ":" + trackerPort); conf.setJarByClass(loader.loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount")); conf.setWorkingDirectory(new Path("/tmp/wordcount")); JobClient jobClient = new JobClient(conf); ClusterStatus status = jobClient.getClusterStatus(); assertEquals(State.RUNNING, status.getJobTrackerState()); RunningJob runningJob = jobClient.submitJob(conf); System.out.print("Running " + runningJob.getJobName() + ""); while (!runningJob.isComplete()) { System.out.print("."); Thread.sleep(500); } System.out.println(); System.out.println("Finished " + runningJob.getJobName() + "."); FileObject file = fsManager.resolveFile(buildHDFSURL("/junit/wordcount/output/part-00000")); String output = IOUtils.toString(file.getContent().getInputStream()); assertEquals("Bye\t1\nGoodbye\t1\nHadoop\t2\nHello\t2\nWorld\t2\n", output); }
From source file:org.pooledtimeseries.healthcheck.CheckCartesianProductSeqFile.java
License:Apache License
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { long start = System.currentTimeMillis(); JobConf conf = new JobConf("Cartesian Product"); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: CheckCartesianProductSeqFile <input sequence file> <out>"); System.exit(1);//from w w w .j a v a2 s.co m } // Configure the join type conf.setJarByClass(CheckCartesianProductSeqFile.class); conf.setMapperClass(CartesianMapper.class); conf.setReducerClass(CartesianReducer.class); conf.setInputFormat(CartesianInputFormat.class); CartesianInputFormat.setLeftInputInfo(conf, SequenceFileInputFormat.class, otherArgs[0]); CartesianInputFormat.setRightInputInfo(conf, SequenceFileInputFormat.class, otherArgs[0]); TextOutputFormat.setOutputPath(conf, new Path(otherArgs[1])); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); RunningJob job = JobClient.runJob(conf); while (!job.isComplete()) { Thread.sleep(1000); } long finish = System.currentTimeMillis(); System.out.println("Time in ms: " + (finish - start)); System.exit(job.isSuccessful() ? 0 : 2); }
From source file:org.pooledtimeseries.SimilarityCalculation.java
License:Apache License
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(); System.out.println("Before Map:" + conf.getNumMapTasks()); conf.setNumMapTasks(196);//from www.j av a2 s . c om System.out.println("After Map:" + conf.getNumMapTasks()); conf.setJobName("similarity_calc"); conf.set("meanDistsFilePath", args[2]); System.out.println("Job Name: " + conf.getJobName()); conf.setJarByClass(SimilarityCalculation.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setInputFormat(CartesianInputFormat.class); CartesianInputFormat.setLeftInputInfo(conf, SequenceFileInputFormat.class, args[0]); CartesianInputFormat.setRightInputInfo(conf, SequenceFileInputFormat.class, args[0]); conf.setOutputFormat(TextOutputFormat.class); FileOutputFormat.setOutputPath(conf, new Path(args[1])); conf.setMapperClass(Map.class); JobClient.runJob(conf); }