List of usage examples for org.apache.hadoop.mapred JobConf getJobName
public String getJobName()
From source file:org.commoncrawl.hadoop.io.S3GetMetdataJob.java
License:Open Source License
public static void main(String[] args) { String accessKey = args[0];/* www .j a v a 2 s . co m*/ String secretKey = args[1]; String paths[] = { // "2008/06", // "2008/07", // "2008/08", // "2008/09", // "2008/10", // "2008/11", "2009" }; for (int pathIndex = 0; pathIndex < paths.length; ++pathIndex) { LOG.info("Processing Path:" + paths[pathIndex]); JobConf job = new JobConf(S3GetMetdataJob.class); Path tempDir = new Path( job.get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); LOG.info("Output for Path:" + paths[pathIndex] + " is:" + tempDir); System.out.println("Output Path is:" + tempDir); job.setJobName("S3 To CrawlURLMetadata Job for Path:" + paths[pathIndex]); // setup s3 properties JetS3tARCSource.setMaxRetries(job, 1); // set up S3 credentials ... JetS3tARCSource.setAWSAccessKeyID(job, accessKey); JetS3tARCSource.setAWSSecretAccessKey(job, secretKey); ARCSplitCalculator.setFilesPerSplit(job, 25); // set up arc reader properties ArcFileReader.setIOTimeoutValue(30000); // set input prefixes ... JetS3tARCSource.setInputPrefixes(job, paths[pathIndex]); // and S3 bucket name ... JetS3tARCSource.setBucketName(job, "commoncrawl"); // and setup arc source for ArcInputFormat ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class); // and set up input format ... job.setInputFormat(ARCInputFormat.class); // set mapper ... job.setMapRunnerClass(S3GetMetdataJob.class); // setup reducer (identity in this case ... ) job.setReducerClass(IdentityReducer.class); // standard output format ... job.setOutputFormat(SequenceFileOutputFormat.class); // set output path job.setOutputPath(tempDir); // map output types job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(CrawlURLMetadata.class); // reduce output types job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlURLMetadata.class); // double the number of reducers ... // job.setNumReduceTasks(job.getNumReduceTasks() * 2); // run the job ... try { LOG.info("Starting Job:" + job.getJobName()); JobClient.runJob(job); LOG.info("Finished Job:" + job.getJobName()); Path finalPath = new Path("jobout/" + paths[pathIndex] + "/result"); LOG.info("Copying Job Output to:" + finalPath); FileSystem fs = FileSystem.get(job); try { fs.mkdirs(finalPath.getParent()); fs.rename(tempDir, finalPath); LOG.info("Copied Job Output to:" + finalPath); } finally { // fs.close(); } } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); e.printStackTrace(); } } }
From source file:org.commoncrawl.hadoop.template.SampleHadoopJob.java
License:Open Source License
/** * main routine// www. j ava 2 s .co m * * @param args */ public static void main(String[] args) { // amazon access key - passed on command line String accessKey = args[0]; // amazon secret key - passed on command line String secretKey = args[1]; // regular expression to match against - passed in command line String regEx = args[2]; // group number to extract int groupNumber = Integer.parseInt(args[3]); /** arc files names start with year then month **/ // we want to process all files uploaded in 2009 // so, we will use the prefix string "2009", // buy you could, for example pass in a more restrictive // pattern such as "2008/06". String inputPrefix = "2009"; LOG.info("Processing Path:" + inputPrefix); // allocate job config JobConf job = new JobConf(SampleHadoopJob.class); // set job name job.setJobName("Sample RegEx Job against path:" + inputPrefix); // set regular expression attributes job.set("mapred.mapper.regex", regEx); job.setInt("mapred.mapper.regex.group", groupNumber); // create temp file pth Path tempDir = new Path(job.get("mapred.temp.dir", ".") + "/temp-" + System.currentTimeMillis()); LOG.info("Output for job " + job.getJobName() + " is:" + tempDir); // we are going to be using the JetS3ARCSource as an input source to // the ArcInputFormat. This input source uses the multi-threaded jets3 // library to request data from S3. /** setup s3 properties **/ // set the number of retries per ARC file. // we are setting this number to one, so if an IOException // occurs when processing an ARCFile, we are going to silently skip it // and continue processing the next ARC file. You should set this to be // a number LESS than mapred.max.tracker.failures (as defined in your // job config or hadoop-site.xml). Otherwise, your entire job could // fail if it encounteres a bad ARC file in the bucket, or if the S3 serivce // exhibits a failure condition specific to a single key or set of keys. JetS3tARCSource.setMaxRetries(job, 1); // set up S3 credentials ... JetS3tARCSource.setAWSAccessKeyID(job, accessKey); JetS3tARCSource.setAWSSecretAccessKey(job, secretKey); // set the number of files per split // set this number higher if the bucket contains lots of files, to reduce // the burden on the map-reduce system from tracking too many file splits. ARCSplitCalculator.setFilesPerSplit(job, 25); /** set up arc reader properties **/ // again, set the timeout to something reasonable, so that your entire job // will not hang if a single GET request fails to complete in a reasonable // amount of time ArcFileReader.setIOTimeoutValue(30000); // set input prefixes ... JetS3tARCSource.setInputPrefixes(job, inputPrefix); // and S3 bucket name ... JetS3tARCSource.setBucketName(job, "commoncrawl"); // and setup arc source for ArcInputFormat ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class); // now inform the job that it needs to use the ARCInputFormat job.setInputFormat(ARCInputFormat.class); // set up our map runner class // we use a map runner instead of a mapper here to give us an extra level of // control over how we handle errors. When running a large job against // the crawl corpus which may contain hunders of thousands of ARC files, it // is extremely important to reduce the risks of abnormal job termination. job.setMapRunnerClass(SampleHadoopJob.class); // setup reducer (identity in this case ... ) job.setReducerClass(IdentityReducer.class); // standard output format ... job.setOutputFormat(SequenceFileOutputFormat.class); // set output path job.setOutputPath(tempDir); // map output types job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); // run the job ... try { LOG.info("Starting Job:" + job.getJobName()); JobClient.runJob(job); LOG.info("Finished Job:" + job.getJobName()); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); e.printStackTrace(); } }
From source file:org.gridgain.grid.kernal.processors.hadoop.GridHadoopUtils.java
License:Open Source License
/** * Creates JobInfo from hadoop configuration. * * @param cfg Hadoop configuration.// ww w. j a v a2s. co m * @return Job info. * @throws GridException If failed. */ public static GridHadoopDefaultJobInfo createJobInfo(Configuration cfg) throws GridException { JobConf jobConf = new JobConf(cfg); boolean hasCombiner = jobConf.get("mapred.combiner.class") != null || jobConf.get(MRJobConfig.COMBINE_CLASS_ATTR) != null; int numReduces = jobConf.getNumReduceTasks(); jobConf.setBooleanIfUnset("mapred.mapper.new-api", jobConf.get(OLD_MAP_CLASS_ATTR) == null); if (jobConf.getUseNewMapper()) { String mode = "new map API"; ensureNotSet(jobConf, "mapred.input.format.class", mode); ensureNotSet(jobConf, OLD_MAP_CLASS_ATTR, mode); if (numReduces != 0) ensureNotSet(jobConf, "mapred.partitioner.class", mode); else ensureNotSet(jobConf, "mapred.output.format.class", mode); } else { String mode = "map compatibility"; ensureNotSet(jobConf, MRJobConfig.INPUT_FORMAT_CLASS_ATTR, mode); ensureNotSet(jobConf, MRJobConfig.MAP_CLASS_ATTR, mode); if (numReduces != 0) ensureNotSet(jobConf, MRJobConfig.PARTITIONER_CLASS_ATTR, mode); else ensureNotSet(jobConf, MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, mode); } if (numReduces != 0) { jobConf.setBooleanIfUnset("mapred.reducer.new-api", jobConf.get(OLD_REDUCE_CLASS_ATTR) == null); if (jobConf.getUseNewReducer()) { String mode = "new reduce API"; ensureNotSet(jobConf, "mapred.output.format.class", mode); ensureNotSet(jobConf, OLD_REDUCE_CLASS_ATTR, mode); } else { String mode = "reduce compatibility"; ensureNotSet(jobConf, MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, mode); ensureNotSet(jobConf, MRJobConfig.REDUCE_CLASS_ATTR, mode); } } Map<String, String> props = new HashMap<>(); for (Map.Entry<String, String> entry : jobConf) props.put(entry.getKey(), entry.getValue()); return new GridHadoopDefaultJobInfo(jobConf.getJobName(), jobConf.getUser(), hasCombiner, numReduces, props); }
From source file:org.macau.util.FuzzyJoinDriver.java
License:Apache License
/** * /* w ww . j ava 2 s .c om*/ * @param job * @throws IOException * run the job and output the basic information of the job * the start time * the finished time * the running time(finished_Time - start_Time) */ public static void run(JobConf job) throws IOException { job.setJarByClass(FuzzyJoinDriver.class); // // print info // String ret = "FuzzyJoinDriver(" + job.getJobName() + ")\n" + " Input Path: {"; Path inputs[] = FileInputFormat.getInputPaths(job); for (int ctr = 0; ctr < inputs.length; ctr++) { if (ctr > 0) { ret += "\n "; } ret += inputs[ctr].toString(); } ret += "}\n"; ret += " Output Path: " + FileOutputFormat.getOutputPath(job) + "\n" + " Map Jobs: " + job.getNumMapTasks() + "\n" + " Reduce Jobs: " + job.getNumReduceTasks() + "\n" + " Properties: {"; String[][] properties = new String[][] { new String[] { FuzzyJoinConfig.SIMILARITY_NAME_PROPERTY, FuzzyJoinConfig.SIMILARITY_NAME_VALUE }, new String[] { FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY, "" + FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE }, new String[] { FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE }, new String[] { TOKENS_PACKAGE_PROPERTY, TOKENS_PACKAGE_VALUE }, new String[] { TOKENS_LENGTHSTATS_PROPERTY, "" + TOKENS_LENGTHSTATS_VALUE }, new String[] { RIDPAIRS_GROUP_CLASS_PROPERTY, RIDPAIRS_GROUP_CLASS_VALUE }, new String[] { RIDPAIRS_GROUP_FACTOR_PROPERTY, "" + RIDPAIRS_GROUP_FACTOR_VALUE }, new String[] { FuzzyJoinConfig.DATA_TOKENS_PROPERTY, "" }, new String[] { DATA_JOININDEX_PROPERTY, "" }, }; for (int crt = 0; crt < properties.length; crt++) { if (crt > 0) { ret += "\n "; } ret += properties[crt][0] + "=" + job.get(properties[crt][0], properties[crt][1]); } ret += "}"; System.out.println(ret); // // run job // Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(job); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println( "The job took " + (end_time.getTime() - startTime.getTime()) / (float) 1000.0 + " seconds."); }
From source file:org.pooledtimeseries.MeanChiSquareDistanceCalculation.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration baseConf = new Configuration(); baseConf.set("mapreduce.job.maps", "96"); baseConf.set("mapred.tasktracker.map.tasks.maximum", "96"); JobConf conf = new JobConf(baseConf, MeanChiSquareDistanceCalculation.class); System.out.println("Before Map:" + conf.getNumMapTasks()); conf.setNumMapTasks(96);/*w w w .java 2 s. co m*/ System.out.println("After Map:" + conf.getNumMapTasks()); conf.setJobName("mean_chi_square_calculation"); System.out.println("Track:" + baseConf.get("mapred.job.tracker")); System.out.println("Job Name- " + conf.getJobName()); System.out.println(baseConf.get("mapreduce.job.maps")); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(DoubleWritable.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(DoubleWritable.class); conf.setOutputFormat(TextOutputFormat.class); conf.setInputFormat(CartesianInputFormat.class); CartesianInputFormat.setLeftInputInfo(conf, SequenceFileInputFormat.class, args[0]); CartesianInputFormat.setRightInputInfo(conf, SequenceFileInputFormat.class, args[0]); FileOutputFormat.setOutputPath(conf, new Path(args[1])); conf.setMapperClass(Map.class); conf.setReducerClass(Reduce.class); JobClient.runJob(conf); }
From source file:org.pooledtimeseries.SimilarityCalculation.java
License:Apache License
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(); System.out.println("Before Map:" + conf.getNumMapTasks()); conf.setNumMapTasks(196);/*from ww w . j av a 2s . co m*/ System.out.println("After Map:" + conf.getNumMapTasks()); conf.setJobName("similarity_calc"); conf.set("meanDistsFilePath", args[2]); System.out.println("Job Name: " + conf.getJobName()); conf.setJarByClass(SimilarityCalculation.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setInputFormat(CartesianInputFormat.class); CartesianInputFormat.setLeftInputInfo(conf, SequenceFileInputFormat.class, args[0]); CartesianInputFormat.setRightInputInfo(conf, SequenceFileInputFormat.class, args[0]); conf.setOutputFormat(TextOutputFormat.class); FileOutputFormat.setOutputPath(conf, new Path(args[1])); conf.setMapperClass(Map.class); JobClient.runJob(conf); }
From source file:voldemort.store.readonly.mr.utils.HadoopUtils.java
License:Apache License
/** * Add the given object to the distributed cache for this job * //from w w w .ja va 2 s.c o m * @param obj A Serializable object to add to the JobConf * @param job The JobConf */ public static <T extends Serializable> void setSerializableInCache(JobConf job, T serializable) { try { // TODO: MED /tmp should be changed by conf.getTempDir() or // something Path workDir = new Path( String.format("/tmp/%s/%s/_join.temporary", job.getJobName(), System.currentTimeMillis())); Path tempPath = new Path(workDir, "serializable.dat"); tempPath.getFileSystem(job).deleteOnExit(tempPath); job.set("serializables.file", tempPath.toUri().getPath()); ObjectOutputStream objectStream = new ObjectOutputStream(tempPath.getFileSystem(job).create(tempPath)); objectStream.writeObject(serializable); objectStream.close(); DistributedCache.addCacheFile(new URI(tempPath.toUri().getPath() + "#" + tempPath.getName()), job); } catch (URISyntaxException e) { throw new RuntimeException(e); } catch (IOException e) { throw new RuntimeException(e); } }