List of usage examples for org.apache.hadoop.mapreduce Job setJobName
public void setJobName(String name) throws IllegalStateException
From source file:com.github.sandgorgon.parmr.Main.java
License:Open Source License
@Override public int run(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: parmr <input file> <output path>"); return -1; }/*from w w w . j a v a2s. c om*/ Configuration conf = super.getConf(); conf.set("mapreduce.job.queuename", "prod"); Job job = Job.getInstance(conf); job.setJobName(jobName); job.setJarByClass(Main.class); // Parquet Schema // Read from the input file itself the schema that we will be assuming Path infile = new Path(args[0]); List<Footer> footers = ParquetFileReader.readFooters(conf, infile.getFileSystem(conf).getFileStatus(infile), true); MessageType schema = footers.get(0).getParquetMetadata().getFileMetaData().getSchema(); // Avro Schema // Convert the Parquet schema to an Avro schema AvroSchemaConverter avroSchemaConverter = new AvroSchemaConverter(); Schema avroSchema = avroSchemaConverter.convert(schema); // Set the Mapper job.setMapperClass(UserMapper.class); // This works for predicate pushdown on record assembly read. AvroParquetInputFormat.setUnboundRecordFilter(job, UserRecordFilter.class); AvroParquetInputFormat.addInputPath(job, new Path(args[0])); AvroParquetInputFormat.setAvroReadSchema(job, avroSchema); job.setInputFormatClass(AvroParquetInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); // If you needed to return an avro object from the mapper, refer to this... //job.setMapOutputValueClass(AvroValue.class); //AvroJob.setMapOutputValueSchema(job, avroSchema); // Reducer job.setReducerClass(UserReducer.class); // Output job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, new Path(args[1])); // If we need to return an avro class again, refer to this... //job.setOutputFormatClass(AvroParquetOutputFormat.class); //AvroParquetOutputFormat.setOutputPath(job, new Path(args[1])); //AvroParquetOutputFormat.setSchema(job, avroSchema); //job.setOutputKeyClass(Void.class); //job.setOutputValueClass(GenericRecord.class); // Rough way of testing the projection side of things. AvroParquetInputFormat.setRequestedProjection(job, Schema.parse("{\"namespace\": \"com.github.sandgorgon.parmr.avro\",\n" + " \"type\": \"record\",\n" + " \"name\": \"User\",\n" + " \"fields\": [\n" + " {\"name\": \"name\", \"type\": \"string\"},\n" + " {\"name\": \"favorite_number\", \"type\": [\"int\", \"null\"]}\n" + // " {\"name\": \"favorite_color\", \"type\": [\"string\", \"null\"]}\n" + " ]\n" + "}\n" + "")); // Do the deed! int completion = job.waitForCompletion(true) ? 0 : 1; return completion; }
From source file:com.google.cloud.bigtable.mapreduce.Export.java
License:Apache License
/** * Sets up the actual job./*from w ww . j a va 2s. c om*/ * * @param conf The current configuration. * @param args The command line parameters. * @return The newly created job. * @throws java.io.IOException When setting up the job fails. */ public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException { conf.setIfUnset("hbase.client.connection.impl", BigtableConfiguration.getConnectionClass().getName()); conf.setIfUnset(BigtableOptionsFactory.BIGTABLE_RPC_TIMEOUT_MS_KEY, "60000"); conf.setBoolean(TableInputFormat.SHUFFLE_MAPS, true); String tableName = args[0]; Path outputDir = new Path(args[1]); Job job = Job.getInstance(conf, NAME + "_" + tableName); job.setJobName(NAME + "_" + tableName); job.setJarByClass(Export.class); // Set optional scan parameters Scan s = getConfiguredScanForJob(conf, args); TableMapReduceUtil.initTableMapperJob(tableName, s, IdentityTableMapper.class, ImmutableBytesWritable.class, Result.class, job, false); // No reducers. Just write straight to output files. job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(Result.class); FileOutputFormat.setOutputPath(job, outputDir); // job conf doesn't contain the conf so doesn't have a default fs. return job; }
From source file:com.gsinnovations.howdah.AbstractJob.java
License:Apache License
protected Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat, Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey, Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer, Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue, Class<? extends OutputFormat> outputFormat) throws IOException { Job job = new Job(new Configuration(getConf())); Configuration jobConf = job.getConfiguration(); if (reducer.equals(Reducer.class)) { if (mapper.equals(Mapper.class)) { throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer"); }//from w w w .jav a 2 s .co m job.setJarByClass(mapper); } else { job.setJarByClass(reducer); } job.setInputFormatClass(inputFormat); jobConf.set("mapred.input.dir", inputPath.toString()); job.setMapperClass(mapper); job.setMapOutputKeyClass(mapperKey); job.setMapOutputValueClass(mapperValue); jobConf.setBoolean("mapred.compress.map.output", true); job.setReducerClass(reducer); job.setOutputKeyClass(reducerKey); job.setOutputValueClass(reducerValue); job.setJobName(getCustomJobName(job, mapper, reducer)); job.setOutputFormatClass(outputFormat); jobConf.set("mapred.output.dir", outputPath.toString()); return job; }
From source file:com.gsvic.csmr.CSMRBase.java
License:Apache License
public static void generatePairs(String in, String out) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); path = out;// w ww .j a va 2 s. c o m Job job; Path input, output; input = new Path(in); output = new Path(path + "/CSMRPairs"); job = new Job(conf); job.setJobName("CSMR Pairs Job"); job.setJarByClass(CSMRBase.class); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(CSMRMapper.class); job.setReducerClass(CSMRReducer.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(DocumentWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorArrayWritable.class); job.waitForCompletion(true); }
From source file:com.gsvic.csmr.CSMRBase.java
License:Apache License
public static void StartCSMR() throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); Job job; job = new Job(conf); job.setJobName("CSMR Cosine Similarity Job"); job.setJarByClass(CSMRBase.class); FileInputFormat.addInputPath(job, new Path(path + "/CSMRPairs/part-r-00000")); FileOutputFormat.setOutputPath(job, new Path(path + "/Results")); job.setMapperClass(Mapper.class); job.setReducerClass(CosineSimilarityReducer.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(VectorArrayWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); System.exit(job.waitForCompletion(true) ? 1 : 0); }
From source file:com.iflytek.spider.crawl.CrawlDb.java
License:Apache License
public static Job createJob(Configuration config, Path crawlDb) throws IOException { Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); Job job = AvroJob.getAvroJob(config); job.setJobName("crawldb " + crawlDb); Path current = new Path(crawlDb, CURRENT_NAME); if (FileSystem.get(config).exists(current)) { FileInputFormat.addInputPath(job, current); }//from w w w .j a v a 2s. c o m job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(CrawlDbFilter.class); job.setReducerClass(CrawlDbReducer.class); FileOutputFormat.setOutputPath(job, newCrawlDb); job.setOutputFormatClass(AvroMapOutputFormat.class); job.setOutputKeyClass(String.class); job.setOutputValueClass(CrawlDatum.class); return job; }
From source file:com.iflytek.spider.crawl.GeneratorSmart.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or not * is read from the crawl.generate.filter property in the configuration files. * If the property is not found, the URLs are filtered. Same for the * normalisation./*from ww w .ja v a 2 s . c o m*/ * * @param dbDir * Crawl database directory * @param segments * Segments directory * @param numLists * Number of reduce tasks * @param curTime * Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException * When an I/O error occurs * @throws ClassNotFoundException * @throws InterruptedException */ public Path[] generate(Path dbDir, Path segments, int numLists, long curTime, boolean force) throws IOException, InterruptedException, ClassNotFoundException { //getConf().set("mapred.temp.dir", "d:/tmp"); Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); LOG.info("Generator: Selecting best-scoring urls due for fetch."); LOG.info("Generator: starting"); Job job = AvroJob.getAvroJob(getConf()); if (numLists == -1) { // for politeness make numLists = job.getNumReduceTasks(); // a partition per fetch task } if ("local".equals(job.getConfiguration().get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } LOG.info("Generator: with " + numLists + " partition."); job.getConfiguration().setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(SelectorMapper.class); job.setReducerClass(SelectorReducer.class); FileOutputFormat.setOutputPath(job, tempDir); //job.setOutputFormatClass(AvroPairOutputFormat.class); job.setOutputFormatClass(GeneratorOutputFormat.class); job.setOutputKeyClass(Float.class); job.setOutputValueClass(SelectorEntry.class); // AvroMultipleOutputs.addNamedOutput(job, "seq", // AvroPairOutputFormat.class, Float.class, SelectorEntry.class); try { job.waitForCompletion(true); } catch (IOException e) { e.printStackTrace(); return null; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) continue; // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); fs.createNewFile(new Path(newSeg, "generatored")); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("Generator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("Generator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); job = AvroJob.getAvroJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(CrawlDbUpdateMapper.class); // job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormatClass(AvroMapOutputFormat.class); job.setOutputKeyClass(String.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { job.waitForCompletion(true); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); if (LOG.isInfoEnabled()) { LOG.info("Generator: done."); } Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); }
From source file:com.iflytek.spider.crawl.GeneratorSmart.java
License:Apache License
private Path partitionSegment(FileSystem fs, Path segmentsDir, Path inputDir, int numLists) throws IOException, InterruptedException, ClassNotFoundException { // invert again, partition by host/domain/IP, sort by url hash if (LOG.isInfoEnabled()) { LOG.info("Generator: Partitioning selected urls for politeness:" + inputDir); }//from w w w . j a v a 2 s. c om Path segment = new Path(segmentsDir, generateSegmentName()); Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME); LOG.info("Generator: segment: " + segment + " with " + numLists + " Fetchers"); Job job = AvroJob.getAvroJob(getConf()); job.setJobName("generate: partition " + segment); job.getConfiguration().setInt("partition.url.seed", new Random().nextInt()); FileInputFormat.addInputPath(job, inputDir); job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(SelectorInverseMapper.class); job.setPartitionerClass(AveragePartition.class); job.setMapOutputKeyClass(String.class); job.setMapOutputValueClass(SelectorEntry.class); job.setReducerClass(PartitionReducer.class); job.setNumReduceTasks(numLists); FileOutputFormat.setOutputPath(job, output); job.setOutputFormatClass(AvroPairOutputFormat.class); job.setOutputKeyClass(String.class); job.setOutputValueClass(CrawlDatum.class); job.waitForCompletion(true); return segment; }
From source file:com.iflytek.spider.parse.ParseSegment.java
License:Apache License
public void parse(Path segment) throws IOException, InterruptedException, ClassNotFoundException { if (LOG.isInfoEnabled()) { LOG.info("Parse: starting"); LOG.info("Parse: segment: " + segment); }//from ww w . jav a2 s . co m Job job = AvroJob.getAvroJob(getConf()); job.setJobName("parse " + segment); FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME)); job.getConfiguration().set(Spider.SEGMENT_NAME_KEY, segment.getName()); job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(ParseMapper.class); FileOutputFormat.setOutputPath(job, segment); job.setOutputFormatClass(ParseOutputFormat.class); job.setOutputKeyClass(String.class); job.setOutputValueClass(UnionData.class); job.waitForCompletion(true); if (LOG.isInfoEnabled()) { LOG.info("Parse: done"); } }
From source file:com.igalia.wordcount.WordCount.java
License:Open Source License
public int run(String[] arg0) throws Exception { Job job = new Job(getConf()); job.setJarByClass(WordCount.class); job.setJobName("wordcount"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(MapClass.class); job.setReducerClass(Reduce.class); FileInputFormat.setInputPaths(job, new Path("/tmp/wordcount/in")); FileOutputFormat.setOutputPath(job, new Path("/tmp/wordcount/out")); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }