List of usage examples for org.apache.hadoop.mapreduce Job getConfiguration
public Configuration getConfiguration()
From source file:com.hadoop.mapreduce.TestLzoTextInputFormat.java
License:Open Source License
/** * Generate random data, compress it, index and md5 hash the data. * Then read it all back and md5 that too, to verify that it all went ok. * /*from w w w . j a va2 s .com*/ * @param testWithIndex Should we index or not? * @param charsToOutput How many characters of random data should we output. * @throws IOException * @throws NoSuchAlgorithmException * @throws InterruptedException */ private void runTest(boolean testWithIndex, int charsToOutput) throws IOException, NoSuchAlgorithmException, InterruptedException { if (!GPLNativeCodeLoader.isNativeCodeLoaded()) { LOG.warn("Cannot run this test without the native lzo libraries"); return; } Configuration conf = new Configuration(); conf.setLong("fs.local.block.size", charsToOutput / 2); // reducing block size to force a split of the tiny file conf.set("io.compression.codecs", LzopCodec.class.getName()); FileSystem localFs = FileSystem.getLocal(conf); localFs.delete(outputDir, true); localFs.mkdirs(outputDir); Job job = new Job(conf); TextOutputFormat.setCompressOutput(job, true); TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class); TextOutputFormat.setOutputPath(job, outputDir); TaskAttemptContext attemptContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID("123", 0, TaskType.REDUCE, 1, 2)); // create some input data byte[] expectedMd5 = createTestInput(outputDir, localFs, attemptContext, charsToOutput); if (testWithIndex) { Path lzoFile = new Path(outputDir, lzoFileName); LzoTextInputFormat.createIndex(localFs, lzoFile); } LzoTextInputFormat inputFormat = new LzoTextInputFormat(); TextInputFormat.setInputPaths(job, outputDir); List<InputSplit> is = inputFormat.getSplits(job); //verify we have the right number of lzo chunks if (testWithIndex && OUTPUT_BIG == charsToOutput) { assertEquals(3, is.size()); } else { assertEquals(1, is.size()); } // let's read it all and calculate the md5 hash for (InputSplit inputSplit : is) { RecordReader<LongWritable, Text> rr = inputFormat.createRecordReader(inputSplit, attemptContext); rr.initialize(inputSplit, attemptContext); while (rr.nextKeyValue()) { Text value = rr.getCurrentValue(); md5.update(value.getBytes(), 0, value.getLength()); } rr.close(); } localFs.close(); assertTrue(Arrays.equals(expectedMd5, md5.digest())); }
From source file:com.hhscyber.nl.tweets.svm.test.Tester.java
/** * @param args the command line arguments * @throws java.io.IOException/*from w ww .j a va 2s . c o m*/ */ public static void main(String[] args) throws IOException, Exception { Conf conf = new Conf(args, ""); Job job = new HBJob(conf, "TweetsSVMTester"); job.setJarByClass(Tester.class); Scan scan = new Scan(); TableMapReduceUtil.initTableMapperJob("hhscyber:tweets_lang", scan, TestMapper.class, ImmutableBytesWritable.class, Put.class, job); job.setOutputFormatClass(MultiTableOutputFormat.class); job.setReducerClass(TestReducer.class); job.setNumReduceTasks(2); TableMapReduceUtil.addDependencyJars(job); TableMapReduceUtil.addDependencyJars(job.getConfiguration()); job.waitForCompletion(true); }
From source file:com.hortonworks.pso.data.generator.mapreduce.DataGenTool.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); // new Job(conf, this.getClass().getCanonicalName()); // Configuration conf = getConf(); int mappers = 2; String output = null;// w ww. j a v a 2 s .co m String config = null; long count = 100; List<String> otherArgs = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-mappers".equals(args[i])) { mappers = Integer.parseInt(args[++i]); otherArgs.add("-Dmapreduce.job.maps=" + Integer.toString(mappers)); } else if ("-output".equals(args[i])) { output = args[++i]; } else if ("-json.cfg".equals(args[i])) { config = args[++i]; } else if ("-count".equals(args[i])) { count = Long.parseLong(args[++i]); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } job.getConfiguration().set("json.cfg", config); String[] altArgs = new String[otherArgs.size()]; otherArgs.toArray(altArgs); GenericOptionsParser gop = new GenericOptionsParser(job.getConfiguration(), altArgs); DataGenInputFormat.setNumberOfRows(job, count); job.setJarByClass(DataGenTool.class); Path output_path = new Path(output); if (output_path.getFileSystem(getConf()).exists(output_path)) { throw new IOException("Output directory " + output_path + " already exists."); } FileOutputFormat.setOutputPath(job, output_path); job.setMapperClass(DataGenMapper.class); // Map Only Job job.setNumReduceTasks(0); // job.setReducerClass(RerateReducer.class); job.setInputFormatClass(DataGenInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Text.class); // job.setOutputKeyClass(Text.class); // job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.iflytek.spider.crawl.CrawlDb.java
License:Apache License
public void update(Path crawlDb, Path[] segments, boolean additionsAllowed, boolean force) throws IOException, InterruptedException, ClassNotFoundException { FileSystem fs = FileSystem.get(getConf()); Path lock = new Path(crawlDb, LOCK_NAME); LockUtil.createLockFile(fs, lock, force); if (LOG.isInfoEnabled()) { LOG.info("CrawlDb update: starting"); LOG.info("CrawlDb update: db: " + crawlDb); LOG.info("CrawlDb update: segments: " + Arrays.asList(segments)); LOG.info("CrawlDb update: additions allowed: " + additionsAllowed); }/*from w ww . j a v a 2 s .c o m*/ Job job = CrawlDb.createJob(getConf(), crawlDb); job.getConfiguration().setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed); for (int i = 0; i < segments.length; i++) { Path fetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME); Path parse = new Path(segments[i], CrawlDatum.PARSE_DIR_NAME); if (fs.exists(fetch)) { FileInputFormat.addInputPath(job, fetch); } if (fs.exists(parse)) { FileInputFormat.addInputPath(job, parse); } else { LOG.info(" - skipping invalid segment " + segments[i]); } } if (LOG.isInfoEnabled()) { LOG.info("CrawlDb update: Merging segment data into db."); } try { job.waitForCompletion(true); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); Path outPath = FileOutputFormat.getOutputPath(job); if (fs.exists(outPath)) fs.delete(outPath, true); throw e; } catch (InterruptedException e) { LockUtil.removeLockFile(fs, lock); Path outPath = FileOutputFormat.getOutputPath(job); if (fs.exists(outPath)) fs.delete(outPath, true); throw e; } catch (ClassNotFoundException e) { LockUtil.removeLockFile(fs, lock); Path outPath = FileOutputFormat.getOutputPath(job); if (fs.exists(outPath)) fs.delete(outPath, true); throw e; } CrawlDb.install(job, crawlDb); if (LOG.isInfoEnabled()) { LOG.info("CrawlDb update: done"); } }
From source file:com.iflytek.spider.crawl.CrawlDb.java
License:Apache License
public static void install(Job job, Path crawlDb) throws IOException { Path newCrawlDb = FileOutputFormat.getOutputPath(job); FileSystem fs = FileSystem.get(job.getConfiguration()); Path old = new Path(crawlDb, "old"); Path current = new Path(crawlDb, CURRENT_NAME); if (fs.exists(current)) { if (fs.exists(old)) fs.delete(old, true);/*from w w w . j av a2 s . co m*/ fs.rename(current, old); } fs.mkdirs(crawlDb); fs.rename(newCrawlDb, current); if (fs.exists(old)) fs.delete(old, true); Path lock = new Path(crawlDb, LOCK_NAME); LockUtil.removeLockFile(fs, lock); }
From source file:com.iflytek.spider.crawl.GeneratorSmart.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or not * is read from the crawl.generate.filter property in the configuration files. * If the property is not found, the URLs are filtered. Same for the * normalisation.//ww w .j a va 2s .co m * * @param dbDir * Crawl database directory * @param segments * Segments directory * @param numLists * Number of reduce tasks * @param curTime * Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException * When an I/O error occurs * @throws ClassNotFoundException * @throws InterruptedException */ public Path[] generate(Path dbDir, Path segments, int numLists, long curTime, boolean force) throws IOException, InterruptedException, ClassNotFoundException { //getConf().set("mapred.temp.dir", "d:/tmp"); Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); LOG.info("Generator: Selecting best-scoring urls due for fetch."); LOG.info("Generator: starting"); Job job = AvroJob.getAvroJob(getConf()); if (numLists == -1) { // for politeness make numLists = job.getNumReduceTasks(); // a partition per fetch task } if ("local".equals(job.getConfiguration().get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } LOG.info("Generator: with " + numLists + " partition."); job.getConfiguration().setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(SelectorMapper.class); job.setReducerClass(SelectorReducer.class); FileOutputFormat.setOutputPath(job, tempDir); //job.setOutputFormatClass(AvroPairOutputFormat.class); job.setOutputFormatClass(GeneratorOutputFormat.class); job.setOutputKeyClass(Float.class); job.setOutputValueClass(SelectorEntry.class); // AvroMultipleOutputs.addNamedOutput(job, "seq", // AvroPairOutputFormat.class, Float.class, SelectorEntry.class); try { job.waitForCompletion(true); } catch (IOException e) { e.printStackTrace(); return null; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) continue; // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); fs.createNewFile(new Path(newSeg, "generatored")); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("Generator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("Generator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); job = AvroJob.getAvroJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(CrawlDbUpdateMapper.class); // job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormatClass(AvroMapOutputFormat.class); job.setOutputKeyClass(String.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { job.waitForCompletion(true); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); if (LOG.isInfoEnabled()) { LOG.info("Generator: done."); } Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); }
From source file:com.iflytek.spider.crawl.GeneratorSmart.java
License:Apache License
private Path partitionSegment(FileSystem fs, Path segmentsDir, Path inputDir, int numLists) throws IOException, InterruptedException, ClassNotFoundException { // invert again, partition by host/domain/IP, sort by url hash if (LOG.isInfoEnabled()) { LOG.info("Generator: Partitioning selected urls for politeness:" + inputDir); }//from ww w . jav a2 s . co m Path segment = new Path(segmentsDir, generateSegmentName()); Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME); LOG.info("Generator: segment: " + segment + " with " + numLists + " Fetchers"); Job job = AvroJob.getAvroJob(getConf()); job.setJobName("generate: partition " + segment); job.getConfiguration().setInt("partition.url.seed", new Random().nextInt()); FileInputFormat.addInputPath(job, inputDir); job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(SelectorInverseMapper.class); job.setPartitionerClass(AveragePartition.class); job.setMapOutputKeyClass(String.class); job.setMapOutputValueClass(SelectorEntry.class); job.setReducerClass(PartitionReducer.class); job.setNumReduceTasks(numLists); FileOutputFormat.setOutputPath(job, output); job.setOutputFormatClass(AvroPairOutputFormat.class); job.setOutputKeyClass(String.class); job.setOutputValueClass(CrawlDatum.class); job.waitForCompletion(true); return segment; }
From source file:com.iflytek.spider.parse.ParseSegment.java
License:Apache License
public void parse(Path segment) throws IOException, InterruptedException, ClassNotFoundException { if (LOG.isInfoEnabled()) { LOG.info("Parse: starting"); LOG.info("Parse: segment: " + segment); }//w w w . j a v a2s. c om Job job = AvroJob.getAvroJob(getConf()); job.setJobName("parse " + segment); FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME)); job.getConfiguration().set(Spider.SEGMENT_NAME_KEY, segment.getName()); job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(ParseMapper.class); FileOutputFormat.setOutputPath(job, segment); job.setOutputFormatClass(ParseOutputFormat.class); job.setOutputKeyClass(String.class); job.setOutputValueClass(UnionData.class); job.waitForCompletion(true); if (LOG.isInfoEnabled()) { LOG.info("Parse: done"); } }
From source file:com.ikanow.aleph2.analytics.hadoop.assets.Aleph2MultiInputFormatBuilder.java
License:Apache License
/** Sets the output configurations in the job * @param job/*w w w . j a v a 2s. co m*/ */ public Job build(final Job job) { job.getConfiguration().set(ALEPH2_MULTI_INPUT_FORMAT_JOBS, _inputs.keySet().stream().collect(Collectors.joining(","))); _inputs.entrySet().stream().forEach(Lambdas.wrap_consumer_u(kv -> { try (final Stringifier<Configuration> stringifier = new DefaultStringifier<Configuration>( job.getConfiguration(), Configuration.class)) { final Configuration new_config = new Configuration(kv.getValue().getConfiguration()); new_config.set(ALEPH2_MULTI_INPUT_FORMAT_CLAZZ, kv.getValue().getInputFormatClass().getName()); job.getConfiguration().set(ALEPH2_MULTI_INPUT_FORMAT_PREFIX + kv.getKey(), stringifier.toString(new_config)); } })); job.setInputFormatClass(Aleph2MultiInputFormat.class); return job; }
From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedFileInputFormat.java
License:Apache License
/** * @param job//from www .j a v a 2 s .c o m * the job to modify * @param inputDirRecursive */ public static void setInputDirRecursive(Job job, boolean inputDirRecursive) { job.getConfiguration().setBoolean(INPUT_DIR_RECURSIVE, inputDirRecursive); }