List of usage examples for org.apache.hadoop.mapred JobConf setLong
public void setLong(String name, long value)
name
property to a long
. From source file:com.rim.logdriver.mapred.avro.AvroBlockInputFormat.java
License:Apache License
@Override public RecordReader<AvroFileHeader, BytesWritable> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { // Ensure we have sensible defaults for how we build blocks. if (job.get("mapreduce.job.max.split.locations") == null) { job.setLong("mapreduce.job.max.split.locations", MAX_SPLIT_LOCATIONS); }/* w w w. j ava 2 s .co m*/ if (job.get("mapred.max.split.size") == null) { // Try to set the split size to the default block size. In case of // failure, we'll use this 128MB default. long blockSize = 128 * 1024 * 1024; // 128MB try { blockSize = FileSystem.get(job).getDefaultBlockSize(); } catch (IOException e) { LOG.error("Error getting filesystem to get get default block size (this does not bode well)."); } job.setLong("mapred.max.split.size", blockSize); } return new AvroBlockRecordReader(split, job); }
From source file:com.rim.logdriver.mapred.boom.BoomInputFormat.java
License:Apache License
@Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { // Ensure we have sensible defaults for how we build blocks. if (job.get("mapreduce.job.max.split.locations") == null) { job.setLong("mapreduce.job.max.split.locations", MAX_SPLIT_LOCATIONS); }//from w w w . ja v a 2 s . c om if (job.get("mapred.max.split.size") == null) { // Try to set the split size to the default block size. In case of // failure, we'll use this 128MB default. long blockSize = 128 * 1024 * 1024; // 128MB try { blockSize = FileSystem.get(job).getDefaultBlockSize(); } catch (IOException e) { LOG.error("Error getting filesystem to get get default block size (this does not bode well)."); } job.setLong("mapred.max.split.size", blockSize); } return super.getSplits(job, numSplits); }
From source file:com.scaleoutsoftware.soss.hserver.hadoop.MapperWrapperMapred.java
License:Apache License
/** * Update the job with details about the file split * @param job the job configuration to update * @param inputSplit the file split//from ww w.j a v a2 s.c o m */ private void updateJobWithSplit(final JobConf job, Object inputSplit) { if (inputSplit instanceof FileSplit) { FileSplit fileSplit = (FileSplit) inputSplit; try { if (fileSplit.getPath() != null) { job.set("mapreduce.map.input.file", fileSplit.getPath().toString()); } } catch (IllegalArgumentException e) { //Swallow this, it appears in Hive splits, which do not have the path encoded //(storage handler for NamedMap is an example). } job.setLong("mapreduce.map.input.start", fileSplit.getStart()); job.setLong("mapreduce.map.input.length", fileSplit.getLength()); } LOG.info("Processing split: " + inputSplit); }
From source file:com.TCG.Nutch_DNS.Generator.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or not * is read from the crawl.generate.filter property in the configuration files. * If the property is not found, the URLs are filtered. Same for the * normalisation./*ww w . j a v a 2s. c o m*/ * * @param dbDir * Crawl database directory * @param segments * Segments directory * @param numLists * Number of reduce tasks * @param topN * Number of top URLs to be selected * @param curTime * Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException * When an I/O error occurs */ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments) throws IOException { Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("Generator: starting at " + sdf.format(start)); LOG.info("Generator: Selecting best-scoring urls due for fetch."); LOG.info("Generator: filtering: " + filter); LOG.info("Generator: normalizing: " + norm); if (topN != Long.MAX_VALUE) { LOG.info("Generator: topN: " + topN); } // map to inverted subset due for fetch, sort by score JobConf job = new NutchJob(getConf()); job.setJobName("generate: select from " + dbDir); if (numLists == -1) { // for politeness make numLists = job.getNumMapTasks(); // a partition per fetch task } if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(Selector.class); job.setPartitionerClass(Selector.class); job.setReducerClass(Selector.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputKeyComparatorClass(DecreasingFloatComparator.class); job.setOutputValueClass(SelectorEntry.class); job.setOutputFormat(GeneratorOutputFormat.class); try { JobClient.runJob(job); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); throw e; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) continue; // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("Generator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("Generator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString()); job = new NutchJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbUpdater.class); job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { JobClient.runJob(job); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); }
From source file:com.TCG.Nutch_DNS.Injector.java
License:Apache License
public void inject(Path hostDb, Path crawlDb) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Injector: starting at " + sdf.format(start)); LOG.info("Injector: hostDb: " + hostDb); LOG.info("Injector: carwlDb: " + crawlDb); }//from w ww. ja v a 2 s . c o m Path tempDir = new Path(getConf().get("mapred.temp.dir", ".") + "/inject-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // map text input file to a <url,CrawlDatum> file if (LOG.isInfoEnabled()) { LOG.info("Injector: Converting injected host to host db entries."); } FileSystem fs = FileSystem.get(getConf()); // determine if the crawldb already exists boolean dbExists = fs.exists(hostDb); JobConf sortJob = new NutchJob(getConf()); sortJob.setJobName("inject " + hostDb); FileInputFormat.addInputPath(sortJob, crawlDb); sortJob.setMapperClass(InjectMapper.class); FileOutputFormat.setOutputPath(sortJob, tempDir); if (dbExists) { sortJob.setOutputFormat(SequenceFileOutputFormat.class); //HostReducer,host sortJob.setReducerClass(ExitHostReducer.class); } else { sortJob.setOutputFormat(MapFileOutputFormat.class); //HostReducer,host sortJob.setReducerClass(NotExitHostReducer.class); sortJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); } sortJob.setOutputKeyClass(Text.class); sortJob.setOutputValueClass(CrawlDatum.class); sortJob.setLong("injector.current.time", System.currentTimeMillis()); RunningJob mapJob = null; try { mapJob = JobClient.runJob(sortJob); } catch (IOException e) { fs.delete(tempDir, true); throw e; } if (dbExists) { // merge with existing host db if (LOG.isInfoEnabled()) { LOG.info("Injector: Merging injected hostDb into old hostDb."); } JobConf mergeJob = HostDb.createJob(getConf(), hostDb); FileInputFormat.addInputPath(mergeJob, tempDir); //HostDb.createJobReducer:HostDbReducer mergeJob.setReducerClass(InjectReducer.class); try { RunningJob merge = JobClient.runJob(mergeJob); } catch (IOException e) { fs.delete(tempDir, true); throw e; } HostDb.install(mergeJob, hostDb); } else { HostDb.install(sortJob, hostDb); } // clean up fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:Corrector.Config.java
License:Apache License
public static void initializeConfiguration(JobConf conf) { validateConfiguration();/*from www .j a v a 2 s . c om*/ conf.setNumMapTasks(HADOOP_MAPPERS); conf.setNumReduceTasks(HADOOP_REDUCERS); conf.set("mapred.child.java.opts", HADOOP_JAVAOPTS); conf.set("mapred.task.timeout", Long.toString(HADOOP_TIMEOUT)); conf.setLong("LOCALNODES", HADOOP_LOCALNODES); conf.setLong("RANDOM_PASS", RANDOM_PASS); conf.setLong("UP_KMER", UP_KMER); conf.setLong("LOW_KMER", LOW_KMER); conf.setLong("K", K); conf.setLong("READLENGTH", READLEN); }
From source file:Corrector.FindError.java
License:Apache License
public RunningJob run(String inputPath, String outputPath, int idx, String hkmerlist) throws Exception { sLogger.info("Tool name: FindError"); sLogger.info(" - input: " + inputPath); sLogger.info(" - output: " + outputPath); JobConf conf = new JobConf(FindError.class); conf.setJobName("FindError " + inputPath + " " + Config.K); conf.setLong("IDX", idx); //\\//from ww w .j a va2 s . com DistributedCache.addCacheFile(new URI(hkmerlist), conf); //\\ Config.initializeConfiguration(conf); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(FindErrorMapper.class); conf.setReducerClass(FindErrorReducer.class); //delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); return JobClient.runJob(conf); }
From source file:Corrector.IdentifyTrustedReads.java
License:Apache License
public RunningJob run(String inputPath, String outputPath, long kmer_threshold) throws Exception { sLogger.info("Tool name: IdentifyTrustedReads"); sLogger.info(" - input: " + inputPath); sLogger.info(" - output: " + outputPath); JobConf conf = new JobConf(IdentifyTrustedReads.class); conf.setJobName("IdentifyTrustedReads " + inputPath + " " + Config.K); conf.setLong("KmerThreshold", kmer_threshold); // conf.setLong("AllKmer", allkmer); Config.initializeConfiguration(conf); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(IntWritable.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(IdentifyTrustedReads.IdentifyTrustedReadsMapper.class); conf.setReducerClass(IdentifyTrustedReads.IdentifyTrustedReadsReducer.class); //delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); return JobClient.runJob(conf); }
From source file:Corrector.PreCorrect.java
License:Apache License
public RunningJob run(String inputPath, String outputPath, int idx, String hkmerlist) throws Exception { sLogger.info("Tool name: PreCorrect"); sLogger.info(" - input: " + inputPath); sLogger.info(" - output: " + outputPath); JobConf conf = new JobConf(PreCorrect.class); conf.setJobName("PreCorrect " + inputPath + " " + Config.K); conf.setLong("IDX", idx); //\\/*from w ww . ja va 2 s .c o m*/ DistributedCache.addCacheFile(new URI(hkmerlist), conf); //\\ Config.initializeConfiguration(conf); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(PreCorrectMapper.class); conf.setReducerClass(PreCorrectReducer.class); //delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); return JobClient.runJob(conf); }
From source file:edu.uci.ics.hyracks.hadoop.compat.util.HadoopAdapter.java
License:Apache License
private Object[] getInputSplits(JobConf conf, int desiredMaxMappers) throws Exception { Object[] splits = getInputSplits(conf); if (splits.length > desiredMaxMappers) { long totalInputSize = getInputSize(splits, conf); long goalSize = (totalInputSize / desiredMaxMappers); conf.setLong("mapred.min.split.size", goalSize); conf.setNumMapTasks(desiredMaxMappers); splits = getInputSplits(conf);/*ww w . j a v a 2 s. com*/ } return splits; }