List of usage examples for org.apache.hadoop.mapred JobConf getNumMapTasks
public int getNumMapTasks()
From source file:gov.nasa.jpl.memex.pooledtimeseries.SimilarityCalculation.java
License:Apache License
public static void main(String[] args) throws Exception { System.loadLibrary(Core.NATIVE_LIBRARY_NAME); Configuration baseConf = new Configuration(); baseConf.set("mapreduce.job.maps", "96"); baseConf.set("mapreduce.job.reduces", "0"); baseConf.set("mapred.tasktracker.map.tasks.maximum", "96"); baseConf.set("meanDistsFilePath", args[2]); JobConf conf = new JobConf(); System.out.println("Before Map:" + conf.getNumMapTasks()); conf.setNumMapTasks(196);/*from w w w . j a va2 s .c o m*/ System.out.println("After Map:" + conf.getNumMapTasks()); Job job = Job.getInstance(baseConf); System.out.println("Track: " + baseConf.get("mapred.job.tracker")); System.out.println("Job ID" + job.getJobID()); System.out.println("Job Name" + job.getJobName()); System.out.println(baseConf.get("mapreduce.job.maps")); job.setJarByClass(SimilarityCalculation.class); job.setJobName("similarity_calc"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(Map.class); job.waitForCompletion(true); }
From source file:net.peacesoft.nutch.crawl.ReGenerator.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or * not is read from the crawl.generate.filter property in the configuration * files. If the property is not found, the URLs are filtered. Same for the * normalisation.// w w w . j a va2 s . c o m * * @param dbDir Crawl database directory * @param segments Segments directory * @param numLists Number of reduce tasks * @param topN Number of top URLs to be selected * @param curTime Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException When an I/O error occurs */ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments) throws IOException { try { Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("ReGenerator: starting at " + sdf.format(start)); LOG.info("ReGenerator: Selecting best-scoring urls due for fetch."); LOG.info("ReGenerator: filtering: " + filter); LOG.info("ReGenerator: normalizing: " + norm); if (topN != Long.MAX_VALUE) { LOG.info("ReGenerator: topN: " + topN); } if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) { LOG.info( "ReGenerator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead"); } // map to inverted subset due for fetch, sort by score JobConf job = new NutchJob(getConf()); job.setJobName("generate: select from " + dbDir); if (numLists == -1) { // for politeness make numLists = job.getNumMapTasks(); // a partition per fetch task } if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("ReGenerator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(Selector.class); job.setPartitionerClass(Selector.class); job.setReducerClass(Selector.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputKeyComparatorClass(DecreasingFloatComparator.class); job.setOutputValueClass(SelectorEntry.class); job.setOutputFormat(GeneratorOutputFormat.class); try { JobClient.runJob(job); } catch (IOException e) { throw e; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) { continue; } // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("ReGenerator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("ReGenerator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); job = new NutchJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbUpdater.class); job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { JobClient.runJob(job); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("ReGenerator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); } catch (Exception ex) { LOG.error("ReGenerator generate error: " + ex.toString(), ex); return null; } }
From source file:org.apache.druid.indexer.hadoop.DatasourceInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext context) throws IOException { JobConf conf = new JobConf(context.getConfiguration()); List<String> dataSources = getDataSources(conf); List<InputSplit> splits = new ArrayList<>(); for (String dataSource : dataSources) { List<WindowedDataSegment> segments = getSegments(conf, dataSource); if (segments == null || segments.size() == 0) { throw new ISE("No segments found to read for dataSource[%s]", dataSource); }// w ww.j a v a2 s .c o m // Note: Each segment is logged separately to avoid creating a huge String if we are loading lots of segments. for (int i = 0; i < segments.size(); i++) { final WindowedDataSegment segment = segments.get(i); logger.info("Segment %,d/%,d for dataSource[%s] has identifier[%s], interval[%s]", i, segments.size(), dataSource, segment.getSegment().getId(), segment.getInterval()); } long maxSize = getMaxSplitSize(conf, dataSource); if (maxSize < 0) { long totalSize = 0; for (WindowedDataSegment segment : segments) { totalSize += segment.getSegment().getSize(); } int mapTask = conf.getNumMapTasks(); if (mapTask > 0) { maxSize = totalSize / mapTask; } } if (maxSize > 0) { //combining is to happen, let us sort the segments list by size so that they //are combined appropriately segments.sort(Comparator.comparingLong(s -> s.getSegment().getSize())); } List<WindowedDataSegment> list = new ArrayList<>(); long size = 0; org.apache.hadoop.mapred.InputFormat fio = supplier.get(); for (WindowedDataSegment segment : segments) { if (size + segment.getSegment().getSize() > maxSize && size > 0) { splits.add(toDataSourceSplit(list, fio, conf)); list = new ArrayList<>(); size = 0; } list.add(segment); size += segment.getSegment().getSize(); } if (list.size() > 0) { splits.add(toDataSourceSplit(list, fio, conf)); } } logger.info("Number of splits [%d]", splits.size()); return splits; }
From source file:org.apache.mahout.df.mapred.partial.PartialBuilder.java
License:Apache License
@Override protected DecisionForest parseOutput(JobConf job, PredictionCallback callback) throws IOException { int numMaps = job.getNumMapTasks(); int numTrees = Builder.getNbTrees(job); Path outputPath = getOutputPath(job); log.info("Computing partitions' first ids..."); Step0Job step0 = new Step0Job(getOutputPath(job), getDataPath(), getDatasetPath()); Step0Output[] partitions = step0.run(getConf()); log.info("Processing the output..."); TreeID[] keys = new TreeID[numTrees]; Node[] trees = new Node[numTrees]; int[] firstIds = Step0Output.extractFirstIds(partitions); processOutput(job, outputPath, firstIds, keys, trees, callback); // call the second step in order to complete the oob predictions if ((callback != null) && (numMaps > 1) && isStep2(getConf())) { log.info("*****************************"); log.info("Second Step"); log.info("*****************************"); Step2Job step2 = new Step2Job(getOutputPath(job), getDataPath(), getDatasetPath(), partitions); step2.run(job, keys, trees, callback); }/* w w w . j a v a 2 s. com*/ return new DecisionForest(Arrays.asList(trees)); }
From source file:org.apache.mahout.df.mapred.partial.PartialSequentialBuilder.java
License:Apache License
@Override protected void configureJob(JobConf job, int nbTrees, boolean oobEstimate) throws IOException { int numMaps = job.getNumMapTasks(); super.configureJob(job, nbTrees, oobEstimate); // PartialBuilder sets the number of maps to 1 if we are running in 'local' job.setNumMapTasks(numMaps);//from w w w .ja v a 2 s . c o m }
From source file:org.apache.mahout.df.mapred.partial.PartialSequentialBuilder.java
License:Apache License
@Override protected void runJob(JobConf job) throws IOException { // retrieve the splits TextInputFormat input = (TextInputFormat) job.getInputFormat(); InputSplit[] splits = input.getSplits(job, job.getNumMapTasks()); log.debug("Nb splits : {}", splits.length); InputSplit[] sorted = Arrays.copyOf(splits, splits.length); Builder.sortSplits(sorted);//from w w w. j a v a2 s . c o m int numTrees = Builder.getNbTrees(job); // total number of trees firstOutput = new PartialOutputCollector(numTrees); Reporter reporter = Reporter.NULL; firstIds = new int[splits.length]; sizes = new int[splits.length]; // to compute firstIds, process the splits in file order int firstId = 0; long slowest = 0; // duration of slowest map for (InputSplit split : splits) { int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter); LongWritable key = reader.createKey(); Text value = reader.createValue(); Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(), hp, splits.length, numTrees); long time = System.currentTimeMillis(); firstIds[hp] = firstId; while (reader.next(key, value)) { mapper.map(key, value, firstOutput, reporter); firstId++; sizes[hp]++; } mapper.close(); time = System.currentTimeMillis() - time; log.info("Duration : {}", DFUtils.elapsedTime(time)); if (time > slowest) { slowest = time; } } log.info("Longest duration : {}", DFUtils.elapsedTime(slowest)); }
From source file:org.apache.mahout.df.mapred.partial.PartialSequentialBuilder.java
License:Apache License
/** * The second step uses the trees to predict the rest of the instances outside * their own partition//from w ww . ja va2 s . co m * * @throws IOException * */ void secondStep(JobConf job, Path forestPath, PredictionCallback callback) throws IOException { // retrieve the splits TextInputFormat input = (TextInputFormat) job.getInputFormat(); InputSplit[] splits = input.getSplits(job, job.getNumMapTasks()); log.debug("Nb splits : {}", splits.length); Builder.sortSplits(splits); int numTrees = Builder.getNbTrees(job); // total number of trees // compute the expected number of outputs int total = 0; for (int p = 0; p < splits.length; p++) { total += Step2Mapper.nbConcerned(splits.length, numTrees, p); } secondOutput = new PartialOutputCollector(total); Reporter reporter = Reporter.NULL; long slowest = 0; // duration of slowest map for (int partition = 0; partition < splits.length; partition++) { InputSplit split = splits[partition]; RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter); LongWritable key = reader.createKey(); Text value = reader.createValue(); // load the output of the 1st step int nbConcerned = Step2Mapper.nbConcerned(splits.length, numTrees, partition); TreeID[] fsKeys = new TreeID[nbConcerned]; Node[] fsTrees = new Node[nbConcerned]; FileSystem fs = forestPath.getFileSystem(job); int numInstances = InterResults.load(fs, forestPath, splits.length, numTrees, partition, fsKeys, fsTrees); Step2Mapper mapper = new Step2Mapper(); mapper.configure(partition, dataset, fsKeys, fsTrees, numInstances); long time = System.currentTimeMillis(); while (reader.next(key, value)) { mapper.map(key, value, secondOutput, reporter); } mapper.close(); time = System.currentTimeMillis() - time; log.info("Duration : {}", DFUtils.elapsedTime(time)); if (time > slowest) { slowest = time; } } log.info("Longest duration : {}", DFUtils.elapsedTime(slowest)); }
From source file:org.apache.mahout.df.mapred.partial.Step0Job.java
License:Apache License
/** * Extracts the output and processes it//w ww . j av a 2 s . co m * * @param job * * @return firstIds for each partition in Hadoop's order * @throws IOException */ protected Step0Output[] parseOutput(JobConf job) throws IOException { int numMaps = job.getNumMapTasks(); FileSystem fs = outputPath.getFileSystem(job); Path[] outfiles = DFUtils.listOutputFiles(fs, outputPath); int[] keys = new int[numMaps]; Step0Output[] values = new Step0Output[numMaps]; // read all the outputs IntWritable key = new IntWritable(); Step0Output value = new Step0Output(0L, 0); int index = 0; for (Path path : outfiles) { Reader reader = new Reader(fs, path, job); try { while (reader.next(key, value)) { keys[index] = key.get(); values[index] = value.clone(); index++; } } finally { reader.close(); } } return processOutput(keys, values); }
From source file:org.apache.mahout.df.mapred.partial.Step1Mapper.java
License:Apache License
@Override public void configure(JobConf job) { super.configure(job); configure(Builder.getRandomSeed(job), job.getInt("mapred.task.partition", -1), job.getNumMapTasks(), Builder.getNbTrees(job));/*from ww w . j a v a2 s . c o m*/ }
From source file:org.apache.mahout.df.mapred.partial.Step2Job.java
License:Apache License
/** * Extracts the output and processes it//from w w w .ja va 2 s . c o m * * @param job * @param callback * @throws IOException */ protected void parseOutput(JobConf job, PredictionCallback callback) throws IOException { int numMaps = job.getNumMapTasks(); int numTrees = Builder.getNbTrees(job); // compute the total number of output values //int total = 0; for (int partition = 0; partition < numMaps; partition++) { //total += Step2Mapper.nbConcerned(numMaps, numTrees, partition); Step2Mapper.nbConcerned(numMaps, numTrees, partition); } int[] firstIds = Step0Output.extractFirstIds(partitions); PartialBuilder.processOutput(job, outputPath, firstIds, null, null, callback); }