List of usage examples for org.apache.hadoop.mapred JobConf getNumMapTasks
public int getNumMapTasks()
From source file:org.apache.mahout.df.mapred.partial.Step2Mapper.java
License:Apache License
@Override public void configure(JobConf job) { // get the cached files' paths URI[] files;//from ww w . ja va 2s. co m try { files = DistributedCache.getCacheFiles(job); } catch (IOException e) { throw new IllegalStateException("Exception while getting the cache files : ", e); } if ((files == null) || (files.length < 2)) { throw new IllegalArgumentException("missing paths from the DistributedCache"); } Dataset dataset; try { Path datasetPath = new Path(files[0].getPath()); dataset = Dataset.load(job, datasetPath); } catch (IOException e) { throw new IllegalStateException("Exception while loading the dataset : ", e); } int numMaps = job.getNumMapTasks(); int p = job.getInt("mapred.task.partition", -1); // total number of trees in the forest int numTrees = Builder.getNbTrees(job); if (numTrees == -1) { throw new IllegalArgumentException("numTrees not found !"); } int nbConcerned = nbConcerned(numMaps, numTrees, p); keys = new TreeID[nbConcerned]; trees = new Node[nbConcerned]; int numInstances; try { Path forestPath = new Path(files[1].getPath()); FileSystem fs = forestPath.getFileSystem(job); numInstances = InterResults.load(fs, forestPath, numMaps, numTrees, p, keys, trees); log.debug("partition: {} numInstances: {}", p, numInstances); } catch (IOException e) { throw new IllegalStateException("Exception while loading the forest : ", e); } configure(p, dataset, keys, trees, numInstances); }
From source file:org.apache.nutch.crawl.Generator.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or not * is read from the crawl.generate.filter property in the configuration files. * If the property is not found, the URLs are filtered. Same for the * normalisation.// w w w. j a va 2 s . c o m * * @param dbDir * Crawl database directory * @param segments * Segments directory * @param numLists * Number of reduce tasks * @param topN * Number of top URLs to be selected * @param curTime * Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException * When an I/O error occurs */ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments) throws IOException { Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("Generator: starting at " + sdf.format(start)); LOG.info("Generator: Selecting best-scoring urls due for fetch."); LOG.info("Generator: filtering: " + filter); LOG.info("Generator: normalizing: " + norm); if (topN != Long.MAX_VALUE) { LOG.info("Generator: topN: " + topN); } if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) { LOG.info("Generator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead"); } // map to inverted subset due for fetch, sort by score JobConf job = new NutchJob(getConf()); job.setJobName("generate: select from " + dbDir); if (numLists == -1) { // for politeness make numLists = job.getNumMapTasks(); // a partition per fetch task } if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(Selector.class); job.setPartitionerClass(Selector.class); job.setReducerClass(Selector.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputKeyComparatorClass(DecreasingFloatComparator.class); job.setOutputValueClass(SelectorEntry.class); job.setOutputFormat(GeneratorOutputFormat.class); try { JobClient.runJob(job); } catch (IOException e) { throw e; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) continue; // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("Generator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("Generator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); job = new NutchJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbUpdater.class); job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { JobClient.runJob(job); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); }
From source file:org.apache.nutch.tools.FreeGenerator.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]"); System.err.println("\tinputDir\tinput directory containing one or more input files."); System.err.println("\t\tEach text file contains a list of URLs, one URL per line"); System.err.println("\tsegmentsDir\toutput directory, where new segment will be created"); System.err.println("\t-filter\trun current URLFilters on input URLs"); System.err.println("\t-normalize\trun current URLNormalizers on input URLs"); return -1; }//from ww w .j a va2 s . c o m boolean filter = false; boolean normalize = false; if (args.length > 2) { for (int i = 2; i < args.length; i++) { if (args[i].equals("-filter")) { filter = true; } else if (args[i].equals("-normalize")) { normalize = true; } else { LOG.error("Unknown argument: " + args[i] + ", exiting ..."); return -1; } } } SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("FreeGenerator: starting at " + sdf.format(start)); JobConf job = new NutchJob(getConf()); job.setBoolean(FILTER_KEY, filter); job.setBoolean(NORMALIZE_KEY, normalize); FileInputFormat.addInputPath(job, new Path(args[0])); job.setInputFormat(TextInputFormat.class); job.setMapperClass(FG.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Generator.SelectorEntry.class); job.setPartitionerClass(URLPartitioner.class); job.setReducerClass(FG.class); String segName = Generator.generateSegmentName(); job.setNumReduceTasks(job.getNumMapTasks()); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(Generator.HashComparator.class); FileOutputFormat.setOutputPath(job, new Path(args[1], new Path(segName, CrawlDatum.GENERATE_DIR_NAME))); try { JobClient.runJob(job); } catch (Exception e) { LOG.error("FAILED: " + StringUtils.stringifyException(e)); return -1; } long end = System.currentTimeMillis(); LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); return 0; }
From source file:org.apache.sysml.runtime.matrix.sort.SamplingSortMRInputFormat.java
License:Apache License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * // ww w.j av a 2 s.c o m * @param conf the job to sample * @param partFile where to write the output file to * @return index value * @throws IOException if something goes wrong * @throws InstantiationException if InstantiationException occurs * @throws IllegalAccessException if IllegalAccessException occurs */ @SuppressWarnings({ "unchecked", "unused", "deprecation" }) public static int writePartitionFile(JobConf conf, Path partFile) throws IOException, InstantiationException, IllegalAccessException { SamplingSortMRInputFormat inFormat = new SamplingSortMRInputFormat(); Sampler sampler = new Sampler(); Class<? extends WritableComparable> targetKeyClass; targetKeyClass = (Class<? extends WritableComparable>) conf.getClass(TARGET_KEY_CLASS, WritableComparable.class); //get input converter information int brlen = MRJobConfiguration.getNumRowsPerBlock(conf, (byte) 0); int bclen = MRJobConfiguration.getNumColumnsPerBlock(conf, (byte) 0); //indicate whether the matrix value in this mapper is a matrix cell or a matrix block int partitions = conf.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 1000); InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks()); int samples = Math.min(10, splits.length); long recordsPerSample = sampleSize / samples; int sampleStep = splits.length / samples; // take N samples from different parts of the input int totalcount = 0; for (int i = 0; i < samples; i++) { SequenceFileRecordReader reader = (SequenceFileRecordReader) inFormat .getRecordReader(splits[sampleStep * i], conf, null); int count = 0; WritableComparable key = (WritableComparable) reader.createKey(); Writable value = (Writable) reader.createValue(); while (reader.next(key, value) && count < recordsPerSample) { Converter inputConverter = MRJobConfiguration.getInputConverter(conf, (byte) 0); inputConverter.setBlockSize(brlen, bclen); inputConverter.convert(key, value); while (inputConverter.hasNext()) { Pair pair = inputConverter.next(); if (pair.getKey() instanceof DoubleWritable) { sampler.addValue(new DoubleWritable(((DoubleWritable) pair.getKey()).get())); } else if (pair.getValue() instanceof MatrixCell) { sampler.addValue(new DoubleWritable(((MatrixCell) pair.getValue()).getValue())); } else throw new IOException("SamplingSortMRInputFormat unsupported key/value class: " + pair.getKey().getClass() + ":" + pair.getValue().getClass()); count++; } key = (WritableComparable) reader.createKey(); value = (Writable) reader.createValue(); } totalcount += count; } if (totalcount == 0) //empty input files sampler.addValue(new DoubleWritable(0)); FileSystem outFs = partFile.getFileSystem(conf); if (outFs.exists(partFile)) { outFs.delete(partFile, false); } //note: key value always double/null as expected by partitioner SequenceFile.Writer writer = null; int index0 = -1; try { writer = SequenceFile.createWriter(outFs, conf, partFile, DoubleWritable.class, NullWritable.class); NullWritable nullValue = NullWritable.get(); int i = 0; boolean lessthan0 = true; for (WritableComparable splitValue : sampler.createPartitions(partitions)) { writer.append(splitValue, nullValue); if (lessthan0 && ((DoubleWritable) splitValue).get() >= 0) { index0 = i; lessthan0 = false; } i++; } if (lessthan0) index0 = partitions - 1; } finally { IOUtilFunctions.closeSilently(writer); } return index0; }
From source file:org.apache.tez.mapreduce.hadoop.MRInputHelpers.java
License:Apache License
@SuppressWarnings({ "rawtypes", "unchecked" }) private static org.apache.hadoop.mapred.InputSplit[] generateOldSplits(JobConf jobConf, boolean groupSplits, int numTasks) throws IOException { // This is the real InputFormat org.apache.hadoop.mapred.InputFormat inputFormat; try {/*w w w . j ava2 s .com*/ inputFormat = jobConf.getInputFormat(); } catch (Exception e) { throw new TezUncheckedException(e); } org.apache.hadoop.mapred.InputFormat finalInputFormat = inputFormat; if (groupSplits) { org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat groupedFormat = new org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat(); groupedFormat.setConf(jobConf); groupedFormat.setInputFormat(inputFormat); groupedFormat.setDesiredNumberOfSplits(numTasks); finalInputFormat = groupedFormat; } else { finalInputFormat = inputFormat; } org.apache.hadoop.mapred.InputSplit[] splits = finalInputFormat.getSplits(jobConf, jobConf.getNumMapTasks()); // sort the splits into order based on size, so that the biggest // go first Arrays.sort(splits, new OldInputSplitComparator()); return splits; }
From source file:org.deeplearning4j.iterativereduce.runtime.yarn.appmaster.ApplicationMaster.java
License:Apache License
private Set<ConfigurationTuple> getConfigurationTuples() throws IOException { if (confTuples != null) return confTuples; Path inputPath = new Path(props.getProperty(ConfigFields.APP_INPUT_PATH)); FileSystem fs = FileSystem.get(conf); FileStatus f = fs.getFileStatus(inputPath); //BlockLocation[] bl = fs.getFileBlockLocations(p, 0, f.getLen()); Set<ConfigurationTuple> configTuples = new HashSet<>(); int workerId = 0; JobConf job = new JobConf(new Configuration()); job.setInputFormat((Class<? extends InputFormat>) this.inputFormatClass); //TextInputFormat.class); FileInputFormat.setInputPaths(job, inputPath); InputSplit[] splits = job.getInputFormat().getSplits(job, job.getNumMapTasks()); for (InputSplit split : splits) { FileSplit convertedToMetronomeSplit = new FileSplit(); org.apache.hadoop.mapred.FileSplit hadoopFileSplit = (org.apache.hadoop.mapred.FileSplit) split; if (hadoopFileSplit.getLength() - hadoopFileSplit.getStart() > 0) { convertedToMetronomeSplit.setLength(hadoopFileSplit.getLength()); convertedToMetronomeSplit.setOffset(hadoopFileSplit.getStart()); convertedToMetronomeSplit.setPath(hadoopFileSplit.getPath().toString()); StartupConfiguration config = StartupConfiguration.newBuilder().setBatchSize(batchSize) .setIterations(iterationCount).setOther(appConfig).setSplit(convertedToMetronomeSplit) .build();// w ww . j a va 2 s . c o m String wid = "worker-" + workerId; ConfigurationTuple tuple = new ConfigurationTuple(split.getLocations()[0], wid, config); configTuples.add(tuple); workerId++; LOG.info("IR_AM_worker: " + wid + " added split: " + convertedToMetronomeSplit.toString()); } else { LOG.info("IR_AM: Culled out 0 length Split: " + convertedToMetronomeSplit.toString()); } } LOG.info("Total Splits/Workers: " + configTuples.size()); confTuples = configTuples; return configTuples; }
From source file:org.elasticsearch.hadoop.mr.EsInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext context) throws IOException { JobConf conf = HadoopCfgUtils.asJobConf(CompatHandler.jobContext(context).getConfiguration()); // NOTE: this method expects a ShardInputSplit to be returned (which implements both the old and the new API). return Arrays.asList((InputSplit[]) getSplits(conf, conf.getNumMapTasks())); }
From source file:org.hxx.hadoop.GeneratorHbase.java
License:Apache License
private RunningJob generateJob(String table, Path segment, long topN, int reduceCnt, boolean filter, boolean norm, boolean force) throws IOException { LOG.info("Generator: from table=" + table + " segment=" + segment); JobConf job = new NutchJob(getConf()); // job.setJarByClass(GeneratorHbase.class); job.setJobName("generate:" + table + " " + (new SimpleDateFormat("HH:mm:ss")).format(System.currentTimeMillis()) + " path=" + segment); if (reduceCnt == -1) { reduceCnt = job.getNumMapTasks(); // a partition per fetch task }//from w w w. j a v a2s.c o m if ("local".equals(job.get("mapred.job.tracker")) && reduceCnt != 1) { LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); reduceCnt = 1; } // job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.set(GENERATL_TABLE, table); job.setInt(GENERATL_REDUCECNT, reduceCnt); job.setInt("partition.url.seed", new Random().nextInt()); job.setInputFormat(TableTopInputFormat.class);// ? job.setMapperClass(GenerateMark.class);// generate? job.setPartitionerClass(GenerateMark.class); job.setNumReduceTasks(reduceCnt); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(HashComparator.class); Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME); FileOutputFormat.setOutputPath(job, output); RunningJob r = JobClient.runJob(job); return r; }
From source file:org.hxx.hadoop.GeneratorHbase.java
License:Apache License
private RunningJob generateJob(String table, Path segment, int reduceCnt, long topN, boolean filter, boolean norm, boolean force) throws IOException { LOG.info("Generator: segment=" + segment); JobConf job = new NutchJob(getConf()); // job.setJarByClass(GeneratorHbase.class); job.setJobName("generate:" + table + " " + (new SimpleDateFormat("HH:mm:ss")).format(System.currentTimeMillis()) + " path=" + segment); // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000); if (reduceCnt == -1) { reduceCnt = job.getNumMapTasks(); // a partition per fetch task }/*from w w w .ja v a2 s . c o m*/ if ("local".equals(job.get("mapred.job.tracker")) && reduceCnt != 1) { LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); reduceCnt = 1; } // job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.set(GENERATL_TABLE, table); job.setInt(GENERATL_REDUCECNT, reduceCnt); job.setInt("partition.url.seed", new Random().nextInt()); job.setInputFormat(CodeInputFormat.class); job.setNumMapTasks(1); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenerateMark.class); job.setNumReduceTasks(reduceCnt); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(HashComparator.class); Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME); FileOutputFormat.setOutputPath(job, output); RunningJob r = JobClient.runJob(job); return r; }
From source file:org.hxx.hadoop.GeneratorMapHbase.java
License:Apache License
private RunningJob generateJob(String table, Path segment, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force) throws IOException { LOG.info("Generator: segment: " + segment); JobConf job = new NutchJob(getConf()); job.setJarByClass(GeneratorMapHbase.class); job.setJobName("generate: from " + table + " " + (new SimpleDateFormat("yyyyMMdd HH:mm:ss")).format(System.currentTimeMillis())); // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000); if (numLists == -1) { numLists = job.getNumMapTasks(); // a partition per fetch task }//from w ww. j a v a 2 s.c o m numLists = 4;// TODO if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } // job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.set(GENERATL_TABLE, table); job.setInt(GENERATL_REDUCENUM, numLists); job.setInputFormat(TableTopInputFormat.class);// ? job.setMapperClass(GenerateMark.class);// generate? job.setPartitionerClass(URLCountPartitioner.class); job.setNumReduceTasks(numLists); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(HashComparator.class); Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME); FileOutputFormat.setOutputPath(job, output); RunningJob r = null; try { r = JobClient.runJob(job); } catch (IOException e) { throw e; } return r; }