List of usage examples for org.apache.hadoop.mapred JobConf setLong
public void setLong(String name, long value)
name
property to a long
. From source file:edu.umn.cs.spatialHadoop.operations.Within.java
License:Open Source License
public static <S extends Shape> long within(Path[] inFiles, Path userOutputPath, OperationsParams params) throws IOException, InterruptedException { JobConf job = new JobConf(params, Within.class); LOG.info("Within journey starts ...."); FileSystem inFs = inFiles[0].getFileSystem(job); Path outputPath = userOutputPath; if (outputPath == null) { FileSystem outFs = FileSystem.get(job); do {// w w w . j av a2 s . c o m outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); } FileSystem outFs = outputPath.getFileSystem(job); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setJobName("Within"); job.setMapperClass(WithinMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IndexedText.class); job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks())); job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(), inFs.getFileStatus(inFiles[1]).getBlockSize())); job.setReducerClass(WithinReduce.class); job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks())); job.setInputFormat(ShapeLineInputFormat.class); if (job.getBoolean("output", true)) job.setOutputFormat(TextOutputFormat.class); else job.setOutputFormat(NullOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, inFiles); // Calculate and set the dimensions of the grid to use in the map phase long total_size = 0; Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE); for (Path file : inFiles) { FileSystem fs = file.getFileSystem(params); Rectangle file_mbr = FileMBR.fileMBR(file, params); mbr.expand(file_mbr); total_size += FileUtil.getPathSize(fs, file); } // If the largest file is globally indexed, use its partitions total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f); int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20); int num_cells = (int) Math.max(1, total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath)); LOG.info("Number of cells is configured to be " + num_cells); OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive); OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce); OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly); GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2); gridInfo.calculateCellDimensions(num_cells); OperationsParams.setShape(job, PartitionGrid, gridInfo); TextOutputFormat.setOutputPath(job, outputPath); if (OperationsParams.isLocal(job, inFiles)) { // Enforce local execution if explicitly set by user or for small files job.set("mapred.job.tracker", "local"); } // Start the job RunningJob runningJob = JobClient.runJob(job); Counters counters = runningJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); return resultCount; }
From source file:edu.umn.cs.spatialHadoop.temporal.RepartitionTemporal.java
License:Apache License
public static void repartitionMapReduce(Path[] inputPaths, Path outputPath, Shape stockShape, long blockSize, CellInfo[] cellInfos, String sindex, boolean overwrite) throws IOException { JobConf job = new JobConf(Repartition.class); job.setJobName("RepartitionTemporal"); FileSystem outFs = outputPath.getFileSystem(job); // Overwrite output file if (outFs.exists(outputPath)) { if (overwrite) outFs.delete(outputPath, true); else//from w ww .j a va2s .c o m throw new RuntimeException( "Output file '" + outputPath + "' already exists and overwrite flag is not set"); } // Decide which map function to use depending on the type of global // index if (sindex.equals("rtree") || sindex.equals("str")) { // Repartition without replication job.setMapperClass(RepartitionMapNoReplication.class); } else { // Repartition with replication (grid and r+tree) job.setMapperClass(RepartitionMap.class); } job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(stockShape.getClass()); CombinedSpatialInputFormat.setInputPaths(job, inputPaths); job.setInputFormat(CombinedSpatialInputFormat.class); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks())); FileOutputFormat.setOutputPath(job, outputPath); if (sindex.equals("grid") || sindex.equals("str") || sindex.equals("str+")) { job.setOutputFormat(GridOutputFormat.class); } else if (sindex.equals("rtree") || sindex.equals("r+tree")) { // For now, the two types of local index are the same job.setOutputFormat(RTreeGridOutputFormat.class); } else { throw new RuntimeException("Unsupported spatial index: " + sindex); } SpatialSite.setCells(job, cellInfos); job.setBoolean(SpatialSite.OVERWRITE, overwrite); // Set reduce function job.setReducerClass(RepartitionReduce.class); job.setNumReduceTasks( Math.max(1, Math.min(cellInfos.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10))); // Set output committer that combines output files together job.setOutputCommitter(RepartitionOutputCommitter.class); if (blockSize != 0) { job.setLong("dfs.block.size", blockSize); job.setLong("fs.local.block.size", blockSize); } JobClient.runJob(job); }
From source file:fr.ens.biologie.genomique.eoulsan.modules.mgmt.hadoop.DistCp.java
License:LGPL
/** * Initialize DFSCopyFileMapper specific job-configuration. * @param conf : The dfs/mapred configuration. * @param jobConf : The handle to the jobConf object to be initialized. * @param args Arguments/*from w w w .ja v a 2 s . c om*/ */ private static void setup(final Configuration conf, final JobConf jobConf, final Arguments args) throws IOException { jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString()); // set boolean values final boolean update = args.flags.contains(Options.UPDATE); final boolean overwrite = !update && args.flags.contains(Options.OVERWRITE); jobConf.setBoolean(Options.UPDATE.propertyname, update); jobConf.setBoolean(Options.OVERWRITE.propertyname, overwrite); jobConf.setBoolean(Options.IGNORE_READ_FAILURES.propertyname, args.flags.contains(Options.IGNORE_READ_FAILURES)); jobConf.setBoolean(Options.PRESERVE_STATUS.propertyname, args.flags.contains(Options.PRESERVE_STATUS)); final String randomId = getRandomId(); JobClient jClient = new JobClient(jobConf); Path jobDirectory = new Path(jClient.getSystemDir(), NAME + "_" + randomId); jobConf.set(JOB_DIR_LABEL, jobDirectory.toString()); long maxBytesPerMap = conf.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP); FileSystem dstfs = args.dst.getFileSystem(conf); boolean dstExists = dstfs.exists(args.dst); boolean dstIsDir = false; if (dstExists) { dstIsDir = dstfs.getFileStatus(args.dst).isDir(); } // default logPath Path logPath = args.log; if (logPath == null) { String filename = "_distcp_logs_" + randomId; if (!dstExists || !dstIsDir) { Path parent = args.dst.getParent(); if (null == parent) { // If dst is '/' on S3, it might not exist yet, but dst.getParent() // will return null. In this case, use '/' as its own parent to // prevent // NPE errors below. parent = args.dst; } if (!dstfs.exists(parent)) { dstfs.mkdirs(parent); } logPath = new Path(parent, filename); } else { logPath = new Path(args.dst, filename); } } FileOutputFormat.setOutputPath(jobConf, logPath); // create src list, dst list FileSystem jobfs = jobDirectory.getFileSystem(jobConf); Path srcfilelist = new Path(jobDirectory, "_distcp_src_files"); jobConf.set(SRC_LIST_LABEL, srcfilelist.toString()); SequenceFile.Writer src_writer = SequenceFile.createWriter(jobfs, jobConf, srcfilelist, LongWritable.class, FilePair.class, SequenceFile.CompressionType.NONE); Path dstfilelist = new Path(jobDirectory, "_distcp_dst_files"); SequenceFile.Writer dst_writer = SequenceFile.createWriter(jobfs, jobConf, dstfilelist, Text.class, Text.class, SequenceFile.CompressionType.NONE); Path dstdirlist = new Path(jobDirectory, "_distcp_dst_dirs"); jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString()); SequenceFile.Writer dir_writer = SequenceFile.createWriter(jobfs, jobConf, dstdirlist, Text.class, FilePair.class, SequenceFile.CompressionType.NONE); // handle the case where the destination directory doesn't exist // and we've only a single src directory OR we're updating/overwriting // the contents of the destination directory. final boolean special = (args.srcs.size() == 1 && !dstExists) || update || overwrite; int srcCount = 0, cnsyncf = 0, dirsyn = 0; long fileCount = 0L, byteCount = 0L, cbsyncs = 0L; try { for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext();) { final Path src = srcItr.next(); FileSystem srcfs = src.getFileSystem(conf); FileStatus srcfilestat = srcfs.getFileStatus(src); Path root = special && srcfilestat.isDir() ? src : src.getParent(); if (srcfilestat.isDir()) { ++srcCount; } Stack<FileStatus> pathstack = new Stack<>(); for (pathstack.push(srcfilestat); !pathstack.empty();) { FileStatus cur = pathstack.pop(); FileStatus[] children = srcfs.listStatus(cur.getPath()); for (int i = 0; i < children.length; i++) { boolean skipfile = false; final FileStatus child = children[i]; final String dst = makeRelative(root, child.getPath()); ++srcCount; if (child.isDir()) { pathstack.push(child); } else { // skip file if the src and the dst files are the same. skipfile = update && sameFile(srcfs, child, dstfs, new Path(args.dst, dst)); // skip file if it exceed file limit or size limit skipfile |= fileCount == args.filelimit || byteCount + child.getLen() > args.sizelimit; if (!skipfile) { ++fileCount; byteCount += child.getLen(); // if (LOG.isTraceEnabled()) { // LOG.trace("adding file " + child.getPath()); // } ++cnsyncf; cbsyncs += child.getLen(); if (cnsyncf > SYNC_FILE_MAX || cbsyncs > maxBytesPerMap) { src_writer.sync(); dst_writer.sync(); cnsyncf = 0; cbsyncs = 0L; } } } if (!skipfile) { src_writer.append(new LongWritable(child.isDir() ? 0 : child.getLen()), new FilePair(child, dst)); } dst_writer.append(new Text(dst), new Text(child.getPath().toString())); } if (cur.isDir()) { String dst = makeRelative(root, cur.getPath()); dir_writer.append(new Text(dst), new FilePair(cur, dst)); if (++dirsyn > SYNC_FILE_MAX) { dirsyn = 0; dir_writer.sync(); } } } } } finally { checkAndClose(src_writer); checkAndClose(dst_writer); checkAndClose(dir_writer); } FileStatus dststatus = null; try { dststatus = dstfs.getFileStatus(args.dst); } catch (FileNotFoundException fnfe) { getLogger().info(args.dst + " does not exist."); } // create dest path dir if copying > 1 file if (dststatus == null) { if (srcCount > 1 && !dstfs.mkdirs(args.dst)) { throw new IOException("Failed to create" + args.dst); } } final Path sorted = new Path(jobDirectory, "_distcp_sorted"); checkDuplication(jobfs, dstfilelist, sorted, conf); if (dststatus != null && args.flags.contains(Options.DELETE)) { deleteNonexisting(dstfs, dststatus, sorted, jobfs, jobDirectory, jobConf, conf); } Path tmpDir = new Path( (dstExists && !dstIsDir) || (!dstExists && srcCount == 1) ? args.dst.getParent() : args.dst, "_distcp_tmp_" + randomId); jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString()); // Explicitly create the tmpDir to ensure that it can be cleaned // up by fullyDelete() later. tmpDir.getFileSystem(conf).mkdirs(tmpDir); getLogger().info("srcCount=" + srcCount); jobConf.setInt(SRC_COUNT_LABEL, srcCount); jobConf.setLong(TOTAL_SIZE_LABEL, byteCount); setMapCount(byteCount, jobConf); }
From source file:hibench.HtmlConf.java
License:Apache License
public void setJobConf(JobConf job) { job.setInt("agents", agents); job.setLong("pages", pages); job.setLong("agentpages", agentpages); long vLinkElems = (null == linkZipf) ? -1 : linkZipf.velems; long vWordElems = (null == wordZipf) ? -1 : wordZipf.velems; job.setLong("vLinkElems", vLinkElems); job.setLong("vWordElems", vWordElems); }
From source file:hibench.VisitConf.java
License:Apache License
public void setJobConf(JobConf job) { job.setInt("agents", agents); job.setLong("visits", visits); job.setLong("agentvisits", agentvisits); job.setLong("pages", pages); }
From source file:hibench.ZipfRandom.java
License:Apache License
public void setJobConf(JobConf job) { job.setInt("agents", agents); job.setLong("elems", elems); job.setLong("agentelems", agentelems); job.setFloat("exponent", (float) exponent); job.setFloat("scale", (float) scale); }
From source file:io.fluo.stress.trie.Generate.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 4) { log.error("Usage: " + this.getClass().getSimpleName() + " <numMappers> <numbersPerMapper> <max> <output dir>"); System.exit(-1);/*from w w w .j ava 2 s . co m*/ } int numMappers = Integer.parseInt(args[0]); int numPerMapper = Integer.parseInt(args[1]); long max = Long.parseLong(args[2]); Path out = new Path(args[3]); Preconditions.checkArgument(numMappers > 0, "numMappers <= 0"); Preconditions.checkArgument(numPerMapper > 0, "numPerMapper <= 0"); Preconditions.checkArgument(max > 0, "max <= 0"); JobConf job = new JobConf(getConf()); job.setJobName(this.getClass().getName()); job.setJarByClass(Generate.class); job.setInt(TRIE_GEN_NUM_PER_MAPPER_PROP, numPerMapper); job.setInt(TRIE_GEN_NUM_MAPPERS_PROP, numMappers); job.setLong(TRIE_GEN_MAX_PROP, max); job.setInputFormat(RandomLongInputFormat.class); job.setNumReduceTasks(0); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(NullWritable.class); job.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, out); RunningJob runningJob = JobClient.runJob(job); runningJob.waitForCompletion(); return runningJob.isSuccessful() ? 0 : -1; }
From source file:ivory.core.index.MergeGlobalStatsAcrossIndexSegments.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), MergeGlobalStatsAcrossIndexSegments.class); FileSystem fs = FileSystem.get(conf); String collectionName = conf.get("Ivory.CollectionName"); String indexPaths = conf.get("Ivory.IndexPaths"); String dataOutputPath = conf.get("Ivory.DataOutputPath"); int dfThreshold = conf.getInt("Ivory.DfThreshold", 0); // first, compute size of global term space Path tmpPaths = new Path("/tmp/index-paths.txt"); FSDataOutputStream out = fs.create(tmpPaths, true); for (String s : indexPaths.split(",")) { out.write(new String(s + "\n").getBytes()); }//from w ww. j av a 2s.co m out.close(); LOG.info("Job: ComputeNumberOfTermsAcrossIndexSegments"); conf.setJobName("ComputeNumberOfTermsAcrossIndexSegments:" + collectionName); FileInputFormat.addInputPath(conf, tmpPaths); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(NLineInputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PairOfIntLong.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(IdentityReducer.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); Counters counters = job.getCounters(); long totalNumTerms = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", 6, "REDUCE_INPUT_GROUPS") .getCounter(); LOG.info("total number of terms in global dictionary = " + totalNumTerms); // now build the dictionary fs.delete(new Path(dataOutputPath), true); conf = new JobConf(getConf(), MergeGlobalStatsAcrossIndexSegments.class); LOG.info("Job: MergeGlobalStatsAcrossIndexSegments"); conf.setJobName("MergeGlobalStatsAcrossIndexSegments:" + collectionName); FileInputFormat.addInputPath(conf, tmpPaths); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(NLineInputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PairOfIntLong.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); conf.setLong("Ivory.IndexNumberOfTerms", (int) totalNumTerms); startTime = System.currentTimeMillis(); job = JobClient.runJob(conf); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // compute some # docs, collection length, avg doc length long collectionLength = 0; int docCount = 0; for (String index : indexPaths.split(",")) { LOG.info("reading stats for " + index); RetrievalEnvironment env = new RetrievalEnvironment(index, fs); long l = env.readCollectionLength(); int n = env.readCollectionDocumentCount(); LOG.info(" - CollectionLength: " + l); LOG.info(" - CollectionDocumentCount: " + n); collectionLength += l; docCount += n; } float avgdl = (float) collectionLength / docCount; LOG.info("all index segments: "); LOG.info(" - CollectionLength: " + collectionLength); LOG.info(" - CollectionDocumentCount: " + docCount); LOG.info(" - AverageDocumentLenght: " + avgdl); RetrievalEnvironment env = new RetrievalEnvironment(dataOutputPath, fs); env.writeCollectionAverageDocumentLength(avgdl); env.writeCollectionLength(collectionLength); env.writeCollectionDocumentCount(docCount); return 0; }
From source file:job.uncombine.compressed.BigBuildInvertedIndex.java
License:Apache License
/** * Runs this tool.//from w w w . j ava 2 s.c o m */ public int run(String[] args) throws Exception { //long GB = 1024 * 1024 * 1024; //long totalDataSize = 1 * GB; int reduceNumArray[] = { 9, 18 }; int splitSizeMBArray[] = { 64, 128, 256 }; int xmxArray[] = { 1000, 2000, 3000, 4000 }; int xmsArray[] = { 0, 1 }; int ismbArray[] = { 200, 400, 600, 800 }; for (int splitIndex = 0; splitIndex < splitSizeMBArray.length; splitIndex++) { for (int reduceNumIndex = 0; reduceNumIndex < reduceNumArray.length; reduceNumIndex++) { for (int xmxIndex = 0; xmxIndex < xmxArray.length; xmxIndex++) { for (int xmsIndex = 0; xmsIndex < xmsArray.length; xmsIndex++) { for (int ismbIndex = 0; ismbIndex < ismbArray.length; ismbIndex++) { int reduceNum = reduceNumArray[reduceNumIndex]; int splitMB = splitSizeMBArray[splitIndex]; int xmx = xmxArray[xmxIndex]; int xms = xmsArray[xmsIndex] * xmx; int ismb = ismbArray[ismbIndex]; JobConf conf = new JobConf(getConf(), BigBuildInvertedIndex.class); conf.setLong("mapred.min.split.size", SplitTable.getMapred_min_split_size(splitMB)); conf.setLong("mapred.max.split.size", SplitTable.getMapred_max_split_size(splitMB)); //conf.setInt("my.sample.split.num", (int) (totalDataSize / (splitMB * 1024 * 1024))); conf.setInt("mapred.reduce.tasks", reduceNum); conf.setInt("io.sort.mb", ismb); if (xms == 0) conf.set("mapred.child.java.opts", "-Xmx" + xmx + "m"); else conf.set("mapred.child.java.opts", "-Xmx" + xmx + "m -Xms" + xms + "m"); conf.setInt("child.monitor.metrics.seconds", 2); conf.setInt("child.monitor.jvm.seconds", 2); conf.setInt("child.monitor.jstat.seconds", 2); conf.setJobName("BigBuildInvertedIndex " + splitMB + "MB " + conf.get("mapred.child.java.opts") + " ismb=" + ismb + " RN=" + reduceNum); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: BigBuildInvertedIndex <in> <out>"); System.exit(2); } conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(PairOfInts.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PairOfWritables.class); SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); conf.setOutputFormat(MapFileOutputFormat.class); conf.setMapperClass(MyMapper.class); // conf.setCombinerClass(IdentityReducer.class); conf.setReducerClass(MyReducer.class); FileInputFormat.setInputPaths(conf, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(conf, new Path(otherArgs[1])); FileSystem.get(conf).delete(new Path(otherArgs[1]), true); try { JobClient.runJob(conf); } catch (IOException e) { e.printStackTrace(); } Thread.sleep(15000); } } } } } return 0; }
From source file:net.peacesoft.nutch.crawl.ReGenerator.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or * not is read from the crawl.generate.filter property in the configuration * files. If the property is not found, the URLs are filtered. Same for the * normalisation./*from w w w.ja v a 2 s . c o m*/ * * @param dbDir Crawl database directory * @param segments Segments directory * @param numLists Number of reduce tasks * @param topN Number of top URLs to be selected * @param curTime Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException When an I/O error occurs */ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments) throws IOException { try { Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("ReGenerator: starting at " + sdf.format(start)); LOG.info("ReGenerator: Selecting best-scoring urls due for fetch."); LOG.info("ReGenerator: filtering: " + filter); LOG.info("ReGenerator: normalizing: " + norm); if (topN != Long.MAX_VALUE) { LOG.info("ReGenerator: topN: " + topN); } if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) { LOG.info( "ReGenerator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead"); } // map to inverted subset due for fetch, sort by score JobConf job = new NutchJob(getConf()); job.setJobName("generate: select from " + dbDir); if (numLists == -1) { // for politeness make numLists = job.getNumMapTasks(); // a partition per fetch task } if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("ReGenerator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(Selector.class); job.setPartitionerClass(Selector.class); job.setReducerClass(Selector.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputKeyComparatorClass(DecreasingFloatComparator.class); job.setOutputValueClass(SelectorEntry.class); job.setOutputFormat(GeneratorOutputFormat.class); try { JobClient.runJob(job); } catch (IOException e) { throw e; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) { continue; } // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("ReGenerator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("ReGenerator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); job = new NutchJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbUpdater.class); job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { JobClient.runJob(job); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("ReGenerator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); } catch (Exception ex) { LOG.error("ReGenerator generate error: " + ex.toString(), ex); return null; } }