List of usage examples for org.apache.hadoop.mapred JobConf setLong
public void setLong(String name, long value)
name
property to a long
. From source file:net.peacesoft.nutch.crawl.ReInjector.java
License:Apache License
public void inject(Path crawlDb, Path urlDir) { try {//from ww w . j a va 2 s .c o m SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Injector: starting at " + sdf.format(start)); LOG.info("Injector: crawlDb: " + crawlDb); LOG.info("Injector: urlDir: " + urlDir); } Path tempDir = new Path(getConf().get("mapred.temp.dir", ".") + "/inject-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // map text input file to a <url,CrawlDatum> file if (LOG.isInfoEnabled()) { LOG.info("Injector: Converting injected urls to crawl db entries."); } JobConf sortJob = new NutchJob(getConf()); sortJob.setJobName("inject " + urlDir); FileInputFormat.addInputPath(sortJob, urlDir); sortJob.setMapperClass(ReInjector.InjectMapper.class); FileOutputFormat.setOutputPath(sortJob, tempDir); sortJob.setOutputFormat(SequenceFileOutputFormat.class); sortJob.setOutputKeyClass(Text.class); sortJob.setOutputValueClass(CrawlDatum.class); sortJob.setLong("injector.current.time", System.currentTimeMillis()); JobClient.runJob(sortJob); // merge with existing crawl db if (LOG.isInfoEnabled()) { LOG.info("Injector: Merging injected urls into crawl db."); } JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb); FileInputFormat.addInputPath(mergeJob, tempDir); mergeJob.setReducerClass(ReInjector.InjectReducer.class); JobClient.runJob(mergeJob); CrawlDb.install(mergeJob, crawlDb); // clean up FileSystem fs = FileSystem.get(getConf()); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } catch (Exception ex) { LOG.error("ReInjector run injector error: " + ex.toString(), ex); } }
From source file:nl.tudelft.graphalytics.mapreducev2.evo.DirectedForestFireModelJob.java
License:Apache License
@Override protected void setConfigurationParameters(JobConf jobConfiguration) { super.setConfigurationParameters(jobConfiguration); jobConfiguration.setLong(ForestFireModelUtils.MAX_ID, getParameters().getMaxId() + 1); jobConfiguration.setFloat(ForestFireModelUtils.P_RATIO, getParameters().getPRatio()); jobConfiguration.setFloat(ForestFireModelUtils.R_RATIO, getParameters().getRRatio()); jobConfiguration.set(ForestFireModelUtils.CURRENT_AMBASSADORS, ForestFireModelUtils.verticesIDsMap2String(burnedEdges)); if (getIteration() == 1) { if (getNumMappers() > 0) { jobConfiguration.setInt(ForestFireModelUtils.NEW_VERTICES_NR, getParameters().getNumNewVertices() / getNumMappers()); jobConfiguration.setInt(ForestFireModelUtils.ID_SHIFT, getNumMappers()); } else {//from w w w . ja v a 2 s .c o m jobConfiguration.setInt(ForestFireModelUtils.NEW_VERTICES_NR, getParameters().getNumNewVertices()); jobConfiguration.setInt(ForestFireModelUtils.ID_SHIFT, 1024 * 1024); } jobConfiguration.setBoolean(ForestFireModelUtils.IS_INIT, true); } }
From source file:nl.tudelft.graphalytics.mapreducev2.evo.UndirectedForestFireModelJob.java
License:Apache License
@Override protected void setConfigurationParameters(JobConf jobConfiguration) { super.setConfigurationParameters(jobConfiguration); jobConfiguration.setLong(ForestFireModelUtils.MAX_ID, getParameters().getMaxId() + 1); jobConfiguration.setFloat(ForestFireModelUtils.P_RATIO, getParameters().getPRatio()); jobConfiguration.setFloat(ForestFireModelUtils.R_RATIO, getParameters().getRRatio()); jobConfiguration.set(ForestFireModelUtils.CURRENT_AMBASSADORS, ForestFireModelUtils.verticesIDsMap2String(burnedEdges)); if (getIteration() == 1) { if (getNumMappers() > 0) { jobConfiguration.setInt(ForestFireModelUtils.NEW_VERTICES_NR, getParameters().getNumNewVertices() / getNumMappers()); jobConfiguration.setInt(ForestFireModelUtils.ID_SHIFT, getNumMappers()); } else {/* ww w .j a v a 2 s. c o m*/ jobConfiguration.setInt(ForestFireModelUtils.NEW_VERTICES_NR, getParameters().getNumNewVertices()); jobConfiguration.setInt(ForestFireModelUtils.ID_SHIFT, 1024 * 1024); } jobConfiguration.setBoolean(ForestFireModelUtils.IS_INIT, true); } else if (getIteration() == getParameters().getMaxIterations() + 1) { jobConfiguration.setBoolean(ForestFireModelUtils.IS_FINAL, true); } }
From source file:nthu.scopelab.tsqr.ssvd.itQJob.java
License:Apache License
public static void run(Configuration conf, Path[] inputPaths, String outputPath, String reduceSchedule, int k, int p, long seed, int mis) throws ClassNotFoundException, InterruptedException, IOException { String stages[] = reduceSchedule.split(","); String rinput = ""; String routput = outputPath + "/iter-r-"; for (int i = 0; i < stages.length; i++) { String thenumber = Integer.toString(i + 1); JobConf job = new JobConf(conf, itQJob.class); job.setJobName("itQ-job-" + thenumber); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); if (i == 0) job.setMapperClass(QMapper.class); else//from w w w . java 2 s.c om job.setMapperClass(IdentityMapper.class); job.setReducerClass(QReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(LMatrixWritable.class); FileSystem fs = FileSystem.get(job); Path Paths[]; fileGather fgather = null; if (i == 0) fgather = new fileGather(inputPaths, "part", fs); else fgather = new fileGather(new Path(rinput), "part", fs); Paths = fgather.getPaths(); mis = Checker.checkMis(mis, fgather.getInputSize(), fs); job.setNumMapTasks(fgather.recNumMapTasks(mis)); job.setNumReduceTasks(Integer.parseInt(stages[i])); job.setInt(QRFirstJob.COLUMN_SIZE, k + p); job.setLong(PROP_OMEGA_SEED, seed); job.setInt(PROP_K, k); job.setInt(PROP_P, p); fs.delete(new Path(routput + thenumber), true); FileInputFormat.setInputPaths(job, Paths); FileOutputFormat.setOutputPath(job, new Path(routput + thenumber)); //FileOutputFormat.setCompressOutput(job, true); //FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); //SequenceFileOutputFormat.setOutputCompressionType(job,CompressionType.BLOCK); //output first level Q MultipleOutputs.addNamedOutput(job, QF_MAT, SequenceFileOutputFormat.class, IntWritable.class, LMatrixWritable.class); RunningJob rj = JobClient.runJob(job); System.out.println("itQJob Job ID: " + rj.getJobID().toString()); rinput = routput + thenumber; } }
From source file:nthu.scopelab.tsqr.ssvd.QJob.java
License:Apache License
public static void run(Configuration conf, Path[] inputPaths, String outputPath, String reduceSchedule, int k, int p, long seed, int mis) throws ClassNotFoundException, InterruptedException, IOException { String stages[] = reduceSchedule.split(","); String rinput = ""; String routput = outputPath + "/iter-r-"; for (int i = 0; i < stages.length; i++) { String thenumber = Integer.toString(i + 1); JobConf job = new JobConf(conf, QJob.class); job.setJobName("Q-job-" + thenumber); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); if (i == 0) job.setMapperClass(QMapper.class); else//from w ww. j a v a 2 s . co m job.setMapperClass(IdentityMapper.class); job.setReducerClass(QReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(LMatrixWritable.class); FileSystem fs = FileSystem.get(job); Path Paths[]; fileGather fgather = null; if (i == 0) fgather = new fileGather(inputPaths, "part", fs); else fgather = new fileGather(new Path(rinput), "part", fs); Paths = fgather.getPaths(); mis = Checker.checkMis(mis, fgather.getInputSize(), fs); job.setNumMapTasks(fgather.recNumMapTasks(mis)); job.setNumReduceTasks(Integer.parseInt(stages[i])); job.setInt(QRFirstJob.COLUMN_SIZE, k + p); job.setLong(PROP_OMEGA_SEED, seed); job.setInt(PROP_K, k); job.setInt(PROP_P, p); fs.delete(new Path(routput + thenumber), true); FileInputFormat.setInputPaths(job, Paths); FileOutputFormat.setOutputPath(job, new Path(routput + thenumber)); //FileOutputFormat.setCompressOutput(job, true); //FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); //SequenceFileOutputFormat.setOutputCompressionType(job,CompressionType.BLOCK); //output first level Q MultipleOutputs.addNamedOutput(job, QF_MAT, SequenceFileOutputFormat.class, IntWritable.class, LMatrixWritable.class); RunningJob rj = JobClient.runJob(job); System.out.println("QJob Job ID: " + rj.getJobID().toString()); rinput = routput + thenumber; } }
From source file:org.apache.nutch.crawl.CrawlDbReader.java
License:Apache License
public void processTopNJob(String crawlDb, long topN, float min, String output, Configuration config) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")"); LOG.info("CrawlDb db: " + crawlDb); }// w w w .ja v a2 s . co m Path outFolder = new Path(output); Path tempDir = new Path(config.get("mapred.temp.dir", ".") + "/readdb-topN-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(config); job.setJobName("topN prepare " + crawlDb); FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbTopNMapper.class); job.setReducerClass(IdentityReducer.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputValueClass(Text.class); // XXX hmmm, no setFloat() in the API ... :( job.setLong("db.reader.topn.min", Math.round(1000000.0 * min)); JobClient.runJob(job); if (LOG.isInfoEnabled()) { LOG.info("CrawlDb topN: collecting topN scores."); } job = new NutchJob(config); job.setJobName("topN collect " + crawlDb); job.setLong("db.reader.topn", topN); FileInputFormat.addInputPath(job, tempDir); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(IdentityMapper.class); job.setReducerClass(CrawlDbTopNReducer.class); FileOutputFormat.setOutputPath(job, outFolder); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(1); // create a single file. JobClient.runJob(job); FileSystem fs = FileSystem.get(config); fs.delete(tempDir, true); if (LOG.isInfoEnabled()) { LOG.info("CrawlDb topN: done"); } }
From source file:org.apache.nutch.crawl.Generator.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or not * is read from the crawl.generate.filter property in the configuration files. * If the property is not found, the URLs are filtered. Same for the * normalisation./* w w w .j a v a 2 s . co m*/ * * @param dbDir * Crawl database directory * @param segments * Segments directory * @param numLists * Number of reduce tasks * @param topN * Number of top URLs to be selected * @param curTime * Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException * When an I/O error occurs */ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments) throws IOException { Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("Generator: starting at " + sdf.format(start)); LOG.info("Generator: Selecting best-scoring urls due for fetch."); LOG.info("Generator: filtering: " + filter); LOG.info("Generator: normalizing: " + norm); if (topN != Long.MAX_VALUE) { LOG.info("Generator: topN: " + topN); } if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) { LOG.info("Generator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead"); } // map to inverted subset due for fetch, sort by score JobConf job = new NutchJob(getConf()); job.setJobName("generate: select from " + dbDir); if (numLists == -1) { // for politeness make numLists = job.getNumMapTasks(); // a partition per fetch task } if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(Selector.class); job.setPartitionerClass(Selector.class); job.setReducerClass(Selector.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputKeyComparatorClass(DecreasingFloatComparator.class); job.setOutputValueClass(SelectorEntry.class); job.setOutputFormat(GeneratorOutputFormat.class); try { JobClient.runJob(job); } catch (IOException e) { throw e; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) continue; // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("Generator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("Generator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); job = new NutchJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbUpdater.class); job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { JobClient.runJob(job); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); }
From source file:org.apache.nutch.crawl.Injector.java
License:Apache License
public void inject(Path crawlDb, Path urlDir) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Injector: starting at " + sdf.format(start)); LOG.info("Injector: crawlDb: " + crawlDb); LOG.info("Injector: urlDir: " + urlDir); }/*from w w w . j av a 2 s . co m*/ Path tempDir = new Path(getConf().get("mapred.temp.dir", ".") + "/inject-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // map text input file to a <url,CrawlDatum> file if (LOG.isInfoEnabled()) { LOG.info("Injector: Converting injected urls to crawl db entries."); } JobConf sortJob = new NutchJob(getConf()); sortJob.setJobName("inject " + urlDir); FileInputFormat.addInputPath(sortJob, urlDir); sortJob.setMapperClass(InjectMapper.class); FileOutputFormat.setOutputPath(sortJob, tempDir); sortJob.setOutputFormat(SequenceFileOutputFormat.class); sortJob.setOutputKeyClass(Text.class); sortJob.setOutputValueClass(CrawlDatum.class); sortJob.setLong("injector.current.time", System.currentTimeMillis()); JobClient.runJob(sortJob); // merge with existing crawl db if (LOG.isInfoEnabled()) { LOG.info("Injector: Merging injected urls into crawl db."); } JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb); FileInputFormat.addInputPath(mergeJob, tempDir); mergeJob.setReducerClass(InjectReducer.class); JobClient.runJob(mergeJob); CrawlDb.install(mergeJob, crawlDb); // clean up FileSystem fs = FileSystem.get(getConf()); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.scoring.webgraph.NodeDumper.java
License:Apache License
/** * Runs the process to dump the top urls out to a text file. * * @param webGraphDb The WebGraph from which to pull values. * * @param topN/*from ww w .jav a 2s . c om*/ * @param output * * @throws IOException If an error occurs while dumping the top values. */ public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType, AggrType aggrType, boolean asSequenceFile) throws Exception { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("NodeDumper: starting at " + sdf.format(start)); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); Configuration conf = getConf(); JobConf dumper = new NutchJob(conf); dumper.setJobName("NodeDumper: " + webGraphDb); FileInputFormat.addInputPath(dumper, nodeDb); dumper.setInputFormat(SequenceFileInputFormat.class); if (nameType == null) { dumper.setMapperClass(Sorter.class); dumper.setReducerClass(Sorter.class); dumper.setMapOutputKeyClass(FloatWritable.class); dumper.setMapOutputValueClass(Text.class); } else { dumper.setMapperClass(Dumper.class); dumper.setReducerClass(Dumper.class); dumper.setMapOutputKeyClass(Text.class); dumper.setMapOutputValueClass(FloatWritable.class); } dumper.setOutputKeyClass(Text.class); dumper.setOutputValueClass(FloatWritable.class); FileOutputFormat.setOutputPath(dumper, output); if (asSequenceFile) { dumper.setOutputFormat(SequenceFileOutputFormat.class); } else { dumper.setOutputFormat(TextOutputFormat.class); } dumper.setNumReduceTasks(1); dumper.setBoolean("inlinks", type == DumpType.INLINKS); dumper.setBoolean("outlinks", type == DumpType.OUTLINKS); dumper.setBoolean("scores", type == DumpType.SCORES); dumper.setBoolean("host", nameType == NameType.HOST); dumper.setBoolean("domain", nameType == NameType.DOMAIN); dumper.setBoolean("sum", aggrType == AggrType.SUM); dumper.setBoolean("max", aggrType == AggrType.MAX); dumper.setLong("topn", topN); // Set equals-sign as separator for Solr's ExternalFileField if (asEff) { dumper.set("mapred.textoutputformat.separator", "="); } try { LOG.info("NodeDumper: running"); JobClient.runJob(dumper); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } long end = System.currentTimeMillis(); LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.segment.SegmentMerger.java
License:Apache License
public void merge(Path out, Path[] segs, boolean filter, boolean normalize, long slice) throws Exception { String segmentName = Generator.generateSegmentName(); if (LOG.isInfoEnabled()) { LOG.info("Merging " + segs.length + " segments to " + out + "/" + segmentName); }/* ww w. ja va 2 s . c o m*/ JobConf job = new NutchJob(getConf()); job.setJobName("mergesegs " + out + "/" + segmentName); job.setBoolean("segment.merger.filter", filter); job.setBoolean("segment.merger.normalizer", normalize); job.setLong("segment.merger.slice", slice); job.set("segment.merger.segmentName", segmentName); FileSystem fs = FileSystem.get(getConf()); // prepare the minimal common set of input dirs boolean g = true; boolean f = true; boolean p = true; boolean c = true; boolean pd = true; boolean pt = true; for (int i = 0; i < segs.length; i++) { if (!fs.exists(segs[i])) { if (LOG.isWarnEnabled()) { LOG.warn("Input dir " + segs[i] + " doesn't exist, skipping."); } segs[i] = null; continue; } if (LOG.isInfoEnabled()) { LOG.info("SegmentMerger: adding " + segs[i]); } Path cDir = new Path(segs[i], Content.DIR_NAME); Path gDir = new Path(segs[i], CrawlDatum.GENERATE_DIR_NAME); Path fDir = new Path(segs[i], CrawlDatum.FETCH_DIR_NAME); Path pDir = new Path(segs[i], CrawlDatum.PARSE_DIR_NAME); Path pdDir = new Path(segs[i], ParseData.DIR_NAME); Path ptDir = new Path(segs[i], ParseText.DIR_NAME); c = c && fs.exists(cDir); g = g && fs.exists(gDir); f = f && fs.exists(fDir); p = p && fs.exists(pDir); pd = pd && fs.exists(pdDir); pt = pt && fs.exists(ptDir); } StringBuffer sb = new StringBuffer(); if (c) sb.append(" " + Content.DIR_NAME); if (g) sb.append(" " + CrawlDatum.GENERATE_DIR_NAME); if (f) sb.append(" " + CrawlDatum.FETCH_DIR_NAME); if (p) sb.append(" " + CrawlDatum.PARSE_DIR_NAME); if (pd) sb.append(" " + ParseData.DIR_NAME); if (pt) sb.append(" " + ParseText.DIR_NAME); if (LOG.isInfoEnabled()) { LOG.info("SegmentMerger: using segment data from:" + sb.toString()); } for (int i = 0; i < segs.length; i++) { if (segs[i] == null) continue; if (g) { Path gDir = new Path(segs[i], CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, gDir); } if (c) { Path cDir = new Path(segs[i], Content.DIR_NAME); FileInputFormat.addInputPath(job, cDir); } if (f) { Path fDir = new Path(segs[i], CrawlDatum.FETCH_DIR_NAME); FileInputFormat.addInputPath(job, fDir); } if (p) { Path pDir = new Path(segs[i], CrawlDatum.PARSE_DIR_NAME); FileInputFormat.addInputPath(job, pDir); } if (pd) { Path pdDir = new Path(segs[i], ParseData.DIR_NAME); FileInputFormat.addInputPath(job, pdDir); } if (pt) { Path ptDir = new Path(segs[i], ParseText.DIR_NAME); FileInputFormat.addInputPath(job, ptDir); } } job.setInputFormat(ObjectInputFormat.class); job.setMapperClass(SegmentMerger.class); job.setReducerClass(SegmentMerger.class); FileOutputFormat.setOutputPath(job, out); job.setOutputKeyClass(Text.class); job.setOutputValueClass(MetaWrapper.class); job.setOutputFormat(SegmentOutputFormat.class); setConf(job); JobClient.runJob(job); }