List of usage examples for org.apache.hadoop.mapred JobConf setBoolean
public void setBoolean(String name, boolean value)
name
property to a boolean
. From source file:org.apache.crunch.io.orc.OrcFileReaderFactory.java
License:Apache License
@Override public Iterator<T> read(FileSystem fs, final Path path) { try {/*w ww . j a v a 2 s . com*/ if (!fs.isFile(path)) { throw new CrunchRuntimeException("Not a file: " + path); } inputFn.initialize(); FileStatus status = fs.getFileStatus(path); FileSplit split = new FileSplit(path, 0, status.getLen(), new String[0]); JobConf conf = new JobConf(); if (readColumns != null) { conf.setBoolean(OrcFileSource.HIVE_READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, OrcFileSource.getColumnIdsStr(readColumns)); } final RecordReader<NullWritable, OrcStruct> reader = inputFormat.getRecordReader(split, conf, Reporter.NULL); return new UnmodifiableIterator<T>() { private boolean checked = false; private boolean hasNext; private OrcStruct value; private OrcWritable writable = new OrcWritable(); @Override public boolean hasNext() { try { if (value == null) { value = reader.createValue(); } if (!checked) { hasNext = reader.next(NullWritable.get(), value); checked = true; } return hasNext; } catch (Exception e) { throw new CrunchRuntimeException("Error while reading local file: " + path, e); } } @Override public T next() { try { if (value == null) { value = reader.createValue(); } if (!checked) { reader.next(NullWritable.get(), value); } checked = false; writable.set(value); return inputFn.map(writable); } catch (Exception e) { throw new CrunchRuntimeException("Error while reading local file: " + path, e); } } }; } catch (Exception e) { throw new CrunchRuntimeException("Error while reading local file: " + path, e); } }
From source file:org.apache.hive.streaming.TestStreaming.java
License:Apache License
/** * @deprecated use {@link #checkDataWritten2(Path, long, long, int, String, boolean, String...)} - * there is little value in using InputFormat directly *//*from w ww.jav a2s . c o m*/ @Deprecated private void checkDataWritten(Path partitionPath, long minTxn, long maxTxn, int buckets, int numExpectedFiles, String... records) throws Exception { ValidWriteIdList writeIds = getTransactionContext(conf); AcidUtils.Directory dir = AcidUtils.getAcidState(partitionPath, conf, writeIds); Assert.assertEquals(0, dir.getObsolete().size()); Assert.assertEquals(0, dir.getOriginalFiles().size()); List<AcidUtils.ParsedDelta> current = dir.getCurrentDirectories(); System.out.println("Files found: "); for (AcidUtils.ParsedDelta pd : current) { System.out.println(pd.getPath().toString()); } Assert.assertEquals(numExpectedFiles, current.size()); // find the absolute minimum transaction long min = Long.MAX_VALUE; long max = Long.MIN_VALUE; for (AcidUtils.ParsedDelta pd : current) { if (pd.getMaxWriteId() > max) { max = pd.getMaxWriteId(); } if (pd.getMinWriteId() < min) { min = pd.getMinWriteId(); } } Assert.assertEquals(minTxn, min); Assert.assertEquals(maxTxn, max); InputFormat inf = new OrcInputFormat(); JobConf job = new JobConf(); job.set("mapred.input.dir", partitionPath.toString()); job.set(BUCKET_COUNT, Integer.toString(buckets)); job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "id,msg"); job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "bigint:string"); AcidUtils.setAcidOperationalProperties(job, true, null); job.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true); job.set(ValidWriteIdList.VALID_WRITEIDS_KEY, writeIds.writeToString()); job.set(ValidTxnList.VALID_TXNS_KEY, conf.get(ValidTxnList.VALID_TXNS_KEY)); InputSplit[] splits = inf.getSplits(job, buckets); Assert.assertEquals(numExpectedFiles, splits.length); org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr = inf.getRecordReader(splits[0], job, Reporter.NULL); NullWritable key = rr.createKey(); OrcStruct value = rr.createValue(); for (String record : records) { Assert.assertEquals(true, rr.next(key, value)); Assert.assertEquals(record, value.toString()); } Assert.assertEquals(false, rr.next(key, value)); }
From source file:org.apache.ignite.internal.processors.hadoop.impl.HadoopTeraSortTest.java
License:Apache License
/** * Does actual test TeraSort job Through Ignite API * * @param gzip Whether to use GZIP./*from w w w.j a v a2 s .com*/ */ protected final void teraSort(boolean gzip) throws Exception { System.out.println("TeraSort ==============================================================="); getFileSystem().delete(new Path(sortOutDir), true); final JobConf jobConf = new JobConf(); jobConf.setUser(getUser()); jobConf.set("fs.defaultFS", getFsBase()); log().info("Desired number of reduces: " + numReduces()); jobConf.set("mapreduce.job.reduces", String.valueOf(numReduces())); log().info("Desired number of maps: " + numMaps()); final long splitSize = dataSizeBytes() / numMaps(); log().info("Desired split size: " + splitSize); // Force the split to be of the desired size: jobConf.set("mapred.min.split.size", String.valueOf(splitSize)); jobConf.set("mapred.max.split.size", String.valueOf(splitSize)); jobConf.setBoolean(HadoopJobProperty.SHUFFLE_MAPPER_STRIPED_OUTPUT.propertyName(), true); jobConf.setInt(HadoopJobProperty.SHUFFLE_MSG_SIZE.propertyName(), 4096); if (gzip) jobConf.setBoolean(HadoopJobProperty.SHUFFLE_MSG_GZIP.propertyName(), true); jobConf.set(HadoopJobProperty.JOB_PARTIALLY_RAW_COMPARATOR.propertyName(), TextPartiallyRawComparator.class.getName()); Job job = setupConfig(jobConf); HadoopJobId jobId = new HadoopJobId(UUID.randomUUID(), 1); IgniteInternalFuture<?> fut = grid(0).hadoop().submit(jobId, createJobInfo(job.getConfiguration())); fut.get(); }
From source file:org.apache.mahout.avro.text.mapred.WikipediaToAvroDocuments.java
License:Apache License
/** * Run the job/* www. java2 s . com*/ * * @param input * the input pathname String * @param output * the output pathname String * @param catFile * the file containing the Wikipedia categories * @param exactMatchOnly * if true, then the Wikipedia category must match exactly instead of * simply containing the category string * @param all * if true select all categories */ public static int runJob(String input, String output, String catFile, boolean exactMatchOnly, boolean all) throws IOException { JobClient client = new JobClient(); JobConf conf = new JobConf(WikipediaToAvroDocuments.class); if (log.isInfoEnabled()) { log.info("Input: " + input + " Out: " + output + " Categories: " + catFile + " All Files: " + all); } Path inPath = new Path(input); Path outPath = new Path(output); FileInputFormat.setInputPaths(conf, inPath); FileOutputFormat.setOutputPath(conf, outPath); //AvroOutputFormat.setClass(conf, AvroDocument.class); //AvroOutputFormat.setSchema(conf, AvroDocument._SCHEMA); conf.set("xmlinput.start", "<page>"); conf.set("xmlinput.end", "</page>"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(AvroDocument.class); conf.setBoolean("exact.match.only", exactMatchOnly); conf.setBoolean("all.files", all); conf.setMapperClass(WikipediaAvroDocumentMapper.class); conf.setInputFormat(XmlInputFormat.class); conf.setReducerClass(IdentityReducer.class); conf.setOutputFormat(AvroOutputFormat.class); AvroOutputFormat.setAvroOutputClass(conf, AvroDocument.class); FileSystem dfs = FileSystem.get(outPath.toUri(), conf); if (dfs.exists(outPath)) { dfs.delete(outPath, true); } Set<String> categories = new HashSet<String>(); if (catFile.equals("") == false) { for (String line : new FileLineIterable(new File(catFile))) { categories.add(line.trim().toLowerCase()); } } DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(categories)); String categoriesStr = setStringifier.toString(categories); conf.set("wikipedia.categories", categoriesStr); client.setConf(conf); RunningJob job = JobClient.runJob(conf); job.waitForCompletion(); return job.isSuccessful() ? 1 : 0; }
From source file:org.apache.mahout.math.hadoop.stochasticsvd.BtJob.java
License:Apache License
public static void run(Configuration conf, Path[] inputPathA, Path inputPathQJob, Path xiPath, Path outputPath, int minSplitSize, int k, int p, int btBlockHeight, int numReduceTasks, boolean broadcast, Class<? extends Writable> labelClass, boolean outputBBtProducts) throws ClassNotFoundException, InterruptedException, IOException { JobConf oldApiJob = new JobConf(conf); MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_Q, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, labelClass, VectorWritable.class); if (outputBBtProducts) { MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_BBT, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class, VectorWritable.class); /*//from w w w . j a v a 2 s . c o m * MAHOUT-1067: if we are asked to output BBT products then named vector * names should be propagated to Q too so that UJob could pick them up * from there. */ oldApiJob.setBoolean(PROP_NV, true); } if (xiPath != null) { // compute pca -related stuff as well MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_SQ, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class, VectorWritable.class); MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_SB, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class, VectorWritable.class); } /* * HACK: we use old api multiple outputs since they are not available in the * new api of either 0.20.2 or 0.20.203 but wrap it into a new api job so we * can use new api interfaces. */ Job job = new Job(oldApiJob); job.setJobName("Bt-job"); job.setJarByClass(BtJob.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, inputPathA); if (minSplitSize > 0) { FileInputFormat.setMinInputSplitSize(job, minSplitSize); } FileOutputFormat.setOutputPath(job, outputPath); // WARN: tight hadoop integration here: job.getConfiguration().set("mapreduce.output.basename", OUTPUT_BT); FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(SparseRowBlockWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(BtMapper.class); job.setCombinerClass(OuterProductCombiner.class); job.setReducerClass(OuterProductReducer.class); job.getConfiguration().setInt(QJob.PROP_K, k); job.getConfiguration().setInt(QJob.PROP_P, p); job.getConfiguration().set(PROP_QJOB_PATH, inputPathQJob.toString()); job.getConfiguration().setBoolean(PROP_OUPTUT_BBT_PRODUCTS, outputBBtProducts); job.getConfiguration().setInt(PROP_OUTER_PROD_BLOCK_HEIGHT, btBlockHeight); job.setNumReduceTasks(numReduceTasks); /* * PCA-related options, MAHOUT-817 */ if (xiPath != null) { job.getConfiguration().set(PROP_XI_PATH, xiPath.toString()); } /* * we can broadhast Rhat files since all of them are reuqired by each job, * but not Q files which correspond to splits of A (so each split of A will * require only particular Q file, each time different one). */ if (broadcast) { job.getConfiguration().set(PROP_RHAT_BROADCAST, "y"); FileSystem fs = FileSystem.get(inputPathQJob.toUri(), conf); FileStatus[] fstats = fs.globStatus(new Path(inputPathQJob, QJob.OUTPUT_RHAT + "-*")); if (fstats != null) { for (FileStatus fstat : fstats) { /* * new api is not enabled yet in our dependencies at this time, still * using deprecated one */ DistributedCache.addCacheFile(fstat.getPath().toUri(), job.getConfiguration()); } } } job.submit(); job.waitForCompletion(false); if (!job.isSuccessful()) { throw new IOException("Bt job unsuccessful."); } }
From source file:org.apache.nutch.crawl.CrawlDb.java
License:Apache License
public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter, boolean additionsAllowed, boolean force) throws IOException { FileSystem fs = FileSystem.get(getConf()); Path lock = new Path(crawlDb, LOCK_NAME); LockUtil.createLockFile(fs, lock, force); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); JobConf job = CrawlDb.createJob(getConf(), crawlDb); job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed); job.setBoolean(CrawlDbFilter.URL_FILTERING, filter); job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize); boolean url404Purging = job.getBoolean(CRAWLDB_PURGE_404, false); if (LOG.isInfoEnabled()) { LOG.info("CrawlDb update: starting at " + sdf.format(start)); LOG.info("CrawlDb update: db: " + crawlDb); LOG.info("CrawlDb update: segments: " + Arrays.asList(segments)); LOG.info("CrawlDb update: additions allowed: " + additionsAllowed); LOG.info("CrawlDb update: URL normalizing: " + normalize); LOG.info("CrawlDb update: URL filtering: " + filter); LOG.info("CrawlDb update: 404 purging: " + url404Purging); }/*from w w w . j a va2 s . c o m*/ for (int i = 0; i < segments.length; i++) { Path fetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME); Path parse = new Path(segments[i], CrawlDatum.PARSE_DIR_NAME); if (fs.exists(fetch) && fs.exists(parse)) { FileInputFormat.addInputPath(job, fetch); FileInputFormat.addInputPath(job, parse); } else { LOG.info(" - skipping invalid segment " + segments[i]); } } if (LOG.isInfoEnabled()) { LOG.info("CrawlDb update: Merging segment data into db."); } try { JobClient.runJob(job); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); Path outPath = FileOutputFormat.getOutputPath(job); if (fs.exists(outPath)) fs.delete(outPath, true); throw e; } CrawlDb.install(job, crawlDb); long end = System.currentTimeMillis(); LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.crawl.CrawlDbMerger.java
License:Apache License
public static JobConf createMergeJob(Configuration conf, Path output, boolean normalize, boolean filter) { Path newCrawlDb = new Path("crawldb-merge-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(conf); job.setJobName("crawldb merge " + output); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbFilter.class); job.setBoolean(CrawlDbFilter.URL_FILTERING, filter); job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize); job.setReducerClass(Merger.class); FileOutputFormat.setOutputPath(job, newCrawlDb); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); return job;// w w w . j a v a2s. c om }
From source file:org.apache.nutch.crawl.CrawlDbReader.java
License:Apache License
public void processStatJob(String crawlDb, Configuration config, boolean sort) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("CrawlDb statistics start: " + crawlDb); }//from ww w .j av a2 s . co m Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis()); JobConf job = new NutchJob(config); job.setJobName("stats " + crawlDb); job.setBoolean("db.reader.stats.sort", sort); FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbStatMapper.class); job.setCombinerClass(CrawlDbStatCombiner.class); job.setReducerClass(CrawlDbStatReducer.class); FileOutputFormat.setOutputPath(job, tmpFolder); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); // https://issues.apache.org/jira/browse/NUTCH-1029 job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); JobClient.runJob(job); // reading the result FileSystem fileSystem = FileSystem.get(config); SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(config, tmpFolder); Text key = new Text(); LongWritable value = new LongWritable(); TreeMap<String, LongWritable> stats = new TreeMap<String, LongWritable>(); for (int i = 0; i < readers.length; i++) { SequenceFile.Reader reader = readers[i]; while (reader.next(key, value)) { String k = key.toString(); LongWritable val = stats.get(k); if (val == null) { val = new LongWritable(); if (k.equals("scx")) val.set(Long.MIN_VALUE); if (k.equals("scn")) val.set(Long.MAX_VALUE); stats.put(k, val); } if (k.equals("scx")) { if (val.get() < value.get()) val.set(value.get()); } else if (k.equals("scn")) { if (val.get() > value.get()) val.set(value.get()); } else { val.set(val.get() + value.get()); } } reader.close(); } if (LOG.isInfoEnabled()) { LOG.info("Statistics for CrawlDb: " + crawlDb); LongWritable totalCnt = stats.get("T"); stats.remove("T"); LOG.info("TOTAL urls:\t" + totalCnt.get()); for (Map.Entry<String, LongWritable> entry : stats.entrySet()) { String k = entry.getKey(); LongWritable val = entry.getValue(); if (k.equals("scn")) { LOG.info("min score:\t" + (float) (val.get() / 1000.0f)); } else if (k.equals("scx")) { LOG.info("max score:\t" + (float) (val.get() / 1000.0f)); } else if (k.equals("sct")) { LOG.info("avg score:\t" + (float) ((((double) val.get()) / totalCnt.get()) / 1000.0)); } else if (k.startsWith("status")) { String[] st = k.split(" "); int code = Integer.parseInt(st[1]); if (st.length > 2) LOG.info(" " + st[2] + " :\t" + val); else LOG.info(st[0] + " " + code + " (" + CrawlDatum.getStatusName((byte) code) + "):\t" + val); } else LOG.info(k + ":\t" + val); } } // removing the tmp folder fileSystem.delete(tmpFolder, true); if (LOG.isInfoEnabled()) { LOG.info("CrawlDb statistics: done"); } }
From source file:org.apache.nutch.crawl.Generator.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or not * is read from the crawl.generate.filter property in the configuration files. * If the property is not found, the URLs are filtered. Same for the * normalisation./*from w w w . j a v a 2 s . co m*/ * * @param dbDir * Crawl database directory * @param segments * Segments directory * @param numLists * Number of reduce tasks * @param topN * Number of top URLs to be selected * @param curTime * Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException * When an I/O error occurs */ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments) throws IOException { Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("Generator: starting at " + sdf.format(start)); LOG.info("Generator: Selecting best-scoring urls due for fetch."); LOG.info("Generator: filtering: " + filter); LOG.info("Generator: normalizing: " + norm); if (topN != Long.MAX_VALUE) { LOG.info("Generator: topN: " + topN); } if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) { LOG.info("Generator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead"); } // map to inverted subset due for fetch, sort by score JobConf job = new NutchJob(getConf()); job.setJobName("generate: select from " + dbDir); if (numLists == -1) { // for politeness make numLists = job.getNumMapTasks(); // a partition per fetch task } if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(Selector.class); job.setPartitionerClass(Selector.class); job.setReducerClass(Selector.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputKeyComparatorClass(DecreasingFloatComparator.class); job.setOutputValueClass(SelectorEntry.class); job.setOutputFormat(GeneratorOutputFormat.class); try { JobClient.runJob(job); } catch (IOException e) { throw e; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) continue; // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("Generator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("Generator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); job = new NutchJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbUpdater.class); job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { JobClient.runJob(job); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); }
From source file:org.apache.nutch.crawl.LinkDb.java
License:Apache License
private static JobConf createJob(Configuration config, Path linkDb, boolean normalize, boolean filter) { Path newLinkDb = new Path("linkdb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(config); job.setJobName("linkdb " + linkDb); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(LinkDb.class); job.setCombinerClass(LinkDbMerger.class); // if we don't run the mergeJob, perform normalization/filtering now if (normalize || filter) { try {// w w w . j av a 2 s. co m FileSystem fs = FileSystem.get(config); if (!fs.exists(linkDb)) { job.setBoolean(LinkDbFilter.URL_FILTERING, filter); job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize); } } catch (Exception e) { LOG.warn("LinkDb createJob: " + e); } } job.setReducerClass(LinkDbMerger.class); FileOutputFormat.setOutputPath(job, newLinkDb); job.setOutputFormat(MapFileOutputFormat.class); job.setBoolean("mapred.output.compress", true); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Inlinks.class); return job; }