List of usage examples for org.apache.hadoop.mapred JobConf setBoolean
public void setBoolean(String name, boolean value)
name
property to a boolean
. From source file:org.apache.nutch.crawl.LinkDbMerger.java
License:Apache License
public static JobConf createMergeJob(Configuration config, Path linkDb, boolean normalize, boolean filter) { Path newLinkDb = new Path("linkdb-merge-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(config); job.setJobName("linkdb merge " + linkDb); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(LinkDbFilter.class); job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize); job.setBoolean(LinkDbFilter.URL_FILTERING, filter); job.setReducerClass(LinkDbMerger.class); FileOutputFormat.setOutputPath(job, newLinkDb); job.setOutputFormat(MapFileOutputFormat.class); job.setBoolean("mapred.output.compress", true); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Inlinks.class); // https://issues.apache.org/jira/browse/NUTCH-1069 job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); return job;/*from ww w . j a v a 2s. co m*/ }
From source file:org.apache.nutch.fetcher.Fetcher2.java
License:Apache License
public void fetch(Path segment, int threads, boolean parsing) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("Fetcher: starting"); LOG.info("Fetcher: segment: " + segment); }//w w w.ja v a 2s . co m JobConf job = new NutchJob(getConf()); job.setJobName("fetch " + segment); job.setInt("fetcher.threads.fetch", threads); job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); job.setBoolean("fetcher.parse", parsing); // for politeness, don't permit parallel execution of a single task job.setSpeculativeExecution(false); job.setInputPath(new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); job.setInputFormat(InputFormat.class); job.setMapRunnerClass(Fetcher2.class); job.setOutputPath(segment); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(FetcherOutput.class); JobClient.runJob(job); if (LOG.isInfoEnabled()) { LOG.info("Fetcher: done"); } }
From source file:org.apache.nutch.indexer.solr.SolrClean.java
License:Apache License
public void delete(String crawldb, String solrUrl, boolean noCommit) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("SolrClean: starting at " + sdf.format(start)); JobConf job = new NutchJob(getConf()); FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME)); job.setBoolean("noCommit", noCommit); job.set(SolrConstants.SERVER_URL, solrUrl); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(NullOutputFormat.class); job.setMapOutputKeyClass(ByteWritable.class); job.setMapOutputValueClass(Text.class); job.setMapperClass(DBFilter.class); job.setReducerClass(SolrDeleter.class); JobClient.runJob(job);//from w w w. j av a2 s. c o m long end = System.currentTimeMillis(); LOG.info("SolrClean: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.indexer.solr.SolrIndexer.java
License:Apache License
public void indexSolr(String solrUrl, Path crawlDb, Path linkDb, List<Path> segments, boolean noCommit, boolean deleteGone, String solrParams, boolean filter, boolean normalize) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("SolrIndexer: starting at " + sdf.format(start)); final JobConf job = new NutchJob(getConf()); job.setJobName("index-solr " + solrUrl); LOG.info("SolrIndexer: deleting gone documents: " + deleteGone); LOG.info("SolrIndexer: URL filtering: " + filter); LOG.info("SolrIndexer: URL normalizing: " + normalize); IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job); job.set(SolrConstants.SERVER_URL, solrUrl); job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone); job.setBoolean(IndexerMapReduce.URL_FILTERING, filter); job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize); if (solrParams != null) { job.set(SolrConstants.PARAMS, solrParams); }//from w ww . j a v a2 s . c o m NutchIndexWriterFactory.addClassToConf(job, SolrWriter.class); job.setReduceSpeculativeExecution(false); final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt()); FileOutputFormat.setOutputPath(job, tmp); try { JobClient.runJob(job); // do the commits once and for all the reducers in one go SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job); if (!noCommit) { solr.commit(); } long end = System.currentTimeMillis(); LOG.info("SolrIndexer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } catch (Exception e) { LOG.error(e.toString()); } finally { FileSystem.get(job).delete(tmp, true); } }
From source file:org.apache.nutch.scoring.webgraph.LinkRank.java
License:Apache License
/** * Runs the counter job. The counter job determines the number of links in the * webgraph. This is used during analysis. * //from w ww. j a va2s .com * @param fs The job file system. * @param webGraphDb The web graph database to use. * * @return The number of nodes in the web graph. * @throws IOException If an error occurs while running the counter job. */ private int runCounter(FileSystem fs, Path webGraphDb) throws IOException { // configure the counter job Path numLinksPath = new Path(webGraphDb, NUM_NODES); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); JobConf counter = new NutchJob(getConf()); counter.setJobName("LinkRank Counter"); FileInputFormat.addInputPath(counter, nodeDb); FileOutputFormat.setOutputPath(counter, numLinksPath); counter.setInputFormat(SequenceFileInputFormat.class); counter.setMapperClass(Counter.class); counter.setCombinerClass(Counter.class); counter.setReducerClass(Counter.class); counter.setMapOutputKeyClass(Text.class); counter.setMapOutputValueClass(LongWritable.class); counter.setOutputKeyClass(Text.class); counter.setOutputValueClass(LongWritable.class); counter.setNumReduceTasks(1); counter.setOutputFormat(TextOutputFormat.class); counter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); // run the counter job, outputs to a single reduce task and file LOG.info("Starting link counter job"); try { JobClient.runJob(counter); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished link counter job"); // read the first (and only) line from the file which should be the // number of links in the web graph LOG.info("Reading numlinks temp file"); FSDataInputStream readLinks = fs.open(new Path(numLinksPath, "part-00000")); BufferedReader buffer = new BufferedReader(new InputStreamReader(readLinks)); String numLinksLine = buffer.readLine(); readLinks.close(); // check if there are links to process, if none, webgraph might be empty if (numLinksLine == null || numLinksLine.length() == 0) { fs.delete(numLinksPath, true); throw new IOException("No links to process, is the webgraph empty?"); } // delete temp file and convert and return the number of links as an int LOG.info("Deleting numlinks temp file"); fs.delete(numLinksPath, true); String numLinks = numLinksLine.split("\\s+")[1]; return Integer.parseInt(numLinks); }
From source file:org.apache.nutch.scoring.webgraph.LinkRank.java
License:Apache License
/** * Runs the initializer job. The initializer job sets up the nodes with a * default starting score for link analysis. * //from ww w .ja va 2 s. c om * @param nodeDb The node database to use. * @param output The job output directory. * * @throws IOException If an error occurs while running the initializer job. */ private void runInitializer(Path nodeDb, Path output) throws IOException { // configure the initializer JobConf initializer = new NutchJob(getConf()); initializer.setJobName("LinkAnalysis Initializer"); FileInputFormat.addInputPath(initializer, nodeDb); FileOutputFormat.setOutputPath(initializer, output); initializer.setInputFormat(SequenceFileInputFormat.class); initializer.setMapperClass(Initializer.class); initializer.setMapOutputKeyClass(Text.class); initializer.setMapOutputValueClass(Node.class); initializer.setOutputKeyClass(Text.class); initializer.setOutputValueClass(Node.class); initializer.setOutputFormat(MapFileOutputFormat.class); initializer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); // run the initializer LOG.info("Starting initialization job"); try { JobClient.runJob(initializer); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished initialization job."); }
From source file:org.apache.nutch.scoring.webgraph.LinkRank.java
License:Apache License
/** * Runs the inverter job. The inverter job flips outlinks to inlinks to be * passed into the analysis job.//from w w w.j a va 2 s .co m * * The inverter job takes a link loops database if it exists. It is an * optional componenet of link analysis due to its extreme computational and * space requirements but it can be very useful is weeding out and eliminating * link farms and other spam pages. * * @param nodeDb The node database to use. * @param outlinkDb The outlink database to use. * @param loopDb The loop database to use if it exists. * @param output The output directory. * * @throws IOException If an error occurs while running the inverter job. */ private void runInverter(Path nodeDb, Path outlinkDb, Path loopDb, Path output) throws IOException { // configure the inverter JobConf inverter = new NutchJob(getConf()); inverter.setJobName("LinkAnalysis Inverter"); FileInputFormat.addInputPath(inverter, nodeDb); FileInputFormat.addInputPath(inverter, outlinkDb); // add the loop database if it exists, isn't null if (loopDb != null) { FileInputFormat.addInputPath(inverter, loopDb); } FileOutputFormat.setOutputPath(inverter, output); inverter.setInputFormat(SequenceFileInputFormat.class); inverter.setMapperClass(Inverter.class); inverter.setReducerClass(Inverter.class); inverter.setMapOutputKeyClass(Text.class); inverter.setMapOutputValueClass(ObjectWritable.class); inverter.setOutputKeyClass(Text.class); inverter.setOutputValueClass(LinkDatum.class); inverter.setOutputFormat(SequenceFileOutputFormat.class); inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); // run the inverter job LOG.info("Starting inverter job"); try { JobClient.runJob(inverter); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished inverter job."); }
From source file:org.apache.nutch.scoring.webgraph.LinkRank.java
License:Apache License
/** * Runs the link analysis job. The link analysis job applies the link rank * formula to create a score per url and stores that score in the NodeDb. * // ww w .j a v a 2 s. c o m * Typically the link analysis job is run a number of times to allow the link * rank scores to converge. * * @param nodeDb The node database from which we are getting previous link * rank scores. * @param inverted The inverted inlinks * @param output The link analysis output. * @param iteration The current iteration number. * @param numIterations The total number of link analysis iterations * * @throws IOException If an error occurs during link analysis. */ private void runAnalysis(Path nodeDb, Path inverted, Path output, int iteration, int numIterations, float rankOne) throws IOException { JobConf analyzer = new NutchJob(getConf()); analyzer.set("link.analyze.iteration", String.valueOf(iteration + 1)); analyzer.setJobName("LinkAnalysis Analyzer, iteration " + (iteration + 1) + " of " + numIterations); FileInputFormat.addInputPath(analyzer, nodeDb); FileInputFormat.addInputPath(analyzer, inverted); FileOutputFormat.setOutputPath(analyzer, output); analyzer.set("link.analyze.rank.one", String.valueOf(rankOne)); analyzer.setMapOutputKeyClass(Text.class); analyzer.setMapOutputValueClass(ObjectWritable.class); analyzer.setInputFormat(SequenceFileInputFormat.class); analyzer.setMapperClass(Analyzer.class); analyzer.setReducerClass(Analyzer.class); analyzer.setOutputKeyClass(Text.class); analyzer.setOutputValueClass(Node.class); analyzer.setOutputFormat(MapFileOutputFormat.class); analyzer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); LOG.info("Starting analysis job"); try { JobClient.runJob(analyzer); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished analysis job."); }
From source file:org.apache.nutch.scoring.webgraph.Loops.java
License:Apache License
/** * Runs the various loop jobs.// w ww .j ava 2 s . co m */ public void findLoops(Path webGraphDb) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Loops: starting at " + sdf.format(start)); LOG.info("Loops: webgraphdb: " + webGraphDb); } Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); Path outlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); Path routes = new Path(webGraphDb, ROUTES_DIR); Path tempRoute = new Path(webGraphDb, ROUTES_DIR + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // run the initializer JobConf init = new NutchJob(conf); init.setJobName("Initializer: " + webGraphDb); FileInputFormat.addInputPath(init, outlinkDb); FileInputFormat.addInputPath(init, nodeDb); init.setInputFormat(SequenceFileInputFormat.class); init.setMapperClass(Initializer.class); init.setReducerClass(Initializer.class); init.setMapOutputKeyClass(Text.class); init.setMapOutputValueClass(ObjectWritable.class); init.setOutputKeyClass(Text.class); init.setOutputValueClass(Route.class); FileOutputFormat.setOutputPath(init, tempRoute); init.setOutputFormat(SequenceFileOutputFormat.class); try { LOG.info("Loops: starting initializer"); JobClient.runJob(init); LOG.info("Loops: installing initializer " + routes); FSUtils.replace(fs, routes, tempRoute, true); LOG.info("Loops: finished initializer"); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } // run the loops job for a maxdepth, default 2, which will find a 3 link // loop cycle int depth = conf.getInt("link.loops.depth", 2); for (int i = 0; i < depth; i++) { JobConf looper = new NutchJob(conf); looper.setJobName("Looper: " + (i + 1) + " of " + depth); FileInputFormat.addInputPath(looper, outlinkDb); FileInputFormat.addInputPath(looper, routes); looper.setInputFormat(SequenceFileInputFormat.class); looper.setMapperClass(Looper.class); looper.setReducerClass(Looper.class); looper.setMapOutputKeyClass(Text.class); looper.setMapOutputValueClass(ObjectWritable.class); looper.setOutputKeyClass(Text.class); looper.setOutputValueClass(Route.class); FileOutputFormat.setOutputPath(looper, tempRoute); looper.setOutputFormat(SequenceFileOutputFormat.class); looper.setBoolean("last", i == (depth - 1)); try { LOG.info("Loops: starting looper"); JobClient.runJob(looper); LOG.info("Loops: installing looper " + routes); FSUtils.replace(fs, routes, tempRoute, true); LOG.info("Loops: finished looper"); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } } // run the finalizer JobConf finalizer = new NutchJob(conf); finalizer.setJobName("Finalizer: " + webGraphDb); FileInputFormat.addInputPath(finalizer, routes); finalizer.setInputFormat(SequenceFileInputFormat.class); finalizer.setMapperClass(Finalizer.class); finalizer.setReducerClass(Finalizer.class); finalizer.setMapOutputKeyClass(Text.class); finalizer.setMapOutputValueClass(Route.class); finalizer.setOutputKeyClass(Text.class); finalizer.setOutputValueClass(LoopSet.class); FileOutputFormat.setOutputPath(finalizer, new Path(webGraphDb, LOOPS_DIR)); finalizer.setOutputFormat(MapFileOutputFormat.class); try { LOG.info("Loops: starting finalizer"); JobClient.runJob(finalizer); LOG.info("Loops: finished finalizer"); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } long end = System.currentTimeMillis(); LOG.info("Loops: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.scoring.webgraph.NodeDumper.java
License:Apache License
/** * Runs the process to dump the top urls out to a text file. * * @param webGraphDb The WebGraph from which to pull values. * * @param topN/*from w w w.j a v a2 s. c o m*/ * @param output * * @throws IOException If an error occurs while dumping the top values. */ public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType, AggrType aggrType, boolean asSequenceFile) throws Exception { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("NodeDumper: starting at " + sdf.format(start)); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); Configuration conf = getConf(); JobConf dumper = new NutchJob(conf); dumper.setJobName("NodeDumper: " + webGraphDb); FileInputFormat.addInputPath(dumper, nodeDb); dumper.setInputFormat(SequenceFileInputFormat.class); if (nameType == null) { dumper.setMapperClass(Sorter.class); dumper.setReducerClass(Sorter.class); dumper.setMapOutputKeyClass(FloatWritable.class); dumper.setMapOutputValueClass(Text.class); } else { dumper.setMapperClass(Dumper.class); dumper.setReducerClass(Dumper.class); dumper.setMapOutputKeyClass(Text.class); dumper.setMapOutputValueClass(FloatWritable.class); } dumper.setOutputKeyClass(Text.class); dumper.setOutputValueClass(FloatWritable.class); FileOutputFormat.setOutputPath(dumper, output); if (asSequenceFile) { dumper.setOutputFormat(SequenceFileOutputFormat.class); } else { dumper.setOutputFormat(TextOutputFormat.class); } dumper.setNumReduceTasks(1); dumper.setBoolean("inlinks", type == DumpType.INLINKS); dumper.setBoolean("outlinks", type == DumpType.OUTLINKS); dumper.setBoolean("scores", type == DumpType.SCORES); dumper.setBoolean("host", nameType == NameType.HOST); dumper.setBoolean("domain", nameType == NameType.DOMAIN); dumper.setBoolean("sum", aggrType == AggrType.SUM); dumper.setBoolean("max", aggrType == AggrType.MAX); dumper.setLong("topn", topN); // Set equals-sign as separator for Solr's ExternalFileField if (asEff) { dumper.set("mapred.textoutputformat.separator", "="); } try { LOG.info("NodeDumper: running"); JobClient.runJob(dumper); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } long end = System.currentTimeMillis(); LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }