List of usage examples for org.apache.hadoop.mapred JobConf setMapOutputValueClass
public void setMapOutputValueClass(Class<?> theClass)
From source file:org.apache.nutch.indexer.field.FieldIndexer.java
License:Apache License
public void index(Path[] fields, Path indexDir) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("FieldIndexer: starting at " + sdf.format(start)); JobConf job = new NutchJob(getConf()); job.setJobName("FieldIndexer: " + indexDir); for (int i = 0; i < fields.length; i++) { Path fieldsDb = fields[i]; LOG.info("FieldIndexer: adding fields db: " + fieldsDb); FileInputFormat.addInputPath(job, fieldsDb); }/*ww w . j a va2 s .co m*/ job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(FieldIndexer.class); job.setReducerClass(FieldIndexer.class); FileOutputFormat.setOutputPath(job, indexDir); job.setOutputFormat(OutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(FieldWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LuceneDocumentWrapper.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("FieldIndexer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.indexer.IndexerMapReduce.java
License:Apache License
public static void initMRJob(Path crawlDb, Path linkDb, Collection<Path> segments, JobConf job) { LOG.info("IndexerMapReduce: crawldb: " + crawlDb); if (linkDb != null) LOG.info("IndexerMapReduce: linkdb: " + linkDb); for (final Path segment : segments) { LOG.info("IndexerMapReduces: adding segment: " + segment); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.FETCH_DIR_NAME)); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.PARSE_DIR_NAME)); FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME)); FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME)); }/* w w w . ja va 2s. co m*/ FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); if (linkDb != null) FileInputFormat.addInputPath(job, new Path(linkDb, LinkDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(IndexerMapReduce.class); job.setReducerClass(IndexerMapReduce.class); job.setOutputFormat(IndexerOutputFormat.class); job.setOutputKeyClass(Text.class); job.setMapOutputValueClass(NutchWritable.class); job.setOutputValueClass(NutchWritable.class); }
From source file:org.apache.nutch.indexer.solr.SolrClean.java
License:Apache License
public void delete(String crawldb, String solrUrl, boolean noCommit) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("SolrClean: starting at " + sdf.format(start)); JobConf job = new NutchJob(getConf()); FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME)); job.setBoolean("noCommit", noCommit); job.set(SolrConstants.SERVER_URL, solrUrl); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(NullOutputFormat.class); job.setMapOutputKeyClass(ByteWritable.class); job.setMapOutputValueClass(Text.class); job.setMapperClass(DBFilter.class); job.setReducerClass(SolrDeleter.class); JobClient.runJob(job);//from w w w . ja v a2 s . c o m long end = System.currentTimeMillis(); LOG.info("SolrClean: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.scoring.webgraph.LinkDumper.java
License:Apache License
/** * Runs the inverter and merger jobs of the LinkDumper tool to create the * url to inlink node database.//w ww. ja va 2 s. c om */ public void dumpLinks(Path webGraphDb) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("NodeDumper: starting at " + sdf.format(start)); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); Path linkdump = new Path(webGraphDb, DUMP_DIR); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); Path loopSetDb = new Path(webGraphDb, Loops.LOOPS_DIR); boolean loopsExists = fs.exists(loopSetDb); Path outlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR); // run the inverter job Path tempInverted = new Path(webGraphDb, "inverted-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf inverter = new NutchJob(conf); inverter.setJobName("LinkDumper: inverter"); FileInputFormat.addInputPath(inverter, nodeDb); if (loopsExists) { FileInputFormat.addInputPath(inverter, loopSetDb); } FileInputFormat.addInputPath(inverter, outlinkDb); inverter.setInputFormat(SequenceFileInputFormat.class); inverter.setMapperClass(Inverter.class); inverter.setReducerClass(Inverter.class); inverter.setMapOutputKeyClass(Text.class); inverter.setMapOutputValueClass(ObjectWritable.class); inverter.setOutputKeyClass(Text.class); inverter.setOutputValueClass(LinkNode.class); FileOutputFormat.setOutputPath(inverter, tempInverted); inverter.setOutputFormat(SequenceFileOutputFormat.class); try { LOG.info("LinkDumper: running inverter"); JobClient.runJob(inverter); LOG.info("LinkDumper: finished inverter"); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } // run the merger job JobConf merger = new NutchJob(conf); merger.setJobName("LinkDumper: merger"); FileInputFormat.addInputPath(merger, tempInverted); merger.setInputFormat(SequenceFileInputFormat.class); merger.setReducerClass(Merger.class); merger.setMapOutputKeyClass(Text.class); merger.setMapOutputValueClass(LinkNode.class); merger.setOutputKeyClass(Text.class); merger.setOutputValueClass(LinkNodes.class); FileOutputFormat.setOutputPath(merger, linkdump); merger.setOutputFormat(MapFileOutputFormat.class); try { LOG.info("LinkDumper: running merger"); JobClient.runJob(merger); LOG.info("LinkDumper: finished merger"); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } fs.delete(tempInverted, true); long end = System.currentTimeMillis(); LOG.info("LinkDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.scoring.webgraph.LinkRank.java
License:Apache License
/** * Runs the counter job. The counter job determines the number of links in the * webgraph. This is used during analysis. * /* w w w. ja v a2s.c om*/ * @param fs The job file system. * @param webGraphDb The web graph database to use. * * @return The number of nodes in the web graph. * @throws IOException If an error occurs while running the counter job. */ private int runCounter(FileSystem fs, Path webGraphDb) throws IOException { // configure the counter job Path numLinksPath = new Path(webGraphDb, NUM_NODES); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); JobConf counter = new NutchJob(getConf()); counter.setJobName("LinkRank Counter"); FileInputFormat.addInputPath(counter, nodeDb); FileOutputFormat.setOutputPath(counter, numLinksPath); counter.setInputFormat(SequenceFileInputFormat.class); counter.setMapperClass(Counter.class); counter.setCombinerClass(Counter.class); counter.setReducerClass(Counter.class); counter.setMapOutputKeyClass(Text.class); counter.setMapOutputValueClass(LongWritable.class); counter.setOutputKeyClass(Text.class); counter.setOutputValueClass(LongWritable.class); counter.setNumReduceTasks(1); counter.setOutputFormat(TextOutputFormat.class); counter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); // run the counter job, outputs to a single reduce task and file LOG.info("Starting link counter job"); try { JobClient.runJob(counter); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished link counter job"); // read the first (and only) line from the file which should be the // number of links in the web graph LOG.info("Reading numlinks temp file"); FSDataInputStream readLinks = fs.open(new Path(numLinksPath, "part-00000")); BufferedReader buffer = new BufferedReader(new InputStreamReader(readLinks)); String numLinksLine = buffer.readLine(); readLinks.close(); // check if there are links to process, if none, webgraph might be empty if (numLinksLine == null || numLinksLine.length() == 0) { fs.delete(numLinksPath, true); throw new IOException("No links to process, is the webgraph empty?"); } // delete temp file and convert and return the number of links as an int LOG.info("Deleting numlinks temp file"); fs.delete(numLinksPath, true); String numLinks = numLinksLine.split("\\s+")[1]; return Integer.parseInt(numLinks); }
From source file:org.apache.nutch.scoring.webgraph.LinkRank.java
License:Apache License
/** * Runs the initializer job. The initializer job sets up the nodes with a * default starting score for link analysis. * //from w ww .j a v a 2 s. c o m * @param nodeDb The node database to use. * @param output The job output directory. * * @throws IOException If an error occurs while running the initializer job. */ private void runInitializer(Path nodeDb, Path output) throws IOException { // configure the initializer JobConf initializer = new NutchJob(getConf()); initializer.setJobName("LinkAnalysis Initializer"); FileInputFormat.addInputPath(initializer, nodeDb); FileOutputFormat.setOutputPath(initializer, output); initializer.setInputFormat(SequenceFileInputFormat.class); initializer.setMapperClass(Initializer.class); initializer.setMapOutputKeyClass(Text.class); initializer.setMapOutputValueClass(Node.class); initializer.setOutputKeyClass(Text.class); initializer.setOutputValueClass(Node.class); initializer.setOutputFormat(MapFileOutputFormat.class); initializer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); // run the initializer LOG.info("Starting initialization job"); try { JobClient.runJob(initializer); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished initialization job."); }
From source file:org.apache.nutch.scoring.webgraph.LinkRank.java
License:Apache License
/** * Runs the inverter job. The inverter job flips outlinks to inlinks to be * passed into the analysis job./* www . j a v a2s .c om*/ * * The inverter job takes a link loops database if it exists. It is an * optional componenet of link analysis due to its extreme computational and * space requirements but it can be very useful is weeding out and eliminating * link farms and other spam pages. * * @param nodeDb The node database to use. * @param outlinkDb The outlink database to use. * @param loopDb The loop database to use if it exists. * @param output The output directory. * * @throws IOException If an error occurs while running the inverter job. */ private void runInverter(Path nodeDb, Path outlinkDb, Path loopDb, Path output) throws IOException { // configure the inverter JobConf inverter = new NutchJob(getConf()); inverter.setJobName("LinkAnalysis Inverter"); FileInputFormat.addInputPath(inverter, nodeDb); FileInputFormat.addInputPath(inverter, outlinkDb); // add the loop database if it exists, isn't null if (loopDb != null) { FileInputFormat.addInputPath(inverter, loopDb); } FileOutputFormat.setOutputPath(inverter, output); inverter.setInputFormat(SequenceFileInputFormat.class); inverter.setMapperClass(Inverter.class); inverter.setReducerClass(Inverter.class); inverter.setMapOutputKeyClass(Text.class); inverter.setMapOutputValueClass(ObjectWritable.class); inverter.setOutputKeyClass(Text.class); inverter.setOutputValueClass(LinkDatum.class); inverter.setOutputFormat(SequenceFileOutputFormat.class); inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); // run the inverter job LOG.info("Starting inverter job"); try { JobClient.runJob(inverter); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished inverter job."); }
From source file:org.apache.nutch.scoring.webgraph.LinkRank.java
License:Apache License
/** * Runs the link analysis job. The link analysis job applies the link rank * formula to create a score per url and stores that score in the NodeDb. * //from w ww. j a v a 2s .co m * Typically the link analysis job is run a number of times to allow the link * rank scores to converge. * * @param nodeDb The node database from which we are getting previous link * rank scores. * @param inverted The inverted inlinks * @param output The link analysis output. * @param iteration The current iteration number. * @param numIterations The total number of link analysis iterations * * @throws IOException If an error occurs during link analysis. */ private void runAnalysis(Path nodeDb, Path inverted, Path output, int iteration, int numIterations, float rankOne) throws IOException { JobConf analyzer = new NutchJob(getConf()); analyzer.set("link.analyze.iteration", String.valueOf(iteration + 1)); analyzer.setJobName("LinkAnalysis Analyzer, iteration " + (iteration + 1) + " of " + numIterations); FileInputFormat.addInputPath(analyzer, nodeDb); FileInputFormat.addInputPath(analyzer, inverted); FileOutputFormat.setOutputPath(analyzer, output); analyzer.set("link.analyze.rank.one", String.valueOf(rankOne)); analyzer.setMapOutputKeyClass(Text.class); analyzer.setMapOutputValueClass(ObjectWritable.class); analyzer.setInputFormat(SequenceFileInputFormat.class); analyzer.setMapperClass(Analyzer.class); analyzer.setReducerClass(Analyzer.class); analyzer.setOutputKeyClass(Text.class); analyzer.setOutputValueClass(Node.class); analyzer.setOutputFormat(MapFileOutputFormat.class); analyzer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); LOG.info("Starting analysis job"); try { JobClient.runJob(analyzer); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished analysis job."); }
From source file:org.apache.nutch.scoring.webgraph.Loops.java
License:Apache License
/** * Runs the various loop jobs./*from w ww .j a va2 s .c o m*/ */ public void findLoops(Path webGraphDb) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Loops: starting at " + sdf.format(start)); LOG.info("Loops: webgraphdb: " + webGraphDb); } Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); Path outlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); Path routes = new Path(webGraphDb, ROUTES_DIR); Path tempRoute = new Path(webGraphDb, ROUTES_DIR + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // run the initializer JobConf init = new NutchJob(conf); init.setJobName("Initializer: " + webGraphDb); FileInputFormat.addInputPath(init, outlinkDb); FileInputFormat.addInputPath(init, nodeDb); init.setInputFormat(SequenceFileInputFormat.class); init.setMapperClass(Initializer.class); init.setReducerClass(Initializer.class); init.setMapOutputKeyClass(Text.class); init.setMapOutputValueClass(ObjectWritable.class); init.setOutputKeyClass(Text.class); init.setOutputValueClass(Route.class); FileOutputFormat.setOutputPath(init, tempRoute); init.setOutputFormat(SequenceFileOutputFormat.class); try { LOG.info("Loops: starting initializer"); JobClient.runJob(init); LOG.info("Loops: installing initializer " + routes); FSUtils.replace(fs, routes, tempRoute, true); LOG.info("Loops: finished initializer"); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } // run the loops job for a maxdepth, default 2, which will find a 3 link // loop cycle int depth = conf.getInt("link.loops.depth", 2); for (int i = 0; i < depth; i++) { JobConf looper = new NutchJob(conf); looper.setJobName("Looper: " + (i + 1) + " of " + depth); FileInputFormat.addInputPath(looper, outlinkDb); FileInputFormat.addInputPath(looper, routes); looper.setInputFormat(SequenceFileInputFormat.class); looper.setMapperClass(Looper.class); looper.setReducerClass(Looper.class); looper.setMapOutputKeyClass(Text.class); looper.setMapOutputValueClass(ObjectWritable.class); looper.setOutputKeyClass(Text.class); looper.setOutputValueClass(Route.class); FileOutputFormat.setOutputPath(looper, tempRoute); looper.setOutputFormat(SequenceFileOutputFormat.class); looper.setBoolean("last", i == (depth - 1)); try { LOG.info("Loops: starting looper"); JobClient.runJob(looper); LOG.info("Loops: installing looper " + routes); FSUtils.replace(fs, routes, tempRoute, true); LOG.info("Loops: finished looper"); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } } // run the finalizer JobConf finalizer = new NutchJob(conf); finalizer.setJobName("Finalizer: " + webGraphDb); FileInputFormat.addInputPath(finalizer, routes); finalizer.setInputFormat(SequenceFileInputFormat.class); finalizer.setMapperClass(Finalizer.class); finalizer.setReducerClass(Finalizer.class); finalizer.setMapOutputKeyClass(Text.class); finalizer.setMapOutputValueClass(Route.class); finalizer.setOutputKeyClass(Text.class); finalizer.setOutputValueClass(LoopSet.class); FileOutputFormat.setOutputPath(finalizer, new Path(webGraphDb, LOOPS_DIR)); finalizer.setOutputFormat(MapFileOutputFormat.class); try { LOG.info("Loops: starting finalizer"); JobClient.runJob(finalizer); LOG.info("Loops: finished finalizer"); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } long end = System.currentTimeMillis(); LOG.info("Loops: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.scoring.webgraph.NodeDumper.java
License:Apache License
/** * Runs the process to dump the top urls out to a text file. * * @param webGraphDb The WebGraph from which to pull values. * * @param topN// www . j a va 2s . c o m * @param output * * @throws IOException If an error occurs while dumping the top values. */ public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType, AggrType aggrType, boolean asSequenceFile) throws Exception { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("NodeDumper: starting at " + sdf.format(start)); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); Configuration conf = getConf(); JobConf dumper = new NutchJob(conf); dumper.setJobName("NodeDumper: " + webGraphDb); FileInputFormat.addInputPath(dumper, nodeDb); dumper.setInputFormat(SequenceFileInputFormat.class); if (nameType == null) { dumper.setMapperClass(Sorter.class); dumper.setReducerClass(Sorter.class); dumper.setMapOutputKeyClass(FloatWritable.class); dumper.setMapOutputValueClass(Text.class); } else { dumper.setMapperClass(Dumper.class); dumper.setReducerClass(Dumper.class); dumper.setMapOutputKeyClass(Text.class); dumper.setMapOutputValueClass(FloatWritable.class); } dumper.setOutputKeyClass(Text.class); dumper.setOutputValueClass(FloatWritable.class); FileOutputFormat.setOutputPath(dumper, output); if (asSequenceFile) { dumper.setOutputFormat(SequenceFileOutputFormat.class); } else { dumper.setOutputFormat(TextOutputFormat.class); } dumper.setNumReduceTasks(1); dumper.setBoolean("inlinks", type == DumpType.INLINKS); dumper.setBoolean("outlinks", type == DumpType.OUTLINKS); dumper.setBoolean("scores", type == DumpType.SCORES); dumper.setBoolean("host", nameType == NameType.HOST); dumper.setBoolean("domain", nameType == NameType.DOMAIN); dumper.setBoolean("sum", aggrType == AggrType.SUM); dumper.setBoolean("max", aggrType == AggrType.MAX); dumper.setLong("topn", topN); // Set equals-sign as separator for Solr's ExternalFileField if (asEff) { dumper.set("mapred.textoutputformat.separator", "="); } try { LOG.info("NodeDumper: running"); JobClient.runJob(dumper); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } long end = System.currentTimeMillis(); LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }