Example usage for org.apache.hadoop.mapred JobConf setMapOutputValueClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setMapOutputValueClass.

Prototype

public void setMapOutputValueClass(Class<?> theClass)

Source Link

Document

Set the value class for the map output data.

Usage

From source file:org.apache.nutch.indexer.field.FieldIndexer.java

License:Apache License

public void index(Path[] fields, Path indexDir) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("FieldIndexer: starting at " + sdf.format(start));

    JobConf job = new NutchJob(getConf());
    job.setJobName("FieldIndexer: " + indexDir);

    for (int i = 0; i < fields.length; i++) {
        Path fieldsDb = fields[i];
        LOG.info("FieldIndexer: adding fields db: " + fieldsDb);
        FileInputFormat.addInputPath(job, fieldsDb);
    }/*ww w .  j a  va2  s  .co m*/

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(FieldIndexer.class);
    job.setReducerClass(FieldIndexer.class);
    FileOutputFormat.setOutputPath(job, indexDir);
    job.setOutputFormat(OutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(FieldWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LuceneDocumentWrapper.class);

    JobClient.runJob(job);
    long end = System.currentTimeMillis();
    LOG.info("FieldIndexer: finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.indexer.IndexerMapReduce.java

License:Apache License

public static void initMRJob(Path crawlDb, Path linkDb, Collection<Path> segments, JobConf job) {

    LOG.info("IndexerMapReduce: crawldb: " + crawlDb);

    if (linkDb != null)
        LOG.info("IndexerMapReduce: linkdb: " + linkDb);

    for (final Path segment : segments) {
        LOG.info("IndexerMapReduces: adding segment: " + segment);
        FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.FETCH_DIR_NAME));
        FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.PARSE_DIR_NAME));
        FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
        FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));
    }/*  w w  w  .  ja va  2s. co m*/

    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));

    if (linkDb != null)
        FileInputFormat.addInputPath(job, new Path(linkDb, LinkDb.CURRENT_NAME));

    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(IndexerMapReduce.class);
    job.setReducerClass(IndexerMapReduce.class);

    job.setOutputFormat(IndexerOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setMapOutputValueClass(NutchWritable.class);
    job.setOutputValueClass(NutchWritable.class);
}

From source file:org.apache.nutch.indexer.solr.SolrClean.java

License:Apache License

public void delete(String crawldb, String solrUrl, boolean noCommit) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("SolrClean: starting at " + sdf.format(start));

    JobConf job = new NutchJob(getConf());

    FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME));
    job.setBoolean("noCommit", noCommit);
    job.set(SolrConstants.SERVER_URL, solrUrl);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setMapOutputKeyClass(ByteWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setMapperClass(DBFilter.class);
    job.setReducerClass(SolrDeleter.class);

    JobClient.runJob(job);//from   w w  w  .  ja v a2  s . c o m

    long end = System.currentTimeMillis();
    LOG.info("SolrClean: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.scoring.webgraph.LinkDumper.java

License:Apache License

/**
 * Runs the inverter and merger jobs of the LinkDumper tool to create the 
 * url to inlink node database.//w  ww.  ja  va 2  s. c  om
 */
public void dumpLinks(Path webGraphDb) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("NodeDumper: starting at " + sdf.format(start));
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    Path linkdump = new Path(webGraphDb, DUMP_DIR);
    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
    Path loopSetDb = new Path(webGraphDb, Loops.LOOPS_DIR);
    boolean loopsExists = fs.exists(loopSetDb);
    Path outlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR);

    // run the inverter job
    Path tempInverted = new Path(webGraphDb,
            "inverted-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
    JobConf inverter = new NutchJob(conf);
    inverter.setJobName("LinkDumper: inverter");
    FileInputFormat.addInputPath(inverter, nodeDb);
    if (loopsExists) {
        FileInputFormat.addInputPath(inverter, loopSetDb);
    }
    FileInputFormat.addInputPath(inverter, outlinkDb);
    inverter.setInputFormat(SequenceFileInputFormat.class);
    inverter.setMapperClass(Inverter.class);
    inverter.setReducerClass(Inverter.class);
    inverter.setMapOutputKeyClass(Text.class);
    inverter.setMapOutputValueClass(ObjectWritable.class);
    inverter.setOutputKeyClass(Text.class);
    inverter.setOutputValueClass(LinkNode.class);
    FileOutputFormat.setOutputPath(inverter, tempInverted);
    inverter.setOutputFormat(SequenceFileOutputFormat.class);

    try {
        LOG.info("LinkDumper: running inverter");
        JobClient.runJob(inverter);
        LOG.info("LinkDumper: finished inverter");
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }

    // run the merger job
    JobConf merger = new NutchJob(conf);
    merger.setJobName("LinkDumper: merger");
    FileInputFormat.addInputPath(merger, tempInverted);
    merger.setInputFormat(SequenceFileInputFormat.class);
    merger.setReducerClass(Merger.class);
    merger.setMapOutputKeyClass(Text.class);
    merger.setMapOutputValueClass(LinkNode.class);
    merger.setOutputKeyClass(Text.class);
    merger.setOutputValueClass(LinkNodes.class);
    FileOutputFormat.setOutputPath(merger, linkdump);
    merger.setOutputFormat(MapFileOutputFormat.class);

    try {
        LOG.info("LinkDumper: running merger");
        JobClient.runJob(merger);
        LOG.info("LinkDumper: finished merger");
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }

    fs.delete(tempInverted, true);
    long end = System.currentTimeMillis();
    LOG.info("LinkDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.scoring.webgraph.LinkRank.java

License:Apache License

/**
 * Runs the counter job. The counter job determines the number of links in the
 * webgraph. This is used during analysis.
 * /*  w w  w. ja v a2s.c  om*/
 * @param fs The job file system.
 * @param webGraphDb The web graph database to use.
 * 
 * @return The number of nodes in the web graph.
 * @throws IOException If an error occurs while running the counter job.
 */
private int runCounter(FileSystem fs, Path webGraphDb) throws IOException {

    // configure the counter job
    Path numLinksPath = new Path(webGraphDb, NUM_NODES);
    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
    JobConf counter = new NutchJob(getConf());
    counter.setJobName("LinkRank Counter");
    FileInputFormat.addInputPath(counter, nodeDb);
    FileOutputFormat.setOutputPath(counter, numLinksPath);
    counter.setInputFormat(SequenceFileInputFormat.class);
    counter.setMapperClass(Counter.class);
    counter.setCombinerClass(Counter.class);
    counter.setReducerClass(Counter.class);
    counter.setMapOutputKeyClass(Text.class);
    counter.setMapOutputValueClass(LongWritable.class);
    counter.setOutputKeyClass(Text.class);
    counter.setOutputValueClass(LongWritable.class);
    counter.setNumReduceTasks(1);
    counter.setOutputFormat(TextOutputFormat.class);
    counter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    // run the counter job, outputs to a single reduce task and file
    LOG.info("Starting link counter job");
    try {
        JobClient.runJob(counter);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished link counter job");

    // read the first (and only) line from the file which should be the
    // number of links in the web graph
    LOG.info("Reading numlinks temp file");
    FSDataInputStream readLinks = fs.open(new Path(numLinksPath, "part-00000"));
    BufferedReader buffer = new BufferedReader(new InputStreamReader(readLinks));
    String numLinksLine = buffer.readLine();
    readLinks.close();

    // check if there are links to process, if none, webgraph might be empty
    if (numLinksLine == null || numLinksLine.length() == 0) {
        fs.delete(numLinksPath, true);
        throw new IOException("No links to process, is the webgraph empty?");
    }

    // delete temp file and convert and return the number of links as an int
    LOG.info("Deleting numlinks temp file");
    fs.delete(numLinksPath, true);
    String numLinks = numLinksLine.split("\\s+")[1];
    return Integer.parseInt(numLinks);
}

From source file:org.apache.nutch.scoring.webgraph.LinkRank.java

License:Apache License

/**
 * Runs the initializer job. The initializer job sets up the nodes with a
 * default starting score for link analysis.
 * //from w ww .j a v  a  2 s. c o m
 * @param nodeDb The node database to use.
 * @param output The job output directory.
 * 
 * @throws IOException If an error occurs while running the initializer job.
 */
private void runInitializer(Path nodeDb, Path output) throws IOException {

    // configure the initializer
    JobConf initializer = new NutchJob(getConf());
    initializer.setJobName("LinkAnalysis Initializer");
    FileInputFormat.addInputPath(initializer, nodeDb);
    FileOutputFormat.setOutputPath(initializer, output);
    initializer.setInputFormat(SequenceFileInputFormat.class);
    initializer.setMapperClass(Initializer.class);
    initializer.setMapOutputKeyClass(Text.class);
    initializer.setMapOutputValueClass(Node.class);
    initializer.setOutputKeyClass(Text.class);
    initializer.setOutputValueClass(Node.class);
    initializer.setOutputFormat(MapFileOutputFormat.class);
    initializer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    // run the initializer
    LOG.info("Starting initialization job");
    try {
        JobClient.runJob(initializer);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished initialization job.");
}

From source file:org.apache.nutch.scoring.webgraph.LinkRank.java

License:Apache License

/**
 * Runs the inverter job. The inverter job flips outlinks to inlinks to be
 * passed into the analysis job./* www .  j a v a2s .c  om*/
 * 
 * The inverter job takes a link loops database if it exists. It is an
 * optional componenet of link analysis due to its extreme computational and
 * space requirements but it can be very useful is weeding out and eliminating
 * link farms and other spam pages.
 * 
 * @param nodeDb The node database to use.
 * @param outlinkDb The outlink database to use.
 * @param loopDb The loop database to use if it exists.
 * @param output The output directory.
 * 
 * @throws IOException If an error occurs while running the inverter job.
 */
private void runInverter(Path nodeDb, Path outlinkDb, Path loopDb, Path output) throws IOException {

    // configure the inverter
    JobConf inverter = new NutchJob(getConf());
    inverter.setJobName("LinkAnalysis Inverter");
    FileInputFormat.addInputPath(inverter, nodeDb);
    FileInputFormat.addInputPath(inverter, outlinkDb);

    // add the loop database if it exists, isn't null
    if (loopDb != null) {
        FileInputFormat.addInputPath(inverter, loopDb);
    }
    FileOutputFormat.setOutputPath(inverter, output);
    inverter.setInputFormat(SequenceFileInputFormat.class);
    inverter.setMapperClass(Inverter.class);
    inverter.setReducerClass(Inverter.class);
    inverter.setMapOutputKeyClass(Text.class);
    inverter.setMapOutputValueClass(ObjectWritable.class);
    inverter.setOutputKeyClass(Text.class);
    inverter.setOutputValueClass(LinkDatum.class);
    inverter.setOutputFormat(SequenceFileOutputFormat.class);
    inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    // run the inverter job
    LOG.info("Starting inverter job");
    try {
        JobClient.runJob(inverter);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished inverter job.");
}

From source file:org.apache.nutch.scoring.webgraph.LinkRank.java

License:Apache License

/**
 * Runs the link analysis job. The link analysis job applies the link rank
 * formula to create a score per url and stores that score in the NodeDb.
 * //from   w ww. j  a v  a 2s .co  m
 * Typically the link analysis job is run a number of times to allow the link
 * rank scores to converge.
 * 
 * @param nodeDb The node database from which we are getting previous link
 * rank scores.
 * @param inverted The inverted inlinks
 * @param output The link analysis output.
 * @param iteration The current iteration number.
 * @param numIterations The total number of link analysis iterations
 * 
 * @throws IOException If an error occurs during link analysis.
 */
private void runAnalysis(Path nodeDb, Path inverted, Path output, int iteration, int numIterations,
        float rankOne) throws IOException {

    JobConf analyzer = new NutchJob(getConf());
    analyzer.set("link.analyze.iteration", String.valueOf(iteration + 1));
    analyzer.setJobName("LinkAnalysis Analyzer, iteration " + (iteration + 1) + " of " + numIterations);
    FileInputFormat.addInputPath(analyzer, nodeDb);
    FileInputFormat.addInputPath(analyzer, inverted);
    FileOutputFormat.setOutputPath(analyzer, output);
    analyzer.set("link.analyze.rank.one", String.valueOf(rankOne));
    analyzer.setMapOutputKeyClass(Text.class);
    analyzer.setMapOutputValueClass(ObjectWritable.class);
    analyzer.setInputFormat(SequenceFileInputFormat.class);
    analyzer.setMapperClass(Analyzer.class);
    analyzer.setReducerClass(Analyzer.class);
    analyzer.setOutputKeyClass(Text.class);
    analyzer.setOutputValueClass(Node.class);
    analyzer.setOutputFormat(MapFileOutputFormat.class);
    analyzer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    LOG.info("Starting analysis job");
    try {
        JobClient.runJob(analyzer);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished analysis job.");
}

From source file:org.apache.nutch.scoring.webgraph.Loops.java

License:Apache License

/**
 * Runs the various loop jobs./*from w ww .j  a va2  s .c o m*/
 */
public void findLoops(Path webGraphDb) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("Loops: starting at " + sdf.format(start));
        LOG.info("Loops: webgraphdb: " + webGraphDb);
    }

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    Path outlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR);
    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
    Path routes = new Path(webGraphDb, ROUTES_DIR);
    Path tempRoute = new Path(webGraphDb,
            ROUTES_DIR + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // run the initializer
    JobConf init = new NutchJob(conf);
    init.setJobName("Initializer: " + webGraphDb);
    FileInputFormat.addInputPath(init, outlinkDb);
    FileInputFormat.addInputPath(init, nodeDb);
    init.setInputFormat(SequenceFileInputFormat.class);
    init.setMapperClass(Initializer.class);
    init.setReducerClass(Initializer.class);
    init.setMapOutputKeyClass(Text.class);
    init.setMapOutputValueClass(ObjectWritable.class);
    init.setOutputKeyClass(Text.class);
    init.setOutputValueClass(Route.class);
    FileOutputFormat.setOutputPath(init, tempRoute);
    init.setOutputFormat(SequenceFileOutputFormat.class);

    try {
        LOG.info("Loops: starting initializer");
        JobClient.runJob(init);
        LOG.info("Loops: installing initializer " + routes);
        FSUtils.replace(fs, routes, tempRoute, true);
        LOG.info("Loops: finished initializer");
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }

    // run the loops job for a maxdepth, default 2, which will find a 3 link
    // loop cycle
    int depth = conf.getInt("link.loops.depth", 2);
    for (int i = 0; i < depth; i++) {

        JobConf looper = new NutchJob(conf);
        looper.setJobName("Looper: " + (i + 1) + " of " + depth);
        FileInputFormat.addInputPath(looper, outlinkDb);
        FileInputFormat.addInputPath(looper, routes);
        looper.setInputFormat(SequenceFileInputFormat.class);
        looper.setMapperClass(Looper.class);
        looper.setReducerClass(Looper.class);
        looper.setMapOutputKeyClass(Text.class);
        looper.setMapOutputValueClass(ObjectWritable.class);
        looper.setOutputKeyClass(Text.class);
        looper.setOutputValueClass(Route.class);
        FileOutputFormat.setOutputPath(looper, tempRoute);
        looper.setOutputFormat(SequenceFileOutputFormat.class);
        looper.setBoolean("last", i == (depth - 1));

        try {
            LOG.info("Loops: starting looper");
            JobClient.runJob(looper);
            LOG.info("Loops: installing looper " + routes);
            FSUtils.replace(fs, routes, tempRoute, true);
            LOG.info("Loops: finished looper");
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            throw e;
        }
    }

    // run the finalizer
    JobConf finalizer = new NutchJob(conf);
    finalizer.setJobName("Finalizer: " + webGraphDb);
    FileInputFormat.addInputPath(finalizer, routes);
    finalizer.setInputFormat(SequenceFileInputFormat.class);
    finalizer.setMapperClass(Finalizer.class);
    finalizer.setReducerClass(Finalizer.class);
    finalizer.setMapOutputKeyClass(Text.class);
    finalizer.setMapOutputValueClass(Route.class);
    finalizer.setOutputKeyClass(Text.class);
    finalizer.setOutputValueClass(LoopSet.class);
    FileOutputFormat.setOutputPath(finalizer, new Path(webGraphDb, LOOPS_DIR));
    finalizer.setOutputFormat(MapFileOutputFormat.class);

    try {
        LOG.info("Loops: starting finalizer");
        JobClient.runJob(finalizer);
        LOG.info("Loops: finished finalizer");
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    long end = System.currentTimeMillis();
    LOG.info("Loops: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.scoring.webgraph.NodeDumper.java

License:Apache License

/**
 * Runs the process to dump the top urls out to a text file.
 *
 * @param webGraphDb The WebGraph from which to pull values.
 *
 * @param topN//  www .  j  a va  2s  . c o m
 * @param output
 *
 * @throws IOException If an error occurs while dumping the top values.
 */
public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType,
        AggrType aggrType, boolean asSequenceFile) throws Exception {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("NodeDumper: starting at " + sdf.format(start));
    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
    Configuration conf = getConf();

    JobConf dumper = new NutchJob(conf);
    dumper.setJobName("NodeDumper: " + webGraphDb);
    FileInputFormat.addInputPath(dumper, nodeDb);
    dumper.setInputFormat(SequenceFileInputFormat.class);

    if (nameType == null) {
        dumper.setMapperClass(Sorter.class);
        dumper.setReducerClass(Sorter.class);
        dumper.setMapOutputKeyClass(FloatWritable.class);
        dumper.setMapOutputValueClass(Text.class);
    } else {
        dumper.setMapperClass(Dumper.class);
        dumper.setReducerClass(Dumper.class);
        dumper.setMapOutputKeyClass(Text.class);
        dumper.setMapOutputValueClass(FloatWritable.class);
    }

    dumper.setOutputKeyClass(Text.class);
    dumper.setOutputValueClass(FloatWritable.class);
    FileOutputFormat.setOutputPath(dumper, output);

    if (asSequenceFile) {
        dumper.setOutputFormat(SequenceFileOutputFormat.class);
    } else {
        dumper.setOutputFormat(TextOutputFormat.class);
    }

    dumper.setNumReduceTasks(1);
    dumper.setBoolean("inlinks", type == DumpType.INLINKS);
    dumper.setBoolean("outlinks", type == DumpType.OUTLINKS);
    dumper.setBoolean("scores", type == DumpType.SCORES);

    dumper.setBoolean("host", nameType == NameType.HOST);
    dumper.setBoolean("domain", nameType == NameType.DOMAIN);
    dumper.setBoolean("sum", aggrType == AggrType.SUM);
    dumper.setBoolean("max", aggrType == AggrType.MAX);

    dumper.setLong("topn", topN);

    // Set equals-sign as separator for Solr's ExternalFileField
    if (asEff) {
        dumper.set("mapred.textoutputformat.separator", "=");
    }

    try {
        LOG.info("NodeDumper: running");
        JobClient.runJob(dumper);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    long end = System.currentTimeMillis();
    LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}