Example usage for org.apache.hadoop.mapred JobConf setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass)

Source Link

Document

Set the key class for the map output data.

Usage

From source file:org.apache.nutch.indexer.field.CustomFields.java

License:Apache License

/**
 * MapReduce job that converts text values into FieldWritable objects.
 * //  ww w.  j av a  2s  .c o m
 * @param inputs The directories with text files to convert.
 * @param output The converter output directory.
 * 
 * @throws IOException If an error occurs while converting.
 */
private void runConverter(Path[] inputs, Path output) throws IOException {

    JobConf converter = new NutchJob(getConf());
    converter.setJobName("CustomFields Converter");
    for (int i = 0; i < inputs.length; i++) {
        FileInputFormat.addInputPath(converter, inputs[i]);
    }
    FileOutputFormat.setOutputPath(converter, output);
    converter.setInputFormat(TextInputFormat.class);
    converter.setMapperClass(Converter.class);
    converter.setReducerClass(Converter.class);
    converter.setMapOutputKeyClass(Text.class);
    converter.setMapOutputValueClass(FieldWritable.class);
    converter.setOutputKeyClass(Text.class);
    converter.setOutputValueClass(FieldWritable.class);
    converter.setOutputFormat(SequenceFileOutputFormat.class);

    LOG.info("Starting converter job");
    try {
        JobClient.runJob(converter);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished converter job.");
}

From source file:org.apache.nutch.indexer.field.CustomFields.java

License:Apache License

/**
 * Aggregated multiple FieldWritable objects with the same name. Depending on
 * settings in the custom-fields.xml file, a field may one or more fields.
 * This jobs aggregates fields and then collects based on the configuration
 * settings.//from   w  w w.  j a  v a 2  s.com
 * 
 * @param basicFields The basicfields FieldWritable objects.
 * @param converted The converted custom field objects.
 * @param output The final output directory for custom field objects.
 * 
 * @throws IOException If an error occurs while converting.
 */
private void runCollector(Path basicFields, Path converted, Path output) throws IOException {

    JobConf collector = new NutchJob(getConf());
    collector.setJobName("CustomFields Collector");
    FileInputFormat.addInputPath(collector, converted);
    FileInputFormat.addInputPath(collector, basicFields);
    FileOutputFormat.setOutputPath(collector, output);
    collector.setInputFormat(SequenceFileInputFormat.class);
    collector.setMapOutputKeyClass(Text.class);
    collector.setMapOutputValueClass(ObjectWritable.class);
    collector.setMapperClass(Collector.class);
    collector.setReducerClass(Collector.class);
    collector.setOutputKeyClass(Text.class);
    collector.setOutputValueClass(FieldWritable.class);
    collector.setOutputFormat(SequenceFileOutputFormat.class);

    LOG.info("Starting collector job");
    try {
        JobClient.runJob(collector);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished collector job.");
}

From source file:org.apache.nutch.indexer.field.FieldIndexer.java

License:Apache License

public void index(Path[] fields, Path indexDir) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("FieldIndexer: starting at " + sdf.format(start));

    JobConf job = new NutchJob(getConf());
    job.setJobName("FieldIndexer: " + indexDir);

    for (int i = 0; i < fields.length; i++) {
        Path fieldsDb = fields[i];
        LOG.info("FieldIndexer: adding fields db: " + fieldsDb);
        FileInputFormat.addInputPath(job, fieldsDb);
    }/*from  ww w . j  a  v  a2s .  c  o m*/

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(FieldIndexer.class);
    job.setReducerClass(FieldIndexer.class);
    FileOutputFormat.setOutputPath(job, indexDir);
    job.setOutputFormat(OutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(FieldWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LuceneDocumentWrapper.class);

    JobClient.runJob(job);
    long end = System.currentTimeMillis();
    LOG.info("FieldIndexer: finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.indexer.solr.SolrClean.java

License:Apache License

public void delete(String crawldb, String solrUrl, boolean noCommit) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("SolrClean: starting at " + sdf.format(start));

    JobConf job = new NutchJob(getConf());

    FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME));
    job.setBoolean("noCommit", noCommit);
    job.set(SolrConstants.SERVER_URL, solrUrl);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setMapOutputKeyClass(ByteWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setMapperClass(DBFilter.class);
    job.setReducerClass(SolrDeleter.class);

    JobClient.runJob(job);/*from www  . j a v a2 s.  com*/

    long end = System.currentTimeMillis();
    LOG.info("SolrClean: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.scoring.webgraph.LinkDumper.java

License:Apache License

/**
 * Runs the inverter and merger jobs of the LinkDumper tool to create the 
 * url to inlink node database.//from   ww  w  .  jav a  2 s .  co  m
 */
public void dumpLinks(Path webGraphDb) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("NodeDumper: starting at " + sdf.format(start));
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    Path linkdump = new Path(webGraphDb, DUMP_DIR);
    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
    Path loopSetDb = new Path(webGraphDb, Loops.LOOPS_DIR);
    boolean loopsExists = fs.exists(loopSetDb);
    Path outlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR);

    // run the inverter job
    Path tempInverted = new Path(webGraphDb,
            "inverted-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
    JobConf inverter = new NutchJob(conf);
    inverter.setJobName("LinkDumper: inverter");
    FileInputFormat.addInputPath(inverter, nodeDb);
    if (loopsExists) {
        FileInputFormat.addInputPath(inverter, loopSetDb);
    }
    FileInputFormat.addInputPath(inverter, outlinkDb);
    inverter.setInputFormat(SequenceFileInputFormat.class);
    inverter.setMapperClass(Inverter.class);
    inverter.setReducerClass(Inverter.class);
    inverter.setMapOutputKeyClass(Text.class);
    inverter.setMapOutputValueClass(ObjectWritable.class);
    inverter.setOutputKeyClass(Text.class);
    inverter.setOutputValueClass(LinkNode.class);
    FileOutputFormat.setOutputPath(inverter, tempInverted);
    inverter.setOutputFormat(SequenceFileOutputFormat.class);

    try {
        LOG.info("LinkDumper: running inverter");
        JobClient.runJob(inverter);
        LOG.info("LinkDumper: finished inverter");
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }

    // run the merger job
    JobConf merger = new NutchJob(conf);
    merger.setJobName("LinkDumper: merger");
    FileInputFormat.addInputPath(merger, tempInverted);
    merger.setInputFormat(SequenceFileInputFormat.class);
    merger.setReducerClass(Merger.class);
    merger.setMapOutputKeyClass(Text.class);
    merger.setMapOutputValueClass(LinkNode.class);
    merger.setOutputKeyClass(Text.class);
    merger.setOutputValueClass(LinkNodes.class);
    FileOutputFormat.setOutputPath(merger, linkdump);
    merger.setOutputFormat(MapFileOutputFormat.class);

    try {
        LOG.info("LinkDumper: running merger");
        JobClient.runJob(merger);
        LOG.info("LinkDumper: finished merger");
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }

    fs.delete(tempInverted, true);
    long end = System.currentTimeMillis();
    LOG.info("LinkDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.scoring.webgraph.LinkRank.java

License:Apache License

/**
 * Runs the counter job. The counter job determines the number of links in the
 * webgraph. This is used during analysis.
 * //  w ww  .  ja  va 2 s.c o  m
 * @param fs The job file system.
 * @param webGraphDb The web graph database to use.
 * 
 * @return The number of nodes in the web graph.
 * @throws IOException If an error occurs while running the counter job.
 */
private int runCounter(FileSystem fs, Path webGraphDb) throws IOException {

    // configure the counter job
    Path numLinksPath = new Path(webGraphDb, NUM_NODES);
    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
    JobConf counter = new NutchJob(getConf());
    counter.setJobName("LinkRank Counter");
    FileInputFormat.addInputPath(counter, nodeDb);
    FileOutputFormat.setOutputPath(counter, numLinksPath);
    counter.setInputFormat(SequenceFileInputFormat.class);
    counter.setMapperClass(Counter.class);
    counter.setCombinerClass(Counter.class);
    counter.setReducerClass(Counter.class);
    counter.setMapOutputKeyClass(Text.class);
    counter.setMapOutputValueClass(LongWritable.class);
    counter.setOutputKeyClass(Text.class);
    counter.setOutputValueClass(LongWritable.class);
    counter.setNumReduceTasks(1);
    counter.setOutputFormat(TextOutputFormat.class);
    counter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    // run the counter job, outputs to a single reduce task and file
    LOG.info("Starting link counter job");
    try {
        JobClient.runJob(counter);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished link counter job");

    // read the first (and only) line from the file which should be the
    // number of links in the web graph
    LOG.info("Reading numlinks temp file");
    FSDataInputStream readLinks = fs.open(new Path(numLinksPath, "part-00000"));
    BufferedReader buffer = new BufferedReader(new InputStreamReader(readLinks));
    String numLinksLine = buffer.readLine();
    readLinks.close();

    // check if there are links to process, if none, webgraph might be empty
    if (numLinksLine == null || numLinksLine.length() == 0) {
        fs.delete(numLinksPath, true);
        throw new IOException("No links to process, is the webgraph empty?");
    }

    // delete temp file and convert and return the number of links as an int
    LOG.info("Deleting numlinks temp file");
    fs.delete(numLinksPath, true);
    String numLinks = numLinksLine.split("\\s+")[1];
    return Integer.parseInt(numLinks);
}

From source file:org.apache.nutch.scoring.webgraph.LinkRank.java

License:Apache License

/**
 * Runs the initializer job. The initializer job sets up the nodes with a
 * default starting score for link analysis.
 * /*ww w  .  j a v  a 2 s.  co  m*/
 * @param nodeDb The node database to use.
 * @param output The job output directory.
 * 
 * @throws IOException If an error occurs while running the initializer job.
 */
private void runInitializer(Path nodeDb, Path output) throws IOException {

    // configure the initializer
    JobConf initializer = new NutchJob(getConf());
    initializer.setJobName("LinkAnalysis Initializer");
    FileInputFormat.addInputPath(initializer, nodeDb);
    FileOutputFormat.setOutputPath(initializer, output);
    initializer.setInputFormat(SequenceFileInputFormat.class);
    initializer.setMapperClass(Initializer.class);
    initializer.setMapOutputKeyClass(Text.class);
    initializer.setMapOutputValueClass(Node.class);
    initializer.setOutputKeyClass(Text.class);
    initializer.setOutputValueClass(Node.class);
    initializer.setOutputFormat(MapFileOutputFormat.class);
    initializer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    // run the initializer
    LOG.info("Starting initialization job");
    try {
        JobClient.runJob(initializer);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished initialization job.");
}

From source file:org.apache.nutch.scoring.webgraph.LinkRank.java

License:Apache License

/**
 * Runs the inverter job. The inverter job flips outlinks to inlinks to be
 * passed into the analysis job./*from   ww  w .j a va2s.  co m*/
 * 
 * The inverter job takes a link loops database if it exists. It is an
 * optional componenet of link analysis due to its extreme computational and
 * space requirements but it can be very useful is weeding out and eliminating
 * link farms and other spam pages.
 * 
 * @param nodeDb The node database to use.
 * @param outlinkDb The outlink database to use.
 * @param loopDb The loop database to use if it exists.
 * @param output The output directory.
 * 
 * @throws IOException If an error occurs while running the inverter job.
 */
private void runInverter(Path nodeDb, Path outlinkDb, Path loopDb, Path output) throws IOException {

    // configure the inverter
    JobConf inverter = new NutchJob(getConf());
    inverter.setJobName("LinkAnalysis Inverter");
    FileInputFormat.addInputPath(inverter, nodeDb);
    FileInputFormat.addInputPath(inverter, outlinkDb);

    // add the loop database if it exists, isn't null
    if (loopDb != null) {
        FileInputFormat.addInputPath(inverter, loopDb);
    }
    FileOutputFormat.setOutputPath(inverter, output);
    inverter.setInputFormat(SequenceFileInputFormat.class);
    inverter.setMapperClass(Inverter.class);
    inverter.setReducerClass(Inverter.class);
    inverter.setMapOutputKeyClass(Text.class);
    inverter.setMapOutputValueClass(ObjectWritable.class);
    inverter.setOutputKeyClass(Text.class);
    inverter.setOutputValueClass(LinkDatum.class);
    inverter.setOutputFormat(SequenceFileOutputFormat.class);
    inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    // run the inverter job
    LOG.info("Starting inverter job");
    try {
        JobClient.runJob(inverter);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished inverter job.");
}

From source file:org.apache.nutch.scoring.webgraph.LinkRank.java

License:Apache License

/**
 * Runs the link analysis job. The link analysis job applies the link rank
 * formula to create a score per url and stores that score in the NodeDb.
 * /*w  w  w.  j ava 2 s  .  c om*/
 * Typically the link analysis job is run a number of times to allow the link
 * rank scores to converge.
 * 
 * @param nodeDb The node database from which we are getting previous link
 * rank scores.
 * @param inverted The inverted inlinks
 * @param output The link analysis output.
 * @param iteration The current iteration number.
 * @param numIterations The total number of link analysis iterations
 * 
 * @throws IOException If an error occurs during link analysis.
 */
private void runAnalysis(Path nodeDb, Path inverted, Path output, int iteration, int numIterations,
        float rankOne) throws IOException {

    JobConf analyzer = new NutchJob(getConf());
    analyzer.set("link.analyze.iteration", String.valueOf(iteration + 1));
    analyzer.setJobName("LinkAnalysis Analyzer, iteration " + (iteration + 1) + " of " + numIterations);
    FileInputFormat.addInputPath(analyzer, nodeDb);
    FileInputFormat.addInputPath(analyzer, inverted);
    FileOutputFormat.setOutputPath(analyzer, output);
    analyzer.set("link.analyze.rank.one", String.valueOf(rankOne));
    analyzer.setMapOutputKeyClass(Text.class);
    analyzer.setMapOutputValueClass(ObjectWritable.class);
    analyzer.setInputFormat(SequenceFileInputFormat.class);
    analyzer.setMapperClass(Analyzer.class);
    analyzer.setReducerClass(Analyzer.class);
    analyzer.setOutputKeyClass(Text.class);
    analyzer.setOutputValueClass(Node.class);
    analyzer.setOutputFormat(MapFileOutputFormat.class);
    analyzer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    LOG.info("Starting analysis job");
    try {
        JobClient.runJob(analyzer);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished analysis job.");
}

From source file:org.apache.nutch.scoring.webgraph.Loops.java

License:Apache License

/**
 * Runs the various loop jobs.//from  w w  w  .j ava  2 s . c  o  m
 */
public void findLoops(Path webGraphDb) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("Loops: starting at " + sdf.format(start));
        LOG.info("Loops: webgraphdb: " + webGraphDb);
    }

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    Path outlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR);
    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
    Path routes = new Path(webGraphDb, ROUTES_DIR);
    Path tempRoute = new Path(webGraphDb,
            ROUTES_DIR + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // run the initializer
    JobConf init = new NutchJob(conf);
    init.setJobName("Initializer: " + webGraphDb);
    FileInputFormat.addInputPath(init, outlinkDb);
    FileInputFormat.addInputPath(init, nodeDb);
    init.setInputFormat(SequenceFileInputFormat.class);
    init.setMapperClass(Initializer.class);
    init.setReducerClass(Initializer.class);
    init.setMapOutputKeyClass(Text.class);
    init.setMapOutputValueClass(ObjectWritable.class);
    init.setOutputKeyClass(Text.class);
    init.setOutputValueClass(Route.class);
    FileOutputFormat.setOutputPath(init, tempRoute);
    init.setOutputFormat(SequenceFileOutputFormat.class);

    try {
        LOG.info("Loops: starting initializer");
        JobClient.runJob(init);
        LOG.info("Loops: installing initializer " + routes);
        FSUtils.replace(fs, routes, tempRoute, true);
        LOG.info("Loops: finished initializer");
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }

    // run the loops job for a maxdepth, default 2, which will find a 3 link
    // loop cycle
    int depth = conf.getInt("link.loops.depth", 2);
    for (int i = 0; i < depth; i++) {

        JobConf looper = new NutchJob(conf);
        looper.setJobName("Looper: " + (i + 1) + " of " + depth);
        FileInputFormat.addInputPath(looper, outlinkDb);
        FileInputFormat.addInputPath(looper, routes);
        looper.setInputFormat(SequenceFileInputFormat.class);
        looper.setMapperClass(Looper.class);
        looper.setReducerClass(Looper.class);
        looper.setMapOutputKeyClass(Text.class);
        looper.setMapOutputValueClass(ObjectWritable.class);
        looper.setOutputKeyClass(Text.class);
        looper.setOutputValueClass(Route.class);
        FileOutputFormat.setOutputPath(looper, tempRoute);
        looper.setOutputFormat(SequenceFileOutputFormat.class);
        looper.setBoolean("last", i == (depth - 1));

        try {
            LOG.info("Loops: starting looper");
            JobClient.runJob(looper);
            LOG.info("Loops: installing looper " + routes);
            FSUtils.replace(fs, routes, tempRoute, true);
            LOG.info("Loops: finished looper");
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            throw e;
        }
    }

    // run the finalizer
    JobConf finalizer = new NutchJob(conf);
    finalizer.setJobName("Finalizer: " + webGraphDb);
    FileInputFormat.addInputPath(finalizer, routes);
    finalizer.setInputFormat(SequenceFileInputFormat.class);
    finalizer.setMapperClass(Finalizer.class);
    finalizer.setReducerClass(Finalizer.class);
    finalizer.setMapOutputKeyClass(Text.class);
    finalizer.setMapOutputValueClass(Route.class);
    finalizer.setOutputKeyClass(Text.class);
    finalizer.setOutputValueClass(LoopSet.class);
    FileOutputFormat.setOutputPath(finalizer, new Path(webGraphDb, LOOPS_DIR));
    finalizer.setOutputFormat(MapFileOutputFormat.class);

    try {
        LOG.info("Loops: starting finalizer");
        JobClient.runJob(finalizer);
        LOG.info("Loops: finished finalizer");
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    long end = System.currentTimeMillis();
    LOG.info("Loops: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}