Example usage for org.apache.hadoop.mapred JobConf setMapOutputKeyClass

List of usage examples for org.apache.hadoop.mapred JobConf setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass) 

Source Link

Document

Set the key class for the map output data.

Usage

From source file:org.apache.nutch.indexer.field.CustomFields.java

License:Apache License

/**
 * MapReduce job that converts text values into FieldWritable objects.
 * //  ww w.  j av a  2s  .c o m
 * @param inputs The directories with text files to convert.
 * @param output The converter output directory.
 * 
 * @throws IOException If an error occurs while converting.
 */
private void runConverter(Path[] inputs, Path output) throws IOException {

    JobConf converter = new NutchJob(getConf());
    converter.setJobName("CustomFields Converter");
    for (int i = 0; i < inputs.length; i++) {
        FileInputFormat.addInputPath(converter, inputs[i]);
    }
    FileOutputFormat.setOutputPath(converter, output);
    converter.setInputFormat(TextInputFormat.class);
    converter.setMapperClass(Converter.class);
    converter.setReducerClass(Converter.class);
    converter.setMapOutputKeyClass(Text.class);
    converter.setMapOutputValueClass(FieldWritable.class);
    converter.setOutputKeyClass(Text.class);
    converter.setOutputValueClass(FieldWritable.class);
    converter.setOutputFormat(SequenceFileOutputFormat.class);

    LOG.info("Starting converter job");
    try {
        JobClient.runJob(converter);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished converter job.");
}

From source file:org.apache.nutch.indexer.field.CustomFields.java

License:Apache License

/**
 * Aggregated multiple FieldWritable objects with the same name. Depending on
 * settings in the custom-fields.xml file, a field may one or more fields.
 * This jobs aggregates fields and then collects based on the configuration
 * settings.//from   w  w w.  j a  v a 2  s.com
 * 
 * @param basicFields The basicfields FieldWritable objects.
 * @param converted The converted custom field objects.
 * @param output The final output directory for custom field objects.
 * 
 * @throws IOException If an error occurs while converting.
 */
private void runCollector(Path basicFields, Path converted, Path output) throws IOException {

    JobConf collector = new NutchJob(getConf());
    collector.setJobName("CustomFields Collector");
    FileInputFormat.addInputPath(collector, converted);
    FileInputFormat.addInputPath(collector, basicFields);
    FileOutputFormat.setOutputPath(collector, output);
    collector.setInputFormat(SequenceFileInputFormat.class);
    collector.setMapOutputKeyClass(Text.class);
    collector.setMapOutputValueClass(ObjectWritable.class);
    collector.setMapperClass(Collector.class);
    collector.setReducerClass(Collector.class);
    collector.setOutputKeyClass(Text.class);
    collector.setOutputValueClass(FieldWritable.class);
    collector.setOutputFormat(SequenceFileOutputFormat.class);

    LOG.info("Starting collector job");
    try {
        JobClient.runJob(collector);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished collector job.");
}

From source file:org.apache.nutch.indexer.field.FieldIndexer.java

License:Apache License

public void index(Path[] fields, Path indexDir) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("FieldIndexer: starting at " + sdf.format(start));

    JobConf job = new NutchJob(getConf());
    job.setJobName("FieldIndexer: " + indexDir);

    for (int i = 0; i < fields.length; i++) {
        Path fieldsDb = fields[i];
        LOG.info("FieldIndexer: adding fields db: " + fieldsDb);
        FileInputFormat.addInputPath(job, fieldsDb);
    }/*from  ww w . j  a  v  a2s .  c  o m*/

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(FieldIndexer.class);
    job.setReducerClass(FieldIndexer.class);
    FileOutputFormat.setOutputPath(job, indexDir);
    job.setOutputFormat(OutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(FieldWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LuceneDocumentWrapper.class);

    JobClient.runJob(job);
    long end = System.currentTimeMillis();
    LOG.info("FieldIndexer: finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.indexer.solr.SolrClean.java

License:Apache License

public void delete(String crawldb, String solrUrl, boolean noCommit) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("SolrClean: starting at " + sdf.format(start));

    JobConf job = new NutchJob(getConf());

    FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME));
    job.setBoolean("noCommit", noCommit);
    job.set(SolrConstants.SERVER_URL, solrUrl);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setMapOutputKeyClass(ByteWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setMapperClass(DBFilter.class);
    job.setReducerClass(SolrDeleter.class);

    JobClient.runJob(job);/*from www  . j a v a2 s.  com*/

    long end = System.currentTimeMillis();
    LOG.info("SolrClean: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.scoring.webgraph.LinkDumper.java

License:Apache License

/**
 * Runs the inverter and merger jobs of the LinkDumper tool to create the 
 * url to inlink node database.//from   ww  w  .  jav a  2 s .  co  m
 */
public void dumpLinks(Path webGraphDb) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("NodeDumper: starting at " + sdf.format(start));
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    Path linkdump = new Path(webGraphDb, DUMP_DIR);
    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
    Path loopSetDb = new Path(webGraphDb, Loops.LOOPS_DIR);
    boolean loopsExists = fs.exists(loopSetDb);
    Path outlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR);

    // run the inverter job
    Path tempInverted = new Path(webGraphDb,
            "inverted-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
    JobConf inverter = new NutchJob(conf);
    inverter.setJobName("LinkDumper: inverter");
    FileInputFormat.addInputPath(inverter, nodeDb);
    if (loopsExists) {
        FileInputFormat.addInputPath(inverter, loopSetDb);
    }
    FileInputFormat.addInputPath(inverter, outlinkDb);
    inverter.setInputFormat(SequenceFileInputFormat.class);
    inverter.setMapperClass(Inverter.class);
    inverter.setReducerClass(Inverter.class);
    inverter.setMapOutputKeyClass(Text.class);
    inverter.setMapOutputValueClass(ObjectWritable.class);
    inverter.setOutputKeyClass(Text.class);
    inverter.setOutputValueClass(LinkNode.class);
    FileOutputFormat.setOutputPath(inverter, tempInverted);
    inverter.setOutputFormat(SequenceFileOutputFormat.class);

    try {
        LOG.info("LinkDumper: running inverter");
        JobClient.runJob(inverter);
        LOG.info("LinkDumper: finished inverter");
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }

    // run the merger job
    JobConf merger = new NutchJob(conf);
    merger.setJobName("LinkDumper: merger");
    FileInputFormat.addInputPath(merger, tempInverted);
    merger.setInputFormat(SequenceFileInputFormat.class);
    merger.setReducerClass(Merger.class);
    merger.setMapOutputKeyClass(Text.class);
    merger.setMapOutputValueClass(LinkNode.class);
    merger.setOutputKeyClass(Text.class);
    merger.setOutputValueClass(LinkNodes.class);
    FileOutputFormat.setOutputPath(merger, linkdump);
    merger.setOutputFormat(MapFileOutputFormat.class);

    try {
        LOG.info("LinkDumper: running merger");
        JobClient.runJob(merger);
        LOG.info("LinkDumper: finished merger");
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }

    fs.delete(tempInverted, true);
    long end = System.currentTimeMillis();
    LOG.info("LinkDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.scoring.webgraph.LinkRank.java

License:Apache License

/**
 * Runs the counter job. The counter job determines the number of links in the
 * webgraph. This is used during analysis.
 * //  w ww  .  ja  va 2 s.c o  m
 * @param fs The job file system.
 * @param webGraphDb The web graph database to use.
 * 
 * @return The number of nodes in the web graph.
 * @throws IOException If an error occurs while running the counter job.
 */
private int runCounter(FileSystem fs, Path webGraphDb) throws IOException {

    // configure the counter job
    Path numLinksPath = new Path(webGraphDb, NUM_NODES);
    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
    JobConf counter = new NutchJob(getConf());
    counter.setJobName("LinkRank Counter");
    FileInputFormat.addInputPath(counter, nodeDb);
    FileOutputFormat.setOutputPath(counter, numLinksPath);
    counter.setInputFormat(SequenceFileInputFormat.class);
    counter.setMapperClass(Counter.class);
    counter.setCombinerClass(Counter.class);
    counter.setReducerClass(Counter.class);
    counter.setMapOutputKeyClass(Text.class);
    counter.setMapOutputValueClass(LongWritable.class);
    counter.setOutputKeyClass(Text.class);
    counter.setOutputValueClass(LongWritable.class);
    counter.setNumReduceTasks(1);
    counter.setOutputFormat(TextOutputFormat.class);
    counter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    // run the counter job, outputs to a single reduce task and file
    LOG.info("Starting link counter job");
    try {
        JobClient.runJob(counter);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished link counter job");

    // read the first (and only) line from the file which should be the
    // number of links in the web graph
    LOG.info("Reading numlinks temp file");
    FSDataInputStream readLinks = fs.open(new Path(numLinksPath, "part-00000"));
    BufferedReader buffer = new BufferedReader(new InputStreamReader(readLinks));
    String numLinksLine = buffer.readLine();
    readLinks.close();

    // check if there are links to process, if none, webgraph might be empty
    if (numLinksLine == null || numLinksLine.length() == 0) {
        fs.delete(numLinksPath, true);
        throw new IOException("No links to process, is the webgraph empty?");
    }

    // delete temp file and convert and return the number of links as an int
    LOG.info("Deleting numlinks temp file");
    fs.delete(numLinksPath, true);
    String numLinks = numLinksLine.split("\\s+")[1];
    return Integer.parseInt(numLinks);
}

From source file:org.apache.nutch.scoring.webgraph.LinkRank.java

License:Apache License

/**
 * Runs the initializer job. The initializer job sets up the nodes with a
 * default starting score for link analysis.
 * /*ww w  .  j a v  a 2 s.  co  m*/
 * @param nodeDb The node database to use.
 * @param output The job output directory.
 * 
 * @throws IOException If an error occurs while running the initializer job.
 */
private void runInitializer(Path nodeDb, Path output) throws IOException {

    // configure the initializer
    JobConf initializer = new NutchJob(getConf());
    initializer.setJobName("LinkAnalysis Initializer");
    FileInputFormat.addInputPath(initializer, nodeDb);
    FileOutputFormat.setOutputPath(initializer, output);
    initializer.setInputFormat(SequenceFileInputFormat.class);
    initializer.setMapperClass(Initializer.class);
    initializer.setMapOutputKeyClass(Text.class);
    initializer.setMapOutputValueClass(Node.class);
    initializer.setOutputKeyClass(Text.class);
    initializer.setOutputValueClass(Node.class);
    initializer.setOutputFormat(MapFileOutputFormat.class);
    initializer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    // run the initializer
    LOG.info("Starting initialization job");
    try {
        JobClient.runJob(initializer);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished initialization job.");
}

From source file:org.apache.nutch.scoring.webgraph.LinkRank.java

License:Apache License

/**
 * Runs the inverter job. The inverter job flips outlinks to inlinks to be
 * passed into the analysis job./*from   ww  w .j a va2s.  co m*/
 * 
 * The inverter job takes a link loops database if it exists. It is an
 * optional componenet of link analysis due to its extreme computational and
 * space requirements but it can be very useful is weeding out and eliminating
 * link farms and other spam pages.
 * 
 * @param nodeDb The node database to use.
 * @param outlinkDb The outlink database to use.
 * @param loopDb The loop database to use if it exists.
 * @param output The output directory.
 * 
 * @throws IOException If an error occurs while running the inverter job.
 */
private void runInverter(Path nodeDb, Path outlinkDb, Path loopDb, Path output) throws IOException {

    // configure the inverter
    JobConf inverter = new NutchJob(getConf());
    inverter.setJobName("LinkAnalysis Inverter");
    FileInputFormat.addInputPath(inverter, nodeDb);
    FileInputFormat.addInputPath(inverter, outlinkDb);

    // add the loop database if it exists, isn't null
    if (loopDb != null) {
        FileInputFormat.addInputPath(inverter, loopDb);
    }
    FileOutputFormat.setOutputPath(inverter, output);
    inverter.setInputFormat(SequenceFileInputFormat.class);
    inverter.setMapperClass(Inverter.class);
    inverter.setReducerClass(Inverter.class);
    inverter.setMapOutputKeyClass(Text.class);
    inverter.setMapOutputValueClass(ObjectWritable.class);
    inverter.setOutputKeyClass(Text.class);
    inverter.setOutputValueClass(LinkDatum.class);
    inverter.setOutputFormat(SequenceFileOutputFormat.class);
    inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    // run the inverter job
    LOG.info("Starting inverter job");
    try {
        JobClient.runJob(inverter);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished inverter job.");
}

From source file:org.apache.nutch.scoring.webgraph.LinkRank.java

License:Apache License

/**
 * Runs the link analysis job. The link analysis job applies the link rank
 * formula to create a score per url and stores that score in the NodeDb.
 * /*w  w  w.  j ava 2 s  .  c om*/
 * Typically the link analysis job is run a number of times to allow the link
 * rank scores to converge.
 * 
 * @param nodeDb The node database from which we are getting previous link
 * rank scores.
 * @param inverted The inverted inlinks
 * @param output The link analysis output.
 * @param iteration The current iteration number.
 * @param numIterations The total number of link analysis iterations
 * 
 * @throws IOException If an error occurs during link analysis.
 */
private void runAnalysis(Path nodeDb, Path inverted, Path output, int iteration, int numIterations,
        float rankOne) throws IOException {

    JobConf analyzer = new NutchJob(getConf());
    analyzer.set("link.analyze.iteration", String.valueOf(iteration + 1));
    analyzer.setJobName("LinkAnalysis Analyzer, iteration " + (iteration + 1) + " of " + numIterations);
    FileInputFormat.addInputPath(analyzer, nodeDb);
    FileInputFormat.addInputPath(analyzer, inverted);
    FileOutputFormat.setOutputPath(analyzer, output);
    analyzer.set("link.analyze.rank.one", String.valueOf(rankOne));
    analyzer.setMapOutputKeyClass(Text.class);
    analyzer.setMapOutputValueClass(ObjectWritable.class);
    analyzer.setInputFormat(SequenceFileInputFormat.class);
    analyzer.setMapperClass(Analyzer.class);
    analyzer.setReducerClass(Analyzer.class);
    analyzer.setOutputKeyClass(Text.class);
    analyzer.setOutputValueClass(Node.class);
    analyzer.setOutputFormat(MapFileOutputFormat.class);
    analyzer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    LOG.info("Starting analysis job");
    try {
        JobClient.runJob(analyzer);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished analysis job.");
}

From source file:org.apache.nutch.scoring.webgraph.Loops.java

License:Apache License

/**
 * Runs the various loop jobs.//from  w w  w  .j ava  2 s . c  o  m
 */
public void findLoops(Path webGraphDb) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("Loops: starting at " + sdf.format(start));
        LOG.info("Loops: webgraphdb: " + webGraphDb);
    }

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    Path outlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR);
    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
    Path routes = new Path(webGraphDb, ROUTES_DIR);
    Path tempRoute = new Path(webGraphDb,
            ROUTES_DIR + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // run the initializer
    JobConf init = new NutchJob(conf);
    init.setJobName("Initializer: " + webGraphDb);
    FileInputFormat.addInputPath(init, outlinkDb);
    FileInputFormat.addInputPath(init, nodeDb);
    init.setInputFormat(SequenceFileInputFormat.class);
    init.setMapperClass(Initializer.class);
    init.setReducerClass(Initializer.class);
    init.setMapOutputKeyClass(Text.class);
    init.setMapOutputValueClass(ObjectWritable.class);
    init.setOutputKeyClass(Text.class);
    init.setOutputValueClass(Route.class);
    FileOutputFormat.setOutputPath(init, tempRoute);
    init.setOutputFormat(SequenceFileOutputFormat.class);

    try {
        LOG.info("Loops: starting initializer");
        JobClient.runJob(init);
        LOG.info("Loops: installing initializer " + routes);
        FSUtils.replace(fs, routes, tempRoute, true);
        LOG.info("Loops: finished initializer");
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }

    // run the loops job for a maxdepth, default 2, which will find a 3 link
    // loop cycle
    int depth = conf.getInt("link.loops.depth", 2);
    for (int i = 0; i < depth; i++) {

        JobConf looper = new NutchJob(conf);
        looper.setJobName("Looper: " + (i + 1) + " of " + depth);
        FileInputFormat.addInputPath(looper, outlinkDb);
        FileInputFormat.addInputPath(looper, routes);
        looper.setInputFormat(SequenceFileInputFormat.class);
        looper.setMapperClass(Looper.class);
        looper.setReducerClass(Looper.class);
        looper.setMapOutputKeyClass(Text.class);
        looper.setMapOutputValueClass(ObjectWritable.class);
        looper.setOutputKeyClass(Text.class);
        looper.setOutputValueClass(Route.class);
        FileOutputFormat.setOutputPath(looper, tempRoute);
        looper.setOutputFormat(SequenceFileOutputFormat.class);
        looper.setBoolean("last", i == (depth - 1));

        try {
            LOG.info("Loops: starting looper");
            JobClient.runJob(looper);
            LOG.info("Loops: installing looper " + routes);
            FSUtils.replace(fs, routes, tempRoute, true);
            LOG.info("Loops: finished looper");
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            throw e;
        }
    }

    // run the finalizer
    JobConf finalizer = new NutchJob(conf);
    finalizer.setJobName("Finalizer: " + webGraphDb);
    FileInputFormat.addInputPath(finalizer, routes);
    finalizer.setInputFormat(SequenceFileInputFormat.class);
    finalizer.setMapperClass(Finalizer.class);
    finalizer.setReducerClass(Finalizer.class);
    finalizer.setMapOutputKeyClass(Text.class);
    finalizer.setMapOutputValueClass(Route.class);
    finalizer.setOutputKeyClass(Text.class);
    finalizer.setOutputValueClass(LoopSet.class);
    FileOutputFormat.setOutputPath(finalizer, new Path(webGraphDb, LOOPS_DIR));
    finalizer.setOutputFormat(MapFileOutputFormat.class);

    try {
        LOG.info("Loops: starting finalizer");
        JobClient.runJob(finalizer);
        LOG.info("Loops: finished finalizer");
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    long end = System.currentTimeMillis();
    LOG.info("Loops: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}