Example usage for org.apache.hadoop.mapred JobConf setCombinerClass

List of usage examples for org.apache.hadoop.mapred JobConf setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> theClass) 

Source Link

Document

Set the user-defined combiner class used to combine map-outputs before being sent to the reducers.

Usage

From source file:org.apache.mahout.classifier.bayes.mapreduce.cbayes.CBayesThetaNormalizerDriver.java

License:Apache License

@Override
public void runJob(Path input, Path output, BayesParameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(CBayesThetaNormalizerDriver.class);
    conf.setJobName("Complementary Bayes Theta Normalizer Driver running over input: " + input);

    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-weights/Sigma_j"));
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-tfIdf/trainer-tfIdf"));
    Path outPath = new Path(output, "trainer-thetaNormalizer");
    FileOutputFormat.setOutputPath(conf, outPath);
    // conf.setNumMapTasks(100);
    // conf.setNumReduceTasks(1);
    conf.setMapperClass(CBayesThetaNormalizerMapper.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setCombinerClass(CBayesThetaNormalizerReducer.class);
    conf.setReducerClass(CBayesThetaNormalizerReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.set("io.serializations",
            "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters and make or break a piece of code

    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    HadoopUtil.overwriteOutput(outPath);

    Path sigmaKFiles = new Path(output, "trainer-weights/Sigma_k/*");
    Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, sigmaKFiles, conf);
    DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf,
            GenericsUtil.getClass(labelWeightSum));
    String labelWeightSumString = mapStringifier.toString(labelWeightSum);

    log.info("Sigma_k for Each Label");
    Map<String, Double> c = mapStringifier.fromString(labelWeightSumString);
    log.info("{}", c);
    conf.set("cnaivebayes.sigma_k", labelWeightSumString);

    Path sigmaKSigmaJFile = new Path(output, "trainer-weights/Sigma_kSigma_j/*");
    double sigmaJSigmaK = SequenceFileModelReader.readSigmaJSigmaK(dfs, sigmaKSigmaJFile, conf);
    DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.class);
    String sigmaJSigmaKString = stringifier.toString(sigmaJSigmaK);

    log.info("Sigma_kSigma_j for each Label and for each Features");
    double retSigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString);
    log.info("{}", retSigmaJSigmaK);
    conf.set("cnaivebayes.sigma_jSigma_k", sigmaJSigmaKString);

    Path vocabCountFile = new Path(output, "trainer-tfIdf/trainer-vocabCount/*");
    double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf);
    String vocabCountString = stringifier.toString(vocabCount);

    log.info("Vocabulary Count");
    conf.set("cnaivebayes.vocabCount", vocabCountString);
    double retvocabCount = stringifier.fromString(vocabCountString);
    log.info("{}", retvocabCount);
    conf.set("bayes.parameters", params.toString());
    conf.set("output.table", output.toString());
    client.setConf(conf);/*  w w w .  j  a v  a  2 s .  co  m*/

    JobClient.runJob(conf);

}

From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesFeatureDriver.java

License:Apache License

@Override
public void runJob(Path input, Path output, BayesParameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesFeatureDriver.class);
    conf.setJobName("Bayes Feature Driver running over input: " + input);
    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);
    conf.setPartitionerClass(FeaturePartitioner.class);
    conf.setOutputKeyComparatorClass(FeatureLabelComparator.class);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);

    conf.setMapperClass(BayesFeatureMapper.class);

    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setCombinerClass(BayesFeatureCombiner.class);
    conf.setReducerClass(BayesFeatureReducer.class);
    conf.setOutputFormat(BayesFeatureOutputFormat.class);
    conf.set("io.serializations",
            "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // this conf parameter needs to be set enable serialisation of conf values

    HadoopUtil.overwriteOutput(output);//  ww  w.  j ava2s  .c  o m
    conf.set("bayes.parameters", params.toString());

    client.setConf(conf);
    JobClient.runJob(conf);

}

From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesTfIdfDriver.java

License:Apache License

@Override
public void runJob(Path input, Path output, BayesParameters params) throws IOException {

    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesWeightSummerDriver.class);
    conf.setJobName("TfIdf Driver running over input: " + input);

    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);

    FileInputFormat.addInputPath(conf, new Path(output, "trainer-termDocCount"));
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-wordFreq"));
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-featureCount"));
    Path outPath = new Path(output, "trainer-tfIdf");
    FileOutputFormat.setOutputPath(conf, outPath);

    // conf.setNumMapTasks(100);

    conf.setJarByClass(BayesTfIdfDriver.class);

    conf.setMapperClass(BayesTfIdfMapper.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setCombinerClass(BayesTfIdfReducer.class);

    conf.setReducerClass(BayesTfIdfReducer.class);

    conf.setOutputFormat(BayesTfIdfOutputFormat.class);

    conf.set("io.serializations",
            "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters and make or break a piece of code

    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    HadoopUtil.overwriteOutput(outPath);

    Path interimFile = new Path(output, "trainer-docCount/part-*");

    Map<String, Double> labelDocumentCounts = SequenceFileModelReader.readLabelDocumentCounts(dfs, interimFile,
            conf);/*w ww. j av  a  2s.co m*/

    DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf,
            GenericsUtil.getClass(labelDocumentCounts));

    String labelDocumentCountString = mapStringifier.toString(labelDocumentCounts);
    log.info("Counts of documents in Each Label");
    Map<String, Double> c = mapStringifier.fromString(labelDocumentCountString);
    log.info("{}", c);

    conf.set("cnaivebayes.labelDocumentCounts", labelDocumentCountString);
    log.info(params.print());
    if (params.get("dataSource").equals("hbase")) {
        String tableName = output.toString();
        HBaseConfiguration hc = new HBaseConfiguration(new Configuration());
        HTableDescriptor ht = new HTableDescriptor(tableName);
        HColumnDescriptor hcd = new HColumnDescriptor(BayesConstants.HBASE_COLUMN_FAMILY + ':');
        hcd.setBloomfilter(true);
        hcd.setInMemory(true);
        hcd.setMaxVersions(1);
        hcd.setBlockCacheEnabled(true);
        ht.addFamily(hcd);

        log.info("Connecting to hbase...");
        HBaseAdmin hba = new HBaseAdmin(hc);
        log.info("Creating Table {}", output);

        if (hba.tableExists(tableName)) {
            hba.disableTable(tableName);
            hba.deleteTable(tableName);
            hba.majorCompact(".META.");
        }
        hba.createTable(ht);
        conf.set("output.table", tableName);
    }
    conf.set("bayes.parameters", params.toString());

    client.setConf(conf);

    JobClient.runJob(conf);
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesWeightSummerDriver.java

License:Apache License

@Override
public void runJob(Path input, Path output, BayesParameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesWeightSummerDriver.class);
    conf.setJobName("Bayes Weight Summer Driver running over input: " + input);

    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);

    FileInputFormat.addInputPath(conf, new Path(output, "trainer-tfIdf/trainer-tfIdf"));
    Path outPath = new Path(output, "trainer-weights");
    FileOutputFormat.setOutputPath(conf, outPath);
    HadoopUtil.overwriteOutput(outPath);
    // conf.setNumReduceTasks(1);
    // conf.setNumMapTasks(100);
    conf.setMapperClass(BayesWeightSummerMapper.class);
    // see the javadoc for the spec for file input formats: first token is key,
    // rest is input. Whole document on one line
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setCombinerClass(BayesWeightSummerReducer.class);
    conf.setReducerClass(BayesWeightSummerReducer.class);
    conf.setOutputFormat(BayesWeightSummerOutputFormat.class);

    conf.set("bayes.parameters", params.toString());
    conf.set("output.table", output.toString());

    client.setConf(conf);/*from  w  ww . ja v  a  2  s  .  com*/

    JobClient.runJob(conf);
}

From source file:org.apache.mahout.classifier.cbayes.CBayesNormalizedWeightDriver.java

License:Apache License

/**
 * Run the job//w  ww  .j a  v a 2s .c o m
 *
 * @param input  the input pathname String
 * @param output the output pathname String
 */
public static void runJob(String input, String output) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(CBayesNormalizedWeightDriver.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DoubleWritable.class);
    FileInputFormat.addInputPath(conf, new Path(output + "/trainer-theta"));
    Path outPath = new Path(output + "/trainer-weight");
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setNumMapTasks(100);
    //conf.setNumReduceTasks(1);
    conf.setMapperClass(CBayesNormalizedWeightMapper.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setCombinerClass(CBayesNormalizedWeightReducer.class);
    conf.setReducerClass(CBayesNormalizedWeightReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.set("io.serializations",
            "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
        dfs.delete(outPath, true);
    }

    Path thetaNormalizationsFiles = new Path(output + "/trainer-thetaNormalizer/part*");
    Map<String, Double> thetaNormalizer = SequenceFileModelReader.readLabelSums(dfs, thetaNormalizationsFiles,
            conf);
    double perLabelWeightSumNormalisationFactor = Double.MAX_VALUE;
    for (Map.Entry<String, Double> stringDoubleEntry1 : thetaNormalizer.entrySet()) {

        double Sigma_W_ij = stringDoubleEntry1.getValue();
        if (perLabelWeightSumNormalisationFactor > Math.abs(Sigma_W_ij)) {
            perLabelWeightSumNormalisationFactor = Math.abs(Sigma_W_ij);
        }
    }

    for (Map.Entry<String, Double> stringDoubleEntry : thetaNormalizer.entrySet()) {
        double Sigma_W_ij = stringDoubleEntry.getValue();
        thetaNormalizer.put(stringDoubleEntry.getKey(), Sigma_W_ij / perLabelWeightSumNormalisationFactor);
    }

    DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf,
            GenericsUtil.getClass(thetaNormalizer));
    String thetaNormalizationsString = mapStringifier.toString(thetaNormalizer);

    Map<String, Double> c = mapStringifier.fromString(thetaNormalizationsString);
    log.info("{}", c);
    conf.set("cnaivebayes.thetaNormalizations", thetaNormalizationsString);

    client.setConf(conf);

    JobClient.runJob(conf);

}

From source file:org.apache.mahout.classifier.cbayes.CBayesThetaNormalizerDriver.java

License:Apache License

/**
 * Run the job//from  www  .  ja v a  2 s .com
 *
 * @param input  the input pathname String
 * @param output the output pathname String
 */
public static void runJob(String input, String output) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(CBayesThetaNormalizerDriver.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DoubleWritable.class);
    FileInputFormat.addInputPath(conf, new Path(output + "/trainer-weights/Sigma_j"));
    FileInputFormat.addInputPath(conf, new Path(output + "/trainer-tfIdf/trainer-tfIdf"));
    Path outPath = new Path(output + "/trainer-thetaNormalizer");
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setNumMapTasks(100);
    //conf.setNumReduceTasks(1);
    conf.setMapperClass(CBayesThetaNormalizerMapper.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setCombinerClass(CBayesThetaNormalizerReducer.class);
    conf.setReducerClass(CBayesThetaNormalizerReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.set("io.serializations",
            "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code

    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
        dfs.delete(outPath, true);
    }

    Path Sigma_kFiles = new Path(output + "/trainer-weights/Sigma_k/*");
    Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, Sigma_kFiles, conf);
    DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf,
            GenericsUtil.getClass(labelWeightSum));
    String labelWeightSumString = mapStringifier.toString(labelWeightSum);

    log.info("Sigma_k for Each Label");
    Map<String, Double> c = mapStringifier.fromString(labelWeightSumString);
    log.info("{}", c);
    conf.set("cnaivebayes.sigma_k", labelWeightSumString);

    Path sigma_kSigma_jFile = new Path(output + "/trainer-weights/Sigma_kSigma_j/*");
    double sigma_jSigma_k = SequenceFileModelReader.readSigma_jSigma_k(dfs, sigma_kSigma_jFile, conf);
    DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.class);
    String sigma_jSigma_kString = stringifier.toString(sigma_jSigma_k);

    log.info("Sigma_kSigma_j for each Label and for each Features");
    double retSigma_jSigma_k = stringifier.fromString(sigma_jSigma_kString);
    log.info("{}", retSigma_jSigma_k);
    conf.set("cnaivebayes.sigma_jSigma_k", sigma_jSigma_kString);

    Path vocabCountFile = new Path(output + "/trainer-tfIdf/trainer-vocabCount/*");
    double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf);
    String vocabCountString = stringifier.toString(vocabCount);

    log.info("Vocabulary Count");
    conf.set("cnaivebayes.vocabCount", vocabCountString);
    double retvocabCount = stringifier.fromString(vocabCountString);
    log.info("{}", retvocabCount);

    client.setConf(conf);

    JobClient.runJob(conf);

}

From source file:org.apache.mahout.math.hadoop.MatrixMultiplicationJob.java

License:Apache License

public static Configuration createMatrixMultiplyJobConf(Configuration initialConf, Path aPath, Path bPath,
        Path outPath, int outCardinality) {
    JobConf conf = new JobConf(initialConf, MatrixMultiplicationJob.class);
    conf.setInputFormat(CompositeInputFormat.class);
    conf.set("mapred.join.expr",
            CompositeInputFormat.compose("inner", SequenceFileInputFormat.class, aPath, bPath));
    conf.setInt(OUT_CARD, outCardinality);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setMapperClass(MatrixMultiplyMapper.class);
    conf.setCombinerClass(MatrixMultiplicationReducer.class);
    conf.setReducerClass(MatrixMultiplicationReducer.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(VectorWritable.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(VectorWritable.class);
    return conf;//from  ww w.j  a va  2 s . c o m
}

From source file:org.apache.nutch.crawl.CrawlDbReader.java

License:Apache License

public void processStatJob(String crawlDb, Configuration config, boolean sort) throws IOException {

    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb statistics start: " + crawlDb);
    }/* w  w  w .j  a  v  a2s.  c  om*/

    Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis());

    JobConf job = new NutchJob(config);
    job.setJobName("stats " + crawlDb);
    job.setBoolean("db.reader.stats.sort", sort);

    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(CrawlDbStatMapper.class);
    job.setCombinerClass(CrawlDbStatCombiner.class);
    job.setReducerClass(CrawlDbStatReducer.class);

    FileOutputFormat.setOutputPath(job, tmpFolder);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // https://issues.apache.org/jira/browse/NUTCH-1029
    job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    JobClient.runJob(job);

    // reading the result
    FileSystem fileSystem = FileSystem.get(config);
    SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(config, tmpFolder);

    Text key = new Text();
    LongWritable value = new LongWritable();

    TreeMap<String, LongWritable> stats = new TreeMap<String, LongWritable>();
    for (int i = 0; i < readers.length; i++) {
        SequenceFile.Reader reader = readers[i];
        while (reader.next(key, value)) {
            String k = key.toString();
            LongWritable val = stats.get(k);
            if (val == null) {
                val = new LongWritable();
                if (k.equals("scx"))
                    val.set(Long.MIN_VALUE);
                if (k.equals("scn"))
                    val.set(Long.MAX_VALUE);
                stats.put(k, val);
            }
            if (k.equals("scx")) {
                if (val.get() < value.get())
                    val.set(value.get());
            } else if (k.equals("scn")) {
                if (val.get() > value.get())
                    val.set(value.get());
            } else {
                val.set(val.get() + value.get());
            }
        }
        reader.close();
    }

    if (LOG.isInfoEnabled()) {
        LOG.info("Statistics for CrawlDb: " + crawlDb);
        LongWritable totalCnt = stats.get("T");
        stats.remove("T");
        LOG.info("TOTAL urls:\t" + totalCnt.get());
        for (Map.Entry<String, LongWritable> entry : stats.entrySet()) {
            String k = entry.getKey();
            LongWritable val = entry.getValue();
            if (k.equals("scn")) {
                LOG.info("min score:\t" + (float) (val.get() / 1000.0f));
            } else if (k.equals("scx")) {
                LOG.info("max score:\t" + (float) (val.get() / 1000.0f));
            } else if (k.equals("sct")) {
                LOG.info("avg score:\t" + (float) ((((double) val.get()) / totalCnt.get()) / 1000.0));
            } else if (k.startsWith("status")) {
                String[] st = k.split(" ");
                int code = Integer.parseInt(st[1]);
                if (st.length > 2)
                    LOG.info("   " + st[2] + " :\t" + val);
                else
                    LOG.info(st[0] + " " + code + " (" + CrawlDatum.getStatusName((byte) code) + "):\t" + val);
            } else
                LOG.info(k + ":\t" + val);
        }
    }
    // removing the tmp folder
    fileSystem.delete(tmpFolder, true);
    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb statistics: done");
    }

}

From source file:org.apache.nutch.crawl.LinkDb.java

License:Apache License

private static JobConf createJob(Configuration config, Path linkDb, boolean normalize, boolean filter) {
    Path newLinkDb = new Path("linkdb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("linkdb " + linkDb);

    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(LinkDb.class);
    job.setCombinerClass(LinkDbMerger.class);
    // if we don't run the mergeJob, perform normalization/filtering now
    if (normalize || filter) {
        try {/*from w  w w.j  a  v a  2  s.  com*/
            FileSystem fs = FileSystem.get(config);
            if (!fs.exists(linkDb)) {
                job.setBoolean(LinkDbFilter.URL_FILTERING, filter);
                job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize);
            }
        } catch (Exception e) {
            LOG.warn("LinkDb createJob: " + e);
        }
    }
    job.setReducerClass(LinkDbMerger.class);

    FileOutputFormat.setOutputPath(job, newLinkDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setBoolean("mapred.output.compress", true);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Inlinks.class);

    return job;
}

From source file:org.apache.nutch.scoring.webgraph.LinkRank.java

License:Apache License

/**
 * Runs the counter job. The counter job determines the number of links in the
 * webgraph. This is used during analysis.
 * /*from w  w  w .j  ava  2s  .com*/
 * @param fs The job file system.
 * @param webGraphDb The web graph database to use.
 * 
 * @return The number of nodes in the web graph.
 * @throws IOException If an error occurs while running the counter job.
 */
private int runCounter(FileSystem fs, Path webGraphDb) throws IOException {

    // configure the counter job
    Path numLinksPath = new Path(webGraphDb, NUM_NODES);
    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
    JobConf counter = new NutchJob(getConf());
    counter.setJobName("LinkRank Counter");
    FileInputFormat.addInputPath(counter, nodeDb);
    FileOutputFormat.setOutputPath(counter, numLinksPath);
    counter.setInputFormat(SequenceFileInputFormat.class);
    counter.setMapperClass(Counter.class);
    counter.setCombinerClass(Counter.class);
    counter.setReducerClass(Counter.class);
    counter.setMapOutputKeyClass(Text.class);
    counter.setMapOutputValueClass(LongWritable.class);
    counter.setOutputKeyClass(Text.class);
    counter.setOutputValueClass(LongWritable.class);
    counter.setNumReduceTasks(1);
    counter.setOutputFormat(TextOutputFormat.class);
    counter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    // run the counter job, outputs to a single reduce task and file
    LOG.info("Starting link counter job");
    try {
        JobClient.runJob(counter);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished link counter job");

    // read the first (and only) line from the file which should be the
    // number of links in the web graph
    LOG.info("Reading numlinks temp file");
    FSDataInputStream readLinks = fs.open(new Path(numLinksPath, "part-00000"));
    BufferedReader buffer = new BufferedReader(new InputStreamReader(readLinks));
    String numLinksLine = buffer.readLine();
    readLinks.close();

    // check if there are links to process, if none, webgraph might be empty
    if (numLinksLine == null || numLinksLine.length() == 0) {
        fs.delete(numLinksPath, true);
        throw new IOException("No links to process, is the webgraph empty?");
    }

    // delete temp file and convert and return the number of links as an int
    LOG.info("Deleting numlinks temp file");
    fs.delete(numLinksPath, true);
    String numLinks = numLinksLine.split("\\s+")[1];
    return Integer.parseInt(numLinks);
}