Example usage for org.apache.hadoop.mapred JobConf setReducerClass

List of usage examples for org.apache.hadoop.mapred JobConf setReducerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setReducerClass.

Prototype

public void setReducerClass(Class<? extends Reducer> theClass) 

Source Link

Document

Set the Reducer class for the job.

Usage

From source file:org.apache.mahout.clustering.canopy.ClusterDriver.java

License:Apache License

/**
 * Run the job//from  w  w  w  . j a v a 2 s. c  o  m
 *
 * @param points           the input points directory pathname String
 * @param canopies         the input canopies directory pathname String
 * @param output           the output directory pathname String
 * @param measureClassName the DistanceMeasure class name
 * @param t1               the T1 distance threshold
 * @param t2               the T2 distance threshold
 * @param vectorClass      The {@link Class} of Vector to use for the Output Value Class.  Must be concrete.
 */
public static void runJob(String points, String canopies, String output, String measureClassName, double t1,
        double t2, Class<? extends Vector> vectorClass) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(org.apache.mahout.clustering.canopy.ClusterDriver.class);

    conf.set(Canopy.DISTANCE_MEASURE_KEY, measureClassName);
    conf.set(Canopy.T1_KEY, String.valueOf(t1));
    conf.set(Canopy.T2_KEY, String.valueOf(t2));
    conf.set(Canopy.CANOPY_PATH_KEY, canopies);

    conf.setInputFormat(SequenceFileInputFormat.class);

    /*conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(SparseVector.class);*/
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(vectorClass);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(points));
    Path outPath = new Path(output + DEFAULT_CLUSTER_OUTPUT_DIRECTORY);
    FileOutputFormat.setOutputPath(conf, outPath);

    conf.setMapperClass(ClusterMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    client.setConf(conf);
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
        dfs.delete(outPath, true);
    }
    JobClient.runJob(conf);
}

From source file:org.apache.mahout.clustering.syntheticcontrol.meanshift.OutputDriver.java

License:Apache License

public static void runJob(String input, String output) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(org.apache.mahout.clustering.syntheticcontrol.meanshift.OutputDriver.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setInputFormat(SequenceFileInputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, new Path(output));

    conf.setMapperClass(OutputMapper.class);

    conf.setReducerClass(Reducer.class);
    conf.setNumReduceTasks(0);//from  w ww . j a  v a 2  s  .  c  o  m

    client.setConf(conf);
    JobClient.runJob(conf);
}

From source file:org.apache.mahout.math.hadoop.MatrixMultiplicationJob.java

License:Apache License

public static Configuration createMatrixMultiplyJobConf(Configuration initialConf, Path aPath, Path bPath,
        Path outPath, int outCardinality) {
    JobConf conf = new JobConf(initialConf, MatrixMultiplicationJob.class);
    conf.setInputFormat(CompositeInputFormat.class);
    conf.set("mapred.join.expr",
            CompositeInputFormat.compose("inner", SequenceFileInputFormat.class, aPath, bPath));
    conf.setInt(OUT_CARD, outCardinality);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setMapperClass(MatrixMultiplyMapper.class);
    conf.setCombinerClass(MatrixMultiplicationReducer.class);
    conf.setReducerClass(MatrixMultiplicationReducer.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(VectorWritable.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(VectorWritable.class);
    return conf;//from  w  ww .  j av a 2 s  . c om
}

From source file:org.apache.nutch.crawl.CrawlDbMerger.java

License:Apache License

public static JobConf createMergeJob(Configuration conf, Path output, boolean normalize, boolean filter) {
    Path newCrawlDb = new Path("crawldb-merge-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(conf);
    job.setJobName("crawldb merge " + output);

    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(CrawlDbFilter.class);
    job.setBoolean(CrawlDbFilter.URL_FILTERING, filter);
    job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize);
    job.setReducerClass(Merger.class);

    FileOutputFormat.setOutputPath(job, newCrawlDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    return job;/*ww w. ja v  a 2 s .c  o  m*/
}

From source file:org.apache.nutch.crawl.CrawlDbReader.java

License:Apache License

public void processStatJob(String crawlDb, Configuration config, boolean sort) throws IOException {

    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb statistics start: " + crawlDb);
    }//from  w ww .ja  v a2s .co  m

    Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis());

    JobConf job = new NutchJob(config);
    job.setJobName("stats " + crawlDb);
    job.setBoolean("db.reader.stats.sort", sort);

    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(CrawlDbStatMapper.class);
    job.setCombinerClass(CrawlDbStatCombiner.class);
    job.setReducerClass(CrawlDbStatReducer.class);

    FileOutputFormat.setOutputPath(job, tmpFolder);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // https://issues.apache.org/jira/browse/NUTCH-1029
    job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    JobClient.runJob(job);

    // reading the result
    FileSystem fileSystem = FileSystem.get(config);
    SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(config, tmpFolder);

    Text key = new Text();
    LongWritable value = new LongWritable();

    TreeMap<String, LongWritable> stats = new TreeMap<String, LongWritable>();
    for (int i = 0; i < readers.length; i++) {
        SequenceFile.Reader reader = readers[i];
        while (reader.next(key, value)) {
            String k = key.toString();
            LongWritable val = stats.get(k);
            if (val == null) {
                val = new LongWritable();
                if (k.equals("scx"))
                    val.set(Long.MIN_VALUE);
                if (k.equals("scn"))
                    val.set(Long.MAX_VALUE);
                stats.put(k, val);
            }
            if (k.equals("scx")) {
                if (val.get() < value.get())
                    val.set(value.get());
            } else if (k.equals("scn")) {
                if (val.get() > value.get())
                    val.set(value.get());
            } else {
                val.set(val.get() + value.get());
            }
        }
        reader.close();
    }

    if (LOG.isInfoEnabled()) {
        LOG.info("Statistics for CrawlDb: " + crawlDb);
        LongWritable totalCnt = stats.get("T");
        stats.remove("T");
        LOG.info("TOTAL urls:\t" + totalCnt.get());
        for (Map.Entry<String, LongWritable> entry : stats.entrySet()) {
            String k = entry.getKey();
            LongWritable val = entry.getValue();
            if (k.equals("scn")) {
                LOG.info("min score:\t" + (float) (val.get() / 1000.0f));
            } else if (k.equals("scx")) {
                LOG.info("max score:\t" + (float) (val.get() / 1000.0f));
            } else if (k.equals("sct")) {
                LOG.info("avg score:\t" + (float) ((((double) val.get()) / totalCnt.get()) / 1000.0));
            } else if (k.startsWith("status")) {
                String[] st = k.split(" ");
                int code = Integer.parseInt(st[1]);
                if (st.length > 2)
                    LOG.info("   " + st[2] + " :\t" + val);
                else
                    LOG.info(st[0] + " " + code + " (" + CrawlDatum.getStatusName((byte) code) + "):\t" + val);
            } else
                LOG.info(k + ":\t" + val);
        }
    }
    // removing the tmp folder
    fileSystem.delete(tmpFolder, true);
    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb statistics: done");
    }

}

From source file:org.apache.nutch.crawl.CrawlDbReader.java

License:Apache License

public void processTopNJob(String crawlDb, long topN, float min, String output, Configuration config)
        throws IOException {

    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")");
        LOG.info("CrawlDb db: " + crawlDb);
    }//from  w  w  w. j a  va  2s  . c  om

    Path outFolder = new Path(output);
    Path tempDir = new Path(config.get("mapred.temp.dir", ".") + "/readdb-topN-temp-"
            + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("topN prepare " + crawlDb);
    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(CrawlDbTopNMapper.class);
    job.setReducerClass(IdentityReducer.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(Text.class);

    // XXX hmmm, no setFloat() in the API ... :(
    job.setLong("db.reader.topn.min", Math.round(1000000.0 * min));
    JobClient.runJob(job);

    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb topN: collecting topN scores.");
    }
    job = new NutchJob(config);
    job.setJobName("topN collect " + crawlDb);
    job.setLong("db.reader.topn", topN);

    FileInputFormat.addInputPath(job, tempDir);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(IdentityMapper.class);
    job.setReducerClass(CrawlDbTopNReducer.class);

    FileOutputFormat.setOutputPath(job, outFolder);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(Text.class);

    job.setNumReduceTasks(1); // create a single file.

    JobClient.runJob(job);
    FileSystem fs = FileSystem.get(config);
    fs.delete(tempDir, true);
    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb topN: done");
    }

}

From source file:org.apache.nutch.crawl.Generator.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or not
 * is read from the crawl.generate.filter property in the configuration files.
 * If the property is not found, the URLs are filtered. Same for the
 * normalisation./*w w  w. j  a  va2  s .co  m*/
 * 
 * @param dbDir
 *          Crawl database directory
 * @param segments
 *          Segments directory
 * @param numLists
 *          Number of reduce tasks
 * @param topN
 *          Number of top URLs to be selected
 * @param curTime
 *          Current time in milliseconds
 * 
 * @return Path to generated segment or null if no entries were selected
 * 
 * @throws IOException
 *           When an I/O error occurs
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter,
        boolean norm, boolean force, int maxNumSegments) throws IOException {

    Path tempDir = new Path(
            getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

    Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
    FileSystem fs = FileSystem.get(getConf());
    LockUtil.createLockFile(fs, lock, force);

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("Generator: starting at " + sdf.format(start));
    LOG.info("Generator: Selecting best-scoring urls due for fetch.");
    LOG.info("Generator: filtering: " + filter);
    LOG.info("Generator: normalizing: " + norm);
    if (topN != Long.MAX_VALUE) {
        LOG.info("Generator: topN: " + topN);
    }

    if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) {
        LOG.info("Generator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead");
    }

    // map to inverted subset due for fetch, sort by score
    JobConf job = new NutchJob(getConf());
    job.setJobName("generate: select from " + dbDir);

    if (numLists == -1) { // for politeness make
        numLists = job.getNumMapTasks(); // a partition per fetch task
    }
    if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
        // override
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        numLists = 1;
    }
    job.setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
    job.setLong(GENERATOR_TOP_N, topN);
    job.setBoolean(GENERATOR_FILTER, filter);
    job.setBoolean(GENERATOR_NORMALISE, norm);
    job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments);

    FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(Selector.class);
    job.setPartitionerClass(Selector.class);
    job.setReducerClass(Selector.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputKeyComparatorClass(DecreasingFloatComparator.class);
    job.setOutputValueClass(SelectorEntry.class);
    job.setOutputFormat(GeneratorOutputFormat.class);

    try {
        JobClient.runJob(job);
    } catch (IOException e) {
        throw e;
    }

    // read the subdirectories generated in the temp
    // output and turn them into segments
    List<Path> generatedSegments = new ArrayList<Path>();

    FileStatus[] status = fs.listStatus(tempDir);
    try {
        for (FileStatus stat : status) {
            Path subfetchlist = stat.getPath();
            if (!subfetchlist.getName().startsWith("fetchlist-"))
                continue;
            // start a new partition job for this segment
            Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);
            generatedSegments.add(newSeg);
        }
    } catch (Exception e) {
        LOG.warn("Generator: exception while partitioning segments, exiting ...");
        fs.delete(tempDir, true);
        return null;
    }

    if (generatedSegments.size() == 0) {
        LOG.warn("Generator: 0 records selected for fetching, exiting ...");
        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);
        return null;
    }

    if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
        // update the db from tempDir
        Path tempDir2 = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        job = new NutchJob(getConf());
        job.setJobName("generate: updatedb " + dbDir);
        job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
        for (Path segmpaths : generatedSegments) {
            Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
            FileInputFormat.addInputPath(job, subGenDir);
        }
        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapperClass(CrawlDbUpdater.class);
        job.setReducerClass(CrawlDbUpdater.class);
        job.setOutputFormat(MapFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlDatum.class);
        FileOutputFormat.setOutputPath(job, tempDir2);
        try {
            JobClient.runJob(job);
            CrawlDb.install(job, dbDir);
        } catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            fs.delete(tempDir2, true);
            throw e;
        }
        fs.delete(tempDir2, true);
    }

    LockUtil.removeLockFile(fs, lock);
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));

    Path[] patharray = new Path[generatedSegments.size()];
    return generatedSegments.toArray(patharray);
}

From source file:org.apache.nutch.crawl.Injector.java

License:Apache License

public void inject(Path crawlDb, Path urlDir) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("Injector: starting at " + sdf.format(start));
        LOG.info("Injector: crawlDb: " + crawlDb);
        LOG.info("Injector: urlDir: " + urlDir);
    }/*from  w w  w .j  ava 2s .  c o  m*/

    Path tempDir = new Path(getConf().get("mapred.temp.dir", ".") + "/inject-temp-"
            + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // map text input file to a <url,CrawlDatum> file
    if (LOG.isInfoEnabled()) {
        LOG.info("Injector: Converting injected urls to crawl db entries.");
    }
    JobConf sortJob = new NutchJob(getConf());
    sortJob.setJobName("inject " + urlDir);
    FileInputFormat.addInputPath(sortJob, urlDir);
    sortJob.setMapperClass(InjectMapper.class);

    FileOutputFormat.setOutputPath(sortJob, tempDir);
    sortJob.setOutputFormat(SequenceFileOutputFormat.class);
    sortJob.setOutputKeyClass(Text.class);
    sortJob.setOutputValueClass(CrawlDatum.class);
    sortJob.setLong("injector.current.time", System.currentTimeMillis());
    JobClient.runJob(sortJob);

    // merge with existing crawl db
    if (LOG.isInfoEnabled()) {
        LOG.info("Injector: Merging injected urls into crawl db.");
    }
    JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb);
    FileInputFormat.addInputPath(mergeJob, tempDir);
    mergeJob.setReducerClass(InjectReducer.class);
    JobClient.runJob(mergeJob);
    CrawlDb.install(mergeJob, crawlDb);

    // clean up
    FileSystem fs = FileSystem.get(getConf());
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.crawl.LinkDb.java

License:Apache License

private static JobConf createJob(Configuration config, Path linkDb, boolean normalize, boolean filter) {
    Path newLinkDb = new Path("linkdb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("linkdb " + linkDb);

    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(LinkDb.class);
    job.setCombinerClass(LinkDbMerger.class);
    // if we don't run the mergeJob, perform normalization/filtering now
    if (normalize || filter) {
        try {/*from w w  w .ja v  a 2s . co m*/
            FileSystem fs = FileSystem.get(config);
            if (!fs.exists(linkDb)) {
                job.setBoolean(LinkDbFilter.URL_FILTERING, filter);
                job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize);
            }
        } catch (Exception e) {
            LOG.warn("LinkDb createJob: " + e);
        }
    }
    job.setReducerClass(LinkDbMerger.class);

    FileOutputFormat.setOutputPath(job, newLinkDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setBoolean("mapred.output.compress", true);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Inlinks.class);

    return job;
}