Example usage for org.apache.hadoop.mapred JobConf setReducerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setReducerClass.

Prototype

public void setReducerClass(Class<? extends Reducer> theClass)

Source Link

Document

Set the Reducer class for the job.

Usage

From source file:me.tingri.graphs.cc.ConnectedComponents.java

License:Apache License

protected RunningJob merge(FileSystem fs, Path tempVectorPath, Path nextVectorPath, int numOfReducers)
        throws Exception {
    Utility.deleteIfExists(fs, nextVectorPath);

    JobConf conf = new JobConf(getConf(), ConnectedComponents.class);
    conf.set(VECTOR_INDICATOR, DEFAULT_VECTOR_INDICATOR);

    conf.setJobName("ConnectedComponents_Merge");

    conf.setMapperClass(MergeMapper.class);
    conf.setReducerClass(MergeReducer.class);

    FileInputFormat.setInputPaths(conf, tempVectorPath);
    FileOutputFormat.setOutputPath(conf, nextVectorPath);

    conf.setNumReduceTasks(numOfReducers);

    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);

    return JobClient.runJob(conf);
}

From source file:mr.WordCount.java

License:Open Source License

public static void main(String[] args) throws Exception {

    Properties properties = new Properties();
    AppProps.addApplicationTag(properties, "tutorials");
    AppProps.addApplicationTag(properties, "cluster:development");
    AppProps.setApplicationName(properties, "cascading-mapreduce-flow");

    JobConf conf = new JobConf(WordCount.class);
    conf.setJobName("casading-mapreduce-flow");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    MapReduceFlow flow = new MapReduceFlow("wordcount", conf, true);

    // JobClient.runJob(conf);
    flow.complete();/* w w w .j  av  a 2s  .com*/

}

From source file:name.abhijitsarkar.hadoop.citation.CitationCombiner.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("citation-combiner");

    /* This is to set the separator byte for KeyValueTextInputFormat */
    conf.set("key.value.separator.in.input.line", ",");

    conf.setMapperClass(CitationMapper.class);
    conf.setReducerClass(CitationReducer.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);//from  www . j a v  a  2s  .c o m

    return 0;
}

From source file:NCDSearch.DistributedNCDSearch.java

public int run(String args[]) throws Exception {

    String inputpath = args[1];/*from ww w  .ja v a  2  s.com*/
    String outputpath = args[2];

    JobConf conf = new JobConf(getConf(), ChunkyFileInputFormat.class);

    //Add the target file to a cache so all nodes can have a copy.
    DistributedCache.addCacheFile(new URI(args[0]), conf);

    FileOutputFormat.setOutputPath(conf, new Path(outputpath));
    FileInputFormat.setInputPaths(conf, new Path(inputpath));

    conf.setJobName("NCDSearch");
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(FloatWritable.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setInputFormat(ChunkyFileInputFormat.class);
    conf.setMapperClass(Map.class);
    conf.setReducerClass(Reduce.class);

    JobClient.runJob(conf);

    return 0;
}

From source file:net.peacesoft.nutch.crawl.RaovatPostDeleteDuplicates.java

License:Apache License

public void dedup(String solrUrl, boolean noCommit) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("RaovatPostDeleteDuplicates: starting at " + sdf.format(start));
    LOG.info("RaovatPostDeleteDuplicates: Solr url: " + solrUrl);

    JobConf job = new NutchJob(getConf());

    job.set(ReSolrConstants.SERVER_URL, solrUrl);
    job.setBoolean("noCommit", noCommit);
    job.setInputFormat(RaovatPostDeleteDuplicates.SolrInputFormat.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(RaovatPostDeleteDuplicates.SolrRecord.class);
    job.setMapperClass(IdentityMapper.class);
    job.setReducerClass(RaovatPostDeleteDuplicates.class);

    JobClient.runJob(job);//from   w w w  .  j  av a2 s  .  c om

    long end = System.currentTimeMillis();
    LOG.info("RaovatPostDeleteDuplicates: finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
}

From source file:net.peacesoft.nutch.crawl.ReCrawlDb.java

License:Apache License

public static JobConf createJob(Configuration config, Path crawlDb) throws IOException {
    Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("crawldb " + crawlDb);

    Path current = new Path(crawlDb, CURRENT_NAME);
    if (FileSystem.get(job).exists(current)) {
        FileInputFormat.addInputPath(job, current);
    }/*from   ww  w  . jav  a 2s .c o m*/
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(CrawlDbFilter.class);
    job.setReducerClass(CrawlDbReducer.class);

    FileOutputFormat.setOutputPath(job, newCrawlDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    // https://issues.apache.org/jira/browse/NUTCH-1110
    job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    return job;
}

From source file:net.peacesoft.nutch.crawl.ReGenerator.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or
 * not is read from the crawl.generate.filter property in the configuration
 * files. If the property is not found, the URLs are filtered. Same for the
 * normalisation.//from  ww w  .  j a  va  2  s.c  o  m
 *
 * @param dbDir Crawl database directory
 * @param segments Segments directory
 * @param numLists Number of reduce tasks
 * @param topN Number of top URLs to be selected
 * @param curTime Current time in milliseconds
 *
 * @return Path to generated segment or null if no entries were selected
 *
 * @throws IOException When an I/O error occurs
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter,
        boolean norm, boolean force, int maxNumSegments) throws IOException {
    try {
        Path tempDir = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
        FileSystem fs = FileSystem.get(getConf());
        LockUtil.createLockFile(fs, lock, force);

        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("ReGenerator: starting at " + sdf.format(start));
        LOG.info("ReGenerator: Selecting best-scoring urls due for fetch.");
        LOG.info("ReGenerator: filtering: " + filter);
        LOG.info("ReGenerator: normalizing: " + norm);
        if (topN != Long.MAX_VALUE) {
            LOG.info("ReGenerator: topN: " + topN);
        }

        if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) {
            LOG.info(
                    "ReGenerator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead");
        }

        // map to inverted subset due for fetch, sort by score
        JobConf job = new NutchJob(getConf());
        job.setJobName("generate: select from " + dbDir);

        if (numLists == -1) { // for politeness make
            numLists = job.getNumMapTasks(); // a partition per fetch task
        }
        if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
            // override
            LOG.info("ReGenerator: jobtracker is 'local', generating exactly one partition.");
            numLists = 1;
        }
        job.setLong(GENERATOR_CUR_TIME, curTime);
        // record real generation time
        long generateTime = System.currentTimeMillis();
        job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
        job.setLong(GENERATOR_TOP_N, topN);
        job.setBoolean(GENERATOR_FILTER, filter);
        job.setBoolean(GENERATOR_NORMALISE, norm);
        job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments);

        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormat(SequenceFileInputFormat.class);

        job.setMapperClass(Selector.class);
        job.setPartitionerClass(Selector.class);
        job.setReducerClass(Selector.class);

        FileOutputFormat.setOutputPath(job, tempDir);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(FloatWritable.class);
        job.setOutputKeyComparatorClass(DecreasingFloatComparator.class);
        job.setOutputValueClass(SelectorEntry.class);
        job.setOutputFormat(GeneratorOutputFormat.class);

        try {
            JobClient.runJob(job);
        } catch (IOException e) {
            throw e;
        }

        // read the subdirectories generated in the temp
        // output and turn them into segments
        List<Path> generatedSegments = new ArrayList<Path>();

        FileStatus[] status = fs.listStatus(tempDir);
        try {
            for (FileStatus stat : status) {
                Path subfetchlist = stat.getPath();
                if (!subfetchlist.getName().startsWith("fetchlist-")) {
                    continue;
                }
                // start a new partition job for this segment
                Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);
                generatedSegments.add(newSeg);
            }
        } catch (Exception e) {
            LOG.warn("ReGenerator: exception while partitioning segments, exiting ...");
            fs.delete(tempDir, true);
            return null;
        }

        if (generatedSegments.size() == 0) {
            LOG.warn("ReGenerator: 0 records selected for fetching, exiting ...");
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            return null;
        }

        if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
            // update the db from tempDir
            Path tempDir2 = new Path(
                    getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

            job = new NutchJob(getConf());
            job.setJobName("generate: updatedb " + dbDir);
            job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
            for (Path segmpaths : generatedSegments) {
                Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
                FileInputFormat.addInputPath(job, subGenDir);
            }
            FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
            job.setInputFormat(SequenceFileInputFormat.class);
            job.setMapperClass(CrawlDbUpdater.class);
            job.setReducerClass(CrawlDbUpdater.class);
            job.setOutputFormat(MapFileOutputFormat.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(CrawlDatum.class);
            FileOutputFormat.setOutputPath(job, tempDir2);
            try {
                JobClient.runJob(job);
                CrawlDb.install(job, dbDir);
            } catch (IOException e) {
                LockUtil.removeLockFile(fs, lock);
                fs.delete(tempDir, true);
                fs.delete(tempDir2, true);
                throw e;
            }
            fs.delete(tempDir2, true);
        }

        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);

        long end = System.currentTimeMillis();
        LOG.info("ReGenerator: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));

        Path[] patharray = new Path[generatedSegments.size()];
        return generatedSegments.toArray(patharray);
    } catch (Exception ex) {
        LOG.error("ReGenerator generate error: " + ex.toString(), ex);
        return null;
    }
}

From source file:net.peacesoft.nutch.crawl.ReIndexerMapReduce.java

License:Apache License

public static void initMRJob(Path inputDb, Path crawlDb, Path linkDb, Collection<Path> segments, JobConf job) {

    LOG.info("IndexerMapReduce: crawldb: " + crawlDb);

    if (linkDb != null) {
        LOG.info("IndexerMapReduce: linkdb: " + linkDb);
    }//ww w.j av  a 2s.  co  m

    for (final Path segment : segments) {
        LOG.info("IndexerMapReduces: adding segment: " + segment);
        FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.FETCH_DIR_NAME));
        FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.PARSE_DIR_NAME));
        FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
        FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));
    }

    FileInputFormat.addInputPath(job, new Path(inputDb, CrawlDb.CURRENT_NAME));
    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));

    if (linkDb != null) {
        FileInputFormat.addInputPath(job, new Path(linkDb, LinkDb.CURRENT_NAME));
    }

    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(IndexerMapReduce.class);
    job.setReducerClass(IndexerMapReduce.class);

    job.setOutputFormat(IndexerOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setMapOutputValueClass(NutchWritable.class);
    job.setOutputValueClass(NutchWritable.class);
}

From source file:net.peacesoft.nutch.crawl.ReInjector.java

License:Apache License

public void inject(Path crawlDb, Path urlDir) {
    try {//from   w w w. ja  va  2  s.c o  m
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("Injector: starting at " + sdf.format(start));
            LOG.info("Injector: crawlDb: " + crawlDb);
            LOG.info("Injector: urlDir: " + urlDir);
        }

        Path tempDir = new Path(getConf().get("mapred.temp.dir", ".") + "/inject-temp-"
                + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

        // map text input file to a <url,CrawlDatum> file
        if (LOG.isInfoEnabled()) {
            LOG.info("Injector: Converting injected urls to crawl db entries.");
        }
        JobConf sortJob = new NutchJob(getConf());
        sortJob.setJobName("inject " + urlDir);
        FileInputFormat.addInputPath(sortJob, urlDir);
        sortJob.setMapperClass(ReInjector.InjectMapper.class);

        FileOutputFormat.setOutputPath(sortJob, tempDir);
        sortJob.setOutputFormat(SequenceFileOutputFormat.class);
        sortJob.setOutputKeyClass(Text.class);
        sortJob.setOutputValueClass(CrawlDatum.class);
        sortJob.setLong("injector.current.time", System.currentTimeMillis());
        JobClient.runJob(sortJob);

        // merge with existing crawl db
        if (LOG.isInfoEnabled()) {
            LOG.info("Injector: Merging injected urls into crawl db.");
        }
        JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb);
        FileInputFormat.addInputPath(mergeJob, tempDir);
        mergeJob.setReducerClass(ReInjector.InjectReducer.class);
        JobClient.runJob(mergeJob);
        CrawlDb.install(mergeJob, crawlDb);

        // clean up
        FileSystem fs = FileSystem.get(getConf());
        fs.delete(tempDir, true);

        long end = System.currentTimeMillis();
        LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));
    } catch (Exception ex) {
        LOG.error("ReInjector run injector error: " + ex.toString(), ex);
    }
}

From source file:net.peacesoft.nutch.crawl.ReLinkDb.java

License:Apache License

private static JobConf createJob(Configuration config, Path linkDb, boolean normalize, boolean filter) {
    Path newLinkDb = new Path("linkdb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("linkdb " + linkDb);

    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(ReLinkDb.class);
    job.setCombinerClass(LinkDbMerger.class);
    // if we don't run the mergeJob, perform normalization/filtering now
    if (normalize || filter) {
        try {/*  w  ww  .  j a  va2s .c  o m*/
            FileSystem fs = FileSystem.get(config);
            if (!fs.exists(linkDb)) {
                job.setBoolean(LinkDbFilter.URL_FILTERING, filter);
                job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize);
            }
        } catch (Exception e) {
            LOG.warn("ReLinkDb createJob: " + e);
        }
    }
    job.setReducerClass(LinkDbMerger.class);

    FileOutputFormat.setOutputPath(job, newLinkDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setBoolean("mapred.output.compress", true);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Inlinks.class);

    return job;
}