Example usage for org.apache.hadoop.mapred JobConf setOutputValueClass

List of usage examples for org.apache.hadoop.mapred JobConf setOutputValueClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setOutputValueClass.

Prototype

public void setOutputValueClass(Class<?> theClass) 

Source Link

Document

Set the value class for job outputs.

Usage

From source file:NCDSearch.DistributedNCDSearch.java

public int run(String args[]) throws Exception {

    String inputpath = args[1];//  w w w  .  ja va 2 s.c o m
    String outputpath = args[2];

    JobConf conf = new JobConf(getConf(), ChunkyFileInputFormat.class);

    //Add the target file to a cache so all nodes can have a copy.
    DistributedCache.addCacheFile(new URI(args[0]), conf);

    FileOutputFormat.setOutputPath(conf, new Path(outputpath));
    FileInputFormat.setInputPaths(conf, new Path(inputpath));

    conf.setJobName("NCDSearch");
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(FloatWritable.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setInputFormat(ChunkyFileInputFormat.class);
    conf.setMapperClass(Map.class);
    conf.setReducerClass(Reduce.class);

    JobClient.runJob(conf);

    return 0;
}

From source file:net.peacesoft.nutch.crawl.ReCrawlDb.java

License:Apache License

public static JobConf createJob(Configuration config, Path crawlDb) throws IOException {
    Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("crawldb " + crawlDb);

    Path current = new Path(crawlDb, CURRENT_NAME);
    if (FileSystem.get(job).exists(current)) {
        FileInputFormat.addInputPath(job, current);
    }/*w ww  . ja v a  2 s.  co m*/
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(CrawlDbFilter.class);
    job.setReducerClass(CrawlDbReducer.class);

    FileOutputFormat.setOutputPath(job, newCrawlDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    // https://issues.apache.org/jira/browse/NUTCH-1110
    job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    return job;
}

From source file:net.peacesoft.nutch.crawl.ReFetcher.java

License:Apache License

public void fetch(Path segment, int threads) throws IOException {

    checkConfiguration();/*from  w w w. j a va2s  .  c  o m*/

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("Fetcher: starting at " + sdf.format(start));
        LOG.info("Fetcher: segment: " + segment);
    }

    // set the actual time for the timelimit relative
    // to the beginning of the whole job and not of a specific task
    // otherwise it keeps trying again if a task fails
    long timelimit = getConf().getLong("ReFetcher.timelimit.mins", -1);
    if (timelimit != -1) {
        timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
        LOG.info("Fetcher Timelimit set for : " + timelimit);
        getConf().setLong("fetcher.timelimit", timelimit);
    }

    // Set the time limit after which the throughput threshold feature is enabled
    timelimit = getConf().getLong("fetcher.throughput.threshold.check.after", 10);
    timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
    getConf().setLong("fetcher.throughput.threshold.check.after", timelimit);

    int maxOutlinkDepth = getConf().getInt("fetcher.follow.outlinks.depth", -1);
    if (maxOutlinkDepth > 0) {
        LOG.info("Fetcher: following outlinks up to depth: " + Integer.toString(maxOutlinkDepth));

        int maxOutlinkDepthNumLinks = getConf().getInt("fetcher.follow.outlinks.num.links", 4);
        int outlinksDepthDivisor = getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2);

        int totalOutlinksToFollow = 0;
        for (int i = 0; i < maxOutlinkDepth; i++) {
            totalOutlinksToFollow += (int) Math.floor(outlinksDepthDivisor / (i + 1) * maxOutlinkDepthNumLinks);
        }

        LOG.info("Fetcher: maximum outlinks to follow: " + Integer.toString(totalOutlinksToFollow));
    }

    JobConf job = new NutchJob(getConf());
    job.setJobName("fetch " + segment);

    job.setInt("fetcher.threads.fetch", threads);
    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());

    // for politeness, don't permit parallel execution of a single task
    job.setSpeculativeExecution(false);

    FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    job.setInputFormat(ReFetcher.InputFormat.class);

    job.setMapRunnerClass(ReFetcher.class);

    FileOutputFormat.setOutputPath(job, segment);
    job.setOutputFormat(FetcherOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NutchWritable.class);

    JobClient.runJob(job);

    long end = System.currentTimeMillis();
    LOG.info("Fetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:net.peacesoft.nutch.crawl.ReGenerator.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or
 * not is read from the crawl.generate.filter property in the configuration
 * files. If the property is not found, the URLs are filtered. Same for the
 * normalisation./*  w w w.  j  a v  a 2  s  .  com*/
 *
 * @param dbDir Crawl database directory
 * @param segments Segments directory
 * @param numLists Number of reduce tasks
 * @param topN Number of top URLs to be selected
 * @param curTime Current time in milliseconds
 *
 * @return Path to generated segment or null if no entries were selected
 *
 * @throws IOException When an I/O error occurs
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter,
        boolean norm, boolean force, int maxNumSegments) throws IOException {
    try {
        Path tempDir = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
        FileSystem fs = FileSystem.get(getConf());
        LockUtil.createLockFile(fs, lock, force);

        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("ReGenerator: starting at " + sdf.format(start));
        LOG.info("ReGenerator: Selecting best-scoring urls due for fetch.");
        LOG.info("ReGenerator: filtering: " + filter);
        LOG.info("ReGenerator: normalizing: " + norm);
        if (topN != Long.MAX_VALUE) {
            LOG.info("ReGenerator: topN: " + topN);
        }

        if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) {
            LOG.info(
                    "ReGenerator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead");
        }

        // map to inverted subset due for fetch, sort by score
        JobConf job = new NutchJob(getConf());
        job.setJobName("generate: select from " + dbDir);

        if (numLists == -1) { // for politeness make
            numLists = job.getNumMapTasks(); // a partition per fetch task
        }
        if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
            // override
            LOG.info("ReGenerator: jobtracker is 'local', generating exactly one partition.");
            numLists = 1;
        }
        job.setLong(GENERATOR_CUR_TIME, curTime);
        // record real generation time
        long generateTime = System.currentTimeMillis();
        job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
        job.setLong(GENERATOR_TOP_N, topN);
        job.setBoolean(GENERATOR_FILTER, filter);
        job.setBoolean(GENERATOR_NORMALISE, norm);
        job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments);

        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormat(SequenceFileInputFormat.class);

        job.setMapperClass(Selector.class);
        job.setPartitionerClass(Selector.class);
        job.setReducerClass(Selector.class);

        FileOutputFormat.setOutputPath(job, tempDir);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(FloatWritable.class);
        job.setOutputKeyComparatorClass(DecreasingFloatComparator.class);
        job.setOutputValueClass(SelectorEntry.class);
        job.setOutputFormat(GeneratorOutputFormat.class);

        try {
            JobClient.runJob(job);
        } catch (IOException e) {
            throw e;
        }

        // read the subdirectories generated in the temp
        // output and turn them into segments
        List<Path> generatedSegments = new ArrayList<Path>();

        FileStatus[] status = fs.listStatus(tempDir);
        try {
            for (FileStatus stat : status) {
                Path subfetchlist = stat.getPath();
                if (!subfetchlist.getName().startsWith("fetchlist-")) {
                    continue;
                }
                // start a new partition job for this segment
                Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);
                generatedSegments.add(newSeg);
            }
        } catch (Exception e) {
            LOG.warn("ReGenerator: exception while partitioning segments, exiting ...");
            fs.delete(tempDir, true);
            return null;
        }

        if (generatedSegments.size() == 0) {
            LOG.warn("ReGenerator: 0 records selected for fetching, exiting ...");
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            return null;
        }

        if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
            // update the db from tempDir
            Path tempDir2 = new Path(
                    getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

            job = new NutchJob(getConf());
            job.setJobName("generate: updatedb " + dbDir);
            job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
            for (Path segmpaths : generatedSegments) {
                Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
                FileInputFormat.addInputPath(job, subGenDir);
            }
            FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
            job.setInputFormat(SequenceFileInputFormat.class);
            job.setMapperClass(CrawlDbUpdater.class);
            job.setReducerClass(CrawlDbUpdater.class);
            job.setOutputFormat(MapFileOutputFormat.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(CrawlDatum.class);
            FileOutputFormat.setOutputPath(job, tempDir2);
            try {
                JobClient.runJob(job);
                CrawlDb.install(job, dbDir);
            } catch (IOException e) {
                LockUtil.removeLockFile(fs, lock);
                fs.delete(tempDir, true);
                fs.delete(tempDir2, true);
                throw e;
            }
            fs.delete(tempDir2, true);
        }

        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);

        long end = System.currentTimeMillis();
        LOG.info("ReGenerator: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));

        Path[] patharray = new Path[generatedSegments.size()];
        return generatedSegments.toArray(patharray);
    } catch (Exception ex) {
        LOG.error("ReGenerator generate error: " + ex.toString(), ex);
        return null;
    }
}

From source file:net.peacesoft.nutch.crawl.ReIndexerMapReduce.java

License:Apache License

public static void initMRJob(Path inputDb, Path crawlDb, Path linkDb, Collection<Path> segments, JobConf job) {

    LOG.info("IndexerMapReduce: crawldb: " + crawlDb);

    if (linkDb != null) {
        LOG.info("IndexerMapReduce: linkdb: " + linkDb);
    }//from  ww w.j  a v a  2s  . c o m

    for (final Path segment : segments) {
        LOG.info("IndexerMapReduces: adding segment: " + segment);
        FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.FETCH_DIR_NAME));
        FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.PARSE_DIR_NAME));
        FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
        FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));
    }

    FileInputFormat.addInputPath(job, new Path(inputDb, CrawlDb.CURRENT_NAME));
    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));

    if (linkDb != null) {
        FileInputFormat.addInputPath(job, new Path(linkDb, LinkDb.CURRENT_NAME));
    }

    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(IndexerMapReduce.class);
    job.setReducerClass(IndexerMapReduce.class);

    job.setOutputFormat(IndexerOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setMapOutputValueClass(NutchWritable.class);
    job.setOutputValueClass(NutchWritable.class);
}

From source file:net.peacesoft.nutch.crawl.ReInjector.java

License:Apache License

public void inject(Path crawlDb, Path urlDir) {
    try {//from   www. ja va 2  s .c o  m
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("Injector: starting at " + sdf.format(start));
            LOG.info("Injector: crawlDb: " + crawlDb);
            LOG.info("Injector: urlDir: " + urlDir);
        }

        Path tempDir = new Path(getConf().get("mapred.temp.dir", ".") + "/inject-temp-"
                + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

        // map text input file to a <url,CrawlDatum> file
        if (LOG.isInfoEnabled()) {
            LOG.info("Injector: Converting injected urls to crawl db entries.");
        }
        JobConf sortJob = new NutchJob(getConf());
        sortJob.setJobName("inject " + urlDir);
        FileInputFormat.addInputPath(sortJob, urlDir);
        sortJob.setMapperClass(ReInjector.InjectMapper.class);

        FileOutputFormat.setOutputPath(sortJob, tempDir);
        sortJob.setOutputFormat(SequenceFileOutputFormat.class);
        sortJob.setOutputKeyClass(Text.class);
        sortJob.setOutputValueClass(CrawlDatum.class);
        sortJob.setLong("injector.current.time", System.currentTimeMillis());
        JobClient.runJob(sortJob);

        // merge with existing crawl db
        if (LOG.isInfoEnabled()) {
            LOG.info("Injector: Merging injected urls into crawl db.");
        }
        JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb);
        FileInputFormat.addInputPath(mergeJob, tempDir);
        mergeJob.setReducerClass(ReInjector.InjectReducer.class);
        JobClient.runJob(mergeJob);
        CrawlDb.install(mergeJob, crawlDb);

        // clean up
        FileSystem fs = FileSystem.get(getConf());
        fs.delete(tempDir, true);

        long end = System.currentTimeMillis();
        LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));
    } catch (Exception ex) {
        LOG.error("ReInjector run injector error: " + ex.toString(), ex);
    }
}

From source file:net.peacesoft.nutch.crawl.ReLinkDb.java

License:Apache License

private static JobConf createJob(Configuration config, Path linkDb, boolean normalize, boolean filter) {
    Path newLinkDb = new Path("linkdb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("linkdb " + linkDb);

    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(ReLinkDb.class);
    job.setCombinerClass(LinkDbMerger.class);
    // if we don't run the mergeJob, perform normalization/filtering now
    if (normalize || filter) {
        try {//from w  ww .  j a va 2  s. co m
            FileSystem fs = FileSystem.get(config);
            if (!fs.exists(linkDb)) {
                job.setBoolean(LinkDbFilter.URL_FILTERING, filter);
                job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize);
            }
        } catch (Exception e) {
            LOG.warn("ReLinkDb createJob: " + e);
        }
    }
    job.setReducerClass(LinkDbMerger.class);

    FileOutputFormat.setOutputPath(job, newLinkDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setBoolean("mapred.output.compress", true);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Inlinks.class);

    return job;
}

From source file:net.team1.dev.HousingAnalysis.java

License:Apache License

/**
 * The main entry point for the map/reduce runner.
 *
 * @param args 2 args: \<input dir\> \<output dir\>
 * @throws Exception Throws IOException//from   w  ww.j ava  2 s.  c  o  m
 */
public static void main(String[] args) throws Exception {
    Path inputDir = new Path(args[0]);
    Path outputDir = new Path(args[1]);
    FileSystem fs = FileSystem.get(new Configuration());

    if (!fs.exists(inputDir))
        throw new IOException("The input path does not exist.");
    if (fs.isFile(inputDir))
        throw new IOException("The input path is a file.");
    if (fs.exists(outputDir))
        fs.delete(outputDir, true);

    // set job configuration
    JobConf conf = new JobConf(HousingAnalysis.class);
    conf.setJobName("housinganalysis");
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setOutputFormat(TextOutputFormat.class);
    conf.setCombinerClass(HousingReducer.class);
    conf.setReducerClass(HousingReducer.class);

    // set multiple input files
    HashMap<Path, Class<? extends Mapper>> inputMappers = getInputFilePaths(inputDir, fs);
    for (Path p : inputMappers.keySet()) {
        MultipleInputs.addInputPath(conf, p, TextInputFormat.class, inputMappers.get(p));
        LOG.info(p.getName() + ": " + inputMappers.get(p).getName());
    }

    // set output
    FileOutputFormat.setOutputPath(conf, outputDir);

    // start the job
    JobClient.runJob(conf);
}

From source file:nl.tudelft.graphalytics.mapreducev2.MapReduceJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    FileSystem dfs = FileSystem.get(getConf());
    String inPath = inputPath;/* w  w  w .  j a v a2 s .c o m*/

    while (!isFinished()) {
        iteration++;

        // Prepare job configuration
        JobConf jobConfiguration = new JobConf(this.getConf());
        jobConfiguration.setJarByClass(this.getClass());

        jobConfiguration.setMapOutputKeyClass(getMapOutputKeyClass());
        jobConfiguration.setMapOutputValueClass(getMapOutputValueClass());

        jobConfiguration.setMapperClass(getMapperClass());
        if (getCombinerClass() != null)
            jobConfiguration.setCombinerClass(getCombinerClass());
        jobConfiguration.setReducerClass(getReducerClass());

        jobConfiguration.setOutputKeyClass(getOutputKeyClass());
        jobConfiguration.setOutputValueClass(getOutputValueClass());

        jobConfiguration.setInputFormat(getInputFormatClass());
        jobConfiguration.setOutputFormat(getOutputFormatClass());

        if (getNumMappers() != -1)
            jobConfiguration.setNumMapTasks(getNumMappers());
        if (getNumReducers() != -1)
            jobConfiguration.setNumReduceTasks(getNumReducers());

        setConfigurationParameters(jobConfiguration);

        // Set the input and output paths
        String outPath = intermediatePath + "/iteration-" + iteration;
        FileInputFormat.addInputPath(jobConfiguration, new Path(inPath));
        FileOutputFormat.setOutputPath(jobConfiguration, new Path(outPath));

        // Execute the current iteration
        RunningJob jobExecution = JobClient.runJob(jobConfiguration);
        jobExecution.waitForCompletion();

        // Remove the output of the previous job (unless it is the input graph)
        if (iteration != 1) {
            dfs.delete(new Path(inPath), true);
        }
        inPath = outPath;

        processJobOutput(jobExecution);
    }

    // Rename the last job output to the specified output path
    try {
        dfs.mkdirs(new Path(outputPath).getParent());
        dfs.rename(new Path(inPath), new Path(outputPath));
    } catch (Exception e) {
        LOG.warn("Failed to rename MapReduce job output.", e);
    }

    return 0;
}