Example usage for org.apache.hadoop.mapred JobConf setLong

List of usage examples for org.apache.hadoop.mapred JobConf setLong

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setLong.

Prototype

public void setLong(String name, long value) 

Source Link

Document

Set the value of the name property to a long.

Usage

From source file:com.rim.logdriver.mapred.avro.AvroBlockInputFormat.java

License:Apache License

@Override
public RecordReader<AvroFileHeader, BytesWritable> getRecordReader(InputSplit split, JobConf job,
        Reporter reporter) throws IOException {

    // Ensure we have sensible defaults for how we build blocks.
    if (job.get("mapreduce.job.max.split.locations") == null) {
        job.setLong("mapreduce.job.max.split.locations", MAX_SPLIT_LOCATIONS);
    }/* w w  w. j ava  2 s .co m*/
    if (job.get("mapred.max.split.size") == null) {
        // Try to set the split size to the default block size. In case of
        // failure, we'll use this 128MB default.
        long blockSize = 128 * 1024 * 1024; // 128MB
        try {
            blockSize = FileSystem.get(job).getDefaultBlockSize();
        } catch (IOException e) {
            LOG.error("Error getting filesystem to get get default block size (this does not bode well).");
        }
        job.setLong("mapred.max.split.size", blockSize);
    }

    return new AvroBlockRecordReader(split, job);
}

From source file:com.rim.logdriver.mapred.boom.BoomInputFormat.java

License:Apache License

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    // Ensure we have sensible defaults for how we build blocks.
    if (job.get("mapreduce.job.max.split.locations") == null) {
        job.setLong("mapreduce.job.max.split.locations", MAX_SPLIT_LOCATIONS);
    }//from w w  w . ja v  a 2 s  . c om
    if (job.get("mapred.max.split.size") == null) {
        // Try to set the split size to the default block size. In case of
        // failure, we'll use this 128MB default.
        long blockSize = 128 * 1024 * 1024; // 128MB
        try {
            blockSize = FileSystem.get(job).getDefaultBlockSize();
        } catch (IOException e) {
            LOG.error("Error getting filesystem to get get default block size (this does not bode well).");
        }
        job.setLong("mapred.max.split.size", blockSize);
    }

    return super.getSplits(job, numSplits);
}

From source file:com.scaleoutsoftware.soss.hserver.hadoop.MapperWrapperMapred.java

License:Apache License

/**
 * Update the job with details about the file split
 * @param job the job configuration to update
 * @param inputSplit the file split//from ww w.j a  v  a2 s.c o m
 */
private void updateJobWithSplit(final JobConf job, Object inputSplit) {
    if (inputSplit instanceof FileSplit) {
        FileSplit fileSplit = (FileSplit) inputSplit;
        try {
            if (fileSplit.getPath() != null) {
                job.set("mapreduce.map.input.file", fileSplit.getPath().toString());
            }
        } catch (IllegalArgumentException e) {
            //Swallow this, it appears in Hive splits, which do not have the path encoded
            //(storage handler for NamedMap is an example).
        }
        job.setLong("mapreduce.map.input.start", fileSplit.getStart());
        job.setLong("mapreduce.map.input.length", fileSplit.getLength());
    }
    LOG.info("Processing split: " + inputSplit);
}

From source file:com.TCG.Nutch_DNS.Generator.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or not
 * is read from the crawl.generate.filter property in the configuration files.
 * If the property is not found, the URLs are filtered. Same for the
 * normalisation./*ww  w  .  j  a  v  a  2s. c o m*/
 * 
 * @param dbDir
 *          Crawl database directory
 * @param segments
 *          Segments directory
 * @param numLists
 *          Number of reduce tasks
 * @param topN
 *          Number of top URLs to be selected
 * @param curTime
 *          Current time in milliseconds
 * 
 * @return Path to generated segment or null if no entries were selected
 * 
 * @throws IOException
 *           When an I/O error occurs
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter,
        boolean norm, boolean force, int maxNumSegments) throws IOException {

    Path tempDir = new Path(
            getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString());

    Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
    FileSystem fs = FileSystem.get(getConf());
    LockUtil.createLockFile(fs, lock, force);

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("Generator: starting at " + sdf.format(start));
    LOG.info("Generator: Selecting best-scoring urls due for fetch.");
    LOG.info("Generator: filtering: " + filter);
    LOG.info("Generator: normalizing: " + norm);
    if (topN != Long.MAX_VALUE) {
        LOG.info("Generator: topN: " + topN);
    }

    // map to inverted subset due for fetch, sort by score
    JobConf job = new NutchJob(getConf());
    job.setJobName("generate: select from " + dbDir);

    if (numLists == -1) { // for politeness make
        numLists = job.getNumMapTasks(); // a partition per fetch task
    }
    if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
        // override
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        numLists = 1;
    }
    job.setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
    job.setLong(GENERATOR_TOP_N, topN);
    job.setBoolean(GENERATOR_FILTER, filter);
    job.setBoolean(GENERATOR_NORMALISE, norm);
    job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments);

    FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(Selector.class);
    job.setPartitionerClass(Selector.class);
    job.setReducerClass(Selector.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputKeyComparatorClass(DecreasingFloatComparator.class);
    job.setOutputValueClass(SelectorEntry.class);
    job.setOutputFormat(GeneratorOutputFormat.class);

    try {
        JobClient.runJob(job);
    } catch (IOException e) {
        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);
        throw e;
    }

    // read the subdirectories generated in the temp
    // output and turn them into segments
    List<Path> generatedSegments = new ArrayList<Path>();

    FileStatus[] status = fs.listStatus(tempDir);
    try {
        for (FileStatus stat : status) {
            Path subfetchlist = stat.getPath();
            if (!subfetchlist.getName().startsWith("fetchlist-"))
                continue;
            // start a new partition job for this segment
            Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);
            generatedSegments.add(newSeg);
        }
    } catch (Exception e) {
        LOG.warn("Generator: exception while partitioning segments, exiting ...");
        fs.delete(tempDir, true);
        return null;
    }

    if (generatedSegments.size() == 0) {
        LOG.warn("Generator: 0 records selected for fetching, exiting ...");
        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);
        return null;
    }

    if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
        // update the db from tempDir
        Path tempDir2 = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString());

        job = new NutchJob(getConf());
        job.setJobName("generate: updatedb " + dbDir);
        job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
        for (Path segmpaths : generatedSegments) {
            Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
            FileInputFormat.addInputPath(job, subGenDir);
        }
        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapperClass(CrawlDbUpdater.class);
        job.setReducerClass(CrawlDbUpdater.class);
        job.setOutputFormat(MapFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlDatum.class);
        FileOutputFormat.setOutputPath(job, tempDir2);
        try {
            JobClient.runJob(job);
            CrawlDb.install(job, dbDir);
        } catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            fs.delete(tempDir2, true);
            throw e;
        }
        fs.delete(tempDir2, true);
    }

    LockUtil.removeLockFile(fs, lock);
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));

    Path[] patharray = new Path[generatedSegments.size()];
    return generatedSegments.toArray(patharray);
}

From source file:com.TCG.Nutch_DNS.Injector.java

License:Apache License

public void inject(Path hostDb, Path crawlDb) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("Injector: starting at " + sdf.format(start));
        LOG.info("Injector: hostDb: " + hostDb);
        LOG.info("Injector: carwlDb: " + crawlDb);
    }//from   w ww.  ja  v  a 2  s . c  o m

    Path tempDir = new Path(getConf().get("mapred.temp.dir", ".") + "/inject-temp-"
            + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // map text input file to a <url,CrawlDatum> file
    if (LOG.isInfoEnabled()) {
        LOG.info("Injector: Converting injected host to host db entries.");
    }

    FileSystem fs = FileSystem.get(getConf());
    // determine if the crawldb already exists
    boolean dbExists = fs.exists(hostDb);

    JobConf sortJob = new NutchJob(getConf());
    sortJob.setJobName("inject " + hostDb);
    FileInputFormat.addInputPath(sortJob, crawlDb);
    sortJob.setMapperClass(InjectMapper.class);

    FileOutputFormat.setOutputPath(sortJob, tempDir);
    if (dbExists) {
        sortJob.setOutputFormat(SequenceFileOutputFormat.class);
        //HostReducer,host
        sortJob.setReducerClass(ExitHostReducer.class);
    } else {
        sortJob.setOutputFormat(MapFileOutputFormat.class);

        //HostReducer,host
        sortJob.setReducerClass(NotExitHostReducer.class);

        sortJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
    }
    sortJob.setOutputKeyClass(Text.class);
    sortJob.setOutputValueClass(CrawlDatum.class);
    sortJob.setLong("injector.current.time", System.currentTimeMillis());

    RunningJob mapJob = null;
    try {
        mapJob = JobClient.runJob(sortJob);
    } catch (IOException e) {
        fs.delete(tempDir, true);
        throw e;
    }

    if (dbExists) {

        // merge with existing host db
        if (LOG.isInfoEnabled()) {
            LOG.info("Injector: Merging injected hostDb into old hostDb.");
        }
        JobConf mergeJob = HostDb.createJob(getConf(), hostDb);
        FileInputFormat.addInputPath(mergeJob, tempDir);
        //HostDb.createJobReducer:HostDbReducer
        mergeJob.setReducerClass(InjectReducer.class);
        try {
            RunningJob merge = JobClient.runJob(mergeJob);
        } catch (IOException e) {
            fs.delete(tempDir, true);
            throw e;
        }
        HostDb.install(mergeJob, hostDb);
    } else {
        HostDb.install(sortJob, hostDb);
    }

    // clean up
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:Corrector.Config.java

License:Apache License

public static void initializeConfiguration(JobConf conf) {
    validateConfiguration();/*from  www  .j a  v a  2 s . c  om*/

    conf.setNumMapTasks(HADOOP_MAPPERS);
    conf.setNumReduceTasks(HADOOP_REDUCERS);
    conf.set("mapred.child.java.opts", HADOOP_JAVAOPTS);
    conf.set("mapred.task.timeout", Long.toString(HADOOP_TIMEOUT));
    conf.setLong("LOCALNODES", HADOOP_LOCALNODES);

    conf.setLong("RANDOM_PASS", RANDOM_PASS);

    conf.setLong("UP_KMER", UP_KMER);
    conf.setLong("LOW_KMER", LOW_KMER);
    conf.setLong("K", K);
    conf.setLong("READLENGTH", READLEN);

}

From source file:Corrector.FindError.java

License:Apache License

public RunningJob run(String inputPath, String outputPath, int idx, String hkmerlist) throws Exception {
    sLogger.info("Tool name: FindError");
    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);

    JobConf conf = new JobConf(FindError.class);
    conf.setJobName("FindError " + inputPath + " " + Config.K);
    conf.setLong("IDX", idx);
    //\\//from  ww w .j a va2  s . com
    DistributedCache.addCacheFile(new URI(hkmerlist), conf);
    //\\

    Config.initializeConfiguration(conf);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(FindErrorMapper.class);
    conf.setReducerClass(FindErrorReducer.class);

    //delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
}

From source file:Corrector.IdentifyTrustedReads.java

License:Apache License

public RunningJob run(String inputPath, String outputPath, long kmer_threshold) throws Exception {
    sLogger.info("Tool name: IdentifyTrustedReads");
    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);

    JobConf conf = new JobConf(IdentifyTrustedReads.class);
    conf.setJobName("IdentifyTrustedReads " + inputPath + " " + Config.K);
    conf.setLong("KmerThreshold", kmer_threshold);
    // conf.setLong("AllKmer", allkmer);

    Config.initializeConfiguration(conf);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(IdentifyTrustedReads.IdentifyTrustedReadsMapper.class);
    conf.setReducerClass(IdentifyTrustedReads.IdentifyTrustedReadsReducer.class);

    //delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
}

From source file:Corrector.PreCorrect.java

License:Apache License

public RunningJob run(String inputPath, String outputPath, int idx, String hkmerlist) throws Exception {
    sLogger.info("Tool name: PreCorrect");
    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);

    JobConf conf = new JobConf(PreCorrect.class);
    conf.setJobName("PreCorrect " + inputPath + " " + Config.K);
    conf.setLong("IDX", idx);
    //\\/*from  w  ww  .  ja va  2 s  .c o m*/
    DistributedCache.addCacheFile(new URI(hkmerlist), conf);
    //\\

    Config.initializeConfiguration(conf);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(PreCorrectMapper.class);
    conf.setReducerClass(PreCorrectReducer.class);

    //delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
}

From source file:edu.uci.ics.hyracks.hadoop.compat.util.HadoopAdapter.java

License:Apache License

private Object[] getInputSplits(JobConf conf, int desiredMaxMappers) throws Exception {
    Object[] splits = getInputSplits(conf);
    if (splits.length > desiredMaxMappers) {
        long totalInputSize = getInputSize(splits, conf);
        long goalSize = (totalInputSize / desiredMaxMappers);
        conf.setLong("mapred.min.split.size", goalSize);
        conf.setNumMapTasks(desiredMaxMappers);
        splits = getInputSplits(conf);/*ww  w . j  a  v  a  2  s. com*/
    }
    return splits;
}