Example usage for org.apache.hadoop.mapred JobConf getNumMapTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf getNumMapTasks.

Prototype

public int getNumMapTasks()

Source Link

Document

Get the configured number of map tasks for this job.

Usage

From source file:gov.nasa.jpl.memex.pooledtimeseries.SimilarityCalculation.java

License:Apache License

public static void main(String[] args) throws Exception {
    System.loadLibrary(Core.NATIVE_LIBRARY_NAME);

    Configuration baseConf = new Configuration();
    baseConf.set("mapreduce.job.maps", "96");
    baseConf.set("mapreduce.job.reduces", "0");
    baseConf.set("mapred.tasktracker.map.tasks.maximum", "96");
    baseConf.set("meanDistsFilePath", args[2]);

    JobConf conf = new JobConf();
    System.out.println("Before Map:" + conf.getNumMapTasks());
    conf.setNumMapTasks(196);/*from  w w  w  .  j a va2 s  .c  o m*/
    System.out.println("After Map:" + conf.getNumMapTasks());

    Job job = Job.getInstance(baseConf);
    System.out.println("Track: " + baseConf.get("mapred.job.tracker"));
    System.out.println("Job ID" + job.getJobID());
    System.out.println("Job Name" + job.getJobName());
    System.out.println(baseConf.get("mapreduce.job.maps"));
    job.setJarByClass(SimilarityCalculation.class);

    job.setJobName("similarity_calc");

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(Map.class);

    job.waitForCompletion(true);
}

From source file:net.peacesoft.nutch.crawl.ReGenerator.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or
 * not is read from the crawl.generate.filter property in the configuration
 * files. If the property is not found, the URLs are filtered. Same for the
 * normalisation.//  w  w w  . j  a  va2 s  . c o m
 *
 * @param dbDir Crawl database directory
 * @param segments Segments directory
 * @param numLists Number of reduce tasks
 * @param topN Number of top URLs to be selected
 * @param curTime Current time in milliseconds
 *
 * @return Path to generated segment or null if no entries were selected
 *
 * @throws IOException When an I/O error occurs
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter,
        boolean norm, boolean force, int maxNumSegments) throws IOException {
    try {
        Path tempDir = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
        FileSystem fs = FileSystem.get(getConf());
        LockUtil.createLockFile(fs, lock, force);

        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("ReGenerator: starting at " + sdf.format(start));
        LOG.info("ReGenerator: Selecting best-scoring urls due for fetch.");
        LOG.info("ReGenerator: filtering: " + filter);
        LOG.info("ReGenerator: normalizing: " + norm);
        if (topN != Long.MAX_VALUE) {
            LOG.info("ReGenerator: topN: " + topN);
        }

        if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) {
            LOG.info(
                    "ReGenerator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead");
        }

        // map to inverted subset due for fetch, sort by score
        JobConf job = new NutchJob(getConf());
        job.setJobName("generate: select from " + dbDir);

        if (numLists == -1) { // for politeness make
            numLists = job.getNumMapTasks(); // a partition per fetch task
        }
        if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
            // override
            LOG.info("ReGenerator: jobtracker is 'local', generating exactly one partition.");
            numLists = 1;
        }
        job.setLong(GENERATOR_CUR_TIME, curTime);
        // record real generation time
        long generateTime = System.currentTimeMillis();
        job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
        job.setLong(GENERATOR_TOP_N, topN);
        job.setBoolean(GENERATOR_FILTER, filter);
        job.setBoolean(GENERATOR_NORMALISE, norm);
        job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments);

        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormat(SequenceFileInputFormat.class);

        job.setMapperClass(Selector.class);
        job.setPartitionerClass(Selector.class);
        job.setReducerClass(Selector.class);

        FileOutputFormat.setOutputPath(job, tempDir);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(FloatWritable.class);
        job.setOutputKeyComparatorClass(DecreasingFloatComparator.class);
        job.setOutputValueClass(SelectorEntry.class);
        job.setOutputFormat(GeneratorOutputFormat.class);

        try {
            JobClient.runJob(job);
        } catch (IOException e) {
            throw e;
        }

        // read the subdirectories generated in the temp
        // output and turn them into segments
        List<Path> generatedSegments = new ArrayList<Path>();

        FileStatus[] status = fs.listStatus(tempDir);
        try {
            for (FileStatus stat : status) {
                Path subfetchlist = stat.getPath();
                if (!subfetchlist.getName().startsWith("fetchlist-")) {
                    continue;
                }
                // start a new partition job for this segment
                Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);
                generatedSegments.add(newSeg);
            }
        } catch (Exception e) {
            LOG.warn("ReGenerator: exception while partitioning segments, exiting ...");
            fs.delete(tempDir, true);
            return null;
        }

        if (generatedSegments.size() == 0) {
            LOG.warn("ReGenerator: 0 records selected for fetching, exiting ...");
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            return null;
        }

        if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
            // update the db from tempDir
            Path tempDir2 = new Path(
                    getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

            job = new NutchJob(getConf());
            job.setJobName("generate: updatedb " + dbDir);
            job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
            for (Path segmpaths : generatedSegments) {
                Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
                FileInputFormat.addInputPath(job, subGenDir);
            }
            FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
            job.setInputFormat(SequenceFileInputFormat.class);
            job.setMapperClass(CrawlDbUpdater.class);
            job.setReducerClass(CrawlDbUpdater.class);
            job.setOutputFormat(MapFileOutputFormat.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(CrawlDatum.class);
            FileOutputFormat.setOutputPath(job, tempDir2);
            try {
                JobClient.runJob(job);
                CrawlDb.install(job, dbDir);
            } catch (IOException e) {
                LockUtil.removeLockFile(fs, lock);
                fs.delete(tempDir, true);
                fs.delete(tempDir2, true);
                throw e;
            }
            fs.delete(tempDir2, true);
        }

        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);

        long end = System.currentTimeMillis();
        LOG.info("ReGenerator: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));

        Path[] patharray = new Path[generatedSegments.size()];
        return generatedSegments.toArray(patharray);
    } catch (Exception ex) {
        LOG.error("ReGenerator generate error: " + ex.toString(), ex);
        return null;
    }
}

From source file:org.apache.druid.indexer.hadoop.DatasourceInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
    JobConf conf = new JobConf(context.getConfiguration());

    List<String> dataSources = getDataSources(conf);
    List<InputSplit> splits = new ArrayList<>();

    for (String dataSource : dataSources) {
        List<WindowedDataSegment> segments = getSegments(conf, dataSource);
        if (segments == null || segments.size() == 0) {
            throw new ISE("No segments found to read for dataSource[%s]", dataSource);
        }// w ww.j  a  v  a2 s  .c  o m

        // Note: Each segment is logged separately to avoid creating a huge String if we are loading lots of segments.
        for (int i = 0; i < segments.size(); i++) {
            final WindowedDataSegment segment = segments.get(i);
            logger.info("Segment %,d/%,d for dataSource[%s] has identifier[%s], interval[%s]", i,
                    segments.size(), dataSource, segment.getSegment().getId(), segment.getInterval());
        }

        long maxSize = getMaxSplitSize(conf, dataSource);
        if (maxSize < 0) {
            long totalSize = 0;
            for (WindowedDataSegment segment : segments) {
                totalSize += segment.getSegment().getSize();
            }
            int mapTask = conf.getNumMapTasks();
            if (mapTask > 0) {
                maxSize = totalSize / mapTask;
            }
        }

        if (maxSize > 0) {
            //combining is to happen, let us sort the segments list by size so that they
            //are combined appropriately
            segments.sort(Comparator.comparingLong(s -> s.getSegment().getSize()));
        }

        List<WindowedDataSegment> list = new ArrayList<>();
        long size = 0;

        org.apache.hadoop.mapred.InputFormat fio = supplier.get();
        for (WindowedDataSegment segment : segments) {
            if (size + segment.getSegment().getSize() > maxSize && size > 0) {
                splits.add(toDataSourceSplit(list, fio, conf));
                list = new ArrayList<>();
                size = 0;
            }

            list.add(segment);
            size += segment.getSegment().getSize();
        }

        if (list.size() > 0) {
            splits.add(toDataSourceSplit(list, fio, conf));
        }
    }

    logger.info("Number of splits [%d]", splits.size());
    return splits;
}

From source file:org.apache.mahout.df.mapred.partial.PartialBuilder.java

License:Apache License

@Override
protected DecisionForest parseOutput(JobConf job, PredictionCallback callback) throws IOException {
    int numMaps = job.getNumMapTasks();
    int numTrees = Builder.getNbTrees(job);

    Path outputPath = getOutputPath(job);

    log.info("Computing partitions' first ids...");
    Step0Job step0 = new Step0Job(getOutputPath(job), getDataPath(), getDatasetPath());
    Step0Output[] partitions = step0.run(getConf());

    log.info("Processing the output...");
    TreeID[] keys = new TreeID[numTrees];
    Node[] trees = new Node[numTrees];
    int[] firstIds = Step0Output.extractFirstIds(partitions);
    processOutput(job, outputPath, firstIds, keys, trees, callback);

    // call the second step in order to complete the oob predictions
    if ((callback != null) && (numMaps > 1) && isStep2(getConf())) {
        log.info("*****************************");
        log.info("Second Step");
        log.info("*****************************");
        Step2Job step2 = new Step2Job(getOutputPath(job), getDataPath(), getDatasetPath(), partitions);

        step2.run(job, keys, trees, callback);
    }/*  w w  w .  j  a  v a  2 s. com*/

    return new DecisionForest(Arrays.asList(trees));
}

From source file:org.apache.mahout.df.mapred.partial.PartialSequentialBuilder.java

License:Apache License

@Override
protected void configureJob(JobConf job, int nbTrees, boolean oobEstimate) throws IOException {

    int numMaps = job.getNumMapTasks();

    super.configureJob(job, nbTrees, oobEstimate);

    // PartialBuilder sets the number of maps to 1 if we are running in 'local'
    job.setNumMapTasks(numMaps);//from   w  w w  .ja v a  2 s . c  o  m
}

From source file:org.apache.mahout.df.mapred.partial.PartialSequentialBuilder.java

License:Apache License

@Override
protected void runJob(JobConf job) throws IOException {
    // retrieve the splits
    TextInputFormat input = (TextInputFormat) job.getInputFormat();
    InputSplit[] splits = input.getSplits(job, job.getNumMapTasks());
    log.debug("Nb splits : {}", splits.length);

    InputSplit[] sorted = Arrays.copyOf(splits, splits.length);
    Builder.sortSplits(sorted);//from   w  w  w.  j  a  v  a2  s . c o  m

    int numTrees = Builder.getNbTrees(job); // total number of trees

    firstOutput = new PartialOutputCollector(numTrees);
    Reporter reporter = Reporter.NULL;

    firstIds = new int[splits.length];
    sizes = new int[splits.length];

    // to compute firstIds, process the splits in file order
    int firstId = 0;
    long slowest = 0; // duration of slowest map
    for (InputSplit split : splits) {
        int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition

        RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter);

        LongWritable key = reader.createKey();
        Text value = reader.createValue();

        Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(), hp, splits.length,
                numTrees);

        long time = System.currentTimeMillis();

        firstIds[hp] = firstId;

        while (reader.next(key, value)) {
            mapper.map(key, value, firstOutput, reporter);
            firstId++;
            sizes[hp]++;
        }

        mapper.close();

        time = System.currentTimeMillis() - time;
        log.info("Duration : {}", DFUtils.elapsedTime(time));

        if (time > slowest) {
            slowest = time;
        }
    }

    log.info("Longest duration : {}", DFUtils.elapsedTime(slowest));
}

From source file:org.apache.mahout.df.mapred.partial.PartialSequentialBuilder.java

License:Apache License

/**
 * The second step uses the trees to predict the rest of the instances outside
 * their own partition//from  w  ww . ja  va2  s  . co m
 * 
 * @throws IOException
 * 
 */
void secondStep(JobConf job, Path forestPath, PredictionCallback callback) throws IOException {
    // retrieve the splits
    TextInputFormat input = (TextInputFormat) job.getInputFormat();
    InputSplit[] splits = input.getSplits(job, job.getNumMapTasks());
    log.debug("Nb splits : {}", splits.length);

    Builder.sortSplits(splits);

    int numTrees = Builder.getNbTrees(job); // total number of trees

    // compute the expected number of outputs
    int total = 0;
    for (int p = 0; p < splits.length; p++) {
        total += Step2Mapper.nbConcerned(splits.length, numTrees, p);
    }

    secondOutput = new PartialOutputCollector(total);
    Reporter reporter = Reporter.NULL;
    long slowest = 0; // duration of slowest map

    for (int partition = 0; partition < splits.length; partition++) {
        InputSplit split = splits[partition];
        RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter);

        LongWritable key = reader.createKey();
        Text value = reader.createValue();

        // load the output of the 1st step
        int nbConcerned = Step2Mapper.nbConcerned(splits.length, numTrees, partition);
        TreeID[] fsKeys = new TreeID[nbConcerned];
        Node[] fsTrees = new Node[nbConcerned];

        FileSystem fs = forestPath.getFileSystem(job);
        int numInstances = InterResults.load(fs, forestPath, splits.length, numTrees, partition, fsKeys,
                fsTrees);

        Step2Mapper mapper = new Step2Mapper();
        mapper.configure(partition, dataset, fsKeys, fsTrees, numInstances);

        long time = System.currentTimeMillis();

        while (reader.next(key, value)) {
            mapper.map(key, value, secondOutput, reporter);
        }

        mapper.close();

        time = System.currentTimeMillis() - time;
        log.info("Duration : {}", DFUtils.elapsedTime(time));

        if (time > slowest) {
            slowest = time;
        }
    }

    log.info("Longest duration : {}", DFUtils.elapsedTime(slowest));
}

From source file:org.apache.mahout.df.mapred.partial.Step0Job.java

License:Apache License

/**
 * Extracts the output and processes it//w ww  .  j av  a  2 s .  co  m
 * 
 * @param job
 * 
 * @return firstIds for each partition in Hadoop's order
 * @throws IOException
 */
protected Step0Output[] parseOutput(JobConf job) throws IOException {
    int numMaps = job.getNumMapTasks();
    FileSystem fs = outputPath.getFileSystem(job);

    Path[] outfiles = DFUtils.listOutputFiles(fs, outputPath);

    int[] keys = new int[numMaps];
    Step0Output[] values = new Step0Output[numMaps];

    // read all the outputs
    IntWritable key = new IntWritable();
    Step0Output value = new Step0Output(0L, 0);

    int index = 0;
    for (Path path : outfiles) {
        Reader reader = new Reader(fs, path, job);

        try {
            while (reader.next(key, value)) {
                keys[index] = key.get();
                values[index] = value.clone();

                index++;
            }
        } finally {
            reader.close();
        }
    }

    return processOutput(keys, values);
}

From source file:org.apache.mahout.df.mapred.partial.Step1Mapper.java

License:Apache License

@Override
public void configure(JobConf job) {
    super.configure(job);

    configure(Builder.getRandomSeed(job), job.getInt("mapred.task.partition", -1), job.getNumMapTasks(),
            Builder.getNbTrees(job));/*from   ww  w . j  a  v a2  s  .  c o  m*/
}

From source file:org.apache.mahout.df.mapred.partial.Step2Job.java

License:Apache License

/**
 * Extracts the output and processes it//from   w  w w .ja  va  2 s . c  o m
 * 
 * @param job
 * @param callback
 * @throws IOException
 */
protected void parseOutput(JobConf job, PredictionCallback callback) throws IOException {
    int numMaps = job.getNumMapTasks();
    int numTrees = Builder.getNbTrees(job);

    // compute the total number of output values
    //int total = 0;
    for (int partition = 0; partition < numMaps; partition++) {
        //total += Step2Mapper.nbConcerned(numMaps, numTrees, partition);
        Step2Mapper.nbConcerned(numMaps, numTrees, partition);
    }

    int[] firstIds = Step0Output.extractFirstIds(partitions);
    PartialBuilder.processOutput(job, outputPath, firstIds, null, null, callback);
}