Example usage for org.apache.hadoop.mapreduce.lib.map MultithreadedMapper setNumberOfThreads

List of usage examples for org.apache.hadoop.mapreduce.lib.map MultithreadedMapper setNumberOfThreads

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.map MultithreadedMapper setNumberOfThreads.

Prototype

public static void setNumberOfThreads(Job job, int threads) 

Source Link

Document

Set the number of threads in the pool for running maps.

Usage

From source file:ml.shifu.shifu.core.processor.VarSelectModelProcessor.java

License:Apache License

private Job createSEMapReduceJob(SourceType source, Configuration conf, String varSelectMSEOutputPath)
        throws IOException {
    @SuppressWarnings("deprecation")
    Job job = new Job(conf, "Shifu: Variable Selection Wrapper Job : " + this.modelConfig.getModelSetName());
    job.setJarByClass(getClass());// w  ww . j ava  2s  .  c o  m
    boolean isSEVarSelMulti = Boolean.TRUE.toString().equalsIgnoreCase(
            Environment.getProperty(Constants.SHIFU_VARSEL_SE_MULTI, Constants.SHIFU_DEFAULT_VARSEL_SE_MULTI));
    if (isSEVarSelMulti) {
        job.setMapperClass(MultithreadedMapper.class);
        MultithreadedMapper.setMapperClass(job, VarSelectMapper.class);
        int threads;
        try {
            threads = Integer.parseInt(Environment.getProperty(Constants.SHIFU_VARSEL_SE_MULTI_THREAD,
                    Constants.SHIFU_DEFAULT_VARSEL_SE_MULTI_THREAD + ""));
        } catch (Exception e) {
            Log.warn("'shifu.varsel.se.multi.thread' should be a int value, set default value: {}",
                    Constants.SHIFU_DEFAULT_VARSEL_SE_MULTI_THREAD);
            threads = Constants.SHIFU_DEFAULT_VARSEL_SE_MULTI_THREAD;
        }
        MultithreadedMapper.setNumberOfThreads(job, threads);
    } else {
        job.setMapperClass(VarSelectMapper.class);
    }

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(ColumnInfo.class);
    job.setInputFormatClass(TextInputFormat.class);
    FileInputFormat.setInputPaths(job, ShifuFileUtils.getFileSystemBySourceType(source)
            .makeQualified(new Path(super.getPathFinder().getNormalizedDataPath())));

    job.setReducerClass(VarSelectReducer.class);
    // Only one reducer, no need set combiner because of distinct keys in map outputs.
    job.setNumReduceTasks(1);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileOutputFormat.setOutputPath(job, new Path(varSelectMSEOutputPath));
    MultipleOutputs.addNamedOutput(job, Constants.SHIFU_VARSELECT_SE_OUTPUT_NAME, TextOutputFormat.class,
            Text.class, Text.class);
    return job;
}

From source file:org.apache.mahout.cf.taste.hadoop.als.ParallelALSFactorizationJob.java

License:Apache License

private void runSolver(Path ratings, Path output, Path pathToUorM, int currentIteration, String matrixName,
        int numEntities) throws ClassNotFoundException, IOException, InterruptedException {

    // necessary for local execution in the same JVM only
    SharingMapper.reset();/*from   w ww .j a v a 2s  .  c o m*/

    Class<? extends Mapper<IntWritable, VectorWritable, IntWritable, VectorWritable>> solverMapperClassInternal;
    String name;

    if (implicitFeedback) {
        solverMapperClassInternal = SolveImplicitFeedbackMapper.class;
        name = "Recompute " + matrixName + ", iteration (" + currentIteration + '/' + numIterations + "), "
                + '(' + numThreadsPerSolver + " threads, " + numFeatures + " features, implicit feedback)";
    } else {
        solverMapperClassInternal = SolveExplicitFeedbackMapper.class;
        name = "Recompute " + matrixName + ", iteration (" + currentIteration + '/' + numIterations + "), "
                + '(' + numThreadsPerSolver + " threads, " + numFeatures + " features, explicit feedback)";
    }

    Job solverForUorI = prepareJob(ratings, output, SequenceFileInputFormat.class,
            MultithreadedSharingMapper.class, IntWritable.class, VectorWritable.class,
            SequenceFileOutputFormat.class, name);
    Configuration solverConf = solverForUorI.getConfiguration();
    solverConf.set(LAMBDA, String.valueOf(lambda));
    solverConf.set(ALPHA, String.valueOf(alpha));
    solverConf.setInt(NUM_FEATURES, numFeatures);
    solverConf.set(NUM_ENTITIES, String.valueOf(numEntities));

    FileSystem fs = FileSystem.get(pathToUorM.toUri(), solverConf);
    FileStatus[] parts = fs.listStatus(pathToUorM, PathFilters.partFilter());
    for (FileStatus part : parts) {
        if (log.isDebugEnabled()) {
            log.debug("Adding {} to distributed cache", part.getPath().toString());
        }
        DistributedCache.addCacheFile(part.getPath().toUri(), solverConf);
    }

    MultithreadedMapper.setMapperClass(solverForUorI, solverMapperClassInternal);
    MultithreadedMapper.setNumberOfThreads(solverForUorI, numThreadsPerSolver);

    boolean succeeded = solverForUorI.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}

From source file:org.apache.mahout.cf.taste.hadoop.als.RecommenderJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*from   www  . j  a  v  a  2 s  . c  o m*/
    addOption("userFeatures", null, "path to the user feature matrix", true);
    addOption("itemFeatures", null, "path to the item feature matrix", true);
    addOption("numRecommendations", null, "number of recommendations per user",
            String.valueOf(DEFAULT_NUM_RECOMMENDATIONS));
    addOption("maxRating", null, "maximum rating available", true);
    addOption("numThreads", null, "threads per mapper", String.valueOf(1));
    addOption("usesLongIDs", null, "input contains long IDs that need to be translated");
    addOption("userIDIndex", null, "index for user long IDs (necessary if usesLongIDs is true)");
    addOption("itemIDIndex", null, "index for user long IDs (necessary if usesLongIDs is true)");
    addOutputOption();

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    Job prediction = prepareJob(getInputPath(), getOutputPath(), SequenceFileInputFormat.class,
            MultithreadedSharingMapper.class, IntWritable.class, RecommendedItemsWritable.class,
            TextOutputFormat.class);
    Configuration conf = prediction.getConfiguration();

    int numThreads = Integer.parseInt(getOption("numThreads"));

    conf.setInt(NUM_RECOMMENDATIONS, Integer.parseInt(getOption("numRecommendations")));
    conf.set(USER_FEATURES_PATH, getOption("userFeatures"));
    conf.set(ITEM_FEATURES_PATH, getOption("itemFeatures"));
    conf.set(MAX_RATING, getOption("maxRating"));

    boolean usesLongIDs = Boolean.parseBoolean(getOption("usesLongIDs"));
    if (usesLongIDs) {
        conf.set(ParallelALSFactorizationJob.USES_LONG_IDS, String.valueOf(true));
        conf.set(USER_INDEX_PATH, getOption("userIDIndex"));
        conf.set(ITEM_INDEX_PATH, getOption("itemIDIndex"));
    }

    MultithreadedMapper.setMapperClass(prediction, PredictionMapper.class);
    MultithreadedMapper.setNumberOfThreads(prediction, numThreads);

    boolean succeeded = prediction.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    return 0;
}

From source file:org.apache.nutch.util.SitemapProcessor.java

License:Apache License

public void sitemap(Path crawldb, Path hostdb, Path sitemapUrlDir, boolean strict, boolean filter,
        boolean normalize, int threads) throws Exception {
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("SitemapProcessor: Starting at {}", sdf.format(start));
    }/* w w w.  java2s .c  o  m*/

    FileSystem fs = crawldb.getFileSystem(getConf());
    Path old = new Path(crawldb, "old");
    Path current = new Path(crawldb, "current");
    Path tempCrawlDb = new Path(crawldb,
            "crawldb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // lock an existing crawldb to prevent multiple simultaneous updates
    Path lock = new Path(crawldb, LOCK_NAME);
    if (!fs.exists(current))
        fs.mkdirs(current);

    LockUtil.createLockFile(fs, lock, false);

    Configuration conf = getConf();
    conf.setBoolean(SITEMAP_STRICT_PARSING, strict);
    conf.setBoolean(SITEMAP_URL_FILTERING, filter);
    conf.setBoolean(SITEMAP_URL_NORMALIZING, normalize);
    conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    Job job = Job.getInstance(conf, "SitemapProcessor_" + crawldb.toString());
    job.setJarByClass(SitemapProcessor.class);

    // add crawlDb, sitemap url directory and hostDb to input paths
    MultipleInputs.addInputPath(job, current, SequenceFileInputFormat.class);

    if (sitemapUrlDir != null)
        MultipleInputs.addInputPath(job, sitemapUrlDir, KeyValueTextInputFormat.class);

    if (hostdb != null)
        MultipleInputs.addInputPath(job, new Path(hostdb, CURRENT_NAME), SequenceFileInputFormat.class);

    FileOutputFormat.setOutputPath(job, tempCrawlDb);

    job.setOutputFormatClass(MapFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    job.setMapperClass(MultithreadedMapper.class);
    MultithreadedMapper.setMapperClass(job, SitemapMapper.class);
    MultithreadedMapper.setNumberOfThreads(job, threads);
    job.setReducerClass(SitemapReducer.class);

    try {
        boolean success = job.waitForCompletion(true);
        if (!success) {
            String message = "SitemapProcessor_" + crawldb.toString() + " job did not succeed, job status: "
                    + job.getStatus().getState() + ", reason: " + job.getStatus().getFailureInfo();
            LOG.error(message);
            NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs);
            // throw exception so that calling routine can exit with error
            throw new RuntimeException(message);
        }

        boolean preserveBackup = conf.getBoolean("db.preserve.backup", true);
        if (!preserveBackup && fs.exists(old))
            fs.delete(old, true);
        else
            FSUtils.replace(fs, old, current, true);

        FSUtils.replace(fs, current, tempCrawlDb, true);
        LockUtil.removeLockFile(fs, lock);

        if (LOG.isInfoEnabled()) {
            long filteredRecords = job.getCounters().findCounter("Sitemap", "filtered_records").getValue();
            long fromHostname = job.getCounters().findCounter("Sitemap", "sitemaps_from_hostname").getValue();
            long fromSeeds = job.getCounters().findCounter("Sitemap", "sitemap_seeds").getValue();
            long failedFetches = job.getCounters().findCounter("Sitemap", "failed_fetches").getValue();
            long newSitemapEntries = job.getCounters().findCounter("Sitemap", "new_sitemap_entries").getValue();

            LOG.info("SitemapProcessor: Total records rejected by filters: {}", filteredRecords);
            LOG.info("SitemapProcessor: Total sitemaps from host name: {}", fromHostname);
            LOG.info("SitemapProcessor: Total sitemaps from seed urls: {}", fromSeeds);
            LOG.info("SitemapProcessor: Total failed sitemap fetches: {}", failedFetches);
            LOG.info("SitemapProcessor: Total new sitemap entries added: {}", newSitemapEntries);

            long end = System.currentTimeMillis();
            LOG.info("SitemapProcessor: Finished at {}, elapsed: {}", sdf.format(end),
                    TimingUtil.elapsedTime(start, end));
        }
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
        LOG.error("SitemapProcessor_" + crawldb.toString(), e);
        NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs);
        throw e;
    }
}

From source file:org.huahinframework.tools.util.ToolsTool.java

License:Apache License

protected void setFilter(SimpleJob job, Class<? extends Filter> clazz) {
    if (!opt.isLocalMode()) {
        job.setFilter(clazz);/* w ww .j  a va 2 s . c  o  m*/
    } else {
        job.setMapperClass(MultithreadedMapper.class);
        MultithreadedMapper.setMapperClass(job, clazz);
        MultithreadedMapper.setNumberOfThreads(job, opt.getThreadNumber());

        SimpleTextInputFormat.setMinInputSplitSize(job, opt.getSplitSize());
        SimpleTextInputFormat.setMaxInputSplitSize(job, opt.getSplitSize());
    }
}

From source file:org.imageterrier.indexers.hadoop.HadoopIndexer.java

License:Mozilla Public License

protected Job createJob(HadoopIndexerOptions options) throws IOException {
    final Job job = new Job(getConf());
    job.setJobName("terrierIndexing");

    if (options.getInputMode() == InputMode.QUANTISED_FEATURES) {
        job.setMapperClass(QFIndexerMapper.class);
    } else {/*from   w w  w  . j av  a  2s. com*/
        if (options.shardPerThread) {
            job.setMapperClass(MultithreadedMapper.class);
            MultithreadedMapper.setMapperClass(job, MTImageIndexerMapper.class);
            MultithreadedMapper.setNumberOfThreads(job, options.getMultithread());
        } else {
            job.setMapperClass(ImageIndexerMapper.class);
        }
    }
    // Load quantiser (if it exists), extract header, count codebook size
    if (options.getInputModeOptions().hasQuantiserFile()) {
        final String quantFile = options.getInputModeOptions().getQuantiserFile();
        System.out.println("Loading codebook to see its size");
        final SpatialClusters<?> quantiser = readClusters(options);
        System.out.println("Setting codebook size: " + quantiser.numClusters());
        job.getConfiguration().setInt(QUANTISER_SIZE, quantiser.numClusters());
        if (quantiser.numClusters() < options.getNumReducers())
            options.setNumReducers(quantiser.numClusters());
    }
    job.setReducerClass(IndexerReducer.class);

    FileOutputFormat.setOutputPath(job, options.getOutputPath());
    job.setMapOutputKeyClass(NewSplitEmittedTerm.class);
    job.setMapOutputValueClass(MapEmittedPostingList.class);
    job.getConfiguration().setBoolean("indexing.hadoop.multiple.indices", options.isDocumentPartitionMode());

    // if
    // (!job.getConfiguration().get("mapred.job.tracker").equals("local")) {
    // job.getConfiguration().set("mapred.map.output.compression.codec",
    // GzipCodec.class.getCanonicalName());
    // job.getConfiguration().setBoolean("mapred.compress.map.output",
    // true);
    // } else {
    job.getConfiguration().setBoolean("mapred.compress.map.output", false);
    // }

    job.setInputFormatClass(PositionAwareSequenceFileInputFormat.class); // important

    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setSortComparatorClass(NewSplitEmittedTerm.SETRawComparatorTermSplitFlush.class);
    job.setGroupingComparatorClass(NewSplitEmittedTerm.SETRawComparatorTerm.class);

    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    SequenceFileInputFormat.setInputPaths(job, options.getInputPaths());

    job.setNumReduceTasks(options.getNumReducers());
    if (options.getNumReducers() > 1) {
        if (options.isDocumentPartitionMode()) {
            job.setPartitionerClass(NewSplitEmittedTerm.SETPartitioner.class);
        } else {
            // job.setPartitionerClass(NewSplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class);
            if (job.getConfiguration().getInt(QUANTISER_SIZE, -1) == -1) {
                job.setPartitionerClass(NewSplitEmittedTerm.SETPartitionerHashedTerm.class);
            } else {
                job.setPartitionerClass(NewSplitEmittedTerm.SETPartitionerCodebookAwareTerm.class);
            }

        }
    } else {
        // for JUnit tests, we seem to need to restore the original
        // partitioner class
        job.setPartitionerClass(HashPartitioner.class);
    }

    job.setJarByClass(this.getClass());

    return job;
}