List of usage examples for org.apache.hadoop.mapreduce.lib.map MultithreadedMapper setNumberOfThreads
public static void setNumberOfThreads(Job job, int threads)
From source file:ml.shifu.shifu.core.processor.VarSelectModelProcessor.java
License:Apache License
private Job createSEMapReduceJob(SourceType source, Configuration conf, String varSelectMSEOutputPath) throws IOException { @SuppressWarnings("deprecation") Job job = new Job(conf, "Shifu: Variable Selection Wrapper Job : " + this.modelConfig.getModelSetName()); job.setJarByClass(getClass());// w ww . j ava 2s . c o m boolean isSEVarSelMulti = Boolean.TRUE.toString().equalsIgnoreCase( Environment.getProperty(Constants.SHIFU_VARSEL_SE_MULTI, Constants.SHIFU_DEFAULT_VARSEL_SE_MULTI)); if (isSEVarSelMulti) { job.setMapperClass(MultithreadedMapper.class); MultithreadedMapper.setMapperClass(job, VarSelectMapper.class); int threads; try { threads = Integer.parseInt(Environment.getProperty(Constants.SHIFU_VARSEL_SE_MULTI_THREAD, Constants.SHIFU_DEFAULT_VARSEL_SE_MULTI_THREAD + "")); } catch (Exception e) { Log.warn("'shifu.varsel.se.multi.thread' should be a int value, set default value: {}", Constants.SHIFU_DEFAULT_VARSEL_SE_MULTI_THREAD); threads = Constants.SHIFU_DEFAULT_VARSEL_SE_MULTI_THREAD; } MultithreadedMapper.setNumberOfThreads(job, threads); } else { job.setMapperClass(VarSelectMapper.class); } job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(ColumnInfo.class); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.setInputPaths(job, ShifuFileUtils.getFileSystemBySourceType(source) .makeQualified(new Path(super.getPathFinder().getNormalizedDataPath()))); job.setReducerClass(VarSelectReducer.class); // Only one reducer, no need set combiner because of distinct keys in map outputs. job.setNumReduceTasks(1); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(varSelectMSEOutputPath)); MultipleOutputs.addNamedOutput(job, Constants.SHIFU_VARSELECT_SE_OUTPUT_NAME, TextOutputFormat.class, Text.class, Text.class); return job; }
From source file:org.apache.mahout.cf.taste.hadoop.als.ParallelALSFactorizationJob.java
License:Apache License
private void runSolver(Path ratings, Path output, Path pathToUorM, int currentIteration, String matrixName, int numEntities) throws ClassNotFoundException, IOException, InterruptedException { // necessary for local execution in the same JVM only SharingMapper.reset();/*from w ww .j a v a 2s . c o m*/ Class<? extends Mapper<IntWritable, VectorWritable, IntWritable, VectorWritable>> solverMapperClassInternal; String name; if (implicitFeedback) { solverMapperClassInternal = SolveImplicitFeedbackMapper.class; name = "Recompute " + matrixName + ", iteration (" + currentIteration + '/' + numIterations + "), " + '(' + numThreadsPerSolver + " threads, " + numFeatures + " features, implicit feedback)"; } else { solverMapperClassInternal = SolveExplicitFeedbackMapper.class; name = "Recompute " + matrixName + ", iteration (" + currentIteration + '/' + numIterations + "), " + '(' + numThreadsPerSolver + " threads, " + numFeatures + " features, explicit feedback)"; } Job solverForUorI = prepareJob(ratings, output, SequenceFileInputFormat.class, MultithreadedSharingMapper.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, name); Configuration solverConf = solverForUorI.getConfiguration(); solverConf.set(LAMBDA, String.valueOf(lambda)); solverConf.set(ALPHA, String.valueOf(alpha)); solverConf.setInt(NUM_FEATURES, numFeatures); solverConf.set(NUM_ENTITIES, String.valueOf(numEntities)); FileSystem fs = FileSystem.get(pathToUorM.toUri(), solverConf); FileStatus[] parts = fs.listStatus(pathToUorM, PathFilters.partFilter()); for (FileStatus part : parts) { if (log.isDebugEnabled()) { log.debug("Adding {} to distributed cache", part.getPath().toString()); } DistributedCache.addCacheFile(part.getPath().toUri(), solverConf); } MultithreadedMapper.setMapperClass(solverForUorI, solverMapperClassInternal); MultithreadedMapper.setNumberOfThreads(solverForUorI, numThreadsPerSolver); boolean succeeded = solverForUorI.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:org.apache.mahout.cf.taste.hadoop.als.RecommenderJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*from www . j a v a 2 s . c o m*/ addOption("userFeatures", null, "path to the user feature matrix", true); addOption("itemFeatures", null, "path to the item feature matrix", true); addOption("numRecommendations", null, "number of recommendations per user", String.valueOf(DEFAULT_NUM_RECOMMENDATIONS)); addOption("maxRating", null, "maximum rating available", true); addOption("numThreads", null, "threads per mapper", String.valueOf(1)); addOption("usesLongIDs", null, "input contains long IDs that need to be translated"); addOption("userIDIndex", null, "index for user long IDs (necessary if usesLongIDs is true)"); addOption("itemIDIndex", null, "index for user long IDs (necessary if usesLongIDs is true)"); addOutputOption(); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } Job prediction = prepareJob(getInputPath(), getOutputPath(), SequenceFileInputFormat.class, MultithreadedSharingMapper.class, IntWritable.class, RecommendedItemsWritable.class, TextOutputFormat.class); Configuration conf = prediction.getConfiguration(); int numThreads = Integer.parseInt(getOption("numThreads")); conf.setInt(NUM_RECOMMENDATIONS, Integer.parseInt(getOption("numRecommendations"))); conf.set(USER_FEATURES_PATH, getOption("userFeatures")); conf.set(ITEM_FEATURES_PATH, getOption("itemFeatures")); conf.set(MAX_RATING, getOption("maxRating")); boolean usesLongIDs = Boolean.parseBoolean(getOption("usesLongIDs")); if (usesLongIDs) { conf.set(ParallelALSFactorizationJob.USES_LONG_IDS, String.valueOf(true)); conf.set(USER_INDEX_PATH, getOption("userIDIndex")); conf.set(ITEM_INDEX_PATH, getOption("itemIDIndex")); } MultithreadedMapper.setMapperClass(prediction, PredictionMapper.class); MultithreadedMapper.setNumberOfThreads(prediction, numThreads); boolean succeeded = prediction.waitForCompletion(true); if (!succeeded) { return -1; } return 0; }
From source file:org.apache.nutch.util.SitemapProcessor.java
License:Apache License
public void sitemap(Path crawldb, Path hostdb, Path sitemapUrlDir, boolean strict, boolean filter, boolean normalize, int threads) throws Exception { long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("SitemapProcessor: Starting at {}", sdf.format(start)); }/* w w w. java2s .c o m*/ FileSystem fs = crawldb.getFileSystem(getConf()); Path old = new Path(crawldb, "old"); Path current = new Path(crawldb, "current"); Path tempCrawlDb = new Path(crawldb, "crawldb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // lock an existing crawldb to prevent multiple simultaneous updates Path lock = new Path(crawldb, LOCK_NAME); if (!fs.exists(current)) fs.mkdirs(current); LockUtil.createLockFile(fs, lock, false); Configuration conf = getConf(); conf.setBoolean(SITEMAP_STRICT_PARSING, strict); conf.setBoolean(SITEMAP_URL_FILTERING, filter); conf.setBoolean(SITEMAP_URL_NORMALIZING, normalize); conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); Job job = Job.getInstance(conf, "SitemapProcessor_" + crawldb.toString()); job.setJarByClass(SitemapProcessor.class); // add crawlDb, sitemap url directory and hostDb to input paths MultipleInputs.addInputPath(job, current, SequenceFileInputFormat.class); if (sitemapUrlDir != null) MultipleInputs.addInputPath(job, sitemapUrlDir, KeyValueTextInputFormat.class); if (hostdb != null) MultipleInputs.addInputPath(job, new Path(hostdb, CURRENT_NAME), SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, tempCrawlDb); job.setOutputFormatClass(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setMapperClass(MultithreadedMapper.class); MultithreadedMapper.setMapperClass(job, SitemapMapper.class); MultithreadedMapper.setNumberOfThreads(job, threads); job.setReducerClass(SitemapReducer.class); try { boolean success = job.waitForCompletion(true); if (!success) { String message = "SitemapProcessor_" + crawldb.toString() + " job did not succeed, job status: " + job.getStatus().getState() + ", reason: " + job.getStatus().getFailureInfo(); LOG.error(message); NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs); // throw exception so that calling routine can exit with error throw new RuntimeException(message); } boolean preserveBackup = conf.getBoolean("db.preserve.backup", true); if (!preserveBackup && fs.exists(old)) fs.delete(old, true); else FSUtils.replace(fs, old, current, true); FSUtils.replace(fs, current, tempCrawlDb, true); LockUtil.removeLockFile(fs, lock); if (LOG.isInfoEnabled()) { long filteredRecords = job.getCounters().findCounter("Sitemap", "filtered_records").getValue(); long fromHostname = job.getCounters().findCounter("Sitemap", "sitemaps_from_hostname").getValue(); long fromSeeds = job.getCounters().findCounter("Sitemap", "sitemap_seeds").getValue(); long failedFetches = job.getCounters().findCounter("Sitemap", "failed_fetches").getValue(); long newSitemapEntries = job.getCounters().findCounter("Sitemap", "new_sitemap_entries").getValue(); LOG.info("SitemapProcessor: Total records rejected by filters: {}", filteredRecords); LOG.info("SitemapProcessor: Total sitemaps from host name: {}", fromHostname); LOG.info("SitemapProcessor: Total sitemaps from seed urls: {}", fromSeeds); LOG.info("SitemapProcessor: Total failed sitemap fetches: {}", failedFetches); LOG.info("SitemapProcessor: Total new sitemap entries added: {}", newSitemapEntries); long end = System.currentTimeMillis(); LOG.info("SitemapProcessor: Finished at {}, elapsed: {}", sdf.format(end), TimingUtil.elapsedTime(start, end)); } } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error("SitemapProcessor_" + crawldb.toString(), e); NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs); throw e; } }
From source file:org.huahinframework.tools.util.ToolsTool.java
License:Apache License
protected void setFilter(SimpleJob job, Class<? extends Filter> clazz) { if (!opt.isLocalMode()) { job.setFilter(clazz);/* w ww .j a va 2 s . c o m*/ } else { job.setMapperClass(MultithreadedMapper.class); MultithreadedMapper.setMapperClass(job, clazz); MultithreadedMapper.setNumberOfThreads(job, opt.getThreadNumber()); SimpleTextInputFormat.setMinInputSplitSize(job, opt.getSplitSize()); SimpleTextInputFormat.setMaxInputSplitSize(job, opt.getSplitSize()); } }
From source file:org.imageterrier.indexers.hadoop.HadoopIndexer.java
License:Mozilla Public License
protected Job createJob(HadoopIndexerOptions options) throws IOException { final Job job = new Job(getConf()); job.setJobName("terrierIndexing"); if (options.getInputMode() == InputMode.QUANTISED_FEATURES) { job.setMapperClass(QFIndexerMapper.class); } else {/*from w w w . j av a 2s. com*/ if (options.shardPerThread) { job.setMapperClass(MultithreadedMapper.class); MultithreadedMapper.setMapperClass(job, MTImageIndexerMapper.class); MultithreadedMapper.setNumberOfThreads(job, options.getMultithread()); } else { job.setMapperClass(ImageIndexerMapper.class); } } // Load quantiser (if it exists), extract header, count codebook size if (options.getInputModeOptions().hasQuantiserFile()) { final String quantFile = options.getInputModeOptions().getQuantiserFile(); System.out.println("Loading codebook to see its size"); final SpatialClusters<?> quantiser = readClusters(options); System.out.println("Setting codebook size: " + quantiser.numClusters()); job.getConfiguration().setInt(QUANTISER_SIZE, quantiser.numClusters()); if (quantiser.numClusters() < options.getNumReducers()) options.setNumReducers(quantiser.numClusters()); } job.setReducerClass(IndexerReducer.class); FileOutputFormat.setOutputPath(job, options.getOutputPath()); job.setMapOutputKeyClass(NewSplitEmittedTerm.class); job.setMapOutputValueClass(MapEmittedPostingList.class); job.getConfiguration().setBoolean("indexing.hadoop.multiple.indices", options.isDocumentPartitionMode()); // if // (!job.getConfiguration().get("mapred.job.tracker").equals("local")) { // job.getConfiguration().set("mapred.map.output.compression.codec", // GzipCodec.class.getCanonicalName()); // job.getConfiguration().setBoolean("mapred.compress.map.output", // true); // } else { job.getConfiguration().setBoolean("mapred.compress.map.output", false); // } job.setInputFormatClass(PositionAwareSequenceFileInputFormat.class); // important job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setSortComparatorClass(NewSplitEmittedTerm.SETRawComparatorTermSplitFlush.class); job.setGroupingComparatorClass(NewSplitEmittedTerm.SETRawComparatorTerm.class); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); SequenceFileInputFormat.setInputPaths(job, options.getInputPaths()); job.setNumReduceTasks(options.getNumReducers()); if (options.getNumReducers() > 1) { if (options.isDocumentPartitionMode()) { job.setPartitionerClass(NewSplitEmittedTerm.SETPartitioner.class); } else { // job.setPartitionerClass(NewSplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class); if (job.getConfiguration().getInt(QUANTISER_SIZE, -1) == -1) { job.setPartitionerClass(NewSplitEmittedTerm.SETPartitionerHashedTerm.class); } else { job.setPartitionerClass(NewSplitEmittedTerm.SETPartitionerCodebookAwareTerm.class); } } } else { // for JUnit tests, we seem to need to restore the original // partitioner class job.setPartitionerClass(HashPartitioner.class); } job.setJarByClass(this.getClass()); return job; }