List of usage examples for org.apache.hadoop.mapred JobConf setNumReduceTasks
public void setNumReduceTasks(int n)
From source file:SleepJobWithArray.java
License:Apache License
public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount, long reduceSleepTime, int reduceSleepCount) { JobConf job = new JobConf(getConf(), SleepJobWithArray.class); job.setNumMapTasks(numMapper);/* w ww. ja v a2 s . c o m*/ job.setNumReduceTasks(numReducer); job.setMapperClass(SleepJobWithArray.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setReducerClass(SleepJobWithArray.class); job.setOutputFormat(NullOutputFormat.class); job.setInputFormat(SleepInputFormat.class); job.setPartitionerClass(SleepJobWithArray.class); job.setSpeculativeExecution(false); FileInputFormat.addInputPath(job, new Path("ignored")); job.setLong("sleep.job.map.sleep.time", mapSleepTime); job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime); job.setInt("sleep.job.map.sleep.count", mapSleepCount); job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount); return job; }
From source file:alluxio.client.hadoop.DFSIOIntegrationTest.java
License:Apache License
private void runIOTest(Class<? extends Mapper<Text, LongWritable, Text, Text>> mapperClass, Path outputDir) throws IOException { JobConf job = new JobConf(mConfig, DFSIOIntegrationTest.class); FileInputFormat.setInputPaths(job, getControlDir(mConfig)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(mapperClass);//from ww w.jav a 2s . c o m job.setReducerClass(AccumulatingReducer.class); FileOutputFormat.setOutputPath(job, outputDir); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(1); JobClient.runJob(job); }
From source file:at.illecker.hadoop.rootbeer.examples.matrixmultiplication.gpu.MatrixMultiplicationGpu.java
License:Apache License
public static Configuration createMatrixMultiplicationGpuConf(Configuration initialConf, Path aPath, Path bPath, Path outPath, int outCardinality, int tileWidth, boolean isDebugging) { JobConf conf = new JobConf(initialConf, MatrixMultiplicationGpu.class); conf.setJobName("MatrixMultiplicationGPU: " + aPath + " x " + bPath + " = " + outPath); conf.setInt(CONF_OUT_CARD, outCardinality); conf.setInt(CONF_TILE_WIDTH, tileWidth); conf.setBoolean(CONF_DEBUG, isDebugging); conf.setInputFormat(CompositeInputFormat.class); conf.set("mapred.join.expr", CompositeInputFormat.compose("inner", SequenceFileInputFormat.class, aPath, bPath)); conf.setOutputFormat(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(conf, outPath); conf.setMapperClass(MatrixMultiplyGpuMapper.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(VectorWritable.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(VectorWritable.class); // Increase client heap size for GPU Rootbeer execution conf.set("mapred.child.java.opts", "-Xms8G -Xmx8G"); // No Reduce step is needed // -> 0 reducer means reduce step will be skipped and // mapper output will be the final out // -> Identity reducer means then shuffling/sorting will still take place conf.setNumReduceTasks(0); return conf;//w w w . jav a 2 s . com }
From source file:azkaban.jobtype.javautils.AbstractHadoopJob.java
License:Apache License
@SuppressWarnings("rawtypes") public JobConf createJobConf(Class<? extends Mapper> mapperClass) throws IOException, URISyntaxException { JobConf conf = createJobConf(mapperClass, null); conf.setNumReduceTasks(0); return conf;//from w ww . ja v a2s . c o m }
From source file:babel.prep.langidtime.LangAndTimeExtractor.java
License:Apache License
/** * Configures a map-only language id job. *///from w w w .j ava 2 s. c o m protected JobConf createJobConf(String crawlDir, String pagesSubDir, String referrer) throws IOException { JobConf job = new JobConf(getConf()); job.setJobName("identify languages and collect time for pages in " + pagesSubDir); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(LangAndTimeMapper.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Page.class); //ANNI EDIT job.setNumMapTasks(2); job.setNumReduceTasks(2); //END ANNI EDIT FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDir)); Path outDir = new Path(new Path(crawlDir, PAGES_SUBDIR), "pages.langidtime." + getCurTimeStamp()); m_fs.delete(outDir, true); FileOutputFormat.setOutputPath(job, outDir); setUniqueTempDir(job); job.set(JOB_PROP_JOB_REFERRER, referrer); return job; }
From source file:bixo.examples.crawl.JDBCCrawlWorkflow.java
License:Apache License
public static Flow createFlow(Path inputDir, Path curLoopDirPath, UserAgent userAgent, FetcherPolicy fetcherPolicy, BaseUrlFilter urlFilter, int maxThreads, boolean debug, String persistentDbLocation) throws Throwable { JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB); int numReducers = HadoopUtils.getNumReducers(conf); conf.setNumReduceTasks(numReducers); FileSystem fs = curLoopDirPath.getFileSystem(conf); if (!fs.exists(inputDir)) { throw new IllegalStateException(String.format("Input directory %s doesn't exist", inputDir)); }//from w w w . java 2s. com Tap inputSource = JDBCTapFactory.createUrlsSourceJDBCTap(persistentDbLocation); // Read _everything_ in initially // Group on the url, and select the best urls to best Pipe importPipe = new Pipe("url importer"); importPipe = new GroupBy(importPipe, new Fields(CrawlDbDatum.URL_FIELD)); importPipe = new Every(importPipe, new BestUrlToFetchBuffer(), Fields.RESULTS); Path contentPath = new Path(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME); Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentPath.toString()); Path parsePath = new Path(curLoopDirPath, CrawlConfig.PARSE_SUBDIR_NAME); Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parsePath.toString()); Path statusDirPath = new Path(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString()); // NOTE: The source and sink for CrawlDbDatums is essentially the same database - // since cascading doesn't allow you to use the same tap for source and // sink we fake it by creating two separate taps. Tap urlSink = JDBCTapFactory.createUrlsSinkJDBCTap(persistentDbLocation); // Create the sub-assembly that runs the fetch job BaseFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent); BaseScoreGenerator scorer = new FixedScoreGenerator(); FetchPipe fetchPipe = new FetchPipe(importPipe, scorer, fetcher, numReducers); Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe()); // Take content and split it into content output plus parse to extract URLs. ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), new SimpleParser()); Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe()); urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction(new SimpleUrlNormalizer(), new SimpleUrlValidator())); urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter)); urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer())); // Take status and output updated UrlDatum's. Again, since we are using // the same database we need to create a new tap. Pipe urlFromFetchPipe = new Pipe("url from fetch", fetchPipe.getStatusTailPipe()); urlFromFetchPipe = new Each(urlFromFetchPipe, new CreateUrlDatumFromStatusFunction()); // Now we need to join the URLs we get from parsing content with the // URLs we got from the status output, so we have a unified stream // of all known URLs. Pipe urlPipe = new GroupBy("url pipe", Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe), new Fields(UrlDatum.URL_FN)); urlPipe = new Every(urlPipe, new LatestUrlDatumBuffer(), Fields.RESULTS); Pipe outputPipe = new Pipe("output pipe"); outputPipe = new Each(urlPipe, new CreateCrawlDbDatumFromUrlFunction()); // Create the output map that connects each tail pipe to the appropriate sink. Map<String, Tap> sinkMap = new HashMap<String, Tap>(); sinkMap.put(statusPipe.getName(), statusSink); sinkMap.put(FetchPipe.CONTENT_PIPE_NAME, contentSink); sinkMap.put(ParsePipe.PARSE_PIPE_NAME, parseSink); sinkMap.put(outputPipe.getName(), urlSink); // Finally we can run it. FlowConnector flowConnector = new FlowConnector( HadoopUtils.getDefaultProperties(JDBCCrawlWorkflow.class, debug, conf)); return flowConnector.connect(inputSource, sinkMap, statusPipe, fetchPipe.getContentTailPipe(), parsePipe.getTailPipe(), outputPipe); }
From source file:bixo.examples.crawl.SimpleCrawlWorkflow.java
License:Apache License
public static Flow createFlow(Path curWorkingDirPath, Path crawlDbPath, FetcherPolicy fetcherPolicy, UserAgent userAgent, BaseUrlFilter urlFilter, SimpleCrawlToolOptions options) throws Throwable { JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB); int numReducers = HadoopUtils.getNumReducers(conf); conf.setNumReduceTasks(numReducers); Properties props = HadoopUtils.getDefaultProperties(SimpleCrawlWorkflow.class, options.isDebugLogging(), conf);//from w w w. j av a 2 s . c om FileSystem fs = curWorkingDirPath.getFileSystem(conf); // Input : the crawldb if (!fs.exists(crawlDbPath)) { throw new RuntimeException("CrawlDb not found"); } // Our crawl db is defined by the CrawlDbDatum Tap inputSource = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString()); Pipe importPipe = new Pipe("import pipe"); // Split into tuples that are to be fetched and that have already been fetched SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedCrawlDatums()); Pipe finishedDatumsFromDb = splitter.getRHSPipe(); Pipe urlsToFetchPipe = new Pipe("urls to Fetch", splitter.getLHSPipe()); // Convert the urlsToFetchPipe so that we now deal with UrlDatums. urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbFunction()); // A TupleLogger is a good way to follow the tuples around in a flow. You can enable the output // of tuples by setting options.setDebugLogging() to true. urlsToFetchPipe = TupleLogger.makePipe(urlsToFetchPipe, true); // Create the output sinks : // crawldb // content // parse // status Path outCrawlDbPath = new Path(curWorkingDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); Tap loopCrawldbSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), outCrawlDbPath.toString()); Path contentDirPath = new Path(curWorkingDirPath, CrawlConfig.CONTENT_SUBDIR_NAME); Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString()); Path parseDirPath = new Path(curWorkingDirPath, CrawlConfig.PARSE_SUBDIR_NAME); Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parseDirPath.toString()); Path statusDirPath = new Path(curWorkingDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString()); Path productsDirPath = new Path(curWorkingDirPath, CrawlConfig.PRODUCTS_SUBDIR_NAME); Tap productsSink = new Hfs(new TextLine(), productsDirPath.toString()); // Tap productsSink = new Hfs(new TextLine(ProductDatum.FIELDS), productsDirPath.toString()); // Create the sub-assembly that runs the fetch job SimpleHttpFetcher fetcher = new SimpleHttpFetcher(options.getMaxThreads(), fetcherPolicy, userAgent); fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES); fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT); fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT); // You can also provide a set of mime types you want to restrict what content type you // want to deal with - for now keep it simple. Set<String> validMimeTypes = new HashSet<String>(); validMimeTypes.add("text/plain"); validMimeTypes.add("text/html"); fetcherPolicy.setValidMimeTypes(validMimeTypes); // The scorer is used by the FetchPipe to assign a score to every URL that passes the // robots.txt processing. The score is used to sort URLs such that higher scoring URLs // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped. BaseScoreGenerator scorer = new FixedScoreGenerator(); FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers); Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe()); Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe()); contentPipe = TupleLogger.makePipe(contentPipe, true); // Take content and split it into content output plus parse to extract URLs. SimpleParser parser = new SimpleParser(); parser.setExtractLanguage(false); ParsePipe parsePipe = new ParsePipe(contentPipe, parser); Pipe productsPipe = new Pipe("products pipe", parsePipe); // PRECIOUS Pipe productsPipe = new Pipe("products pipe", fetchPipe.getContentTailPipe()); String regex = "[a-z]+@[a-z]+.[a-z]+"; // WAS: String regex = "[\\w\\-]([\\.\\w])+[\\w]+@([\\w\\-]+\\.)+[A-Z]{2,4}"; Function emailExtractor = new RegexGenerator(new Fields("email"), regex); productsPipe = new Each(productsPipe, emailExtractor); // PRECIOUS productsPipe = new Each(productsPipe, new CreateProductDatumsFunction()); productsPipe = TupleLogger.makePipe(productsPipe, true); Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe()); urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction()); if (urlFilter != null) { urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter)); } urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer())); urlFromOutlinksPipe = TupleLogger.makePipe(urlFromOutlinksPipe, true); // Take status and output urls from it Pipe urlFromFetchPipe = new Pipe("url from fetch"); urlFromFetchPipe = new Each(statusPipe, new CreateUrlDatumFromStatusFunction()); urlFromFetchPipe = TupleLogger.makePipe(urlFromFetchPipe, true); // Finally join the URLs we get from parsing content with the URLs we got // from the status ouput, and the urls we didn't process from the db so that // we have a unified stream of all known URLs for the crawldb. Pipe finishedUrlsFromDbPipe = new Each(finishedDatumsFromDb, new CreateUrlDatumFromCrawlDbFunction()); finishedUrlsFromDbPipe = TupleLogger.makePipe(finishedUrlsFromDbPipe, true); // NOTE : Ideally you would just do a CoGroup instead of converting all the pipes to emit UrlDatums // and then doing the extra step of converting from UrlDatum to CrawlDbDatum. // The reason this isn't being done here is because we are sharing LatestUrlDatumBuffer() with JDBCCrawlTool Pipe crawlDbPipe = new GroupBy("crawldb pipe", Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe, finishedUrlsFromDbPipe), new Fields(UrlDatum.URL_FN)); crawlDbPipe = new Every(crawlDbPipe, new LatestUrlDatumBuffer(), Fields.RESULTS); Pipe outputPipe = new Pipe("output pipe"); outputPipe = new Each(crawlDbPipe, new CreateCrawlDbDatumFromUrlFunction()); // Create the output map that connects each tail pipe to the appropriate sink. Map<String, Tap> sinkMap = new HashMap<String, Tap>(); sinkMap.put(statusPipe.getName(), statusSink); sinkMap.put(contentPipe.getName(), contentSink); sinkMap.put(ParsePipe.PARSE_PIPE_NAME, parseSink); sinkMap.put(crawlDbPipe.getName(), loopCrawldbSink); sinkMap.put(productsPipe.getName(), productsSink); FlowConnector flowConnector = new FlowConnector(props); Flow flow = flowConnector.connect(inputSource, sinkMap, statusPipe, contentPipe, parsePipe.getTailPipe(), outputPipe); return flow; }
From source file:bixo.examples.webmining.DemoWebMiningWorkflow.java
License:Apache License
public static Flow createWebMiningWorkflow(Path crawlDbPath, Path curLoopDirPath, FetcherPolicy fetcherPolicy, UserAgent userAgent, DemoWebMiningOptions options) throws IOException, InterruptedException { // Fetch at most 200 pages, max size of 128K, complete mode, from the current dir. // HTML only. // We want to extract the cleaned up HTML, and pass that to the parser, which will // be specified via options.getAnalyzer. From this we'll get outlinks, page score, and // any results. JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB); boolean isLocal = HadoopUtils.isJobLocal(conf); int numReducers = HadoopUtils.getNumReducers(conf); conf.setNumReduceTasks(numReducers); conf.setInt("mapred.min.split.size", 64 * 1024 * 1024); Properties props = HadoopUtils.getDefaultProperties(DemoWebMiningWorkflow.class, false, conf); FileSystem fs = crawlDbPath.getFileSystem(conf); // Input : the crawldb if (!fs.exists(crawlDbPath)) { throw new RuntimeException("CrawlDb not found"); }/*from w w w . j ava 2s . c o m*/ Tap inputSource = new Hfs(new TextDelimited(CrawlDbDatum.FIELDS, "\t", CrawlDbDatum.TYPES), crawlDbPath.toString()); Pipe importPipe = new Pipe("import pipe"); // Split into tuples that are to be fetched and that have already been fetched SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedSSCrawlDatums()); Pipe finishedDatumsFromDb = new Pipe("finished datums from db", splitter.getRHSPipe()); Pipe urlsToFetchPipe = splitter.getLHSPipe(); // Limit to MAX_DISTRIBUTED_FETCH if running in real cluster, // or MAX_LOCAL_FETCH if running locally. So first we sort the entries // from high to low by links score. // TODO add unit test urlsToFetchPipe = new GroupBy(urlsToFetchPipe, new Fields(CrawlDbDatum.LINKS_SCORE_FIELD), true); long maxToFetch = HadoopUtils.isJobLocal(conf) ? MAX_LOCAL_FETCH : MAX_DISTRIBUTED_FETCH; urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbDatum(maxToFetch)); BaseScoreGenerator scorer = new LinkScoreGenerator(); // Create the sub-assembly that runs the fetch job int maxThreads = isLocal ? CrawlConfig.DEFAULT_NUM_THREADS_LOCAL : CrawlConfig.DEFAULT_NUM_THREADS_CLUSTER; SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent); fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES); fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT); fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT); FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers); Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe()); Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe()); contentPipe = TupleLogger.makePipe(contentPipe, true); // Create a parser that returns back the raw HTML (cleaned up by Tika) as the parsed content. SimpleParser parser = new SimpleParser(new ParserPolicy(), true); ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), parser); Pipe analyzerPipe = new Pipe("analyzer pipe"); analyzerPipe = new Each(parsePipe.getTailPipe(), new AnalyzeHtml()); Pipe outlinksPipe = new Pipe("outlinks pipe", analyzerPipe); outlinksPipe = new Each(outlinksPipe, new CreateLinkDatumFromOutlinksFunction()); Pipe resultsPipe = new Pipe("results pipe", analyzerPipe); resultsPipe = new Each(resultsPipe, new CreateResultsFunction()); // Group the finished datums, the skipped datums, status, outlinks Pipe updatePipe = new CoGroup("update pipe", Pipe.pipes(finishedDatumsFromDb, statusPipe, analyzerPipe, outlinksPipe), Fields.fields(new Fields(CrawlDbDatum.URL_FIELD), new Fields(StatusDatum.URL_FN), new Fields(AnalyzedDatum.URL_FIELD), new Fields(LinkDatum.URL_FN)), null, new OuterJoin()); updatePipe = new Every(updatePipe, new UpdateCrawlDbBuffer(), Fields.RESULTS); // output : loop dir specific crawldb Path outCrawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); Tap crawlDbSink = new Hfs(new TextLine(), outCrawlDbPath.toString()); // Status, Path statusDirPath = new Path(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString()); // Content Path contentDirPath = new Path(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME); Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString()); // PageResults Path resultsDirPath = new Path(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME); Tap resultsSink = new Hfs(new TextLine(), resultsDirPath.toString()); // Create the output map that connects each tail pipe to the appropriate sink. Map<String, Tap> sinkMap = new HashMap<String, Tap>(); sinkMap.put(updatePipe.getName(), crawlDbSink); sinkMap.put(statusPipe.getName(), statusSink); sinkMap.put(contentPipe.getName(), contentSink); sinkMap.put(resultsPipe.getName(), resultsSink); FlowConnector flowConnector = new FlowConnector(props); Flow flow = flowConnector.connect(inputSource, sinkMap, updatePipe, statusPipe, contentPipe, resultsPipe); return flow; }
From source file:bixo.examples.webmining.WebMiningWorkflow.java
License:Apache License
public static Flow createWebMiningWorkflow(Path crawlDbPath, Path curLoopDirPath, FetcherPolicy fetcherPolicy, UserAgent userAgent, WebMiningOptions options, boolean resetSolr) throws IOException, InterruptedException { // Fetch at most 200 pages, max size of 128K, complete mode, from the current dir. // HTML only. // We want to extract the cleaned up HTML, and pass that to the parser, which will // be specified via options.getAnalyzer. From this we'll get outlinks, page score, and // any results. JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB); boolean isLocal = HadoopUtils.isJobLocal(conf); int numReducers = 1; // we always want to use a single reducer, to avoid contention conf.setNumReduceTasks(numReducers); conf.setInt("mapred.min.split.size", 64 * 1024 * 1024); Properties props = HadoopUtils.getDefaultProperties(WebMiningWorkflow.class, false, conf); FileSystem fs = crawlDbPath.getFileSystem(conf); // Input : the crawldb if (!fs.exists(crawlDbPath)) { throw new RuntimeException("CrawlDb not found"); }//w w w . ja v a 2s . co m Tap inputSource = new Hfs(new TextDelimited(CrawlDbDatum.FIELDS, "\t", CrawlDbDatum.TYPES), crawlDbPath.toString()); Pipe importPipe = new Pipe("import pipe"); // Split into tuples that are to be fetched and that have already been fetched SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedSSCrawlDatums()); Pipe finishedDatumsFromDb = new Pipe("finished datums from db", splitter.getRHSPipe()); Pipe urlsToFetchPipe = splitter.getLHSPipe(); // Limit to MAX_DISTRIBUTED_FETCH if running in real cluster, // or MAX_LOCAL_FETCH if running locally. So first we sort the entries // from high to low by links score. // TODO add unit test urlsToFetchPipe = new GroupBy(urlsToFetchPipe, new Fields(CrawlDbDatum.LINKS_SCORE_FIELD), true); long maxToFetch = HadoopUtils.isJobLocal(conf) ? MAX_LOCAL_FETCH : MAX_DISTRIBUTED_FETCH; urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbDatum(maxToFetch)); BaseScoreGenerator scorer = new LinkScoreGenerator(); // Create the sub-assembly that runs the fetch job int maxThreads = isLocal ? CrawlConfig.DEFAULT_NUM_THREADS_LOCAL : CrawlConfig.DEFAULT_NUM_THREADS_CLUSTER; SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent); fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES); fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT); fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT); FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers); Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe()); Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe()); contentPipe = TupleLogger.makePipe(contentPipe, true); // Create a parser that returns back the raw HTML (cleaned up by Tika) as the parsed content. SimpleParser parser = new SimpleParser(new ParserPolicy(), true); ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), parser); Pipe analyzerPipe = new Pipe("analyzer pipe"); analyzerPipe = new Each(parsePipe.getTailPipe(), new AnalyzeHtml()); Pipe outlinksPipe = new Pipe("outlinks pipe", analyzerPipe); outlinksPipe = new Each(outlinksPipe, new CreateLinkDatumFromOutlinksFunction()); Pipe resultsPipe = new Pipe("results pipe", analyzerPipe); resultsPipe = new Each(resultsPipe, new CreateResultsFunction()); // Group the finished datums, the skipped datums, status, outlinks Pipe updatePipe = new CoGroup("update pipe", Pipe.pipes(finishedDatumsFromDb, statusPipe, analyzerPipe, outlinksPipe), Fields.fields(new Fields(CrawlDbDatum.URL_FIELD), new Fields(StatusDatum.URL_FN), new Fields(AnalyzedDatum.URL_FIELD), new Fields(LinkDatum.URL_FN)), null, new OuterJoin()); updatePipe = new Every(updatePipe, new UpdateCrawlDbBuffer(), Fields.RESULTS); // output : loop dir specific crawldb Path outCrawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); Tap crawlDbSink = new Hfs(new TextLine(), outCrawlDbPath.toString()); // Status, Path statusDirPath = new Path(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString()); // Content Path contentDirPath = new Path(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME); Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString()); // PageResults Path resultsDirPath = new Path(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME); Tap resultsSink = new Hfs(new TextLine(), resultsDirPath.toString()); // Create the output map that connects each tail pipe to the appropriate sink. Map<String, Tap> sinkMap = new HashMap<String, Tap>(); sinkMap.put(updatePipe.getName(), crawlDbSink); sinkMap.put(statusPipe.getName(), statusSink); sinkMap.put(contentPipe.getName(), contentSink); sinkMap.put(resultsPipe.getName(), resultsSink); FlowConnector flowConnector = new FlowConnector(props); Flow flow = flowConnector.connect(inputSource, sinkMap, updatePipe, statusPipe, contentPipe, resultsPipe); return flow; }
From source file:boa.datagen.SeqSort.java
License:Apache License
/** * The main driver for sort program.// ww w .j a v a 2 s . c o m * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker. */ @Override public int run(String[] args) throws Exception { System.out.println(inPath); JobConf jobConf = new JobConf(getConf(), SeqSort.class); jobConf.setJobName("sorter"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(IdentityReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sort_reduces = jobConf.get("test.sort.reduces_per_host"); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); } // Set user-supplied (possibly default) job configs jobConf.setNumReduceTasks(num_reduces); jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(BytesWritable.class); SequenceFileOutputFormat.setCompressOutput(jobConf, true); SequenceFileOutputFormat.setOutputCompressorClass(jobConf, SnappyCodec.class); SequenceFileOutputFormat.setOutputCompressionType(jobConf, CompressionType.BLOCK); // Make sure there are exactly 2 parameters left. FileInputFormat.setInputPaths(jobConf, inPath); FileOutputFormat.setOutputPath(jobConf, new Path(outPath)); System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + num_reduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }