List of usage examples for org.apache.hadoop.mapred JobConf JobConf
public JobConf()
From source file:bixo.examples.crawl.SimpleCrawlTool.java
License:Apache License
public static void main(String[] args) { SimpleCrawlToolOptions options = new SimpleCrawlToolOptions(); CmdLineParser parser = new CmdLineParser(options); try {/*from w w w . j a va 2s. co m*/ parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Before we get too far along, see if the domain looks valid. String domain = options.getDomain(); String urlsFile = options.getUrlsFile(); if (domain != null) { validateDomain(domain, parser); } else { if (urlsFile == null) { System.err.println( "Either a target domain should be specified or a file with a list of urls needs to be provided"); printUsageAndExit(parser); } } if (domain != null && urlsFile != null) { System.out.println("Warning: Both domain and urls file list provided - using domain"); } String outputDirName = options.getOutputDir(); if (options.isDebugLogging()) { System.setProperty("bixo.root.level", "DEBUG"); } else { System.setProperty("bixo.root.level", "INFO"); } if (options.getLoggingAppender() != null) { // Set console vs. DRFA vs. something else System.setProperty("bixo.appender", options.getLoggingAppender()); } try { JobConf conf = new JobConf(); Path outputPath = new Path(outputDirName); FileSystem fs = outputPath.getFileSystem(conf); // See if the user isn't starting from scratch then set up the // output directory and create an initial urls subdir. if (!fs.exists(outputPath)) { fs.mkdirs(outputPath); // Create a "0-<timestamp>" sub-directory with just a /urls subdir // In the /urls dir the input file will have a single URL for the target domain. Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0); String curLoopDirName = curLoopDir.toUri().toString(); setLoopLoggerFile(curLoopDirName, 0); Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME); if (domain != null) { importOneDomain(domain, crawlDbPath, conf); } else { importUrls(urlsFile, crawlDbPath); } } Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath); if (latestDirPath == null) { System.err.println("No previous cycle output dirs exist in " + outputDirName); printUsageAndExit(parser); } Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); // Set up the start and end loop counts. int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath); int endLoop = startLoop + options.getNumLoops(); // Set up the UserAgent for the fetcher. UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); // You also get to customize the FetcherPolicy FetcherPolicy defaultPolicy = new FetcherPolicy(); defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY); defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT); // It is a good idea to set up a crawl duration when running long crawls as you may // end up in situations where the fetch slows down due to a 'long tail' and by // specifying a crawl duration you know exactly when the crawl will end. int crawlDurationInMinutes = options.getCrawlDuration(); boolean hasEndTime = crawlDurationInMinutes != SimpleCrawlToolOptions.NO_CRAWL_DURATION; long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) : FetcherPolicy.NO_CRAWL_END_TIME; // By setting up a url filter we only deal with urls that we want to // instead of all the urls that we extract. BaseUrlFilter urlFilter = null; if (domain != null) { urlFilter = new DomainUrlFilter(domain); } // OK, now we're ready to start looping, since we've got our current // settings for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) { // Adjust target end time, if appropriate. if (hasEndTime) { int remainingLoops = (endLoop - curLoop) + 1; long now = System.currentTimeMillis(); long perLoopTime = (targetEndTime - now) / remainingLoops; defaultPolicy.setCrawlEndTime(now + perLoopTime); } Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop); String curLoopDirName = curLoopDirPath.toUri().toString(); setLoopLoggerFile(curLoopDirName, curLoop); Flow flow = SimpleCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options); flow.complete(); // Writing out .dot files is a good way to verify your flows. // flow.writeDOT("build/valid-flow.dot"); // Update crawlDbPath to point to the latest crawl db crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); } } catch (PlannerException e) { e.writeDOT("build/failed-flow.dot"); System.err.println("PlannerException: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } catch (Throwable t) { System.err.println("Exception running tool: " + t.getMessage()); t.printStackTrace(System.err); System.exit(-1); } }
From source file:bixo.examples.crawl.SimpleCrawlWorkflowLRTest.java
License:Apache License
@Test public void testNotLosingFetchedUrls() throws Throwable { String baseDirName = "build/test/SimpleCrawlWorkflowLRTest/output"; JobConf conf = new JobConf(); Path baseDirPath = new Path(baseDirName); FileSystem fs = baseDirPath.getFileSystem(conf); HadoopUtils.safeRemove(fs, baseDirPath); Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 0); Path crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); SimpleCrawlTool.importOneDomain("localhost:8089", crawlDbPath, conf); curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 1); FetcherPolicy defaultPolicy = new FetcherPolicy(); defaultPolicy.setCrawlDelay(1);/*from w w w . ja v a 2 s . c o m*/ defaultPolicy.setFetcherMode(FetcherMode.COMPLETE); BaseUrlFilter urlFilter = new BaseUrlFilter() { @Override public boolean isRemove(UrlDatum datum) { return false; } }; SimpleCrawlToolOptions options = new SimpleCrawlToolOptions(); UserAgent userAgent = new UserAgent("test", "test@domain.com", "http://test.domain.com"); Server server = null; try { server = startServer(new FakeWebSiteHandler(), 8089); Flow flow = SimpleCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options); flow.complete(); // Update the crawlDb path crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); // Now we should have an output/1-<timestamp>/ directory, where the // /urls dir has 11 entries with // one being previously crawled, and the other 10 being pending. Hfs crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString()); TupleEntryIterator iter = crawldbTap.openForRead(conf); int numFetched = 0; int numPending = 0; while (iter.hasNext()) { CrawlDbDatum datum = new CrawlDbDatum(iter.next()); UrlStatus status = datum.getLastStatus(); int crawlDepth = datum.getCrawlDepth(); if (datum.getLastFetched() != 0) { numFetched += 1; assertEquals(UrlStatus.FETCHED, status); assertEquals(0, crawlDepth); } else { numPending += 1; assertEquals(UrlStatus.UNFETCHED, status); assertEquals(1, crawlDepth); } } assertEquals(1, numFetched); assertEquals(10, numPending); // Do it one more time, to verify status gets propagated forward. curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 2); flow = SimpleCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options); flow.complete(); // Update crawldb path crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString()); iter = crawldbTap.openForRead(conf); numFetched = 0; numPending = 0; int numDepth0 = 0; int numDepth1 = 0; int numDepth2 = 0; while (iter.hasNext()) { CrawlDbDatum datum = new CrawlDbDatum(iter.next()); UrlStatus status = datum.getLastStatus(); int depth = datum.getCrawlDepth(); if (datum.getLastFetched() != 0) { numFetched += 1; assertEquals("URL has incorrect status: " + datum.getUrl(), UrlStatus.FETCHED, status); } else { numPending += 1; assertEquals("URL has incorrect status: " + datum.getUrl(), UrlStatus.UNFETCHED, status); } if (depth == 0) { numDepth0 += 1; } else if (depth == 1) { numDepth1 += 1; } else if (depth == 2) { numDepth2 += 1; } else { fail("Invalid crawl depth for " + datum.getUrl()); } // System.out.println(String.format("URL %s has status %s, last fetch %d, and depth %d", // datum.getUrl(), datum.getLastStatus(), // datum.getLastFetched(), depth)); } assertEquals(11, numFetched); assertEquals(100, numPending); assertEquals(1, numDepth0); assertEquals(10, numDepth1); assertEquals(100, numDepth2); } catch (Throwable t) { fail(t.getMessage()); } finally { if (server != null) { server.stop(); } } }
From source file:bixo.examples.crawl.SimpleStatusTool.java
License:Apache License
public static void main(String[] args) { SimpleStatusToolOptions options = new SimpleStatusToolOptions(); CmdLineParser parser = new CmdLineParser(options); try {//from ww w. j av a 2 s . c o m parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } String crawlDirName = options.getCrawlDir(); try { JobConf conf = new JobConf(); Path crawlDirPath = new Path(crawlDirName); FileSystem fs = crawlDirPath.getFileSystem(conf); if (!fs.exists(crawlDirPath)) { System.err.println("Prior crawl output directory does not exist: " + crawlDirName); System.exit(-1); } // Skip Hadoop/Cascading DEBUG messages. Logger.getRootLogger().setLevel(Level.INFO); boolean exportDb = options.isExportDb(); if (exportDb) { Path latestCrawlDirPath = CrawlDirUtils.findLatestLoopDir(fs, crawlDirPath); processCrawlDb(conf, latestCrawlDirPath, exportDb); } else { int prevLoop = -1; Path curDirPath = null; while ((curDirPath = CrawlDirUtils.findNextLoopDir(fs, crawlDirPath, prevLoop)) != null) { String curDirName = curDirPath.toUri().toString(); LOGGER.info(""); LOGGER.info("================================================================"); LOGGER.info("Processing " + curDirName); LOGGER.info("================================================================"); int curLoop = CrawlDirUtils.extractLoopNumber(curDirPath); if (curLoop != prevLoop + 1) { LOGGER.warn(String.format("Missing directories between %d and %d", prevLoop, curLoop)); } prevLoop = curLoop; // Process the status and crawldb in curPath processStatus(conf, curDirPath); processCrawlDb(conf, curDirPath, exportDb); } } } catch (Throwable t) { LOGGER.error("Exception running tool", t); System.exit(-1); } }
From source file:bixo.examples.JDBCCrawlTool.java
License:Open Source License
public static void main(String[] args) { JDBCCrawlToolOptions options = new JDBCCrawlToolOptions(); CmdLineParser parser = new CmdLineParser(options); try {/* w w w.ja va 2 s .co m*/ parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Before we get too far along, see if the domain looks valid. String domain = options.getDomain(); if (domain.startsWith("http")) { System.err.println( "The target domain should be specified as just the host, without the http protocol: " + domain); printUsageAndExit(parser); } if (!domain.equals("localhost") && (domain.split("\\.").length < 2)) { System.err.println( "The target domain should be a valid paid-level domain or subdomain of the same: " + domain); printUsageAndExit(parser); } String outputDirName = options.getOutputDir(); if (options.isDebugLogging()) { System.setProperty("bixo.root.level", "DEBUG"); } else { System.setProperty("bixo.root.level", "INFO"); } if (options.getLoggingAppender() != null) { // Set console vs. DRFA vs. something else System.setProperty("bixo.appender", options.getLoggingAppender()); } try { JobConf conf = new JobConf(); Path outputPath = new Path(outputDirName); FileSystem fs = outputPath.getFileSystem(conf); // See if the user is starting from scratch if (options.getDbLocation() == null) { if (fs.exists(outputPath)) { System.out.println("Warning: Previous cycle output dirs exist in : " + outputDirName); System.out.println("Warning: Delete the output dir before running"); fs.delete(outputPath, true); } } else { Path dbLocationPath = new Path(options.getDbLocation()); if (!fs.exists(dbLocationPath)) { fs.mkdirs(dbLocationPath); } } if (!fs.exists(outputPath)) { fs.mkdirs(outputPath); Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0); String curLoopDirName = curLoopDir.toUri().toString(); setLoopLoggerFile(curLoopDirName, 0); importOneDomain(domain, JDBCTapFactory.createUrlsSinkJDBCTap(options.getDbLocation()), conf); } Path inputPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath); if (inputPath == null) { System.err.println("No previous cycle output dirs exist in " + outputDirName); printUsageAndExit(parser); } int startLoop = CrawlDirUtils.extractLoopNumber(inputPath); int endLoop = startLoop + options.getNumLoops(); UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); FetcherPolicy defaultPolicy = new FetcherPolicy(); defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY); defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT); int crawlDurationInMinutes = options.getCrawlDuration(); boolean hasEndTime = crawlDurationInMinutes != JDBCCrawlToolOptions.NO_CRAWL_DURATION; long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) : FetcherPolicy.NO_CRAWL_END_TIME; BaseUrlFilter urlFilter = new DomainUrlFilter(domain); // Now we're ready to start looping, since we've got our current settings for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) { // Adjust target end time, if appropriate. if (hasEndTime) { int remainingLoops = (endLoop - curLoop) + 1; long now = System.currentTimeMillis(); long perLoopTime = (targetEndTime - now) / remainingLoops; defaultPolicy.setCrawlEndTime(now + perLoopTime); } Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop); String curLoopDirName = curLoopDir.toUri().toString(); setLoopLoggerFile(curLoopDirName, curLoop); Flow flow = JDBCCrawlWorkflow.createFlow(inputPath, curLoopDir, userAgent, defaultPolicy, urlFilter, options.getMaxThreads(), options.isDebugLogging(), options.getDbLocation()); flow.complete(); // flow.writeDOT("build/valid-flow.dot"); // Input for the next round is our current output inputPath = curLoopDir; } } catch (PlannerException e) { e.writeDOT("build/failed-flow.dot"); System.err.println("PlannerException: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } catch (Throwable t) { System.err.println("Exception running tool: " + t.getMessage()); t.printStackTrace(System.err); System.exit(-1); } JDBCTapFactory.shutdown(); }
From source file:bixo.examples.SimpleCrawlTool.java
License:Open Source License
public static void main(String[] args) { SimpleCrawlToolOptions options = new SimpleCrawlToolOptions(); CmdLineParser parser = new CmdLineParser(options); try {// ww w . j av a 2 s. c o m parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Before we get too far along, see if the domain looks valid. String domain = options.getDomain(); if (domain.startsWith("http")) { System.err.println( "The target domain should be specified as just the host, without the http protocol: " + domain); printUsageAndExit(parser); } if (!domain.equals("localhost") && (domain.split("\\.").length < 2)) { System.err.println( "The target domain should be a valid paid-level domain or subdomain of the same: " + domain); printUsageAndExit(parser); } String outputDirName = options.getOutputDir(); if (options.isDebugLogging()) { System.setProperty("bixo.root.level", "DEBUG"); } else { System.setProperty("bixo.root.level", "INFO"); } if (options.getLoggingAppender() != null) { // Set console vs. DRFA vs. something else System.setProperty("bixo.appender", options.getLoggingAppender()); } try { JobConf conf = new JobConf(); Path outputPath = new Path(outputDirName); FileSystem fs = outputPath.getFileSystem(conf); // See if the user isn't starting from scratch then set up the // output directory and create an initial urls subdir. if (!fs.exists(outputPath)) { fs.mkdirs(outputPath); // Create a "0-<timestamp>" sub-directory with just a /urls subdir // In the /urls dir the input file will have a single URL for the target domain. Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0); String curLoopDirName = curLoopDir.toUri().toString(); setLoopLoggerFile(curLoopDirName, 0); Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME); importOneDomain(domain, crawlDbPath, conf); } Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath); if (latestDirPath == null) { System.err.println("No previous cycle output dirs exist in " + outputDirName); printUsageAndExit(parser); } Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); // Set up the start and end loop counts. int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath); int endLoop = startLoop + options.getNumLoops(); // Set up the UserAgent for the fetcher. UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); // You also get to customize the FetcherPolicy FetcherPolicy defaultPolicy = new FetcherPolicy(); defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY); defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT); // It is a good idea to set up a crawl duration when running long crawls as you may // end up in situations where the fetch slows down due to a 'long tail' and by // specifying a crawl duration you know exactly when the crawl will end. int crawlDurationInMinutes = options.getCrawlDuration(); boolean hasEndTime = crawlDurationInMinutes != SimpleCrawlToolOptions.NO_CRAWL_DURATION; long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) : FetcherPolicy.NO_CRAWL_END_TIME; // By setting up a url filter we only deal with urls that we want to // instead of all the urls that we extract. BaseUrlFilter urlFilter = new DomainUrlFilter(domain); // OK, now we're ready to start looping, since we've got our current settings for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) { // Adjust target end time, if appropriate. if (hasEndTime) { int remainingLoops = (endLoop - curLoop) + 1; long now = System.currentTimeMillis(); long perLoopTime = (targetEndTime - now) / remainingLoops; defaultPolicy.setCrawlEndTime(now + perLoopTime); } Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop); String curLoopDirName = curLoopDirPath.toUri().toString(); setLoopLoggerFile(curLoopDirName, curLoop); Flow flow = SimpleCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options); flow.complete(); // Writing out .dot files is a good way to verify your flows. // flow.writeDOT("build/valid-flow.dot"); // Update crawlDbPath to point to the latest crawl db crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); } } catch (PlannerException e) { e.writeDOT("build/failed-flow.dot"); System.err.println("PlannerException: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } catch (Throwable t) { System.err.println("Exception running tool: " + t.getMessage()); t.printStackTrace(System.err); System.exit(-1); } }
From source file:bixo.examples.webmining.DemoWebMiningTool.java
License:Apache License
public static void main(String[] args) throws IOException { DemoWebMiningOptions options = new DemoWebMiningOptions(); CmdLineParser parser = new CmdLineParser(options); try {/*from ww w.ja v a 2 s .c om*/ parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Build and run the flow. try { Path workingDirPath = new Path(options.getWorkingDir()); JobConf conf = new JobConf(); FileSystem fs = workingDirPath.getFileSystem(conf); setupWorkingDir(fs, workingDirPath, CrawlConfig.SEED_URLS_FILENAME); Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath); if (latestDirPath == null) { error("No previous cycle output dirs exist in " + workingDirPath, parser); } Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); FetcherPolicy fetcherPolicy = new FetcherPolicy(); fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY); fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT); // We only care about mime types that the Tika HTML parser can handle, // so restrict it to the same. Set<String> validMimeTypes = new HashSet<String>(); Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext()); for (MediaType supportedType : supportedTypes) { validMimeTypes.add(String.format("%s/%s", supportedType.getType(), supportedType.getSubtype())); } fetcherPolicy.setValidMimeTypes(validMimeTypes); // Let's limit our crawl to two loops for (int curLoop = 1; curLoop <= 2; curLoop++) { Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, curLoop); Flow flow = DemoWebMiningWorkflow.createWebMiningWorkflow(crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options); flow.complete(); // Update crawlDbPath to point to the latest crawl db crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); } } catch (Exception e) { System.err.println("Exception running job: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } }
From source file:bixo.examples.webmining.WebMiningTool.java
License:Apache License
public static void main(String[] args) throws IOException { WebMiningOptions options = new WebMiningOptions(); CmdLineParser parser = new CmdLineParser(options); try {//from w w w .j a va 2s . co m parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Build and run the flow. try { Path workingDirPath = new Path(options.getWorkingDir()); JobConf conf = new JobConf(); FileSystem fs = workingDirPath.getFileSystem(conf); setupWorkingDir(fs, workingDirPath, CrawlConfig.SEED_URLS_FILENAME); Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath); if (latestDirPath == null) { error("No previous cycle output dirs exist in " + workingDirPath, parser); } Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); FetcherPolicy fetcherPolicy = new FetcherPolicy(); fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY); fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT); // We only care about mime types that the Tika HTML parser can handle, // so restrict it to the same. Set<String> validMimeTypes = new HashSet<String>(); Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext()); for (MediaType supportedType : supportedTypes) { validMimeTypes.add(String.format("%s/%s", supportedType.getType(), supportedType.getSubtype())); } fetcherPolicy.setValidMimeTypes(validMimeTypes); // Let's limit our crawl to two loops for (int curLoop = 1; curLoop <= 2; curLoop++) { Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, curLoop); Flow flow = WebMiningWorkflow.createWebMiningWorkflow(crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options, curLoop == 1); flow.complete(); // Update crawlDbPath to point to the latest crawl db crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); } } catch (Exception e) { System.err.println("Exception running job: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } }
From source file:bixo.fetcher.FetcherTest.java
License:Open Source License
@Test public void testStaleConnection() throws Exception { System.setProperty("bixo.root.level", "TRACE"); String workingFolder = "build/it/FetcherTest/testStaleConnection/working"; String inputPath = makeCrawlDb(workingFolder, "src/it/resources/apple-pages.txt"); Lfs in = new Lfs(new SequenceFile(UrlDatum.FIELDS), inputPath, true); String outPath = "build/it/FetcherTest/testStaleConnection/out"; Lfs content = new Lfs(new SequenceFile(FetchedDatum.FIELDS), outPath + "/content", true); Lfs status = new Lfs(new SequenceFile(StatusDatum.FIELDS), outPath + "/status", true); Pipe pipe = new Pipe("urlSource"); UserAgent userAgent = new FirefoxUserAgent(); FetcherPolicy fetcherPolicy = new FetcherPolicy(); fetcherPolicy.setMaxRequestsPerConnection(1); fetcherPolicy.setCrawlDelay(5 * 1000L); BaseFetcher fetcher = new SimpleHttpFetcher(2, fetcherPolicy, userAgent); BaseScoreGenerator scorer = new FixedScoreGenerator(); FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, 1); FlowConnector flowConnector = new FlowConnector(); Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe); flow.complete();/* w w w. ja va 2 s . c o m*/ // Test for all valid fetches. Lfs validate = new Lfs(new SequenceFile(StatusDatum.FIELDS), outPath + "/status"); TupleEntryIterator tupleEntryIterator = validate.openForRead(new JobConf()); while (tupleEntryIterator.hasNext()) { TupleEntry entry = tupleEntryIterator.next(); StatusDatum sd = new StatusDatum(entry); if (sd.getStatus() != UrlStatus.FETCHED) { LOGGER.error(String.format("Fetched failed! Status is %s for %s", sd.getStatus(), sd.getUrl())); BaseFetchException e = sd.getException(); if (e != null) { LOGGER.error("Fetched failed due to exception", e); } Assert.fail("Status not equal to FETCHED"); } } }
From source file:bixo.fetcher.FetcherTest.java
License:Open Source License
@Test public void testRunFetcher() throws Exception { System.setProperty("bixo.root.level", "TRACE"); String workingFolder = "build/test-it/FetcherTest/testRunFetcher"; String inputPath = makeCrawlDb(workingFolder, "src/it/resources/top10urls.txt"); Lfs in = new Lfs(new SequenceFile(UrlDatum.FIELDS), inputPath, true); Lfs content = new Lfs(new SequenceFile(FetchedDatum.FIELDS), workingFolder + "/content", true); Lfs status = new Lfs(new TextLine(), workingFolder + "/status", true); Pipe pipe = new Pipe("urlSource"); UserAgent userAgent = new FirefoxUserAgent(); BaseFetcher fetcher = new SimpleHttpFetcher(10, userAgent); BaseScoreGenerator scorer = new FixedScoreGenerator(); FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, 1); FlowConnector flowConnector = new FlowConnector(); Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe); flow.complete();/*from ww w . ja v a 2s . c o m*/ // Test for 10 good fetches. Lfs validate = new Lfs(new SequenceFile(FetchedDatum.FIELDS), workingFolder + "/content"); TupleEntryIterator tupleEntryIterator = validate.openForRead(new JobConf()); int fetchedPages = 0; while (tupleEntryIterator.hasNext()) { TupleEntry entry = tupleEntryIterator.next(); new FetchedDatum(entry); fetchedPages += 1; } Assert.assertEquals(10, fetchedPages); }
From source file:cascading.avro.AvroSchemeTest.java
License:Apache License
@Test public void listOrMapInsideListTest() throws Exception { final Schema schema = new Schema.Parser().parse(getClass().getResourceAsStream("test4.avsc")); final AvroScheme scheme = new AvroScheme(schema); final Fields fields = new Fields("aListOfListOfInt", "aListOfMapToLong"); final Lfs lfs = new Lfs(scheme, tempDir.getRoot().toString()); HadoopFlowProcess writeProcess = new HadoopFlowProcess(new JobConf()); final TupleEntryCollector collector = lfs.openForWrite(writeProcess); List<Map<String, Long>> aListOfMapToLong = new ArrayList<Map<String, Long>>(); Map<String, Long> aMapToLong = new HashMap<String, Long>(); aMapToLong.put("one", 1L); aMapToLong.put("two", 2L); aListOfMapToLong.add(aMapToLong);/*from w w w. j a v a2 s .c o m*/ List<List<Integer>> aListOfListOfInt = new ArrayList<List<Integer>>(); List<Integer> aListOfInt = new LinkedList<Integer>(); aListOfInt.add(0); aListOfInt.add(1); aListOfListOfInt.add(aListOfInt); write(scheme, collector, new TupleEntry(fields, new Tuple(aListOfListOfInt, aListOfMapToLong))); collector.close(); HadoopFlowProcess readProcess = new HadoopFlowProcess(new JobConf()); final TupleEntryIterator iterator = lfs.openForRead(readProcess); assertTrue(iterator.hasNext()); final TupleEntry readEntry1 = iterator.next(); List<Integer> outListOfInt = (List) ((List) readEntry1.getObject("aListOfListOfInt")).get(0); Map<Utf8, Long> outMapToLong = (Map) ((List) readEntry1.getObject("aListOfMapToLong")).get(0); assertEquals(Integer.valueOf(0), outListOfInt.get(0)); assertEquals(Integer.valueOf(1), outListOfInt.get(1)); assertEquals(Long.valueOf(1L), outMapToLong.get("one")); assertEquals(Long.valueOf(2L), outMapToLong.get("two")); assertTrue(!iterator.hasNext()); }