List of usage examples for org.apache.hadoop.fs FileSystem exists
public boolean exists(Path f) throws IOException
From source file:be_uclouvain_ingi2145_lab05.GiraphJobRunner.java
@Override public int run(String[] strings) throws Exception { GiraphConfiguration gconf = new GiraphConfiguration(conf); //gconf.setVertexClass(SimpleShortestPathsComputation.class); /*gconf.setVertexInputFormatClass( SimpleShortestPathsVertexInputFormat.class); gconf.setVertexOutputFormatClass(/* w w w. j a va 2 s .c o m*/ SimpleShortestPathsVertexOutputFormat.class); */ CommandLine cmd = ConfigurationUtils.parseArgs(gconf, strings); if (null == cmd) { return 0; } //GiraphYarnClient job = new GiraphYarnClient(gconf,gconf.getClass().getName()); GiraphJob job = new GiraphJob(gconf, getClass().getName()); job.getInternalJob().setJarByClass(getClass()); if (cmd.hasOption("vof") || cmd.hasOption("eof")) { if (cmd.hasOption("op")) { Path outputPath = new Path(cmd.getOptionValue("op")); FileSystem fs = FileSystem.get(outputPath.toUri(), conf); /*Check if output path (args[1])exist or not*/ if (fs.exists(outputPath)) { /*If exist delete the output path*/ fs.delete(outputPath, true); } FileOutputFormat.setOutputPath(job.getInternalJob(), outputPath); } } /* if (cmd.hasOption("vif") || cmd.hasOption("eif")) { if (cmd.hasOption("vip")) { FileInputFormat.addInputPath(job.getInternalJob(), new Path(cmd.getOptionValue("op"))); } }*/ //If there is a custom option specified if (cmd.hasOption("ca")) { String[] args = cmd.getOptionValues("ca"); LOG.fatal("" + Arrays.toString(args)); gconf.set("ca", args[0].split("=")[1]); LOG.fatal("" + gconf.get("ca")); gconf.setWorkerConfiguration(Integer.parseInt(cmd.getOptionValue("w")), Integer.parseInt(cmd.getOptionValue("w")), 100.0f); } /* if (cmd.hasOption("cf")) { DistributedCache.addCacheFile(new URI(cmd.getOptionValue("cf")), job.getConfiguration()); } */ return job.run(true) ? 0 : -1; }
From source file:bigimp.BuildForest.java
License:Apache License
private void buildForest() throws IOException, ClassNotFoundException, InterruptedException { // make sure the output path does not exist FileSystem ofs = outputPath.getFileSystem(getConf()); if (ofs.exists(outputPath)) { log.error("Output path already exists"); return;//from w w w . j a v a 2 s . co m } DecisionTreeBuilder treeBuilder = new DecisionTreeBuilder(); if (m != null) { treeBuilder.setM(m); } treeBuilder.setComplemented(complemented); if (minSplitNum != null) { treeBuilder.setMinSplitNum(minSplitNum); } if (minVarianceProportion != null) { treeBuilder.setMinVarianceProportion(minVarianceProportion); } Builder forestBuilder; if (isPartial) { log.info("Partial Mapred implementation"); forestBuilder = new PartialBuilder(treeBuilder, dataPath, datasetPath, seed, getConf()); } else { log.info("InMem Mapred implementation"); forestBuilder = new InMemBuilder(treeBuilder, dataPath, datasetPath, seed, getConf()); } forestBuilder.setOutputDirName(outputPath.getName()); log.info("Building the forest..."); long time = System.currentTimeMillis(); DecisionForest forest = forestBuilder.build(nbTrees); time = System.currentTimeMillis() - time; log.info("Build Time: {}", DFUtils.elapsedTime(time)); log.info("Forest num Nodes: {}", forest.nbNodes()); log.info("Forest mean num Nodes: {}", forest.meanNbNodes()); log.info("Forest mean max Depth: {}", forest.meanMaxDepth()); // store the decision forest in the output path Path forestPath = new Path(outputPath, "forest.seq"); log.info("Storing the forest in: {}", forestPath); DFUtils.storeWritable(getConf(), forestPath, forest); }
From source file:bixo.examples.crawl.DemoCrawlTool.java
License:Apache License
public static void main(String[] args) { DemoCrawlToolOptions options = new DemoCrawlToolOptions(); CmdLineParser parser = new CmdLineParser(options); try {//from w w w . ja va 2 s. c om parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Before we get too far along, see if the domain looks valid. String domain = options.getDomain(); String urlsFile = options.getUrlsFile(); if (domain != null) { validateDomain(domain, parser); } else { if (urlsFile == null) { System.err.println( "Either a target domain should be specified or a file with a list of urls needs to be provided"); printUsageAndExit(parser); } } if (domain != null && urlsFile != null) { System.out.println("Warning: Both domain and urls file list provided - using domain"); } String outputDirName = options.getOutputDir(); if (options.isDebugLogging()) { System.setProperty("bixo.root.level", "DEBUG"); } else { System.setProperty("bixo.root.level", "INFO"); } if (options.getLoggingAppender() != null) { // Set console vs. DRFA vs. something else System.setProperty("bixo.appender", options.getLoggingAppender()); } String logsDir = options.getLogsDir(); if (!logsDir.endsWith("/")) { logsDir = logsDir + "/"; } try { JobConf conf = new JobConf(); Path outputPath = new Path(outputDirName); FileSystem fs = outputPath.getFileSystem(conf); // First check if the user want to clean if (options.isCleanOutputDir()) { if (fs.exists(outputPath)) { fs.delete(outputPath, true); } } // See if the user isn't starting from scratch then set up the // output directory and create an initial urls subdir. if (!fs.exists(outputPath)) { fs.mkdirs(outputPath); // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir // In the /crawldb dir the input file will have a single URL for the target domain. Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0); String curLoopDirName = curLoopDir.getName(); setLoopLoggerFile(logsDir + curLoopDirName, 0); Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME); if (domain != null) { importOneDomain(domain, crawlDbPath, conf); } else { importUrls(urlsFile, crawlDbPath); } } Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath); if (latestDirPath == null) { System.err.println("No previous cycle output dirs exist in " + outputDirName); printUsageAndExit(parser); } Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); // Set up the start and end loop counts. int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath); int endLoop = startLoop + options.getNumLoops(); // Set up the UserAgent for the fetcher. UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); // You also get to customize the FetcherPolicy FetcherPolicy defaultPolicy = new FetcherPolicy(); defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY); defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); // defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.IMPOLITE); defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT); // this is to cause Bixo to block waiting for next time it can fetch from a particular site. // todo: may not be necessary in future versions of Bixo // defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.COMPLETE); // It is a good idea to set up a crawl duration when running long crawls as you may // end up in situations where the fetch slows down due to a 'long tail' and by // specifying a crawl duration you know exactly when the crawl will end. int crawlDurationInMinutes = options.getCrawlDuration(); boolean hasEndTime = crawlDurationInMinutes != DemoCrawlToolOptions.NO_CRAWL_DURATION; long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) : FetcherPolicy.NO_CRAWL_END_TIME; // By setting up a url filter we only deal with urls that we want to // instead of all the urls that we extract. BaseUrlFilter urlFilter = null; List<String> patterns = null; String regexUrlFiltersFile = options.getRegexUrlFiltersFile(); if (regexUrlFiltersFile != null) { patterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile); } else { patterns = RegexUrlFilter.getDefaultUrlFilterPatterns(); if (domain != null) { String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain; patterns.add(domainPatterStr); } else { String protocolPatterStr = "+(?i)^(http|https)://*"; patterns.add(protocolPatterStr); //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol"); } } urlFilter = new RegexUrlFilter(patterns.toArray(new String[patterns.size()])); // OK, now we're ready to start looping, since we've got our current // settings for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) { // Adjust target end time, if appropriate. if (hasEndTime) { int remainingLoops = (endLoop - curLoop) + 1; long now = System.currentTimeMillis(); long perLoopTime = (targetEndTime - now) / remainingLoops; defaultPolicy.setCrawlEndTime(now + perLoopTime); } Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop); String curLoopDirName = curLoopDirPath.getName(); setLoopLoggerFile(logsDir + curLoopDirName, curLoop); Flow flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options); flow.complete(); // Writing out .dot files is a good way to verify your flows. // flow.writeDOT("build/valid-flow.dot"); // Update crawlDbPath to point to the latest crawl db crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); } } catch (PlannerException e) { e.writeDOT("build/failed-flow.dot"); System.err.println("PlannerException: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } catch (Throwable t) { System.err.println("Exception running tool: " + t.getMessage()); t.printStackTrace(System.err); System.exit(-1); } }
From source file:bixo.examples.crawl.DemoStatusTool.java
License:Apache License
public static void main(String[] args) { DemoStatusToolOptions options = new DemoStatusToolOptions(); CmdLineParser parser = new CmdLineParser(options); try {/*www . j a v a 2 s . co m*/ parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } String crawlDirName = options.getWorkingDir(); try { JobConf conf = new JobConf(); Path crawlDirPath = new Path(crawlDirName); FileSystem fs = crawlDirPath.getFileSystem(conf); if (!fs.exists(crawlDirPath)) { System.err.println("Prior crawl output directory does not exist: " + crawlDirName); System.exit(-1); } // Skip Hadoop/Cascading DEBUG messages. Logger.getRootLogger().setLevel(Level.INFO); boolean exportDb = options.isExportDb(); if (exportDb) { Path latestCrawlDirPath = CrawlDirUtils.findLatestLoopDir(fs, crawlDirPath); processCrawlDb(conf, latestCrawlDirPath, exportDb); } else { int prevLoop = -1; Path curDirPath = null; while ((curDirPath = CrawlDirUtils.findNextLoopDir(fs, crawlDirPath, prevLoop)) != null) { String curDirName = curDirPath.toUri().toString(); LOGGER.info(""); LOGGER.info("================================================================"); LOGGER.info("Processing " + curDirName); LOGGER.info("================================================================"); int curLoop = CrawlDirUtils.extractLoopNumber(curDirPath); if (curLoop != prevLoop + 1) { LOGGER.warn(String.format("Missing directories between %d and %d", prevLoop, curLoop)); } prevLoop = curLoop; // Process the status and crawldb in curPath processStatus(conf, curDirPath); processCrawlDb(conf, curDirPath, exportDb); } } } catch (Throwable t) { LOGGER.error("Exception running tool", t); System.exit(-1); } }
From source file:bixo.examples.crawl.JDBCCrawlTool.java
License:Apache License
public static void main(String[] args) { JDBCCrawlToolOptions options = new JDBCCrawlToolOptions(); CmdLineParser parser = new CmdLineParser(options); try {//w w w . j ava 2 s . co m parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Before we get too far along, see if the domain looks valid. String domain = options.getDomain(); if (domain != null) { validateDomain(domain, parser); } String outputDirName = options.getOutputDir(); if (options.isDebugLogging()) { System.setProperty("bixo.root.level", "DEBUG"); } else { System.setProperty("bixo.root.level", "INFO"); } if (options.getLoggingAppender() != null) { // Set console vs. DRFA vs. something else System.setProperty("bixo.appender", options.getLoggingAppender()); } String logsDir = options.getLogsDir(); if (!logsDir.endsWith("/")) { logsDir = logsDir + "/"; } try { JobConf conf = new JobConf(); Path outputPath = new Path(outputDirName); FileSystem fs = outputPath.getFileSystem(conf); // See if the user is starting from scratch if (options.getDbLocation() == null) { if (fs.exists(outputPath)) { System.out.println("Warning: Previous cycle output dirs exist in : " + outputDirName); System.out.println("Warning: Delete the output dir before running"); fs.delete(outputPath, true); } } else { Path dbLocationPath = new Path(options.getDbLocation()); if (!fs.exists(dbLocationPath)) { fs.mkdirs(dbLocationPath); } } if (!fs.exists(outputPath)) { fs.mkdirs(outputPath); Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0); String curLoopDirName = curLoopDir.getName(); setLoopLoggerFile(logsDir + curLoopDirName, 0); if (domain == null) { System.err.println("For a new crawl the domain needs to be specified" + domain); printUsageAndExit(parser); } importOneDomain(domain, JDBCTapFactory.createUrlsSinkJDBCTap(options.getDbLocation()), conf); } Path inputPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath); if (inputPath == null) { System.err.println("No previous cycle output dirs exist in " + outputDirName); printUsageAndExit(parser); } int startLoop = CrawlDirUtils.extractLoopNumber(inputPath); int endLoop = startLoop + options.getNumLoops(); UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); FetcherPolicy defaultPolicy = new FetcherPolicy(); defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY); defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT); int crawlDurationInMinutes = options.getCrawlDuration(); boolean hasEndTime = crawlDurationInMinutes != JDBCCrawlToolOptions.NO_CRAWL_DURATION; long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) : FetcherPolicy.NO_CRAWL_END_TIME; // By setting up a url filter we only deal with urls that we want to // instead of all the urls that we extract. BaseUrlFilter urlFilter = null; List<String> patterns = null; String regexUrlFiltersFile = options.getRegexUrlFiltersFile(); if (regexUrlFiltersFile != null) { patterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile); } else { patterns = RegexUrlFilter.getDefaultUrlFilterPatterns(); if (domain != null) { String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain; patterns.add(domainPatterStr); } else { String protocolPatterStr = "+(?i)^(http|https)://*"; patterns.add(protocolPatterStr); //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol"); } } urlFilter = new RegexUrlFilter(patterns.toArray(new String[patterns.size()])); // Now we're ready to start looping, since we've got our current settings for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) { // Adjust target end time, if appropriate. if (hasEndTime) { int remainingLoops = (endLoop - curLoop) + 1; long now = System.currentTimeMillis(); long perLoopTime = (targetEndTime - now) / remainingLoops; defaultPolicy.setCrawlEndTime(now + perLoopTime); } Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop); String curLoopDirName = curLoopDir.getName(); setLoopLoggerFile(logsDir + curLoopDirName, curLoop); Flow flow = JDBCCrawlWorkflow.createFlow(inputPath, curLoopDir, userAgent, defaultPolicy, urlFilter, options.getMaxThreads(), options.isDebugLogging(), options.getDbLocation()); flow.complete(); // flow.writeDOT("build/valid-flow.dot"); // Input for the next round is our current output inputPath = curLoopDir; } } catch (PlannerException e) { e.writeDOT("build/failed-flow.dot"); System.err.println("PlannerException: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } catch (Throwable t) { System.err.println("Exception running tool: " + t.getMessage()); t.printStackTrace(System.err); System.exit(-1); } JDBCTapFactory.shutdown(); }
From source file:bixo.examples.crawl.JDBCCrawlWorkflow.java
License:Apache License
public static Flow createFlow(Path inputDir, Path curLoopDirPath, UserAgent userAgent, FetcherPolicy fetcherPolicy, BaseUrlFilter urlFilter, int maxThreads, boolean debug, String persistentDbLocation) throws Throwable { JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB); int numReducers = HadoopUtils.getNumReducers(conf); conf.setNumReduceTasks(numReducers); FileSystem fs = curLoopDirPath.getFileSystem(conf); if (!fs.exists(inputDir)) { throw new IllegalStateException(String.format("Input directory %s doesn't exist", inputDir)); }// w w w . jav a2 s .c om Tap inputSource = JDBCTapFactory.createUrlsSourceJDBCTap(persistentDbLocation); // Read _everything_ in initially // Group on the url, and select the best urls to best Pipe importPipe = new Pipe("url importer"); importPipe = new GroupBy(importPipe, new Fields(CrawlDbDatum.URL_FIELD)); importPipe = new Every(importPipe, new BestUrlToFetchBuffer(), Fields.RESULTS); Path contentPath = new Path(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME); Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentPath.toString()); Path parsePath = new Path(curLoopDirPath, CrawlConfig.PARSE_SUBDIR_NAME); Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parsePath.toString()); Path statusDirPath = new Path(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString()); // NOTE: The source and sink for CrawlDbDatums is essentially the same database - // since cascading doesn't allow you to use the same tap for source and // sink we fake it by creating two separate taps. Tap urlSink = JDBCTapFactory.createUrlsSinkJDBCTap(persistentDbLocation); // Create the sub-assembly that runs the fetch job BaseFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent); BaseScoreGenerator scorer = new FixedScoreGenerator(); FetchPipe fetchPipe = new FetchPipe(importPipe, scorer, fetcher, numReducers); Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe()); // Take content and split it into content output plus parse to extract URLs. ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), new SimpleParser()); Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe()); urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction(new SimpleUrlNormalizer(), new SimpleUrlValidator())); urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter)); urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer())); // Take status and output updated UrlDatum's. Again, since we are using // the same database we need to create a new tap. Pipe urlFromFetchPipe = new Pipe("url from fetch", fetchPipe.getStatusTailPipe()); urlFromFetchPipe = new Each(urlFromFetchPipe, new CreateUrlDatumFromStatusFunction()); // Now we need to join the URLs we get from parsing content with the // URLs we got from the status output, so we have a unified stream // of all known URLs. Pipe urlPipe = new GroupBy("url pipe", Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe), new Fields(UrlDatum.URL_FN)); urlPipe = new Every(urlPipe, new LatestUrlDatumBuffer(), Fields.RESULTS); Pipe outputPipe = new Pipe("output pipe"); outputPipe = new Each(urlPipe, new CreateCrawlDbDatumFromUrlFunction()); // Create the output map that connects each tail pipe to the appropriate sink. Map<String, Tap> sinkMap = new HashMap<String, Tap>(); sinkMap.put(statusPipe.getName(), statusSink); sinkMap.put(FetchPipe.CONTENT_PIPE_NAME, contentSink); sinkMap.put(ParsePipe.PARSE_PIPE_NAME, parseSink); sinkMap.put(outputPipe.getName(), urlSink); // Finally we can run it. FlowConnector flowConnector = new FlowConnector( HadoopUtils.getDefaultProperties(JDBCCrawlWorkflow.class, debug, conf)); return flowConnector.connect(inputSource, sinkMap, statusPipe, fetchPipe.getContentTailPipe(), parsePipe.getTailPipe(), outputPipe); }
From source file:bixo.examples.crawl.MultiDomainUrlFilter.java
License:Apache License
public MultiDomainUrlFilter(Path filterFile) throws Exception { //we could require a filter file and put these in all urls or leave them here _suffixExclusionPattern = Pattern.compile("(?i)\\.(pdf|zip|gzip|gz|sit|bz|bz2|tar|tgz|exe)$"); _protocolInclusionPattern = Pattern.compile("(?i)^(http|https)://"); JobConf conf = HadoopUtils.getDefaultJobConf(); try {//process the file passed in if (filterFile != null) { FileSystem fs = filterFile.getFileSystem(conf); if (fs.exists(filterFile)) { FSDataInputStream in = fs.open(filterFile); LineReader lr = new LineReader(in); Text tmpStr = new Text(); while (lr.readLine(tmpStr) > 0 && !tmpStr.toString().equals("")) {//skip blank lines String p = tmpStr.toString().trim();//remove whitespace if (p.substring(0, 1).equals("+")) {// '+' means do-crawl ArrayList filterPair = new ArrayList(); filterPair.add((Boolean) true); filterPair.add(Pattern.compile(p.substring(1, p.length()))); _filters.add(filterPair); } else if (p.substring(0, 1).equals("-")) {// '-' means filter out ArrayList filterPair = new ArrayList(); filterPair.add(new Boolean(false)); filterPair.add(Pattern.compile(p.substring(1, p.length()))); _filters.add(filterPair); } // otherwise a comment or malformed filter pattern }/*from ww w . j ava2 s.co m*/ } } } catch (Exception e) { //any cleanup here? This would indicate a file system error, most likely throw e; } }
From source file:bixo.examples.crawl.RegexUrlFilter.java
License:Apache License
public static List<String> getUrlFilterPatterns(String urlFiltersFile) throws IOException, InterruptedException { //this reads regex filters from a file in HDFS or the native file system JobConf conf = HadoopUtils.getDefaultJobConf(); Path filterFile = new Path(urlFiltersFile); FileSystem fs = filterFile.getFileSystem(conf); List<String> filterList = new ArrayList<String>(); LOGGER.info("Looking for file: " + urlFiltersFile); if (fs.exists(filterFile)) { FSDataInputStream in = fs.open(filterFile); LineReader reader = new LineReader(in); Text tLine = new Text(); while (reader.readLine(tLine) > 0) { String line = tLine.toString(); if (StringUtils.isNotBlank(line) && (line.startsWith(INCLUDE_CHAR) || line.startsWith(EXCLUDE_CHAR))) { filterList.add(line.trim()); }/*from w w w. j a va2 s . c o m*/ } in.close(); } else { LOGGER.info("Can't find file: " + urlFiltersFile); } return filterList; }
From source file:bixo.examples.crawl.SimpleCrawlTool.java
License:Apache License
public static void main(String[] args) { SimpleCrawlToolOptions options = new SimpleCrawlToolOptions(); CmdLineParser parser = new CmdLineParser(options); try {//from w w w .j a va 2s. co m parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Before we get too far along, see if the domain looks valid. String domain = options.getDomain(); String urlsFile = options.getUrlsFile(); if (domain != null) { validateDomain(domain, parser); } else { if (urlsFile == null) { System.err.println( "Either a target domain should be specified or a file with a list of urls needs to be provided"); printUsageAndExit(parser); } } if (domain != null && urlsFile != null) { System.out.println("Warning: Both domain and urls file list provided - using domain"); } String outputDirName = options.getOutputDir(); if (options.isDebugLogging()) { System.setProperty("bixo.root.level", "DEBUG"); } else { System.setProperty("bixo.root.level", "INFO"); } if (options.getLoggingAppender() != null) { // Set console vs. DRFA vs. something else System.setProperty("bixo.appender", options.getLoggingAppender()); } try { JobConf conf = new JobConf(); Path outputPath = new Path(outputDirName); FileSystem fs = outputPath.getFileSystem(conf); // See if the user isn't starting from scratch then set up the // output directory and create an initial urls subdir. if (!fs.exists(outputPath)) { fs.mkdirs(outputPath); // Create a "0-<timestamp>" sub-directory with just a /urls subdir // In the /urls dir the input file will have a single URL for the target domain. Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0); String curLoopDirName = curLoopDir.toUri().toString(); setLoopLoggerFile(curLoopDirName, 0); Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME); if (domain != null) { importOneDomain(domain, crawlDbPath, conf); } else { importUrls(urlsFile, crawlDbPath); } } Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath); if (latestDirPath == null) { System.err.println("No previous cycle output dirs exist in " + outputDirName); printUsageAndExit(parser); } Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); // Set up the start and end loop counts. int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath); int endLoop = startLoop + options.getNumLoops(); // Set up the UserAgent for the fetcher. UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); // You also get to customize the FetcherPolicy FetcherPolicy defaultPolicy = new FetcherPolicy(); defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY); defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT); // It is a good idea to set up a crawl duration when running long crawls as you may // end up in situations where the fetch slows down due to a 'long tail' and by // specifying a crawl duration you know exactly when the crawl will end. int crawlDurationInMinutes = options.getCrawlDuration(); boolean hasEndTime = crawlDurationInMinutes != SimpleCrawlToolOptions.NO_CRAWL_DURATION; long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) : FetcherPolicy.NO_CRAWL_END_TIME; // By setting up a url filter we only deal with urls that we want to // instead of all the urls that we extract. BaseUrlFilter urlFilter = null; if (domain != null) { urlFilter = new DomainUrlFilter(domain); } // OK, now we're ready to start looping, since we've got our current // settings for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) { // Adjust target end time, if appropriate. if (hasEndTime) { int remainingLoops = (endLoop - curLoop) + 1; long now = System.currentTimeMillis(); long perLoopTime = (targetEndTime - now) / remainingLoops; defaultPolicy.setCrawlEndTime(now + perLoopTime); } Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop); String curLoopDirName = curLoopDirPath.toUri().toString(); setLoopLoggerFile(curLoopDirName, curLoop); Flow flow = SimpleCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options); flow.complete(); // Writing out .dot files is a good way to verify your flows. // flow.writeDOT("build/valid-flow.dot"); // Update crawlDbPath to point to the latest crawl db crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); } } catch (PlannerException e) { e.writeDOT("build/failed-flow.dot"); System.err.println("PlannerException: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } catch (Throwable t) { System.err.println("Exception running tool: " + t.getMessage()); t.printStackTrace(System.err); System.exit(-1); } }
From source file:bixo.examples.crawl.SimpleCrawlWorkflow.java
License:Apache License
public static Flow createFlow(Path curWorkingDirPath, Path crawlDbPath, FetcherPolicy fetcherPolicy, UserAgent userAgent, BaseUrlFilter urlFilter, SimpleCrawlToolOptions options) throws Throwable { JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB); int numReducers = HadoopUtils.getNumReducers(conf); conf.setNumReduceTasks(numReducers); Properties props = HadoopUtils.getDefaultProperties(SimpleCrawlWorkflow.class, options.isDebugLogging(), conf);// www.j a v a 2s. c o m FileSystem fs = curWorkingDirPath.getFileSystem(conf); // Input : the crawldb if (!fs.exists(crawlDbPath)) { throw new RuntimeException("CrawlDb not found"); } // Our crawl db is defined by the CrawlDbDatum Tap inputSource = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString()); Pipe importPipe = new Pipe("import pipe"); // Split into tuples that are to be fetched and that have already been fetched SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedCrawlDatums()); Pipe finishedDatumsFromDb = splitter.getRHSPipe(); Pipe urlsToFetchPipe = new Pipe("urls to Fetch", splitter.getLHSPipe()); // Convert the urlsToFetchPipe so that we now deal with UrlDatums. urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbFunction()); // A TupleLogger is a good way to follow the tuples around in a flow. You can enable the output // of tuples by setting options.setDebugLogging() to true. urlsToFetchPipe = TupleLogger.makePipe(urlsToFetchPipe, true); // Create the output sinks : // crawldb // content // parse // status Path outCrawlDbPath = new Path(curWorkingDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); Tap loopCrawldbSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), outCrawlDbPath.toString()); Path contentDirPath = new Path(curWorkingDirPath, CrawlConfig.CONTENT_SUBDIR_NAME); Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString()); Path parseDirPath = new Path(curWorkingDirPath, CrawlConfig.PARSE_SUBDIR_NAME); Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parseDirPath.toString()); Path statusDirPath = new Path(curWorkingDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString()); Path productsDirPath = new Path(curWorkingDirPath, CrawlConfig.PRODUCTS_SUBDIR_NAME); Tap productsSink = new Hfs(new TextLine(), productsDirPath.toString()); // Tap productsSink = new Hfs(new TextLine(ProductDatum.FIELDS), productsDirPath.toString()); // Create the sub-assembly that runs the fetch job SimpleHttpFetcher fetcher = new SimpleHttpFetcher(options.getMaxThreads(), fetcherPolicy, userAgent); fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES); fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT); fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT); // You can also provide a set of mime types you want to restrict what content type you // want to deal with - for now keep it simple. Set<String> validMimeTypes = new HashSet<String>(); validMimeTypes.add("text/plain"); validMimeTypes.add("text/html"); fetcherPolicy.setValidMimeTypes(validMimeTypes); // The scorer is used by the FetchPipe to assign a score to every URL that passes the // robots.txt processing. The score is used to sort URLs such that higher scoring URLs // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped. BaseScoreGenerator scorer = new FixedScoreGenerator(); FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers); Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe()); Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe()); contentPipe = TupleLogger.makePipe(contentPipe, true); // Take content and split it into content output plus parse to extract URLs. SimpleParser parser = new SimpleParser(); parser.setExtractLanguage(false); ParsePipe parsePipe = new ParsePipe(contentPipe, parser); Pipe productsPipe = new Pipe("products pipe", parsePipe); // PRECIOUS Pipe productsPipe = new Pipe("products pipe", fetchPipe.getContentTailPipe()); String regex = "[a-z]+@[a-z]+.[a-z]+"; // WAS: String regex = "[\\w\\-]([\\.\\w])+[\\w]+@([\\w\\-]+\\.)+[A-Z]{2,4}"; Function emailExtractor = new RegexGenerator(new Fields("email"), regex); productsPipe = new Each(productsPipe, emailExtractor); // PRECIOUS productsPipe = new Each(productsPipe, new CreateProductDatumsFunction()); productsPipe = TupleLogger.makePipe(productsPipe, true); Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe()); urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction()); if (urlFilter != null) { urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter)); } urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer())); urlFromOutlinksPipe = TupleLogger.makePipe(urlFromOutlinksPipe, true); // Take status and output urls from it Pipe urlFromFetchPipe = new Pipe("url from fetch"); urlFromFetchPipe = new Each(statusPipe, new CreateUrlDatumFromStatusFunction()); urlFromFetchPipe = TupleLogger.makePipe(urlFromFetchPipe, true); // Finally join the URLs we get from parsing content with the URLs we got // from the status ouput, and the urls we didn't process from the db so that // we have a unified stream of all known URLs for the crawldb. Pipe finishedUrlsFromDbPipe = new Each(finishedDatumsFromDb, new CreateUrlDatumFromCrawlDbFunction()); finishedUrlsFromDbPipe = TupleLogger.makePipe(finishedUrlsFromDbPipe, true); // NOTE : Ideally you would just do a CoGroup instead of converting all the pipes to emit UrlDatums // and then doing the extra step of converting from UrlDatum to CrawlDbDatum. // The reason this isn't being done here is because we are sharing LatestUrlDatumBuffer() with JDBCCrawlTool Pipe crawlDbPipe = new GroupBy("crawldb pipe", Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe, finishedUrlsFromDbPipe), new Fields(UrlDatum.URL_FN)); crawlDbPipe = new Every(crawlDbPipe, new LatestUrlDatumBuffer(), Fields.RESULTS); Pipe outputPipe = new Pipe("output pipe"); outputPipe = new Each(crawlDbPipe, new CreateCrawlDbDatumFromUrlFunction()); // Create the output map that connects each tail pipe to the appropriate sink. Map<String, Tap> sinkMap = new HashMap<String, Tap>(); sinkMap.put(statusPipe.getName(), statusSink); sinkMap.put(contentPipe.getName(), contentSink); sinkMap.put(ParsePipe.PARSE_PIPE_NAME, parseSink); sinkMap.put(crawlDbPipe.getName(), loopCrawldbSink); sinkMap.put(productsPipe.getName(), productsSink); FlowConnector flowConnector = new FlowConnector(props); Flow flow = flowConnector.connect(inputSource, sinkMap, statusPipe, contentPipe, parsePipe.getTailPipe(), outputPipe); return flow; }