List of usage examples for org.apache.hadoop.fs FileSystem exists
public boolean exists(Path f) throws IOException
From source file:com.facebook.presto.hive.metastore.SemiTransactionalHiveMetastore.java
License:Apache License
/** * Attempt to recursively remove eligible files and/or directories in {@code directory}. * * When {@code filePrefixes} is not present, all files (but not necessarily directories) will be * ineligible. If all files shall be deleted, you can use an empty string as {@code filePrefixes}. * * When {@code deleteEmptySubDirectory} is true, any empty directory (including directories that * were originally empty, and directories that become empty after files prefixed with * {@code filePrefixes} are deleted) will be eligible. * * This method will not delete anything that's neither a directory nor a file. * * @param filePrefixes prefix of files that should be deleted * @param deleteEmptyDirectories whether empty directories should be deleted *//*from w w w. j a v a 2 s.com*/ private static RecursiveDeleteResult recursiveDeleteFiles(HdfsEnvironment hdfsEnvironment, String user, Path directory, List<String> filePrefixes, boolean deleteEmptyDirectories) { FileSystem fileSystem; try { fileSystem = hdfsEnvironment.getFileSystem(user, directory); if (!fileSystem.exists(directory)) { return new RecursiveDeleteResult(true, ImmutableList.of()); } } catch (IOException e) { ImmutableList.Builder<String> notDeletedItems = ImmutableList.builder(); notDeletedItems.add(directory.toString() + "/**"); return new RecursiveDeleteResult(false, notDeletedItems.build()); } return doRecursiveDeleteFiles(fileSystem, directory, filePrefixes, deleteEmptyDirectories); }
From source file:com.facebook.presto.hive.metastore.SemiTransactionalHiveMetastore.java
License:Apache License
/** * Attempts to remove the file or empty directory. * * @return true if the location no longer exists *//*ww w . j a v a2s . c o m*/ private static boolean deleteIfExists(FileSystem fileSystem, Path path, boolean recursive) { try { // attempt to delete the path if (fileSystem.delete(path, recursive)) { return true; } // delete failed // check if path still exists return !fileSystem.exists(path); } catch (FileNotFoundException ignored) { // path was already removed or never existed return true; } catch (IOException ignored) { } return false; }
From source file:com.fanlehai.hadoop.serialize.json.multiline.ExampleJob.java
License:Apache License
/** * Writes the contents of {@link #JSON} into a file in the job input * directory in HDFS.//from w ww . ja v a2 s . c o m * * @param conf * the Hadoop config * @param inputDir * the HDFS input directory where we'll write a file * @throws IOException * if something goes wrong */ public static void writeInput(Configuration conf, Path inputDir) throws IOException { FileSystem fs = FileSystem.get(conf); if (fs.exists(inputDir)) { // throw new IOException( // String.format("Input directory '%s' exists - please remove and // rerun this example", inputDir)); fs.delete(inputDir, true); } OutputStreamWriter writer = new OutputStreamWriter(fs.create(new Path(inputDir, "input.txt"))); writer.write(JSON); IOUtils.closeStream(writer); }
From source file:com.finderbots.miner.MinerWorkflow.java
License:Apache License
public static Flow createWebMiningWorkflow(Path crawlDbPath, Path curLoopDirPath, FetcherPolicy fetcherPolicy, UserAgent userAgent, MinerOptions options, BaseUrlFilter crawlUrlFilter, BaseUrlFilter mineUrlFilter) throws IOException, InterruptedException { // Fetch at most 200 pages, max size of 128K, complete mode, from the current dir. // HTML only. // We want to extract the cleaned up HTML, and pass that to the parser, which will // be specified via options.getAnalyzer. From this we'll get outlinks, page score, and // any results. JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB); boolean isLocal = HadoopUtils.isJobLocal(conf); int numReducers = HadoopUtils.getNumReducers(conf); conf.setNumReduceTasks(numReducers); conf.setInt("mapred.min.split.size", 64 * 1024 * 1024); Properties props = HadoopUtils.getDefaultProperties(MinerWorkflow.class, false, conf); FileSystem fs = crawlDbPath.getFileSystem(conf); // Input : the crawldb if (!fs.exists(crawlDbPath)) { throw new RuntimeException("CrawlDb not found"); }/*from w ww. j a v a2 s .com*/ //Tap inputSource = new Hfs(new TextDelimited(CrawlDbDatum.FIELDS, "\t", CrawlDbDatum.TYPES), crawlDbPath.toString()); Tap inputSource = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString(), true); Pipe importPipe = new Pipe("import pipe"); // Split into tuples that are to be fetched and that have already been fetched SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedSSCrawlDatums()); Pipe finishedDatumsFromDb = new Pipe("finished datums from db", splitter.getRHSPipe()); Pipe urlsToFetchPipe = splitter.getLHSPipe(); // Limit to MAX_DISTRIBUTED_FETCH if running in real cluster, // or MAX_LOCAL_FETCH if running locally. So first we sort the entries // from high to low by links score. // TODO add unit test urlsToFetchPipe = new GroupBy(urlsToFetchPipe, new Fields(CrawlDbDatum.LINKS_SCORE_FIELD), true); long maxToFetch = HadoopUtils.isJobLocal(conf) ? MAX_LOCAL_FETCH : MAX_DISTRIBUTED_FETCH; urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbDatum(maxToFetch)); // Create the sub-assembly that runs the fetch job int maxThreads = isLocal ? CrawlConfig.DEFAULT_NUM_THREADS_LOCAL : CrawlConfig.DEFAULT_NUM_THREADS_CLUSTER; SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent); fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES); fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT); fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT); // The scorer is used by the FetchPipe to assign a score to every URL that passes the // robots.txt processing. The score is used to sort URLs such that higher scoring URLs // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped. BaseScoreGenerator scorer = new FixedScoreGenerator(); FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers); Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe()); Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe()); contentPipe = TupleLogger.makePipe(contentPipe, true); // Create a parser that returns back the raw HTML (cleaned up by Tika) as the parsed content. SimpleParser parser = new SimpleParser(new ParserPolicy(), true); ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), parser); Pipe analyzerPipe = new Pipe("analyzer pipe"); analyzerPipe = new Each(parsePipe.getTailPipe(), new AnalyzeHtml()); //add a regex url filter to filter outlinks Pipe outlinksPipe = new Pipe("outlinks pipe", analyzerPipe); outlinksPipe = new Each(outlinksPipe, new CreateLinkDatumFromOutlinksFunction()); if (crawlUrlFilter != null) { outlinksPipe = new Each(outlinksPipe, new UrlFilter(crawlUrlFilter)); } Pipe resultsPipe = new Pipe("results pipe", analyzerPipe); resultsPipe = new Each(resultsPipe, new CreateResultsFunction()); // Group the finished datums, the skipped datums, status, outlinks Pipe updatePipe = new CoGroup("update pipe", Pipe.pipes(finishedDatumsFromDb, statusPipe, analyzerPipe, outlinksPipe), Fields.fields(new Fields(CrawlDbDatum.URL_FIELD), new Fields(StatusDatum.URL_FN), new Fields(AnalyzedDatum.URL_FIELD), new Fields(LinkDatum.URL_FN)), null, new OuterJoin()); updatePipe = new Every(updatePipe, new UpdateCrawlDbBuffer(), Fields.RESULTS); // output : loop dir specific crawldb Path outCrawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); Tap crawlDbSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), outCrawlDbPath.toString()); // Status, Path statusDirPath = new Path(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString()); // Content Path contentDirPath = new Path(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME); Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString()); // PageResults Path resultsDirPath = new Path(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME); Tap resultsSink = new Hfs(new TextLine(), resultsDirPath.toString()); // Create the output map that connects each tail pipe to the appropriate sink. Map<String, Tap> sinkMap = new HashMap<String, Tap>(); sinkMap.put(updatePipe.getName(), crawlDbSink); sinkMap.put(statusPipe.getName(), statusSink); sinkMap.put(contentPipe.getName(), contentSink); sinkMap.put(resultsPipe.getName(), resultsSink); FlowConnector flowConnector = new FlowConnector(props); Flow flow = flowConnector.connect(inputSource, sinkMap, updatePipe, statusPipe, contentPipe, resultsPipe); return flow; }
From source file:com.finderbots.miner.RegexUrlFilter.java
License:Apache License
public static List<String> getUrlFilterPatterns(String urlFiltersFile) throws IOException, InterruptedException { //this reads regex filters from a file in HDFS or the native file sysytem JobConf conf = HadoopUtils.getDefaultJobConf(); Path filterFile = new Path(urlFiltersFile); FileSystem fs = filterFile.getFileSystem(conf); List<String> filterList = new ArrayList<String>(); if (fs.exists(filterFile)) { FSDataInputStream in = fs.open(filterFile); LineReader reader = new LineReader(in); Text tLine = new Text(); while (reader.readLine(tLine) > 0) { String line = tLine.toString(); if (StringUtils.isNotBlank(line) && (line.startsWith(INCLUDE_CHAR) || line.startsWith(EXCLUDE_CHAR))) { filterList.add(line.trim()); }/*www .j av a 2 s .co m*/ } in.close(); } return filterList; }
From source file:com.finderbots.miner2.pinterest.PinterestCrawlAndMinerTool.java
License:Apache License
public static void main(String[] args) { Options options = new Options(); CmdLineParser parser = new CmdLineParser(options); try {/*from w w w . jav a2s . c o m*/ parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Before we get too far along, see if the domain looks valid. String domain = options.getDomain(); String urlsFile = options.getUrlsFile(); if (domain != null) { validateDomain(domain, parser); } else { if (urlsFile == null) { System.err.println( "Either a target domain should be specified or a file with a list of urls needs to be provided"); printUsageAndExit(parser); } } if (domain != null && urlsFile != null) { System.out.println("Warning: Both domain and urls file list provided - using domain"); } String outputDirName = options.getOutputDir(); if (options.isDebugLogging()) { System.setProperty("bixo.root.level", "DEBUG"); } else { System.setProperty("bixo.root.level", "INFO"); } if (options.getLoggingAppender() != null) { // Set console vs. DRFA vs. something else System.setProperty("bixo.appender", options.getLoggingAppender()); } String logsDir = options.getLogsDir(); if (!logsDir.endsWith("/")) { logsDir = logsDir + "/"; } try { JobConf conf = new JobConf(); Path outputPath = new Path(outputDirName); FileSystem fs = outputPath.getFileSystem(conf); // First check if the user wants to clean if (options.isCleanOutputDir()) { if (fs.exists(outputPath)) { fs.delete(outputPath, true); } } // See if the user isn't starting from scratch then set up the // output directory and create an initial urls subdir. if (!fs.exists(outputPath)) { fs.mkdirs(outputPath); // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir // In the /crawldb dir the input file will have a single URL for the target domain. Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0); String curLoopDirName = curLoopDir.getName(); setLoopLoggerFile(logsDir + curLoopDirName, 0); Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME); if (domain != null) { importOneDomain(domain, crawlDbPath, conf); } else { importUrls(urlsFile, crawlDbPath); } } Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath); if (latestDirPath == null) { System.err.println("No previous cycle output dirs exist in " + outputDirName); printUsageAndExit(parser); } Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); // Set up the start and end loop counts. int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath); int endLoop = startLoop + options.getNumLoops(); // Set up the UserAgent for the fetcher. UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); // You also get to customize the FetcherPolicy FetcherPolicy defaultPolicy; if (options.getCrawlDuration() != 0) { defaultPolicy = new AdaptiveFetcherPolicy(options.getEndCrawlTime(), options.getCrawlDelay()); } else { defaultPolicy = new FetcherPolicy(); } defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); defaultPolicy.setRequestTimeout(10L * 1000L);//10 seconds // COMPLETE for crawling a single site, EFFICIENT for many sites if (options.getCrawlPolicy().equals(Options.IMPOLITE_CRAWL_POLICY)) { defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.IMPOLITE); } else if (options.getCrawlPolicy().equals(Options.EFFICIENT_CRAWL_POLICY)) { defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT); } else if (options.getCrawlPolicy().equals(Options.COMPLETE_CRAWL_POLICY)) { defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.COMPLETE); } // It is a good idea to set up a crawl duration when running long crawls as you may // end up in situations where the fetch slows down due to a 'long tail' and by // specifying a crawl duration you know exactly when the crawl will end. int crawlDurationInMinutes = options.getCrawlDuration(); boolean hasEndTime = crawlDurationInMinutes != Options.NO_CRAWL_DURATION; long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) : FetcherPolicy.NO_CRAWL_END_TIME; // By setting up a url filter we only deal with urls that we want to // instead of all the urls that we extract. BaseUrlFilter urlFilter = null; List<String> patterns = null; String regexUrlFiltersFile = options.getRegexUrlFiltersFile(); if (regexUrlFiltersFile != null) { patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlFiltersFile); } else { patterns = RegexUrlDatumFilter.getDefaultUrlFilterPatterns(); if (domain != null) { String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain; patterns.add(domainPatterStr); } else { String protocolPatterStr = "+(?i)^(http|https)://*"; patterns.add(protocolPatterStr); //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol"); } } urlFilter = new RegexUrlDatumFilter(patterns.toArray(new String[patterns.size()])); // get a list of patterns which tell the miner which URLs to include or exclude. patterns.clear(); RegexUrlStringFilter urlsToMineFilter = null; String regexUrlsToMineFiltersFile = options.getRegexUrlToMineFile(); AnalyzeHtml analyzer = null; if (regexUrlsToMineFiltersFile != null) { patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlsToMineFiltersFile); urlsToMineFilter = new RegexUrlStringFilter(patterns.toArray(new String[patterns.size()])); analyzer = new AnalyzeHtml(urlsToMineFilter); } // OK, now we're ready to start looping, since we've got our current // settings for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) { // Adjust target end time, if appropriate. if (hasEndTime) { int remainingLoops = (endLoop - curLoop) + 1; long now = System.currentTimeMillis(); long perLoopTime = (targetEndTime - now) / remainingLoops; defaultPolicy.setCrawlEndTime(now + perLoopTime); } Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop); String curLoopDirName = curLoopDirPath.getName(); setLoopLoggerFile(logsDir + curLoopDirName, curLoop); Flow flow = PinterestCrawlAndMinerWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, analyzer, options); flow.complete(); // Writing out .dot files is a good way to verify your flows. flow.writeDOT("valid-flow.dot"); // Update crawlDbPath to point to the latest crawl db crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); } } catch (PlannerException e) { e.writeDOT("failed-flow.dot"); System.err.println("PlannerException: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } catch (Throwable t) { System.err.println("Exception running tool: " + t.getMessage()); t.printStackTrace(System.err); System.exit(-1); } }
From source file:com.finderbots.miner2.pinterest.PinterestCrawlAndMinerWorkflow.java
License:Apache License
public static Flow createFlow(Path curWorkingDirPath, Path crawlDbPath, FetcherPolicy fetcherPolicy, UserAgent userAgent, BaseUrlFilter urlFilter, AnalyzeHtml analyzer, PinterestCrawlAndMinerTool.Options options) throws Throwable { JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB); int numReducers = HadoopUtils.getNumReducers(conf); conf.setNumReduceTasks(numReducers); Properties props = HadoopUtils.getDefaultProperties(PinterestCrawlAndMinerWorkflow.class, options.isDebugLogging(), conf); FileSystem fs = curWorkingDirPath.getFileSystem(conf); // Input : the crawldb if (!fs.exists(crawlDbPath)) { throw new RuntimeException("CrawlDb doesn't exist at " + crawlDbPath); }/*from www . j ava 2 s .c om*/ // Our crawl db is defined by the CrawlDbDatum Tap inputSource = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString()); Pipe importPipe = new Pipe("import pipe"); // Split into tuples that are to be fetched and that have already been fetched SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedCrawlDatums()); Pipe finishedDatumsFromDb = splitter.getRHSPipe(); Pipe urlsToFetchPipe = new Pipe("urls to Fetch", splitter.getLHSPipe()); // Convert the urlsToFetchPipe so that we now deal with UrlDatums. urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbFunction()); // A TupleLogger is a good way to follow the tuples around in a flow. You can enable the output // of tuples by setting options.setDebugLogging() to true. urlsToFetchPipe = TupleLogger.makePipe(urlsToFetchPipe, true); // Create the output sinks : // crawldb // content // parse // status Path outCrawlDbPath = new Path(curWorkingDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); Tap loopCrawldbSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), outCrawlDbPath.toString()); Path contentDirPath = new Path(curWorkingDirPath, CrawlConfig.CONTENT_SUBDIR_NAME); Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString()); Path parseDirPath = new Path(curWorkingDirPath, CrawlConfig.PARSE_SUBDIR_NAME); Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parseDirPath.toString()); Path statusDirPath = new Path(curWorkingDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString()); // Create the sub-assembly that runs the fetch job SimpleHttpFetcher fetcher = new SimpleHttpFetcher(options.getMaxThreads(), fetcherPolicy, userAgent); fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);// set to two tries fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);// and 10 sec timeout fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT); // You can also provide a set of mime types you want to restrict what content type you // want to deal with - for now keep it simple. Set<String> validMimeTypes = new HashSet<String>(); validMimeTypes.add("text/plain"); validMimeTypes.add("text/html"); fetcherPolicy.setValidMimeTypes(validMimeTypes); // The scorer is used by the FetchPipe to assign a score to every URL that passes the // robots.txt processing. The score is used to sort URLs such that higher scoring URLs // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped. BaseScoreGenerator scorer = new FixedScoreGenerator(); FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers); Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe()); Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe()); contentPipe = TupleLogger.makePipe(contentPipe, true); // Take content and split it into content output plus parse to extract URLs. // BEWARE: The SimpleParser will discard HTML unless you pass in true as last arg! So for mining // always pass in true!!! SimpleParser parser; if (options.isUseBoilerpipe()) { parser = new SimpleParser(new BoilerpipeContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy()); } else if (options.isGenerateHTML()) { parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(), true); } else if (options.isEnableMiner()) { parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(), true); } else { parser = new SimpleParser(); } parser.setExtractLanguage(false); ParsePipe parsePipe = new ParsePipe(contentPipe, parser); Tap writableSeqFileSink = null; Pipe writableSeqFileDataPipe = null; // Create the output map that connects each tail pipe to the appropriate sink, and the // list of tail pipes. Map<String, Tap> sinkMap = new HashMap<String, Tap>(); List<Pipe> tailPipes = new ArrayList<Pipe>(); if (options.isGenerateHTML()) { // Let's write out the parse as text: Pipe textParsePipe = new Pipe("text parse data", parsePipe.getTailPipe()); textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.PARSED_TEXT_FN), new RegexReplace(new Fields(ParsedDatum.PARSED_TEXT_FN), "[\\r\\n\\t]+", " ", true), Fields.REPLACE); textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.URL_FN, ParsedDatum.PARSED_TEXT_FN), new Identity()); Path textParsePath = new Path(curWorkingDirPath, CrawlConfig.HTML_SUBDIR_NAME); Tap textParseTap = new Hfs(new TextLine(), textParsePath.toString(), true); sinkMap.put(textParsePipe.getName(), textParseTap); tailPipes.add(textParsePipe); } if (options.isEnableMiner()) { Pipe analyzerPipe = new Pipe("analyzer pipe", parsePipe.getTailPipe()); analyzerPipe = new Each(analyzerPipe, analyzer); Pipe resultsPipe = new Pipe("results pipe", analyzerPipe); resultsPipe = new Each(resultsPipe, new CreateBooleanPreferenceFunction()); Path minerOutputPath = new Path(curWorkingDirPath, CrawlConfig.MINER_SUBDIR_NAME); Tap minerOutputTap = new Hfs(new TextLine(), minerOutputPath.toString(), true); sinkMap.put(resultsPipe.getName(), minerOutputTap); tailPipes.add(resultsPipe); } // Let's output a WritableSequenceFile as an example - this file can // then be used as input when working with Mahout. writableSeqFileDataPipe = new Pipe("writable seqfile data", new Each(parsePipe.getTailPipe(), new CreateWritableSeqFileData())); Path writableSeqFileDataPath = new Path(curWorkingDirPath, CrawlConfig.EXTRACTED_TEXT_SUBDIR_NAME); writableSeqFileSink = new Hfs(new WritableSequenceFile( new Fields(CrawlConfig.WRITABLE_SEQ_FILE_KEY_FN, CrawlConfig.WRITABLE_SEQ_FILE_VALUE_FN), Text.class, Text.class), writableSeqFileDataPath.toString()); Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe()); urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction(new SimpleUrlNormalizer(), new SimpleUrlValidator())); if (urlFilter != null) { urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter)); } urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer())); urlFromOutlinksPipe = TupleLogger.makePipe(urlFromOutlinksPipe, true); // Take status and output urls from it Pipe urlFromFetchPipe = new Pipe("url from fetch", statusPipe); urlFromFetchPipe = new Each(urlFromFetchPipe, new CreateUrlDatumFromStatusFunction()); urlFromFetchPipe = TupleLogger.makePipe(urlFromFetchPipe, true); // Finally join the URLs we get from parsing content with the URLs we got // from the status ouput, and the urls we didn't process from the db so that // we have a unified stream of all known URLs for the crawldb. Pipe finishedUrlsFromDbPipe = new Each(finishedDatumsFromDb, new CreateUrlDatumFromCrawlDbFunction()); finishedUrlsFromDbPipe = TupleLogger.makePipe(finishedUrlsFromDbPipe, true); // NOTE : Ideally you would just do a CoGroup instead of converting all the pipes to emit UrlDatums // and then doing the extra step of converting from UrlDatum to CrawlDbDatum. // The reason this isn't being done here is because we are sharing LatestUrlDatumBuffer() with JDBCCrawlTool Pipe crawlDbPipe = new GroupBy("crawldb pipe", Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe, finishedUrlsFromDbPipe), new Fields(UrlDatum.URL_FN)); crawlDbPipe = new Every(crawlDbPipe, new LatestUrlDatumBuffer(), Fields.RESULTS); Pipe outputPipe = new Pipe("output pipe"); outputPipe = new Each(crawlDbPipe, new CreateCrawlDbDatumFromUrlFunction()); // Create the output map that connects each tail pipe to the appropriate sink. sinkMap.put(statusPipe.getName(), statusSink); tailPipes.add(statusPipe); sinkMap.put(contentPipe.getName(), contentSink); tailPipes.add(contentPipe); sinkMap.put(parsePipe.getTailPipe().getName(), parseSink); tailPipes.add(parsePipe.getTailPipe()); sinkMap.put(outputPipe.getName(), loopCrawldbSink); tailPipes.add(outputPipe); sinkMap.put(writableSeqFileDataPipe.getName(), writableSeqFileSink); tailPipes.add(writableSeqFileDataPipe); FlowConnector flowConnector = new FlowConnector(props); Flow flow = flowConnector.connect(inputSource, sinkMap, tailPipes.toArray(new Pipe[tailPipes.size()])); return flow; }
From source file:com.finderbots.miner2.tomatoes.RTCriticsCrawlAndMinerTool.java
License:Apache License
public static void main(String[] args) { Options options = new Options(); CmdLineParser parser = new CmdLineParser(options); try {// ww w .j ava 2 s . com parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Before we get too far along, see if the domain looks valid. String domain = options.getDomain(); String urlsFile = options.getUrlsFile(); if (domain != null) { validateDomain(domain, parser); } else { if (urlsFile == null) { System.err.println( "Either a target domain should be specified or a file with a list of urls needs to be provided"); printUsageAndExit(parser); } } if (domain != null && urlsFile != null) { System.out.println("Warning: Both domain and urls file list provided - using domain"); } String outputDirName = options.getOutputDir(); if (options.isDebugLogging()) { System.setProperty("bixo.root.level", "DEBUG"); } else { System.setProperty("bixo.root.level", "INFO"); } if (options.getLoggingAppender() != null) { // Set console vs. DRFA vs. something else System.setProperty("bixo.appender", options.getLoggingAppender()); } String logsDir = options.getLogsDir(); if (!logsDir.endsWith("/")) { logsDir = logsDir + "/"; } try { JobConf conf = new JobConf(); Path outputPath = new Path(outputDirName); FileSystem fs = outputPath.getFileSystem(conf); // First check if the user wants to clean if (options.isCleanOutputDir()) { if (fs.exists(outputPath)) { fs.delete(outputPath, true); } } // See if the user isn't starting from scratch then set up the // output directory and create an initial urls subdir. if (!fs.exists(outputPath)) { fs.mkdirs(outputPath); // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir // In the /crawldb dir the input file will have a single URL for the target domain. Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0); String curLoopDirName = curLoopDir.getName(); setLoopLoggerFile(logsDir + curLoopDirName, 0); Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME); if (domain != null) { importOneDomain(domain, crawlDbPath, conf); } else { importUrls(urlsFile, crawlDbPath); } } Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath); if (latestDirPath == null) { System.err.println("No previous cycle output dirs exist in " + outputDirName); printUsageAndExit(parser); } Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); // Set up the start and end loop counts. int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath); int endLoop = startLoop + options.getNumLoops(); // Set up the UserAgent for the fetcher. UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); // You also get to customize the FetcherPolicy FetcherPolicy defaultPolicy; if (options.getCrawlDuration() != 0) { defaultPolicy = new AdaptiveFetcherPolicy(options.getEndCrawlTime(), options.getCrawlDelay()); } else { defaultPolicy = new FetcherPolicy(); } defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); defaultPolicy.setRequestTimeout(10L * 1000L);//10 seconds // COMPLETE for crawling a single site, EFFICIENT for many sites if (options.getCrawlPolicy().equals(Options.IMPOLITE_CRAWL_POLICY)) { defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.IMPOLITE); } else if (options.getCrawlPolicy().equals(Options.EFFICIENT_CRAWL_POLICY)) { defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT); } else if (options.getCrawlPolicy().equals(Options.COMPLETE_CRAWL_POLICY)) { defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.COMPLETE); } // It is a good idea to set up a crawl duration when running long crawls as you may // end up in situations where the fetch slows down due to a 'long tail' and by // specifying a crawl duration you know exactly when the crawl will end. int crawlDurationInMinutes = options.getCrawlDuration(); boolean hasEndTime = crawlDurationInMinutes != Options.NO_CRAWL_DURATION; long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) : FetcherPolicy.NO_CRAWL_END_TIME; // By setting up a url filter we only deal with urls that we want to // instead of all the urls that we extract. BaseUrlFilter urlFilter = null; List<String> patterns = null; String regexUrlFiltersFile = options.getRegexUrlFiltersFile(); if (regexUrlFiltersFile != null) { patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlFiltersFile); } else { patterns = RegexUrlDatumFilter.getDefaultUrlFilterPatterns(); if (domain != null) { String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain; patterns.add(domainPatterStr); } else { String protocolPatterStr = "+(?i)^(http|https)://*"; patterns.add(protocolPatterStr); //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol"); } } urlFilter = new RegexUrlDatumFilter(patterns.toArray(new String[patterns.size()])); // get a list of patterns which tell the miner which URLs to include or exclude. patterns.clear(); RegexUrlStringFilter urlsToMineFilter = null; String regexUrlsToMineFiltersFile = options.getRegexUrlToMineFile(); MineRTCriticsPreferences prefsAnalyzer = null; if (regexUrlsToMineFiltersFile != null) { patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlsToMineFiltersFile); urlsToMineFilter = new RegexUrlStringFilter(patterns.toArray(new String[patterns.size()])); prefsAnalyzer = new MineRTCriticsPreferences(urlsToMineFilter); } // OK, now we're ready to start looping, since we've got our current // settings for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) { // Adjust target end time, if appropriate. if (hasEndTime) { int remainingLoops = (endLoop - curLoop) + 1; long now = System.currentTimeMillis(); long perLoopTime = (targetEndTime - now) / remainingLoops; defaultPolicy.setCrawlEndTime(now + perLoopTime); } Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop); String curLoopDirName = curLoopDirPath.getName(); setLoopLoggerFile(logsDir + curLoopDirName, curLoop); Flow flow = RTCriticsCrawlAndMinerWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, prefsAnalyzer, options); flow.complete(); // Writing out .dot files is a good way to verify your flows. flow.writeDOT("valid-flow.dot"); // Update crawlDbPath to point to the latest crawl db crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); } } catch (PlannerException e) { e.writeDOT("failed-flow.dot"); System.err.println("PlannerException: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } catch (Throwable t) { System.err.println("Exception running tool: " + t.getMessage()); t.printStackTrace(System.err); System.exit(-1); } }
From source file:com.finderbots.miner2.tomatoes.RTCriticsCrawlAndMinerWorkflow.java
License:Apache License
public static Flow createFlow(Path curWorkingDirPath, Path crawlDbPath, FetcherPolicy fetcherPolicy, UserAgent userAgent, BaseUrlFilter urlFilter, MineRTCriticsPreferences prefsAnalyzer, RTCriticsCrawlAndMinerTool.Options options) throws Throwable { JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB); int numReducers = HadoopUtils.getNumReducers(conf); conf.setNumReduceTasks(numReducers); Properties props = HadoopUtils.getDefaultProperties(RTCriticsCrawlAndMinerWorkflow.class, options.isDebugLogging(), conf); FileSystem fs = curWorkingDirPath.getFileSystem(conf); // Input : the crawldb if (!fs.exists(crawlDbPath)) { throw new RuntimeException("CrawlDb doesn't exist at " + crawlDbPath); }/*from ww w .j av a 2 s . c o m*/ // Our crawl db is defined by the CrawlDbDatum Tap inputSource = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString()); Pipe importPipe = new Pipe("import pipe"); // Split into tuples that are to be fetched and that have already been fetched SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedCrawlDatums()); Pipe finishedDatumsFromDb = splitter.getRHSPipe(); Pipe urlsToFetchPipe = new Pipe("urls to Fetch", splitter.getLHSPipe()); // Convert the urlsToFetchPipe so that we now deal with UrlDatums. urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbFunction()); // A TupleLogger is a good way to follow the tuples around in a flow. You can enable the output // of tuples by setting options.setDebugLogging() to true. urlsToFetchPipe = TupleLogger.makePipe(urlsToFetchPipe, true); // Create the output sinks : // crawldb // content // parse // status Path outCrawlDbPath = new Path(curWorkingDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); Tap loopCrawldbSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), outCrawlDbPath.toString()); Path contentDirPath = new Path(curWorkingDirPath, CrawlConfig.CONTENT_SUBDIR_NAME); Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString()); Path parseDirPath = new Path(curWorkingDirPath, CrawlConfig.PARSE_SUBDIR_NAME); Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parseDirPath.toString()); Path statusDirPath = new Path(curWorkingDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString()); // Create the sub-assembly that runs the fetch job SimpleHttpFetcher fetcher = new SimpleHttpFetcher(options.getMaxThreads(), fetcherPolicy, userAgent); fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);// set to two tries fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);// and 10 sec timeout fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT); // You can also provide a set of mime types you want to restrict what content type you // want to deal with - for now keep it simple. Set<String> validMimeTypes = new HashSet<String>(); validMimeTypes.add("text/plain"); validMimeTypes.add("text/html"); fetcherPolicy.setValidMimeTypes(validMimeTypes); // The scorer is used by the FetchPipe to assign a score to every URL that passes the // robots.txt processing. The score is used to sort URLs such that higher scoring URLs // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped. BaseScoreGenerator scorer = new FixedScoreGenerator(); FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers); Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe()); Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe()); contentPipe = TupleLogger.makePipe(contentPipe, true); // Take content and split it into content output plus parse to extract URLs. // BEWARE: The SimpleParser will discard HTML unless you pass in true as last arg! So for mining // always pass in true!!! SimpleParser parser; if (options.isUseBoilerpipe()) { parser = new SimpleParser(new BoilerpipeContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy()); } else if (options.isGenerateHTML()) { parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(), true); } else if (options.isEnableMiner()) { parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(), true); } else { parser = new SimpleParser(); } parser.setExtractLanguage(false); ParsePipe parsePipe = new ParsePipe(contentPipe, parser); Tap writableSeqFileSink = null; Pipe writableSeqFileDataPipe = null; // Create the output map that connects each tail pipe to the appropriate sink, and the // list of tail pipes. Map<String, Tap> sinkMap = new HashMap<String, Tap>(); List<Pipe> tailPipes = new ArrayList<Pipe>(); if (options.isGenerateHTML()) { // Let's write out the parse as text: Pipe textParsePipe = new Pipe("text parse data", parsePipe.getTailPipe()); textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.PARSED_TEXT_FN), new RegexReplace(new Fields(ParsedDatum.PARSED_TEXT_FN), "[\\r\\n\\t]+", " ", true), Fields.REPLACE); textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.URL_FN, ParsedDatum.PARSED_TEXT_FN), new Identity()); Path textParsePath = new Path(curWorkingDirPath, CrawlConfig.HTML_SUBDIR_NAME); Tap textParseTap = new Hfs(new TextLine(), textParsePath.toString(), true); sinkMap.put(textParsePipe.getName(), textParseTap); tailPipes.add(textParsePipe); } if (options.isEnableMiner()) { //all the miner assembly happens here // analyze all pages that are to be mined, create an RTPageDatum // that will have data for /m/ page OR /critic/ page but not both // todo: in a perfect world there would be two datum types and we would // split them before analysis but it's nice to have all anaylysis in a single // function--maybe? Pipe prefsAnalyzerPipe = new Pipe("RT critics analyzer pipe", parsePipe.getTailPipe()); prefsAnalyzerPipe = new Each(prefsAnalyzerPipe, prefsAnalyzer); // take all RTPageDatum, create a text line TSV then write to a output Tap Pipe prefsPipe = new Pipe("prefs pipe", prefsAnalyzerPipe); prefsPipe = new Each(prefsPipe, new CreateRTCriticsPrefsFunction()); //todo, should we run through Unique? Path outPrefsPath = new Path(curWorkingDirPath, "prefs"); Tap outPrefsTap = new Hfs(new TextLine(), outPrefsPath.toString(), true); sinkMap.put(prefsPipe.getName(), outPrefsTap); tailPipes.add(prefsPipe); // take all RTPageDatum, filter out all but /m/ pages // make sure they are unique, create a TSV line per datum, // write to an output Tap Pipe filterMedia = new Pipe("filter_out_all_but_media_datum", prefsAnalyzerPipe); filterMedia = new Each(filterMedia, new FilterMediaDatumFunction()); Pipe mediaPipe = new Pipe("create_media_records", filterMedia); mediaPipe = new Each(mediaPipe, new CreateRTMediaRecordsFunction()); Pipe uniqueMedia = new Unique("uniquify_media_records", mediaPipe, new Fields("line")); Path outMediaPath = new Path(curWorkingDirPath, "media"); Tap outMediaTap = new Hfs(new TextLine(), outMediaPath.toString(), true); sinkMap.put(uniqueMedia.getName(), outMediaTap); tailPipes.add(uniqueMedia); } // Let's output a WritableSequenceFile as an example - this file can // then be used as input when working with Mahout. writableSeqFileDataPipe = new Pipe("writable seqfile data", new Each(parsePipe.getTailPipe(), new CreateWritableSeqFileData())); Path writableSeqFileDataPath = new Path(curWorkingDirPath, CrawlConfig.EXTRACTED_TEXT_SUBDIR_NAME); writableSeqFileSink = new Hfs(new WritableSequenceFile( new Fields(CrawlConfig.WRITABLE_SEQ_FILE_KEY_FN, CrawlConfig.WRITABLE_SEQ_FILE_VALUE_FN), Text.class, Text.class), writableSeqFileDataPath.toString()); Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe()); urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction(new SimpleUrlNormalizer(), new SimpleUrlValidator())); if (urlFilter != null) { urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter)); } urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer())); urlFromOutlinksPipe = TupleLogger.makePipe(urlFromOutlinksPipe, true); // Take status and output urls from it Pipe urlFromFetchPipe = new Pipe("url from fetch", statusPipe); urlFromFetchPipe = new Each(urlFromFetchPipe, new CreateUrlDatumFromStatusFunction()); urlFromFetchPipe = TupleLogger.makePipe(urlFromFetchPipe, true); // Finally join the URLs we get from parsing content with the URLs we got // from the status ouput, and the urls we didn't process from the db so that // we have a unified stream of all known URLs for the crawldb. Pipe finishedUrlsFromDbPipe = new Each(finishedDatumsFromDb, new CreateUrlDatumFromCrawlDbFunction()); finishedUrlsFromDbPipe = TupleLogger.makePipe(finishedUrlsFromDbPipe, true); // NOTE : Ideally you would just do a CoGroup instead of converting all the pipes to emit UrlDatums // and then doing the extra step of converting from UrlDatum to CrawlDbDatum. // The reason this isn't being done here is because we are sharing LatestUrlDatumBuffer() with JDBCCrawlTool Pipe crawlDbPipe = new GroupBy("crawldb pipe", Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe, finishedUrlsFromDbPipe), new Fields(UrlDatum.URL_FN)); crawlDbPipe = new Every(crawlDbPipe, new LatestUrlDatumBuffer(), Fields.RESULTS); Pipe outputPipe = new Pipe("output pipe"); outputPipe = new Each(crawlDbPipe, new CreateCrawlDbDatumFromUrlFunction()); // Create the output map that connects each tail pipe to the appropriate sink. sinkMap.put(statusPipe.getName(), statusSink); tailPipes.add(statusPipe); sinkMap.put(contentPipe.getName(), contentSink); tailPipes.add(contentPipe); sinkMap.put(parsePipe.getTailPipe().getName(), parseSink); tailPipes.add(parsePipe.getTailPipe()); sinkMap.put(outputPipe.getName(), loopCrawldbSink); tailPipes.add(outputPipe); sinkMap.put(writableSeqFileDataPipe.getName(), writableSeqFileSink); tailPipes.add(writableSeqFileDataPipe); FlowConnector flowConnector = new FlowConnector(props); Flow flow = flowConnector.connect(inputSource, sinkMap, tailPipes.toArray(new Pipe[tailPipes.size()])); return flow; }
From source file:com.firewallid.util.FIFile.java
public static boolean isExistsHDFSPath(String fullPath) throws IOException { Configuration hadoopConf = new Configuration(); FileSystem fileSystem = FileSystem.get(hadoopConf); Path path = new Path(fullPath); return fileSystem.exists(path); }