List of usage examples for org.apache.hadoop.fs Path getFileSystem
public FileSystem getFileSystem(Configuration conf) throws IOException
From source file:com.finderbots.miner.MinerTool.java
License:Apache License
public static void main(String[] args) throws IOException { MinerOptions options = new MinerOptions(); CmdLineParser parser = new CmdLineParser(options); try {//from ww w .java 2 s. c o m parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Build and run the flow. try { Path workingDirPath = new Path(options.getWorkingDir()); JobConf conf = new JobConf(); FileSystem fs = workingDirPath.getFileSystem(conf); setupWorkingDir(fs, workingDirPath, options.getUrlsFile()); Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath); if (latestDirPath == null) { error("No previous cycle output dirs exist in " + workingDirPath, parser); } Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); FetcherPolicy fetcherPolicy = new FetcherPolicy(); fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY); fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT); // We only care about mime types that the Tika HTML parser can handle, // so restrict it to the same. Set<String> validMimeTypes = new HashSet<String>(); Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext()); for (MediaType supportedType : supportedTypes) { validMimeTypes.add(String.format("%s/%s", supportedType.getType(), supportedType.getSubtype())); } fetcherPolicy.setValidMimeTypes(validMimeTypes); // By setting up a url filter we only deal with urls that we want to // instead of all the urls that we extract. String crawlUrlFiltersFile = options.getRegexUrlFiltersFile(); List<String> crawlUrlPatterns = RegexUrlFilter.getUrlFilterPatterns(crawlUrlFiltersFile); BaseUrlFilter crawlUrlFilter = new RegexUrlFilter( crawlUrlPatterns.toArray(new String[crawlUrlPatterns.size()])); // setting up a miner filter we will mine only pages that match one of the urls String regexUrlFiltersFile = options.getRegexUrlFiltersFile(); List<String> mineUrlPatterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile); BaseUrlFilter mineUrlFilter = new RegexUrlFilter( mineUrlPatterns.toArray(new String[mineUrlPatterns.size()])); // Let's limit our crawl to two loops for (int curLoop = 1; curLoop <= options.getNumLoops(); curLoop++) { Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, curLoop); Flow flow = MinerWorkflow.createWebMiningWorkflow(crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options, crawlUrlFilter, mineUrlFilter); flow.complete(); // Update crawlDbPath to point to the latest crawl db crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); } } catch (Exception e) { System.err.println("Exception running job: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } }
From source file:com.finderbots.miner.MinerWorkflow.java
License:Apache License
public static void importSeedUrls(Path crawlDbPath, String fileName) throws IOException, InterruptedException { SimpleUrlNormalizer normalizer = new SimpleUrlNormalizer(); JobConf defaultJobConf = HadoopUtils.getDefaultJobConf(); InputStream is = null;/*ww w. j a v a 2 s. co m*/ TupleEntryCollector writer = null; try { Tap urlSink = new Hfs(new TextLine(), crawlDbPath.toString(), true); writer = urlSink.openForWrite(defaultJobConf); //gak this should be an HDFS file of seeds we iterate through is = MinerWorkflow.class.getResourceAsStream(fileName); if (is == null) { throw new FileNotFoundException("The seed urls file doesn't exist"); } //read all the lines from the hadoop file List<String> lines = IOUtils.readLines(is); for (String line : lines) { line = line.trim(); if (line.startsWith("#")) { continue; } CrawlDbDatum datum = new CrawlDbDatum(normalizer.normalize(line), 0, UrlStatus.UNFETCHED, 0.0f, 0.0f); writer.add(datum.getTuple()); } writer.close(); } catch (IOException e) { HadoopUtils.safeRemove(crawlDbPath.getFileSystem(defaultJobConf), crawlDbPath); throw e; } finally { IoUtils.safeClose(is); if (writer != null) { writer.close(); } } }
From source file:com.finderbots.miner.MinerWorkflow.java
License:Apache License
public static Flow createWebMiningWorkflow(Path crawlDbPath, Path curLoopDirPath, FetcherPolicy fetcherPolicy, UserAgent userAgent, MinerOptions options, BaseUrlFilter crawlUrlFilter, BaseUrlFilter mineUrlFilter) throws IOException, InterruptedException { // Fetch at most 200 pages, max size of 128K, complete mode, from the current dir. // HTML only. // We want to extract the cleaned up HTML, and pass that to the parser, which will // be specified via options.getAnalyzer. From this we'll get outlinks, page score, and // any results. JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB); boolean isLocal = HadoopUtils.isJobLocal(conf); int numReducers = HadoopUtils.getNumReducers(conf); conf.setNumReduceTasks(numReducers); conf.setInt("mapred.min.split.size", 64 * 1024 * 1024); Properties props = HadoopUtils.getDefaultProperties(MinerWorkflow.class, false, conf); FileSystem fs = crawlDbPath.getFileSystem(conf); // Input : the crawldb if (!fs.exists(crawlDbPath)) { throw new RuntimeException("CrawlDb not found"); }//from ww w. j a va 2 s . c o m //Tap inputSource = new Hfs(new TextDelimited(CrawlDbDatum.FIELDS, "\t", CrawlDbDatum.TYPES), crawlDbPath.toString()); Tap inputSource = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString(), true); Pipe importPipe = new Pipe("import pipe"); // Split into tuples that are to be fetched and that have already been fetched SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedSSCrawlDatums()); Pipe finishedDatumsFromDb = new Pipe("finished datums from db", splitter.getRHSPipe()); Pipe urlsToFetchPipe = splitter.getLHSPipe(); // Limit to MAX_DISTRIBUTED_FETCH if running in real cluster, // or MAX_LOCAL_FETCH if running locally. So first we sort the entries // from high to low by links score. // TODO add unit test urlsToFetchPipe = new GroupBy(urlsToFetchPipe, new Fields(CrawlDbDatum.LINKS_SCORE_FIELD), true); long maxToFetch = HadoopUtils.isJobLocal(conf) ? MAX_LOCAL_FETCH : MAX_DISTRIBUTED_FETCH; urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbDatum(maxToFetch)); // Create the sub-assembly that runs the fetch job int maxThreads = isLocal ? CrawlConfig.DEFAULT_NUM_THREADS_LOCAL : CrawlConfig.DEFAULT_NUM_THREADS_CLUSTER; SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent); fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES); fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT); fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT); // The scorer is used by the FetchPipe to assign a score to every URL that passes the // robots.txt processing. The score is used to sort URLs such that higher scoring URLs // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped. BaseScoreGenerator scorer = new FixedScoreGenerator(); FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers); Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe()); Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe()); contentPipe = TupleLogger.makePipe(contentPipe, true); // Create a parser that returns back the raw HTML (cleaned up by Tika) as the parsed content. SimpleParser parser = new SimpleParser(new ParserPolicy(), true); ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), parser); Pipe analyzerPipe = new Pipe("analyzer pipe"); analyzerPipe = new Each(parsePipe.getTailPipe(), new AnalyzeHtml()); //add a regex url filter to filter outlinks Pipe outlinksPipe = new Pipe("outlinks pipe", analyzerPipe); outlinksPipe = new Each(outlinksPipe, new CreateLinkDatumFromOutlinksFunction()); if (crawlUrlFilter != null) { outlinksPipe = new Each(outlinksPipe, new UrlFilter(crawlUrlFilter)); } Pipe resultsPipe = new Pipe("results pipe", analyzerPipe); resultsPipe = new Each(resultsPipe, new CreateResultsFunction()); // Group the finished datums, the skipped datums, status, outlinks Pipe updatePipe = new CoGroup("update pipe", Pipe.pipes(finishedDatumsFromDb, statusPipe, analyzerPipe, outlinksPipe), Fields.fields(new Fields(CrawlDbDatum.URL_FIELD), new Fields(StatusDatum.URL_FN), new Fields(AnalyzedDatum.URL_FIELD), new Fields(LinkDatum.URL_FN)), null, new OuterJoin()); updatePipe = new Every(updatePipe, new UpdateCrawlDbBuffer(), Fields.RESULTS); // output : loop dir specific crawldb Path outCrawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); Tap crawlDbSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), outCrawlDbPath.toString()); // Status, Path statusDirPath = new Path(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString()); // Content Path contentDirPath = new Path(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME); Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString()); // PageResults Path resultsDirPath = new Path(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME); Tap resultsSink = new Hfs(new TextLine(), resultsDirPath.toString()); // Create the output map that connects each tail pipe to the appropriate sink. Map<String, Tap> sinkMap = new HashMap<String, Tap>(); sinkMap.put(updatePipe.getName(), crawlDbSink); sinkMap.put(statusPipe.getName(), statusSink); sinkMap.put(contentPipe.getName(), contentSink); sinkMap.put(resultsPipe.getName(), resultsSink); FlowConnector flowConnector = new FlowConnector(props); Flow flow = flowConnector.connect(inputSource, sinkMap, updatePipe, statusPipe, contentPipe, resultsPipe); return flow; }
From source file:com.finderbots.miner.RegexUrlFilter.java
License:Apache License
public static List<String> getUrlFilterPatterns(String urlFiltersFile) throws IOException, InterruptedException { //this reads regex filters from a file in HDFS or the native file sysytem JobConf conf = HadoopUtils.getDefaultJobConf(); Path filterFile = new Path(urlFiltersFile); FileSystem fs = filterFile.getFileSystem(conf); List<String> filterList = new ArrayList<String>(); if (fs.exists(filterFile)) { FSDataInputStream in = fs.open(filterFile); LineReader reader = new LineReader(in); Text tLine = new Text(); while (reader.readLine(tLine) > 0) { String line = tLine.toString(); if (StringUtils.isNotBlank(line) && (line.startsWith(INCLUDE_CHAR) || line.startsWith(EXCLUDE_CHAR))) { filterList.add(line.trim()); }//w ww . j a v a2 s .co m } in.close(); } return filterList; }
From source file:com.finderbots.miner2.pinterest.PinterestCrawlAndMinerTool.java
License:Apache License
public static void main(String[] args) { Options options = new Options(); CmdLineParser parser = new CmdLineParser(options); try {/*w w w . j a v a2s . co m*/ parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Before we get too far along, see if the domain looks valid. String domain = options.getDomain(); String urlsFile = options.getUrlsFile(); if (domain != null) { validateDomain(domain, parser); } else { if (urlsFile == null) { System.err.println( "Either a target domain should be specified or a file with a list of urls needs to be provided"); printUsageAndExit(parser); } } if (domain != null && urlsFile != null) { System.out.println("Warning: Both domain and urls file list provided - using domain"); } String outputDirName = options.getOutputDir(); if (options.isDebugLogging()) { System.setProperty("bixo.root.level", "DEBUG"); } else { System.setProperty("bixo.root.level", "INFO"); } if (options.getLoggingAppender() != null) { // Set console vs. DRFA vs. something else System.setProperty("bixo.appender", options.getLoggingAppender()); } String logsDir = options.getLogsDir(); if (!logsDir.endsWith("/")) { logsDir = logsDir + "/"; } try { JobConf conf = new JobConf(); Path outputPath = new Path(outputDirName); FileSystem fs = outputPath.getFileSystem(conf); // First check if the user wants to clean if (options.isCleanOutputDir()) { if (fs.exists(outputPath)) { fs.delete(outputPath, true); } } // See if the user isn't starting from scratch then set up the // output directory and create an initial urls subdir. if (!fs.exists(outputPath)) { fs.mkdirs(outputPath); // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir // In the /crawldb dir the input file will have a single URL for the target domain. Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0); String curLoopDirName = curLoopDir.getName(); setLoopLoggerFile(logsDir + curLoopDirName, 0); Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME); if (domain != null) { importOneDomain(domain, crawlDbPath, conf); } else { importUrls(urlsFile, crawlDbPath); } } Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath); if (latestDirPath == null) { System.err.println("No previous cycle output dirs exist in " + outputDirName); printUsageAndExit(parser); } Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); // Set up the start and end loop counts. int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath); int endLoop = startLoop + options.getNumLoops(); // Set up the UserAgent for the fetcher. UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); // You also get to customize the FetcherPolicy FetcherPolicy defaultPolicy; if (options.getCrawlDuration() != 0) { defaultPolicy = new AdaptiveFetcherPolicy(options.getEndCrawlTime(), options.getCrawlDelay()); } else { defaultPolicy = new FetcherPolicy(); } defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); defaultPolicy.setRequestTimeout(10L * 1000L);//10 seconds // COMPLETE for crawling a single site, EFFICIENT for many sites if (options.getCrawlPolicy().equals(Options.IMPOLITE_CRAWL_POLICY)) { defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.IMPOLITE); } else if (options.getCrawlPolicy().equals(Options.EFFICIENT_CRAWL_POLICY)) { defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT); } else if (options.getCrawlPolicy().equals(Options.COMPLETE_CRAWL_POLICY)) { defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.COMPLETE); } // It is a good idea to set up a crawl duration when running long crawls as you may // end up in situations where the fetch slows down due to a 'long tail' and by // specifying a crawl duration you know exactly when the crawl will end. int crawlDurationInMinutes = options.getCrawlDuration(); boolean hasEndTime = crawlDurationInMinutes != Options.NO_CRAWL_DURATION; long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) : FetcherPolicy.NO_CRAWL_END_TIME; // By setting up a url filter we only deal with urls that we want to // instead of all the urls that we extract. BaseUrlFilter urlFilter = null; List<String> patterns = null; String regexUrlFiltersFile = options.getRegexUrlFiltersFile(); if (regexUrlFiltersFile != null) { patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlFiltersFile); } else { patterns = RegexUrlDatumFilter.getDefaultUrlFilterPatterns(); if (domain != null) { String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain; patterns.add(domainPatterStr); } else { String protocolPatterStr = "+(?i)^(http|https)://*"; patterns.add(protocolPatterStr); //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol"); } } urlFilter = new RegexUrlDatumFilter(patterns.toArray(new String[patterns.size()])); // get a list of patterns which tell the miner which URLs to include or exclude. patterns.clear(); RegexUrlStringFilter urlsToMineFilter = null; String regexUrlsToMineFiltersFile = options.getRegexUrlToMineFile(); AnalyzeHtml analyzer = null; if (regexUrlsToMineFiltersFile != null) { patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlsToMineFiltersFile); urlsToMineFilter = new RegexUrlStringFilter(patterns.toArray(new String[patterns.size()])); analyzer = new AnalyzeHtml(urlsToMineFilter); } // OK, now we're ready to start looping, since we've got our current // settings for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) { // Adjust target end time, if appropriate. if (hasEndTime) { int remainingLoops = (endLoop - curLoop) + 1; long now = System.currentTimeMillis(); long perLoopTime = (targetEndTime - now) / remainingLoops; defaultPolicy.setCrawlEndTime(now + perLoopTime); } Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop); String curLoopDirName = curLoopDirPath.getName(); setLoopLoggerFile(logsDir + curLoopDirName, curLoop); Flow flow = PinterestCrawlAndMinerWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, analyzer, options); flow.complete(); // Writing out .dot files is a good way to verify your flows. flow.writeDOT("valid-flow.dot"); // Update crawlDbPath to point to the latest crawl db crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); } } catch (PlannerException e) { e.writeDOT("failed-flow.dot"); System.err.println("PlannerException: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } catch (Throwable t) { System.err.println("Exception running tool: " + t.getMessage()); t.printStackTrace(System.err); System.exit(-1); } }
From source file:com.finderbots.miner2.pinterest.PinterestCrawlAndMinerWorkflow.java
License:Apache License
public static Flow createFlow(Path curWorkingDirPath, Path crawlDbPath, FetcherPolicy fetcherPolicy, UserAgent userAgent, BaseUrlFilter urlFilter, AnalyzeHtml analyzer, PinterestCrawlAndMinerTool.Options options) throws Throwable { JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB); int numReducers = HadoopUtils.getNumReducers(conf); conf.setNumReduceTasks(numReducers); Properties props = HadoopUtils.getDefaultProperties(PinterestCrawlAndMinerWorkflow.class, options.isDebugLogging(), conf); FileSystem fs = curWorkingDirPath.getFileSystem(conf); // Input : the crawldb if (!fs.exists(crawlDbPath)) { throw new RuntimeException("CrawlDb doesn't exist at " + crawlDbPath); }//from ww w . j av a 2 s . c o m // Our crawl db is defined by the CrawlDbDatum Tap inputSource = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString()); Pipe importPipe = new Pipe("import pipe"); // Split into tuples that are to be fetched and that have already been fetched SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedCrawlDatums()); Pipe finishedDatumsFromDb = splitter.getRHSPipe(); Pipe urlsToFetchPipe = new Pipe("urls to Fetch", splitter.getLHSPipe()); // Convert the urlsToFetchPipe so that we now deal with UrlDatums. urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbFunction()); // A TupleLogger is a good way to follow the tuples around in a flow. You can enable the output // of tuples by setting options.setDebugLogging() to true. urlsToFetchPipe = TupleLogger.makePipe(urlsToFetchPipe, true); // Create the output sinks : // crawldb // content // parse // status Path outCrawlDbPath = new Path(curWorkingDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); Tap loopCrawldbSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), outCrawlDbPath.toString()); Path contentDirPath = new Path(curWorkingDirPath, CrawlConfig.CONTENT_SUBDIR_NAME); Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString()); Path parseDirPath = new Path(curWorkingDirPath, CrawlConfig.PARSE_SUBDIR_NAME); Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parseDirPath.toString()); Path statusDirPath = new Path(curWorkingDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString()); // Create the sub-assembly that runs the fetch job SimpleHttpFetcher fetcher = new SimpleHttpFetcher(options.getMaxThreads(), fetcherPolicy, userAgent); fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);// set to two tries fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);// and 10 sec timeout fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT); // You can also provide a set of mime types you want to restrict what content type you // want to deal with - for now keep it simple. Set<String> validMimeTypes = new HashSet<String>(); validMimeTypes.add("text/plain"); validMimeTypes.add("text/html"); fetcherPolicy.setValidMimeTypes(validMimeTypes); // The scorer is used by the FetchPipe to assign a score to every URL that passes the // robots.txt processing. The score is used to sort URLs such that higher scoring URLs // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped. BaseScoreGenerator scorer = new FixedScoreGenerator(); FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers); Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe()); Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe()); contentPipe = TupleLogger.makePipe(contentPipe, true); // Take content and split it into content output plus parse to extract URLs. // BEWARE: The SimpleParser will discard HTML unless you pass in true as last arg! So for mining // always pass in true!!! SimpleParser parser; if (options.isUseBoilerpipe()) { parser = new SimpleParser(new BoilerpipeContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy()); } else if (options.isGenerateHTML()) { parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(), true); } else if (options.isEnableMiner()) { parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(), true); } else { parser = new SimpleParser(); } parser.setExtractLanguage(false); ParsePipe parsePipe = new ParsePipe(contentPipe, parser); Tap writableSeqFileSink = null; Pipe writableSeqFileDataPipe = null; // Create the output map that connects each tail pipe to the appropriate sink, and the // list of tail pipes. Map<String, Tap> sinkMap = new HashMap<String, Tap>(); List<Pipe> tailPipes = new ArrayList<Pipe>(); if (options.isGenerateHTML()) { // Let's write out the parse as text: Pipe textParsePipe = new Pipe("text parse data", parsePipe.getTailPipe()); textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.PARSED_TEXT_FN), new RegexReplace(new Fields(ParsedDatum.PARSED_TEXT_FN), "[\\r\\n\\t]+", " ", true), Fields.REPLACE); textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.URL_FN, ParsedDatum.PARSED_TEXT_FN), new Identity()); Path textParsePath = new Path(curWorkingDirPath, CrawlConfig.HTML_SUBDIR_NAME); Tap textParseTap = new Hfs(new TextLine(), textParsePath.toString(), true); sinkMap.put(textParsePipe.getName(), textParseTap); tailPipes.add(textParsePipe); } if (options.isEnableMiner()) { Pipe analyzerPipe = new Pipe("analyzer pipe", parsePipe.getTailPipe()); analyzerPipe = new Each(analyzerPipe, analyzer); Pipe resultsPipe = new Pipe("results pipe", analyzerPipe); resultsPipe = new Each(resultsPipe, new CreateBooleanPreferenceFunction()); Path minerOutputPath = new Path(curWorkingDirPath, CrawlConfig.MINER_SUBDIR_NAME); Tap minerOutputTap = new Hfs(new TextLine(), minerOutputPath.toString(), true); sinkMap.put(resultsPipe.getName(), minerOutputTap); tailPipes.add(resultsPipe); } // Let's output a WritableSequenceFile as an example - this file can // then be used as input when working with Mahout. writableSeqFileDataPipe = new Pipe("writable seqfile data", new Each(parsePipe.getTailPipe(), new CreateWritableSeqFileData())); Path writableSeqFileDataPath = new Path(curWorkingDirPath, CrawlConfig.EXTRACTED_TEXT_SUBDIR_NAME); writableSeqFileSink = new Hfs(new WritableSequenceFile( new Fields(CrawlConfig.WRITABLE_SEQ_FILE_KEY_FN, CrawlConfig.WRITABLE_SEQ_FILE_VALUE_FN), Text.class, Text.class), writableSeqFileDataPath.toString()); Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe()); urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction(new SimpleUrlNormalizer(), new SimpleUrlValidator())); if (urlFilter != null) { urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter)); } urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer())); urlFromOutlinksPipe = TupleLogger.makePipe(urlFromOutlinksPipe, true); // Take status and output urls from it Pipe urlFromFetchPipe = new Pipe("url from fetch", statusPipe); urlFromFetchPipe = new Each(urlFromFetchPipe, new CreateUrlDatumFromStatusFunction()); urlFromFetchPipe = TupleLogger.makePipe(urlFromFetchPipe, true); // Finally join the URLs we get from parsing content with the URLs we got // from the status ouput, and the urls we didn't process from the db so that // we have a unified stream of all known URLs for the crawldb. Pipe finishedUrlsFromDbPipe = new Each(finishedDatumsFromDb, new CreateUrlDatumFromCrawlDbFunction()); finishedUrlsFromDbPipe = TupleLogger.makePipe(finishedUrlsFromDbPipe, true); // NOTE : Ideally you would just do a CoGroup instead of converting all the pipes to emit UrlDatums // and then doing the extra step of converting from UrlDatum to CrawlDbDatum. // The reason this isn't being done here is because we are sharing LatestUrlDatumBuffer() with JDBCCrawlTool Pipe crawlDbPipe = new GroupBy("crawldb pipe", Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe, finishedUrlsFromDbPipe), new Fields(UrlDatum.URL_FN)); crawlDbPipe = new Every(crawlDbPipe, new LatestUrlDatumBuffer(), Fields.RESULTS); Pipe outputPipe = new Pipe("output pipe"); outputPipe = new Each(crawlDbPipe, new CreateCrawlDbDatumFromUrlFunction()); // Create the output map that connects each tail pipe to the appropriate sink. sinkMap.put(statusPipe.getName(), statusSink); tailPipes.add(statusPipe); sinkMap.put(contentPipe.getName(), contentSink); tailPipes.add(contentPipe); sinkMap.put(parsePipe.getTailPipe().getName(), parseSink); tailPipes.add(parsePipe.getTailPipe()); sinkMap.put(outputPipe.getName(), loopCrawldbSink); tailPipes.add(outputPipe); sinkMap.put(writableSeqFileDataPipe.getName(), writableSeqFileSink); tailPipes.add(writableSeqFileDataPipe); FlowConnector flowConnector = new FlowConnector(props); Flow flow = flowConnector.connect(inputSource, sinkMap, tailPipes.toArray(new Pipe[tailPipes.size()])); return flow; }
From source file:com.finderbots.miner2.tomatoes.RTCriticsCrawlAndMinerTool.java
License:Apache License
public static void main(String[] args) { Options options = new Options(); CmdLineParser parser = new CmdLineParser(options); try {//w w w . ja va 2 s. com parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Before we get too far along, see if the domain looks valid. String domain = options.getDomain(); String urlsFile = options.getUrlsFile(); if (domain != null) { validateDomain(domain, parser); } else { if (urlsFile == null) { System.err.println( "Either a target domain should be specified or a file with a list of urls needs to be provided"); printUsageAndExit(parser); } } if (domain != null && urlsFile != null) { System.out.println("Warning: Both domain and urls file list provided - using domain"); } String outputDirName = options.getOutputDir(); if (options.isDebugLogging()) { System.setProperty("bixo.root.level", "DEBUG"); } else { System.setProperty("bixo.root.level", "INFO"); } if (options.getLoggingAppender() != null) { // Set console vs. DRFA vs. something else System.setProperty("bixo.appender", options.getLoggingAppender()); } String logsDir = options.getLogsDir(); if (!logsDir.endsWith("/")) { logsDir = logsDir + "/"; } try { JobConf conf = new JobConf(); Path outputPath = new Path(outputDirName); FileSystem fs = outputPath.getFileSystem(conf); // First check if the user wants to clean if (options.isCleanOutputDir()) { if (fs.exists(outputPath)) { fs.delete(outputPath, true); } } // See if the user isn't starting from scratch then set up the // output directory and create an initial urls subdir. if (!fs.exists(outputPath)) { fs.mkdirs(outputPath); // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir // In the /crawldb dir the input file will have a single URL for the target domain. Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0); String curLoopDirName = curLoopDir.getName(); setLoopLoggerFile(logsDir + curLoopDirName, 0); Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME); if (domain != null) { importOneDomain(domain, crawlDbPath, conf); } else { importUrls(urlsFile, crawlDbPath); } } Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath); if (latestDirPath == null) { System.err.println("No previous cycle output dirs exist in " + outputDirName); printUsageAndExit(parser); } Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); // Set up the start and end loop counts. int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath); int endLoop = startLoop + options.getNumLoops(); // Set up the UserAgent for the fetcher. UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); // You also get to customize the FetcherPolicy FetcherPolicy defaultPolicy; if (options.getCrawlDuration() != 0) { defaultPolicy = new AdaptiveFetcherPolicy(options.getEndCrawlTime(), options.getCrawlDelay()); } else { defaultPolicy = new FetcherPolicy(); } defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); defaultPolicy.setRequestTimeout(10L * 1000L);//10 seconds // COMPLETE for crawling a single site, EFFICIENT for many sites if (options.getCrawlPolicy().equals(Options.IMPOLITE_CRAWL_POLICY)) { defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.IMPOLITE); } else if (options.getCrawlPolicy().equals(Options.EFFICIENT_CRAWL_POLICY)) { defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT); } else if (options.getCrawlPolicy().equals(Options.COMPLETE_CRAWL_POLICY)) { defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.COMPLETE); } // It is a good idea to set up a crawl duration when running long crawls as you may // end up in situations where the fetch slows down due to a 'long tail' and by // specifying a crawl duration you know exactly when the crawl will end. int crawlDurationInMinutes = options.getCrawlDuration(); boolean hasEndTime = crawlDurationInMinutes != Options.NO_CRAWL_DURATION; long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) : FetcherPolicy.NO_CRAWL_END_TIME; // By setting up a url filter we only deal with urls that we want to // instead of all the urls that we extract. BaseUrlFilter urlFilter = null; List<String> patterns = null; String regexUrlFiltersFile = options.getRegexUrlFiltersFile(); if (regexUrlFiltersFile != null) { patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlFiltersFile); } else { patterns = RegexUrlDatumFilter.getDefaultUrlFilterPatterns(); if (domain != null) { String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain; patterns.add(domainPatterStr); } else { String protocolPatterStr = "+(?i)^(http|https)://*"; patterns.add(protocolPatterStr); //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol"); } } urlFilter = new RegexUrlDatumFilter(patterns.toArray(new String[patterns.size()])); // get a list of patterns which tell the miner which URLs to include or exclude. patterns.clear(); RegexUrlStringFilter urlsToMineFilter = null; String regexUrlsToMineFiltersFile = options.getRegexUrlToMineFile(); MineRTCriticsPreferences prefsAnalyzer = null; if (regexUrlsToMineFiltersFile != null) { patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlsToMineFiltersFile); urlsToMineFilter = new RegexUrlStringFilter(patterns.toArray(new String[patterns.size()])); prefsAnalyzer = new MineRTCriticsPreferences(urlsToMineFilter); } // OK, now we're ready to start looping, since we've got our current // settings for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) { // Adjust target end time, if appropriate. if (hasEndTime) { int remainingLoops = (endLoop - curLoop) + 1; long now = System.currentTimeMillis(); long perLoopTime = (targetEndTime - now) / remainingLoops; defaultPolicy.setCrawlEndTime(now + perLoopTime); } Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop); String curLoopDirName = curLoopDirPath.getName(); setLoopLoggerFile(logsDir + curLoopDirName, curLoop); Flow flow = RTCriticsCrawlAndMinerWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, prefsAnalyzer, options); flow.complete(); // Writing out .dot files is a good way to verify your flows. flow.writeDOT("valid-flow.dot"); // Update crawlDbPath to point to the latest crawl db crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); } } catch (PlannerException e) { e.writeDOT("failed-flow.dot"); System.err.println("PlannerException: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } catch (Throwable t) { System.err.println("Exception running tool: " + t.getMessage()); t.printStackTrace(System.err); System.exit(-1); } }
From source file:com.finderbots.miner2.tomatoes.RTCriticsCrawlAndMinerWorkflow.java
License:Apache License
public static Flow createFlow(Path curWorkingDirPath, Path crawlDbPath, FetcherPolicy fetcherPolicy, UserAgent userAgent, BaseUrlFilter urlFilter, MineRTCriticsPreferences prefsAnalyzer, RTCriticsCrawlAndMinerTool.Options options) throws Throwable { JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB); int numReducers = HadoopUtils.getNumReducers(conf); conf.setNumReduceTasks(numReducers); Properties props = HadoopUtils.getDefaultProperties(RTCriticsCrawlAndMinerWorkflow.class, options.isDebugLogging(), conf); FileSystem fs = curWorkingDirPath.getFileSystem(conf); // Input : the crawldb if (!fs.exists(crawlDbPath)) { throw new RuntimeException("CrawlDb doesn't exist at " + crawlDbPath); }/*ww w. j a v a 2 s . c o m*/ // Our crawl db is defined by the CrawlDbDatum Tap inputSource = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString()); Pipe importPipe = new Pipe("import pipe"); // Split into tuples that are to be fetched and that have already been fetched SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedCrawlDatums()); Pipe finishedDatumsFromDb = splitter.getRHSPipe(); Pipe urlsToFetchPipe = new Pipe("urls to Fetch", splitter.getLHSPipe()); // Convert the urlsToFetchPipe so that we now deal with UrlDatums. urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbFunction()); // A TupleLogger is a good way to follow the tuples around in a flow. You can enable the output // of tuples by setting options.setDebugLogging() to true. urlsToFetchPipe = TupleLogger.makePipe(urlsToFetchPipe, true); // Create the output sinks : // crawldb // content // parse // status Path outCrawlDbPath = new Path(curWorkingDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); Tap loopCrawldbSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), outCrawlDbPath.toString()); Path contentDirPath = new Path(curWorkingDirPath, CrawlConfig.CONTENT_SUBDIR_NAME); Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString()); Path parseDirPath = new Path(curWorkingDirPath, CrawlConfig.PARSE_SUBDIR_NAME); Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parseDirPath.toString()); Path statusDirPath = new Path(curWorkingDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString()); // Create the sub-assembly that runs the fetch job SimpleHttpFetcher fetcher = new SimpleHttpFetcher(options.getMaxThreads(), fetcherPolicy, userAgent); fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);// set to two tries fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);// and 10 sec timeout fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT); // You can also provide a set of mime types you want to restrict what content type you // want to deal with - for now keep it simple. Set<String> validMimeTypes = new HashSet<String>(); validMimeTypes.add("text/plain"); validMimeTypes.add("text/html"); fetcherPolicy.setValidMimeTypes(validMimeTypes); // The scorer is used by the FetchPipe to assign a score to every URL that passes the // robots.txt processing. The score is used to sort URLs such that higher scoring URLs // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped. BaseScoreGenerator scorer = new FixedScoreGenerator(); FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers); Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe()); Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe()); contentPipe = TupleLogger.makePipe(contentPipe, true); // Take content and split it into content output plus parse to extract URLs. // BEWARE: The SimpleParser will discard HTML unless you pass in true as last arg! So for mining // always pass in true!!! SimpleParser parser; if (options.isUseBoilerpipe()) { parser = new SimpleParser(new BoilerpipeContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy()); } else if (options.isGenerateHTML()) { parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(), true); } else if (options.isEnableMiner()) { parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(), true); } else { parser = new SimpleParser(); } parser.setExtractLanguage(false); ParsePipe parsePipe = new ParsePipe(contentPipe, parser); Tap writableSeqFileSink = null; Pipe writableSeqFileDataPipe = null; // Create the output map that connects each tail pipe to the appropriate sink, and the // list of tail pipes. Map<String, Tap> sinkMap = new HashMap<String, Tap>(); List<Pipe> tailPipes = new ArrayList<Pipe>(); if (options.isGenerateHTML()) { // Let's write out the parse as text: Pipe textParsePipe = new Pipe("text parse data", parsePipe.getTailPipe()); textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.PARSED_TEXT_FN), new RegexReplace(new Fields(ParsedDatum.PARSED_TEXT_FN), "[\\r\\n\\t]+", " ", true), Fields.REPLACE); textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.URL_FN, ParsedDatum.PARSED_TEXT_FN), new Identity()); Path textParsePath = new Path(curWorkingDirPath, CrawlConfig.HTML_SUBDIR_NAME); Tap textParseTap = new Hfs(new TextLine(), textParsePath.toString(), true); sinkMap.put(textParsePipe.getName(), textParseTap); tailPipes.add(textParsePipe); } if (options.isEnableMiner()) { //all the miner assembly happens here // analyze all pages that are to be mined, create an RTPageDatum // that will have data for /m/ page OR /critic/ page but not both // todo: in a perfect world there would be two datum types and we would // split them before analysis but it's nice to have all anaylysis in a single // function--maybe? Pipe prefsAnalyzerPipe = new Pipe("RT critics analyzer pipe", parsePipe.getTailPipe()); prefsAnalyzerPipe = new Each(prefsAnalyzerPipe, prefsAnalyzer); // take all RTPageDatum, create a text line TSV then write to a output Tap Pipe prefsPipe = new Pipe("prefs pipe", prefsAnalyzerPipe); prefsPipe = new Each(prefsPipe, new CreateRTCriticsPrefsFunction()); //todo, should we run through Unique? Path outPrefsPath = new Path(curWorkingDirPath, "prefs"); Tap outPrefsTap = new Hfs(new TextLine(), outPrefsPath.toString(), true); sinkMap.put(prefsPipe.getName(), outPrefsTap); tailPipes.add(prefsPipe); // take all RTPageDatum, filter out all but /m/ pages // make sure they are unique, create a TSV line per datum, // write to an output Tap Pipe filterMedia = new Pipe("filter_out_all_but_media_datum", prefsAnalyzerPipe); filterMedia = new Each(filterMedia, new FilterMediaDatumFunction()); Pipe mediaPipe = new Pipe("create_media_records", filterMedia); mediaPipe = new Each(mediaPipe, new CreateRTMediaRecordsFunction()); Pipe uniqueMedia = new Unique("uniquify_media_records", mediaPipe, new Fields("line")); Path outMediaPath = new Path(curWorkingDirPath, "media"); Tap outMediaTap = new Hfs(new TextLine(), outMediaPath.toString(), true); sinkMap.put(uniqueMedia.getName(), outMediaTap); tailPipes.add(uniqueMedia); } // Let's output a WritableSequenceFile as an example - this file can // then be used as input when working with Mahout. writableSeqFileDataPipe = new Pipe("writable seqfile data", new Each(parsePipe.getTailPipe(), new CreateWritableSeqFileData())); Path writableSeqFileDataPath = new Path(curWorkingDirPath, CrawlConfig.EXTRACTED_TEXT_SUBDIR_NAME); writableSeqFileSink = new Hfs(new WritableSequenceFile( new Fields(CrawlConfig.WRITABLE_SEQ_FILE_KEY_FN, CrawlConfig.WRITABLE_SEQ_FILE_VALUE_FN), Text.class, Text.class), writableSeqFileDataPath.toString()); Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe()); urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction(new SimpleUrlNormalizer(), new SimpleUrlValidator())); if (urlFilter != null) { urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter)); } urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer())); urlFromOutlinksPipe = TupleLogger.makePipe(urlFromOutlinksPipe, true); // Take status and output urls from it Pipe urlFromFetchPipe = new Pipe("url from fetch", statusPipe); urlFromFetchPipe = new Each(urlFromFetchPipe, new CreateUrlDatumFromStatusFunction()); urlFromFetchPipe = TupleLogger.makePipe(urlFromFetchPipe, true); // Finally join the URLs we get from parsing content with the URLs we got // from the status ouput, and the urls we didn't process from the db so that // we have a unified stream of all known URLs for the crawldb. Pipe finishedUrlsFromDbPipe = new Each(finishedDatumsFromDb, new CreateUrlDatumFromCrawlDbFunction()); finishedUrlsFromDbPipe = TupleLogger.makePipe(finishedUrlsFromDbPipe, true); // NOTE : Ideally you would just do a CoGroup instead of converting all the pipes to emit UrlDatums // and then doing the extra step of converting from UrlDatum to CrawlDbDatum. // The reason this isn't being done here is because we are sharing LatestUrlDatumBuffer() with JDBCCrawlTool Pipe crawlDbPipe = new GroupBy("crawldb pipe", Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe, finishedUrlsFromDbPipe), new Fields(UrlDatum.URL_FN)); crawlDbPipe = new Every(crawlDbPipe, new LatestUrlDatumBuffer(), Fields.RESULTS); Pipe outputPipe = new Pipe("output pipe"); outputPipe = new Each(crawlDbPipe, new CreateCrawlDbDatumFromUrlFunction()); // Create the output map that connects each tail pipe to the appropriate sink. sinkMap.put(statusPipe.getName(), statusSink); tailPipes.add(statusPipe); sinkMap.put(contentPipe.getName(), contentSink); tailPipes.add(contentPipe); sinkMap.put(parsePipe.getTailPipe().getName(), parseSink); tailPipes.add(parsePipe.getTailPipe()); sinkMap.put(outputPipe.getName(), loopCrawldbSink); tailPipes.add(outputPipe); sinkMap.put(writableSeqFileDataPipe.getName(), writableSeqFileSink); tailPipes.add(writableSeqFileDataPipe); FlowConnector flowConnector = new FlowConnector(props); Flow flow = flowConnector.connect(inputSource, sinkMap, tailPipes.toArray(new Pipe[tailPipes.size()])); return flow; }
From source file:com.finderbots.utilities.ExportPinterestPrefsTool.java
License:Apache License
public static void main(String[] args) { ExportToolOptions options = new ExportToolOptions(); CmdLineParser parser = new CmdLineParser(options); String outputDirName;/*from w w w .j a v a 2 s . com*/ try { parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } try { outputDirName = options.getOutputDir(); JobConf conf = new JobConf(); Path outputPath = new Path(outputDirName); Path crawlPath = new Path(options.getCrawlDir()); FileSystem fs = outputPath.getFileSystem(conf); // get the urls of users, urls of followed people, make sure they are unique, create an index // and write the ids out as CSV file of prefs for mahout input. Flow exportPinterestPrefsWorkFlow = ExportPinterestPrefsWorkflow.createFlow(crawlPath, options); exportPinterestPrefsWorkFlow.complete(); } catch (PlannerException e) { e.writeDOT("failed-flow.dot"); System.err.println("PlannerException: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } catch (Throwable t) { System.err.println("Exception running tool: " + t.getMessage()); t.printStackTrace(System.err); System.exit(-1); } }
From source file:com.finderbots.utilities.ExportTool.java
License:Apache License
public static void main(String[] args) { ExportToolOptions options = new ExportToolOptions(); CmdLineParser parser = new CmdLineParser(options); String outputDirName;/*from w ww . j a va2 s .c o m*/ try { parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } try { outputDirName = options.getOutputDir(); JobConf conf = new JobConf(); Path outputPath = new Path(outputDirName); Path crawlPath = new Path(options.getCrawlDir()); FileSystem fs = outputPath.getFileSystem(conf); // create a flow that takes all parsed text and accumulates into a single sink in mahout format Flow exportToMahoutFlow = ExportAllToMahoutWorkflow.createFlow(crawlPath, options); exportToMahoutFlow.complete(); } catch (PlannerException e) { e.writeDOT("failed-flow.dot"); System.err.println("PlannerException: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } catch (Throwable t) { System.err.println("Exception running tool: " + t.getMessage()); t.printStackTrace(System.err); System.exit(-1); } }