List of usage examples for org.apache.hadoop.fs Path toString
@Override
public String toString()
From source file:be.uantwerpen.adrem.disteclat.PrefixComputerMapper.java
License:Apache License
@Override public void setup(Context context) throws IOException { try {//www . jav a2s . co m Configuration conf = context.getConfiguration(); minSup = conf.getInt(MIN_SUP_KEY, -1); prefixLength = conf.getInt(PREFIX_LENGTH_KEY, 1); Path[] localCacheFiles = getLocalCacheFiles(conf); for (Path path : localCacheFiles) { String pathString = path.toString(); if (pathString.contains(OSingletonsTids)) { System.out.println("[PrefixComputerMapper]: Reading singletons"); singletons = readTidLists(conf, path); } else if (pathString.contains(OSingletonsOrder)) { System.out.println("[PrefixComputerMapper]: Reading singleton orders"); orderMap = readSingletonsOrder(path); } } sortSingletons(); } catch (Exception e) { e.printStackTrace(); } }
From source file:be.uantwerpen.adrem.disteclat.PrefixComputerMapper.java
License:Apache License
/** * Reads the singletons ordering from file. * //from ww w. ja va 2s. c o m * @param path * @return * @throws IOException */ private static Map<Integer, Integer> readSingletonsOrder(Path path) throws IOException { BufferedReader reader = new BufferedReader(new FileReader(path.toString())); String order = reader.readLine().trim(); reader.close(); Map<Integer, Integer> orderMap = newHashMap(); String[] split = order.split(" "); int ix = 0; for (String item : split) { orderMap.put(valueOf(item), ix++); } return orderMap; }
From source file:be.uantwerpen.adrem.hadoop.util.Tools.java
License:Apache License
@SuppressWarnings("rawtypes") public static Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat, Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey, Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer, Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue, Class<? extends OutputFormat> outputFormat) throws IOException { Job job = new Job(new Configuration()); Configuration jobConf = job.getConfiguration(); if (reducer.equals(Reducer.class)) { if (mapper.equals(Mapper.class)) { throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer"); }//from w w w . jav a 2s.c o m job.setJarByClass(mapper); } else { job.setJarByClass(reducer); } job.setInputFormatClass(inputFormat); jobConf.set("mapred.input.dir", inputPath.toString()); job.setMapperClass(mapper); if (mapperKey != null) { job.setMapOutputKeyClass(mapperKey); } if (mapperValue != null) { job.setMapOutputValueClass(mapperValue); } jobConf.setBoolean("mapred.compress.map.output", true); job.setReducerClass(reducer); job.setOutputKeyClass(reducerKey); job.setOutputValueClass(reducerValue); job.setOutputFormatClass(outputFormat); jobConf.set("mapred.output.dir", outputPath.toString()); return job; }
From source file:bigfat.hadoop.HDFSDirInputStream.java
License:Apache License
/** * Create a input stream that will read through all the files in one * directory note that the file will be sorted by name, using the * comparator.// w ww. ja va 2 s . com * * @param fs * @param dir * @param comp * @throws IOException */ public HDFSDirInputStream(FileSystem fs, String dir, Comparator<String> comp) throws IOException { this.fs = fs; Path p = new Path(dir); FileStatus fstate = fs.getFileStatus(p); if (fstate.isDir()) { FileStatus[] child = fs.globStatus(new Path(dir + "/*")); LinkedList<String> s = new LinkedList<String>(); Map<String, Path> map = new HashMap<String, Path>(); for (FileStatus c : child) { if (c.isDir()) continue; map.put(c.getPath().getName(), c.getPath()); s.add(c.getPath().getName()); } if (comp != null) Collections.sort(s, comp); else Collections.sort(s); Iterator<String> it = s.iterator(); while (it.hasNext()) { String n = it.next(); Path pr = map.get(n); this.appendFile(pr.toString()); } } else { this.appendFile(dir); } }
From source file:bixo.examples.crawl.DemoCrawlWorkflowLRTest.java
License:Apache License
@Test public void testNotLosingFetchedUrls() throws Throwable { String baseDirName = "build/test/SimpleCrawlWorkflowLRTest/output"; JobConf conf = new JobConf(); Path baseDirPath = new Path(baseDirName); FileSystem fs = baseDirPath.getFileSystem(conf); HadoopUtils.safeRemove(fs, baseDirPath); Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 0); Path crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); DemoCrawlTool.importOneDomain("localhost:8089", crawlDbPath, conf); curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 1); FetcherPolicy defaultPolicy = new FetcherPolicy(); defaultPolicy.setCrawlDelay(1);// ww w .j a v a 2s . c o m defaultPolicy.setFetcherMode(FetcherMode.COMPLETE); BaseUrlFilter urlFilter = new BaseUrlFilter() { @Override public boolean isRemove(UrlDatum datum) { return false; } }; DemoCrawlToolOptions options = new DemoCrawlToolOptions(); options.setUseBoilerpipe(true); UserAgent userAgent = new UserAgent("test", "test@domain.com", "http://test.domain.com"); Server server = null; try { server = startServer(new FakeWebSiteHandler(), 8089); Flow flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options); flow.complete(); // Update the crawlDb path crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); // Now we should have an output/1-<timestamp>/ directory, where the // /urls dir has 11 entries with // one being previously crawled, and the other 10 being pending. Hfs crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString()); TupleEntryIterator iter = crawldbTap.openForRead(conf); int numFetched = 0; int numPending = 0; while (iter.hasNext()) { CrawlDbDatum datum = new CrawlDbDatum(iter.next()); UrlStatus status = datum.getLastStatus(); int crawlDepth = datum.getCrawlDepth(); if (datum.getLastFetched() != 0) { numFetched += 1; assertEquals(UrlStatus.FETCHED, status); assertEquals(0, crawlDepth); } else { numPending += 1; assertEquals(UrlStatus.UNFETCHED, status); assertEquals(1, crawlDepth); } } assertEquals(1, numFetched); assertEquals(10, numPending); // Do it one more time, to verify status gets propagated forward. curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 2); flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options); flow.complete(); // Update crawldb path crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString()); iter = crawldbTap.openForRead(conf); numFetched = 0; numPending = 0; int numDepth0 = 0; int numDepth1 = 0; int numDepth2 = 0; while (iter.hasNext()) { CrawlDbDatum datum = new CrawlDbDatum(iter.next()); UrlStatus status = datum.getLastStatus(); int depth = datum.getCrawlDepth(); if (datum.getLastFetched() != 0) { numFetched += 1; assertEquals("URL has incorrect status: " + datum.getUrl(), UrlStatus.FETCHED, status); } else { numPending += 1; assertEquals("URL has incorrect status: " + datum.getUrl(), UrlStatus.UNFETCHED, status); } if (depth == 0) { numDepth0 += 1; } else if (depth == 1) { numDepth1 += 1; } else if (depth == 2) { numDepth2 += 1; } else { fail("Invalid crawl depth for " + datum.getUrl()); } // System.out.println(String.format("URL %s has status %s, last fetch %d, and depth %d", // datum.getUrl(), datum.getLastStatus(), // datum.getLastFetched(), depth)); } assertEquals(11, numFetched); assertEquals(100, numPending); assertEquals(1, numDepth0); assertEquals(10, numDepth1); assertEquals(100, numDepth2); } catch (Throwable t) { fail(t.getMessage()); } finally { if (server != null) { server.stop(); } } }
From source file:bixo.examples.crawl.JDBCCrawlWorkflow.java
License:Apache License
public static Flow createFlow(Path inputDir, Path curLoopDirPath, UserAgent userAgent, FetcherPolicy fetcherPolicy, BaseUrlFilter urlFilter, int maxThreads, boolean debug, String persistentDbLocation) throws Throwable { JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB); int numReducers = HadoopUtils.getNumReducers(conf); conf.setNumReduceTasks(numReducers); FileSystem fs = curLoopDirPath.getFileSystem(conf); if (!fs.exists(inputDir)) { throw new IllegalStateException(String.format("Input directory %s doesn't exist", inputDir)); }/*from w w w .j a va 2 s .c o m*/ Tap inputSource = JDBCTapFactory.createUrlsSourceJDBCTap(persistentDbLocation); // Read _everything_ in initially // Group on the url, and select the best urls to best Pipe importPipe = new Pipe("url importer"); importPipe = new GroupBy(importPipe, new Fields(CrawlDbDatum.URL_FIELD)); importPipe = new Every(importPipe, new BestUrlToFetchBuffer(), Fields.RESULTS); Path contentPath = new Path(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME); Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentPath.toString()); Path parsePath = new Path(curLoopDirPath, CrawlConfig.PARSE_SUBDIR_NAME); Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parsePath.toString()); Path statusDirPath = new Path(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString()); // NOTE: The source and sink for CrawlDbDatums is essentially the same database - // since cascading doesn't allow you to use the same tap for source and // sink we fake it by creating two separate taps. Tap urlSink = JDBCTapFactory.createUrlsSinkJDBCTap(persistentDbLocation); // Create the sub-assembly that runs the fetch job BaseFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent); BaseScoreGenerator scorer = new FixedScoreGenerator(); FetchPipe fetchPipe = new FetchPipe(importPipe, scorer, fetcher, numReducers); Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe()); // Take content and split it into content output plus parse to extract URLs. ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), new SimpleParser()); Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe()); urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction(new SimpleUrlNormalizer(), new SimpleUrlValidator())); urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter)); urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer())); // Take status and output updated UrlDatum's. Again, since we are using // the same database we need to create a new tap. Pipe urlFromFetchPipe = new Pipe("url from fetch", fetchPipe.getStatusTailPipe()); urlFromFetchPipe = new Each(urlFromFetchPipe, new CreateUrlDatumFromStatusFunction()); // Now we need to join the URLs we get from parsing content with the // URLs we got from the status output, so we have a unified stream // of all known URLs. Pipe urlPipe = new GroupBy("url pipe", Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe), new Fields(UrlDatum.URL_FN)); urlPipe = new Every(urlPipe, new LatestUrlDatumBuffer(), Fields.RESULTS); Pipe outputPipe = new Pipe("output pipe"); outputPipe = new Each(urlPipe, new CreateCrawlDbDatumFromUrlFunction()); // Create the output map that connects each tail pipe to the appropriate sink. Map<String, Tap> sinkMap = new HashMap<String, Tap>(); sinkMap.put(statusPipe.getName(), statusSink); sinkMap.put(FetchPipe.CONTENT_PIPE_NAME, contentSink); sinkMap.put(ParsePipe.PARSE_PIPE_NAME, parseSink); sinkMap.put(outputPipe.getName(), urlSink); // Finally we can run it. FlowConnector flowConnector = new FlowConnector( HadoopUtils.getDefaultProperties(JDBCCrawlWorkflow.class, debug, conf)); return flowConnector.connect(inputSource, sinkMap, statusPipe, fetchPipe.getContentTailPipe(), parsePipe.getTailPipe(), outputPipe); }
From source file:bixo.examples.crawl.LatestUrlDatumBufferTest.java
License:Apache License
@Test public void testOperateWithGroupBy() throws IOException { // Create a temp file with a fetched url Path fetchedDatumsPath = new Path(_workingDirPath, "fetched"); ArrayList<UrlDatum> fetchedDatums = new ArrayList<UrlDatum>(); UrlDatum fetchedDatum1 = new UrlDatum("http://foo.com"); fetchedDatum1.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 2L); fetchedDatums.add(fetchedDatum1);//from w w w .j a va 2 s .com createDataFile(fetchedDatumsPath.toString(), fetchedDatums); // And another with unfetched urls Path unfetchedDatumsPath = new Path(_workingDirPath, "unfetched"); ArrayList<UrlDatum> unfetchedDatums = new ArrayList<UrlDatum>(); UrlDatum unfetchedDatum1 = new UrlDatum("http://foo.com"); unfetchedDatum1.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L); unfetchedDatums.add(unfetchedDatum1); UrlDatum unfetchedDatum2 = new UrlDatum("http://foo.com"); unfetchedDatum2.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L); unfetchedDatums.add(unfetchedDatum2); createDataFile(unfetchedDatumsPath.toString(), unfetchedDatums); // create a workflow Tap inputSource1 = new Hfs(new SequenceFile(UrlDatum.FIELDS), fetchedDatumsPath.toString()); Pipe fetchedPipe = new Pipe("fetched"); Tap inputSource2 = new Hfs(new SequenceFile(UrlDatum.FIELDS), unfetchedDatumsPath.toString()); Pipe unfetchedPipe = new Pipe("unfetched"); Map<String, Tap> sources = new HashMap<String, Tap>(); sources.put(fetchedPipe.getName(), inputSource1); sources.put(unfetchedPipe.getName(), inputSource2); Path resultsPath = new Path(_workingDirPath, "results"); Tap resultSink = new Hfs(new SequenceFile(UrlDatum.FIELDS), resultsPath.toString(), true); Pipe resultsPipe = new GroupBy("results pipe", Pipe.pipes(fetchedPipe, unfetchedPipe), new Fields(UrlDatum.URL_FN)); resultsPipe = new Every(resultsPipe, new LatestUrlDatumBuffer(), Fields.RESULTS); Properties props = HadoopUtils.getDefaultProperties(LatestUrlDatumBufferTest.class, false, _conf); FlowConnector flowConnector = new FlowConnector(props); Flow flow = flowConnector.connect(sources, resultSink, resultsPipe); flow.complete(); // verify that the resulting pipe has the latest tuple Tap testSink = new Hfs(new SequenceFile(UrlDatum.FIELDS), resultsPath.toString(), false); TupleEntryIterator reader = testSink.openForRead(_conf); int count = 0; long latest = 0; while (reader.hasNext()) { TupleEntry next = reader.next(); UrlDatum datum = new UrlDatum(next); latest = (Long) datum.getPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD); count++; } assertEquals(1, count); assertEquals(2, latest); }
From source file:bixo.examples.crawl.SimpleCrawlWorkflow.java
License:Apache License
public static Flow createFlow(Path curWorkingDirPath, Path crawlDbPath, FetcherPolicy fetcherPolicy, UserAgent userAgent, BaseUrlFilter urlFilter, SimpleCrawlToolOptions options) throws Throwable { JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB); int numReducers = HadoopUtils.getNumReducers(conf); conf.setNumReduceTasks(numReducers); Properties props = HadoopUtils.getDefaultProperties(SimpleCrawlWorkflow.class, options.isDebugLogging(), conf);//from ww w.j a v a2 s . c om FileSystem fs = curWorkingDirPath.getFileSystem(conf); // Input : the crawldb if (!fs.exists(crawlDbPath)) { throw new RuntimeException("CrawlDb not found"); } // Our crawl db is defined by the CrawlDbDatum Tap inputSource = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString()); Pipe importPipe = new Pipe("import pipe"); // Split into tuples that are to be fetched and that have already been fetched SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedCrawlDatums()); Pipe finishedDatumsFromDb = splitter.getRHSPipe(); Pipe urlsToFetchPipe = new Pipe("urls to Fetch", splitter.getLHSPipe()); // Convert the urlsToFetchPipe so that we now deal with UrlDatums. urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbFunction()); // A TupleLogger is a good way to follow the tuples around in a flow. You can enable the output // of tuples by setting options.setDebugLogging() to true. urlsToFetchPipe = TupleLogger.makePipe(urlsToFetchPipe, true); // Create the output sinks : // crawldb // content // parse // status Path outCrawlDbPath = new Path(curWorkingDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); Tap loopCrawldbSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), outCrawlDbPath.toString()); Path contentDirPath = new Path(curWorkingDirPath, CrawlConfig.CONTENT_SUBDIR_NAME); Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString()); Path parseDirPath = new Path(curWorkingDirPath, CrawlConfig.PARSE_SUBDIR_NAME); Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parseDirPath.toString()); Path statusDirPath = new Path(curWorkingDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString()); Path productsDirPath = new Path(curWorkingDirPath, CrawlConfig.PRODUCTS_SUBDIR_NAME); Tap productsSink = new Hfs(new TextLine(), productsDirPath.toString()); // Tap productsSink = new Hfs(new TextLine(ProductDatum.FIELDS), productsDirPath.toString()); // Create the sub-assembly that runs the fetch job SimpleHttpFetcher fetcher = new SimpleHttpFetcher(options.getMaxThreads(), fetcherPolicy, userAgent); fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES); fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT); fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT); // You can also provide a set of mime types you want to restrict what content type you // want to deal with - for now keep it simple. Set<String> validMimeTypes = new HashSet<String>(); validMimeTypes.add("text/plain"); validMimeTypes.add("text/html"); fetcherPolicy.setValidMimeTypes(validMimeTypes); // The scorer is used by the FetchPipe to assign a score to every URL that passes the // robots.txt processing. The score is used to sort URLs such that higher scoring URLs // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped. BaseScoreGenerator scorer = new FixedScoreGenerator(); FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers); Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe()); Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe()); contentPipe = TupleLogger.makePipe(contentPipe, true); // Take content and split it into content output plus parse to extract URLs. SimpleParser parser = new SimpleParser(); parser.setExtractLanguage(false); ParsePipe parsePipe = new ParsePipe(contentPipe, parser); Pipe productsPipe = new Pipe("products pipe", parsePipe); // PRECIOUS Pipe productsPipe = new Pipe("products pipe", fetchPipe.getContentTailPipe()); String regex = "[a-z]+@[a-z]+.[a-z]+"; // WAS: String regex = "[\\w\\-]([\\.\\w])+[\\w]+@([\\w\\-]+\\.)+[A-Z]{2,4}"; Function emailExtractor = new RegexGenerator(new Fields("email"), regex); productsPipe = new Each(productsPipe, emailExtractor); // PRECIOUS productsPipe = new Each(productsPipe, new CreateProductDatumsFunction()); productsPipe = TupleLogger.makePipe(productsPipe, true); Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe()); urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction()); if (urlFilter != null) { urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter)); } urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer())); urlFromOutlinksPipe = TupleLogger.makePipe(urlFromOutlinksPipe, true); // Take status and output urls from it Pipe urlFromFetchPipe = new Pipe("url from fetch"); urlFromFetchPipe = new Each(statusPipe, new CreateUrlDatumFromStatusFunction()); urlFromFetchPipe = TupleLogger.makePipe(urlFromFetchPipe, true); // Finally join the URLs we get from parsing content with the URLs we got // from the status ouput, and the urls we didn't process from the db so that // we have a unified stream of all known URLs for the crawldb. Pipe finishedUrlsFromDbPipe = new Each(finishedDatumsFromDb, new CreateUrlDatumFromCrawlDbFunction()); finishedUrlsFromDbPipe = TupleLogger.makePipe(finishedUrlsFromDbPipe, true); // NOTE : Ideally you would just do a CoGroup instead of converting all the pipes to emit UrlDatums // and then doing the extra step of converting from UrlDatum to CrawlDbDatum. // The reason this isn't being done here is because we are sharing LatestUrlDatumBuffer() with JDBCCrawlTool Pipe crawlDbPipe = new GroupBy("crawldb pipe", Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe, finishedUrlsFromDbPipe), new Fields(UrlDatum.URL_FN)); crawlDbPipe = new Every(crawlDbPipe, new LatestUrlDatumBuffer(), Fields.RESULTS); Pipe outputPipe = new Pipe("output pipe"); outputPipe = new Each(crawlDbPipe, new CreateCrawlDbDatumFromUrlFunction()); // Create the output map that connects each tail pipe to the appropriate sink. Map<String, Tap> sinkMap = new HashMap<String, Tap>(); sinkMap.put(statusPipe.getName(), statusSink); sinkMap.put(contentPipe.getName(), contentSink); sinkMap.put(ParsePipe.PARSE_PIPE_NAME, parseSink); sinkMap.put(crawlDbPipe.getName(), loopCrawldbSink); sinkMap.put(productsPipe.getName(), productsSink); FlowConnector flowConnector = new FlowConnector(props); Flow flow = flowConnector.connect(inputSource, sinkMap, statusPipe, contentPipe, parsePipe.getTailPipe(), outputPipe); return flow; }
From source file:bixo.examples.crawl.SimpleCrawlWorkflowLRTest.java
License:Apache License
@Test public void testNotLosingFetchedUrls() throws Throwable { String baseDirName = "build/test/SimpleCrawlWorkflowLRTest/output"; JobConf conf = new JobConf(); Path baseDirPath = new Path(baseDirName); FileSystem fs = baseDirPath.getFileSystem(conf); HadoopUtils.safeRemove(fs, baseDirPath); Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 0); Path crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); SimpleCrawlTool.importOneDomain("localhost:8089", crawlDbPath, conf); curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 1); FetcherPolicy defaultPolicy = new FetcherPolicy(); defaultPolicy.setCrawlDelay(1);/*from w w w. j a v a 2 s .c o m*/ defaultPolicy.setFetcherMode(FetcherMode.COMPLETE); BaseUrlFilter urlFilter = new BaseUrlFilter() { @Override public boolean isRemove(UrlDatum datum) { return false; } }; SimpleCrawlToolOptions options = new SimpleCrawlToolOptions(); UserAgent userAgent = new UserAgent("test", "test@domain.com", "http://test.domain.com"); Server server = null; try { server = startServer(new FakeWebSiteHandler(), 8089); Flow flow = SimpleCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options); flow.complete(); // Update the crawlDb path crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); // Now we should have an output/1-<timestamp>/ directory, where the // /urls dir has 11 entries with // one being previously crawled, and the other 10 being pending. Hfs crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString()); TupleEntryIterator iter = crawldbTap.openForRead(conf); int numFetched = 0; int numPending = 0; while (iter.hasNext()) { CrawlDbDatum datum = new CrawlDbDatum(iter.next()); UrlStatus status = datum.getLastStatus(); int crawlDepth = datum.getCrawlDepth(); if (datum.getLastFetched() != 0) { numFetched += 1; assertEquals(UrlStatus.FETCHED, status); assertEquals(0, crawlDepth); } else { numPending += 1; assertEquals(UrlStatus.UNFETCHED, status); assertEquals(1, crawlDepth); } } assertEquals(1, numFetched); assertEquals(10, numPending); // Do it one more time, to verify status gets propagated forward. curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 2); flow = SimpleCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options); flow.complete(); // Update crawldb path crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString()); iter = crawldbTap.openForRead(conf); numFetched = 0; numPending = 0; int numDepth0 = 0; int numDepth1 = 0; int numDepth2 = 0; while (iter.hasNext()) { CrawlDbDatum datum = new CrawlDbDatum(iter.next()); UrlStatus status = datum.getLastStatus(); int depth = datum.getCrawlDepth(); if (datum.getLastFetched() != 0) { numFetched += 1; assertEquals("URL has incorrect status: " + datum.getUrl(), UrlStatus.FETCHED, status); } else { numPending += 1; assertEquals("URL has incorrect status: " + datum.getUrl(), UrlStatus.UNFETCHED, status); } if (depth == 0) { numDepth0 += 1; } else if (depth == 1) { numDepth1 += 1; } else if (depth == 2) { numDepth2 += 1; } else { fail("Invalid crawl depth for " + datum.getUrl()); } // System.out.println(String.format("URL %s has status %s, last fetch %d, and depth %d", // datum.getUrl(), datum.getLastStatus(), // datum.getLastFetched(), depth)); } assertEquals(11, numFetched); assertEquals(100, numPending); assertEquals(1, numDepth0); assertEquals(10, numDepth1); assertEquals(100, numDepth2); } catch (Throwable t) { fail(t.getMessage()); } finally { if (server != null) { server.stop(); } } }
From source file:bixo.examples.JDBCCrawlWorkflow.java
License:Open Source License
public static Flow createFlow(Path inputDir, Path curLoopDirPath, UserAgent userAgent, FetcherPolicy fetcherPolicy, BaseUrlFilter urlFilter, int maxThreads, boolean debug, String persistentDbLocation) throws Throwable { JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB); int numReducers = conf.getNumReduceTasks() * HadoopUtils.getTaskTrackers(conf); FileSystem fs = curLoopDirPath.getFileSystem(conf); if (!fs.exists(inputDir)) { throw new IllegalStateException(String.format("Input directory %s doesn't exist", inputDir)); }//from w ww . j ava2 s. c o m Tap inputSource = JDBCTapFactory.createUrlsSourceJDBCTap(persistentDbLocation); // Read _everything_ in initially // Split that pipe into URLs we want to fetch for the fetch pipe Pipe importPipe = new Pipe("url importer"); importPipe = new GroupBy(importPipe, new Fields(CrawlDbDatum.URL_FIELD)); importPipe = new Every(importPipe, new BestUrlToFetchBuffer(), Fields.RESULTS); Path contentPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentPath.toString()); Path parsePath = new Path(curLoopDirPath, CrawlConfig.PARSE_SUBDIR_NAME); Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parsePath.toString()); Path statusDirPath = new Path(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString()); // NOTE: The source and sink for CrawlDbDatums is essentially the same database - // since cascading doesn't allow you to use the same tap for source and // sink we fake it by creating two separate taps. Tap urlSink = JDBCTapFactory.createUrlsSinkJDBCTap(persistentDbLocation); // Create the sub-assembly that runs the fetch job BaseFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent); BaseScoreGenerator scorer = new FixedScoreGenerator(); FetchPipe fetchPipe = new FetchPipe(importPipe, scorer, fetcher, numReducers); Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe()); // Take content and split it into content output plus parse to extract URLs. ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), new SimpleParser()); Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe()); urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction()); urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter)); urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer())); // Take status and output updated UrlDatum's. Again, since we are using // the same database we need to create a new tap. Pipe urlFromFetchPipe = new Pipe("url from fetch", fetchPipe.getStatusTailPipe()); urlFromFetchPipe = new Each(urlFromFetchPipe, new CreateUrlDatumFromStatusFunction()); // Now we need to join the URLs we get from parsing content with the // URLs we got from the status output, so we have a unified stream // of all known URLs. Pipe urlPipe = new GroupBy("url pipe", Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe), new Fields(UrlDatum.URL_FN)); urlPipe = new Every(urlPipe, new LatestUrlDatumBuffer(), Fields.RESULTS); Pipe outputPipe = new Pipe("output pipe"); outputPipe = new Each(urlPipe, new CreateCrawlDbDatumFromUrlFunction()); // Create the output map that connects each tail pipe to the appropriate sink. Map<String, Tap> sinkMap = new HashMap<String, Tap>(); sinkMap.put(statusPipe.getName(), statusSink); sinkMap.put(FetchPipe.CONTENT_PIPE_NAME, contentSink); sinkMap.put(ParsePipe.PARSE_PIPE_NAME, parseSink); sinkMap.put(outputPipe.getName(), urlSink); // Finally we can run it. FlowConnector flowConnector = new FlowConnector( HadoopUtils.getDefaultProperties(JDBCCrawlWorkflow.class, debug, conf)); return flowConnector.connect(inputSource, sinkMap, statusPipe, fetchPipe.getContentTailPipe(), parsePipe.getTailPipe(), outputPipe); }