List of usage examples for org.apache.commons.collections Bag removeAll
boolean removeAll(Collection coll);
From source file:org.lockss.crawler.FuncArcExploder.java
public void runTest(boolean good) throws Exception { log.debug3("About to create content"); createContent();// ww w .ja v a2s. c om // get the root of the simContent String simDir = sau.getSimRoot(); log.debug3("About to crawl content"); boolean res = crawlContent(good ? null : url[url.length - 1]); if (good) { assertTrue("Crawl failed", res); } else { assertFalse("Crawl succeeded", res); return; } // read all the files links from the root of the simcontent // check the link level of the file and see if it contains // in myCUS (check if the crawler crawl within the max. depth) CachedUrlSet myCUS = sau.getAuCachedUrlSet(); File dir = new File(simDir); if (dir.isDirectory()) { File f[] = dir.listFiles(); log.debug("Checking simulated content."); checkThruFileTree(f, myCUS); log.debug("Checking simulated content done."); checkExplodedUrls(); checkUnExplodedUrls(); log.debug("Check finished."); } else { log.error("Error: The root path of the simulated" + " content [" + dir + "] is not a directory"); } // Test PluginManager.getAuContentSize(), just because this is a // convenient place to do it. If the simulated AU params are changed, or // SimulatedContentGenerator is changed, this number may have to // change. NB - because the ARC files are compressed, their // size varies randomly by a small amount. long expected = 5579; long actual = AuUtil.getAuContentSize(sau, true); long error = expected - actual; log.debug("Expected " + expected + " actual " + actual); long absError = (error < 0 ? -error : error); assertTrue("size mismatch " + expected + " vs. " + actual, absError < 100); List sbc = ((MySimulatedArchivalUnit) sau).sbc; Bag b = new HashBag(sbc); Set uniq = new HashSet(b.uniqueSet()); for (Iterator iter = uniq.iterator(); iter.hasNext();) { b.remove(iter.next(), 1); } // Permission pages get checked twice. Hard to avoid that, so allow it b.removeAll(sau.getPermissionUrls()); // archives get checked twice - from checkThruFileTree & checkExplodedUrls b.remove(url2[url2.length - 1]); // This test is screwed up by the use of shouldBeCached() in // ArcExploder() to find the AU to store the URL in. //assertEmpty("shouldBeCached() called multiple times on same URLs.", b); // Test getUrlStems checkGetUrlStems(); // Test crawl rules checkCrawlRules(); // Test getPermissionPage //checkGetPermissionPages(); }
From source file:org.lockss.crawler.FuncArcExploder2.java
public void runTest(boolean good) throws Exception { log.debug3("About to create content"); createContent();/* w w w . j a v a 2 s .c o m*/ // get the root of the simContent String simDir = sau.getSimRoot(); log.debug3("About to crawl content"); boolean res = crawlContent(good ? null : url[url.length - 1]); if (good) { assertTrue("Crawl failed", res); } else { assertFalse("Crawl succeeded", res); return; } // read all the files links from the root of the simcontent // check the link level of the file and see if it contains // in myCUS (check if the crawler crawl within the max. depth) CachedUrlSet myCUS = sau.getAuCachedUrlSet(); File dir = new File(simDir); if (dir.isDirectory()) { checkExplodedUrls(); checkUnExplodedUrls(); log.debug("Check finished."); } else { log.error("Error: The root path of the simulated" + " content [" + dir + "] is not a directory"); } // Test PluginManager.getAuContentSize(), just because this is a // convenient place to do it. If the simulated AU params are changed, or // SimulatedContentGenerator is changed, this number may have to // change. NB - because the ARC files are compressed, their // size varies randomly by a small amount. long expected = 2671; long actual = AuUtil.getAuContentSize(sau, true); long error = expected - actual; long absError = (error < 0 ? -error : error); assertTrue("size mismatch " + expected + " vs. " + actual, absError < 60); List sbc = ((MySimulatedArchivalUnit) sau).sbc; Bag b = new HashBag(sbc); Set uniq = new HashSet(b.uniqueSet()); for (Iterator iter = uniq.iterator(); iter.hasNext();) { b.remove(iter.next(), 1); } // Permission pages get checked twice. Hard to avoid that, so allow it b.removeAll(sau.getPermissionUrls()); // archives get checked twice - from checkThruFileTree & checkExplodedUrls b.remove(url2[url2.length - 1]); // This test is screwed up by the use of shouldBeCached() in // ArcExploder() to find the AU to store the URL in. //assertEmpty("shouldBeCached() called multiple times on same URLs.", b); }
From source file:org.lockss.crawler.FuncNewContentCrawler.java
public void testRunSelf() throws Exception { createContent();/*from w ww . ja v a 2 s . c o m*/ // get the root of the simContent String simDir = sau.getSimRoot(); NoCrawlEndActionsFollowLinkCrawler crawler = crawlContent(); // read all the files links from the root of the simcontent // check the link level of the file and see if it contains // in myCUS (check if the crawler crawl within the max. depth) CachedUrlSet myCUS = sau.getAuCachedUrlSet(); File dir = new File(simDir); if (dir.isDirectory()) { File f[] = dir.listFiles(); log.debug("Checking simulated content."); checkThruFileTree(f, myCUS); log.debug("Check finished."); } else { log.error("Error: The root path of the simulated" + " content [" + dir + "] is not a directory"); } // Test PluginManager.getAuContentSize(), just because this is a // convenient place to do it. (And NewContentCrawler calls it at the // end.) If the simulated AU params are changed, or // SimulatedContentGenerator is changed, this number may have to // change. assertEquals(19262, AuUtil.getAuContentSize(sau, true)); List sbc = ((MySimulatedArchivalUnit) sau).sbc; Bag b = new HashBag(sbc); Set uniq = new HashSet(b.uniqueSet()); for (Iterator iter = uniq.iterator(); iter.hasNext();) { b.remove(iter.next(), 1); } // Permission pages get checked twice. Hard to avoid that, so allow it b.removeAll(sau.getPermissionUrls()); assertEmpty("shouldBeCached() called multiple times on same URLs.", b); String th = "text/html"; String tp = "text/plain"; String[] ct = { null, null, null, tp, tp, th, th, tp, tp, th, tp }; Bag ctb = new HashBag(ListUtil.fromArray(ct)); CrawlRateLimiter crl = crawlMgr.getCrawlRateLimiter(crawler); assertEquals(ctb, new HashBag(crawlMgr.getPauseContentTypes(crawler))); }
From source file:org.lockss.crawler.FuncTarExploder.java
public void runTest(boolean good) throws Exception { log.debug3("About to create content"); createContent();// w w w .j a v a 2 s . com // get the root of the simContent String simDir = sau.getSimRoot(); log.debug3("About to crawl content"); boolean res = crawlContent(good ? null : "002file.bin"); if (good) { assertTrue("Crawl failed", res); if (false) assertTrue( "Crawl should succeed but got " + lastCrawlResult + (lastCrawlMessage == null ? "" : " with " + lastCrawlMessage), lastCrawlResult == Crawler.STATUS_SUCCESSFUL); } else { assertFalse("Crawl succeeded", res); if (false) assertTrue( "Crawl should get STATUS_PLUGIN_ERROR but got " + lastCrawlResult + (lastCrawlMessage == null ? "" : " with " + lastCrawlMessage), lastCrawlResult == Crawler.STATUS_PLUGIN_ERROR); return; } // read all the files links from the root of the simcontent // check the link level of the file and see if it contains // in myCUS (check if the crawler crawl within the max. depth) CachedUrlSet myCUS = sau.getAuCachedUrlSet(); File dir = new File(simDir); if (dir.isDirectory()) { File f[] = dir.listFiles(); log.debug("Checking simulated content."); checkThruFileTree(f, myCUS); log.debug("Checking simulated content done."); checkExplodedUrls(); checkUnExplodedUrls(); log.debug("Check finished."); } else { log.error("Error: The root path of the simulated" + " content [" + dir + "] is not a directory"); } // Test PluginManager.getAuContentSize(), just because this is a // convenient place to do it. If the simulated AU params are changed, or // SimulatedContentGenerator is changed, this number may have to // change. NB - because the TAR files are compressed, their // size varies randomly by a small amount. long expected = 41399; long actual = AuUtil.getAuContentSize(sau, true); long error = expected - actual; long absError = (error < 0 ? -error : error); assertTrue("size mismatch " + expected + " vs. " + actual, absError < 60); List sbc = ((MySimulatedArchivalUnit) sau).sbc; Bag b = new HashBag(sbc); Set uniq = new HashSet(b.uniqueSet()); for (Iterator iter = uniq.iterator(); iter.hasNext();) { b.remove(iter.next(), 1); } // Permission pages get checked twice. Hard to avoid that, so allow it b.removeAll(sau.getPermissionUrls()); // archives get checked twice - from checkThruFileTree & checkExplodedUrls b.remove("http://www.example.com/content.tar"); // This test is screwed up by the use of shouldBeCached() in // TarExploder() to find the AU to store the URL in. //assertEmpty("shouldBeCached() called multiple times on same URLs.", b); }
From source file:org.lockss.crawler.FuncTarExploder2.java
public void testRunSelf() throws Exception { log.debug3("About to create content"); createContent();//from w w w .j a v a 2 s .co m // get the root of the simContent String simDir = sau.getSimRoot(); assertTrue("No simulated content", simDir != null); log.debug3("About to crawl content"); crawlContent(); // read all the files links from the root of the simcontent // check the link level of the file and see if it contains // in myCUS (check if the crawler crawl within the max. depth) CachedUrlSet myCUS = sau.getAuCachedUrlSet(); File dir = new File(simDir); if (dir.isDirectory()) { File f[] = dir.listFiles(); log.debug("Checking simulated content."); checkThruFileTree(f, myCUS); log.debug("Checking simulated content done."); checkUnExplodedUrls(); checkExplodedUrls(); log.debug("Check finished."); } else { log.error("Error: The root path of the simulated" + " content [" + dir + "] is not a directory"); } // Test PluginManager.getAuContentSize(), just because this is a // convenient place to do it. If the simulated AU params are changed, or // SimulatedContentGenerator is changed, this number may have to // change. NB - because the TAR files are compressed, their // size varies randomly by a small amount. long expected = 261173; long actual = AuUtil.getAuContentSize(sau, true); long error = expected - actual; long absError = (error < 0 ? -error : error); assertTrue("size mismatch " + expected + " vs. " + actual, absError < 60); if (false) { List sbc = ((MySimulatedArchivalUnit) sau).sbc; Bag b = new HashBag(sbc); Set uniq = new HashSet(b.uniqueSet()); for (Iterator iter = uniq.iterator(); iter.hasNext();) { b.remove(iter.next(), 1); } // Permission pages get checked twice. Hard to avoid that, so allow it b.removeAll(sau.getPermissionUrls()); // archives get checked twice - from checkThruFileTree & checkExplodedUrls b.remove("http://www.example.com/issn.tar"); // This test is screwed up by the use of shouldBeCached() in // TarExploder() to find the AU to store the URL in. //assertEmpty("shouldBeCached() called multiple times on same URLs.", b); } // Now check the DOIs checkDOIs(); }
From source file:org.lockss.crawler.FuncWarcExploder.java
public void runTest(boolean good) throws Exception { log.debug3("About to create content"); createContent();//from w w w . j a v a 2 s . c o m // get the root of the simContent String simDir = sau.getSimRoot(); log.debug3("About to crawl content"); boolean res = crawlContent(good ? null : url[url.length - 1]); if (good) { assertTrue("Crawl failed", res); if (false) assertTrue( "Crawl should succeed but got " + lastCrawlResult + (lastCrawlMessage == null ? "" : " with " + lastCrawlMessage), lastCrawlResult == Crawler.STATUS_SUCCESSFUL); } else { assertFalse("Crawl succeeded", res); if (false) assertTrue( "Crawl should get STATUS_PLUGIN_ERROR but got " + lastCrawlResult + (lastCrawlMessage == null ? "" : " with " + lastCrawlMessage), lastCrawlResult == Crawler.STATUS_PLUGIN_ERROR); return; } // read all the files links from the root of the simcontent // check the link level of the file and see if it contains // in myCUS (check if the crawler crawl within the max. depth) CachedUrlSet myCUS = sau.getAuCachedUrlSet(); File dir = new File(simDir); if (dir.isDirectory()) { File f[] = dir.listFiles(); log.debug("Checking simulated content."); checkThruFileTree(f, myCUS); log.debug("Checking simulated content done."); checkExplodedUrls(); checkUnExplodedUrls(); log.debug("Check finished."); } else { log.error("Error: The root path of the simulated" + " content [" + dir + "] is not a directory"); } // Test PluginManager.getAuContentSize(), just because this is a // convenient place to do it. If the simulated AU params are changed, or // SimulatedContentGenerator is changed, this number may have to // change. NB - because the WARC files are compressed, their // size varies randomly by a small amount. long expected = 9394; long actual = AuUtil.getAuContentSize(sau, true); long error = expected - actual; log.debug("Expected " + expected + " actual " + actual); long absError = (error < 0 ? -error : error); assertTrue("size mismatch " + expected + " vs. " + actual, absError < 60); List sbc = ((MySimulatedArchivalUnit) sau).sbc; Bag b = new HashBag(sbc); Set uniq = new HashSet(b.uniqueSet()); for (Iterator iter = uniq.iterator(); iter.hasNext();) { b.remove(iter.next(), 1); } // Permission pages get checked twice. Hard to avoid that, so allow it b.removeAll(sau.getPermissionUrls()); // archives get checked twice - from checkThruFileTree & checkExplodedUrls b.remove(url2[url2.length - 1]); // This test is screwed up by the use of shouldBeCached() in // WarcExploder() to find the AU to store the URL in. //assertEmpty("shouldBeCached() called multiple times on same URLs.", b); // Test getUrlStems checkGetUrlStems(); // Test crawl rules checkCrawlRules(); // Test getPermissionPage //checkGetPermissionPages(); }
From source file:org.lockss.crawler.FuncWarcExploder2.java
public void runTest(boolean good) throws Exception { log.debug3("About to create content"); createContent();/*from w ww .ja v a 2 s .com*/ // get the root of the simContent String simDir = sau.getSimRoot(); log.debug3("About to crawl content"); boolean res = crawlContent(good ? null : url[url.length - 1]); if (good) { assertTrue("Crawl failed", res); if (false) assertTrue( "Crawl should succeed but got " + lastCrawlResult + (lastCrawlMessage == null ? "" : " with " + lastCrawlMessage), lastCrawlResult == Crawler.STATUS_SUCCESSFUL); } else { assertFalse("Crawl succeeded", res); if (false) assertTrue( "Crawl should get STATUS_PLUGIN_ERROR but got " + lastCrawlResult + (lastCrawlMessage == null ? "" : " with " + lastCrawlMessage), lastCrawlResult == Crawler.STATUS_PLUGIN_ERROR); return; } // read all the files links from the root of the simcontent // check the link level of the file and see if it contains // in myCUS (check if the crawler crawl within the max. depth) CachedUrlSet myCUS = sau.getAuCachedUrlSet(); File dir = new File(simDir); if (dir.isDirectory()) { checkExplodedUrls(); checkUnExplodedUrls(); log.debug("Check finished."); } else { log.error("Error: The root path of the simulated" + " content [" + dir + "] is not a directory"); } // Test PluginManager.getAuContentSize(), just because this is a // convenient place to do it. If the simulated AU params are changed, or // SimulatedContentGenerator is changed, this number may have to // change. NB - because the WARC files are compressed, their // size varies randomly by a small amount. long expected = 35775; long actual = AuUtil.getAuContentSize(sau, true); long error = expected - actual; long absError = (error < 0 ? -error : error); assertTrue("size mismatch " + expected + " vs. " + actual, absError < 60); List sbc = ((MySimulatedArchivalUnit) sau).sbc; Bag b = new HashBag(sbc); Set uniq = new HashSet(b.uniqueSet()); for (Iterator iter = uniq.iterator(); iter.hasNext();) { b.remove(iter.next(), 1); } // Permission pages get checked twice. Hard to avoid that, so allow it b.removeAll(sau.getPermissionUrls()); // archives get checked twice - from checkThruFileTree & checkExplodedUrls b.remove(url2[url2.length - 1]); // This test is screwed up by the use of shouldBeCached() in // WarcExploder() to find the AU to store the URL in. //assertEmpty("shouldBeCached() called multiple times on same URLs.", b); }
From source file:org.lockss.crawler.FuncZipExploder.java
public void runTest(boolean good) throws Exception { log.debug3("About to create content"); createContent();/*from w w w . j a v a 2 s .c o m*/ // get the root of the simContent String simDir = sau.getSimRoot(); log.debug3("About to crawl content"); boolean res = crawlContent(good ? null : "002file.bin"); if (good) { assertTrue("Crawl failed", res); if (false) assertTrue( "Crawl should succeed but got " + lastCrawlResult + (lastCrawlMessage == null ? "" : " with " + lastCrawlMessage), lastCrawlResult == Crawler.STATUS_SUCCESSFUL); } else { assertFalse("Crawl succeeded", res); if (false) assertTrue( "Crawl should get STATUS_PLUGIN_ERROR but got " + lastCrawlResult + (lastCrawlMessage == null ? "" : " with " + lastCrawlMessage), lastCrawlResult == Crawler.STATUS_PLUGIN_ERROR); return; } // read all the files links from the root of the simcontent // check the link level of the file and see if it contains // in myCUS (check if the crawler crawl within the max. depth) CachedUrlSet myCUS = sau.getAuCachedUrlSet(); File dir = new File(simDir); if (dir.isDirectory()) { File f[] = dir.listFiles(); log.debug("Checking simulated content."); checkThruFileTree(f, myCUS); log.debug("Checking simulated content done."); checkExplodedUrls(); checkUnExplodedUrls(); log.debug("Check finished."); } else { log.error("Error: The root path of the simulated" + " content [" + dir + "] is not a directory"); } // Test PluginManager.getAuContentSize(), just because this is a // convenient place to do it. If the simulated AU params are changed, or // SimulatedContentGenerator is changed, this number may have to // change. NB - because the ZIP files are compressed, their // size varies randomly by a small amount. long expected = 2615; long actual = AuUtil.getAuContentSize(sau, true); long error = expected - actual; long absError = (error < 0 ? -error : error); assertTrue("size mismatch " + expected + " vs. " + actual, absError < 60); List sbc = ((MySimulatedArchivalUnit) sau).sbc; Bag b = new HashBag(sbc); Set uniq = new HashSet(b.uniqueSet()); for (Iterator iter = uniq.iterator(); iter.hasNext();) { b.remove(iter.next(), 1); } // Permission pages get checked twice. Hard to avoid that, so allow it b.removeAll(sau.getPermissionUrls()); // archives get checked twice - from checkThruFileTree & checkExplodedUrls b.remove("http://www.example.com/content.zip"); // This test is screwed up by the use of shouldBeCached() in // ZipExploder() to find the AU to store the URL in. //assertEmpty("shouldBeCached() called multiple times on same URLs.", b); }
From source file:org.lockss.crawler.FuncZipExploder2.java
public void testRunSelf() throws Exception { log.debug3("About to create content"); createContent();/* w w w. j a v a 2 s . c om*/ // get the root of the simContent String simDir = sau.getSimRoot(); log.debug3("About to crawl content"); crawlContent(); // read all the files links from the root of the simcontent // check the link level of the file and see if it contains // in myCUS (check if the crawler crawl within the max. depth) CachedUrlSet myCUS = sau.getAuCachedUrlSet(); File dir = new File(simDir); if (dir.isDirectory()) { File f[] = dir.listFiles(); log.debug("Checking simulated content."); checkThruFileTree(f, myCUS); log.debug("Checking simulated content done."); checkUnExplodedUrls(); checkExplodedUrls(); log.debug("Check finished."); } else { log.error("Error: The root path of the simulated" + " content [" + dir + "] is not a directory"); } // Test PluginManager.getAuContentSize(), just because this is a // convenient place to do it. If the simulated AU params are changed, or // SimulatedContentGenerator is changed, this number may have to // change. NB - because the ZIP files are compressed, their // size varies randomly by a small amount. long expected = 285227; long actual = AuUtil.getAuContentSize(sau, true); long error = expected - actual; long absError = (error < 0 ? -error : error); assertTrue("size mismatch " + expected + " vs. " + actual, absError < 60); if (false) { List sbc = ((MySimulatedArchivalUnit) sau).sbc; Bag b = new HashBag(sbc); Set uniq = new HashSet(b.uniqueSet()); for (Iterator iter = uniq.iterator(); iter.hasNext();) { b.remove(iter.next(), 1); } // Permission pages get checked twice. Hard to avoid that, so allow it b.removeAll(sau.getPermissionUrls()); // archives get checked twice - from checkThruFileTree & checkExplodedUrls b.remove("http://www.example.com/content.zip"); // This test is screwed up by the use of shouldBeCached() in // ZipExploder() to find the AU to store the URL in. //assertEmpty("shouldBeCached() called multiple times on same URLs.", b); } // Now check the DOIs checkDOIs(); }