List of usage examples for org.jsoup.helper DataUtil load
public static Document load(InputStream in, String charsetName, String baseUri) throws IOException
From source file:com.screenslicer.core.scrape.trainer.TrainerVisitorExtractOnce.java
@Override public void init() { final ArrayList<String> filenames = new ArrayList<String>(); final List<String> bump = Arrays.asList(new String[] {}); new File("./test/data-webpages/").listFiles(new FileFilter() { @Override// www . j av a 2s. c o m public boolean accept(File file) { if (!file.getAbsolutePath().endsWith("-success") && !file.getAbsolutePath().endsWith("-successnode") && !file.getAbsolutePath().endsWith("-result") && !file.getAbsolutePath().endsWith("-num") && !file.getAbsolutePath().endsWith("-next")) { try { if (bump.contains(file.getName())) { resultParents.add(0, FileUtils .readFileToString(new File(file.getAbsolutePath() + "-success"), "utf-8")); elements.add(0, DataUtil.load(file, "utf-8", "http://localhost").body()); filenames.add(0, file.getName()); } else { resultParents.add(FileUtils .readFileToString(new File(file.getAbsolutePath() + "-success", "utf-8"))); elements.add(DataUtil.load(file, "utf-8", "http://localhost").body()); filenames.add(file.getName()); } } catch (IOException e) { throw new RuntimeException(e); } } return false; } }); for (String filename : filenames) { System.out.println(filename); } names = filenames.toArray(new String[0]); }
From source file:com.screenslicer.core.scrape.trainer.TrainerVisitorProceed.java
@Override public void init() { final ArrayList<String> filenames = new ArrayList<String>(); final List<String> bump = Arrays.asList(new String[] { "buzzfeed" }); new File("./test/data-webpages/").listFiles(new FileFilter() { @Override//w ww. j a v a 2 s . co m public boolean accept(File file) { if (!file.getAbsolutePath().endsWith("-success") && !file.getAbsolutePath().endsWith("-successnode") && !file.getAbsolutePath().endsWith("-result") && !file.getAbsolutePath().endsWith("-num") && !file.getAbsolutePath().endsWith("-next")) { try { File fileNext = new File(file.getAbsolutePath() + "-next"); if (fileNext.exists()) { if (bump.contains(file.getName())) { nextButtons.add(0, FileUtils.readFileToString(fileNext, "utf-8")); elements.add(0, DataUtil.load(file, "utf-8", "http://localhost").body()); filenames.add(0, file.getName()); } else { nextButtons.add(FileUtils.readFileToString(fileNext, "utf-8")); elements.add(DataUtil.load(file, "utf-8", "http://localhost").body()); filenames.add(file.getName()); } } } catch (IOException e) { throw new RuntimeException(e); } } return false; } }); for (String filename : filenames) { System.out.println(filename); } names = filenames.toArray(new String[0]); }
From source file:com.screenslicer.core.scrape.trainer.TrainerVisitorExtract.java
@Override public void init() { final ArrayList<String> filenames = new ArrayList<String>(); final List<String> bump = Arrays.asList(new String[] {}); new File("./test/data-webpages/").listFiles(new FileFilter() { @Override// w w w. ja v a 2s . c o m public boolean accept(File file) { if (!file.getAbsolutePath().endsWith("-success") && !file.getAbsolutePath().endsWith("-successnode") && !file.getAbsolutePath().endsWith("-result") && !file.getAbsolutePath().endsWith("-num") && !file.getAbsolutePath().endsWith("-next")) { try { if (bump.contains(file.getName())) { resultParents.add(0, FileUtils .readFileToString(new File(file.getAbsolutePath() + "-success", "utf-8"))); elements.add(0, DataUtil.load(file, "utf-8", "http://localhost").body()); filenames.add(0, file.getName()); } else { resultParents.add(FileUtils .readFileToString(new File(file.getAbsolutePath() + "-success", "utf-8"))); elements.add(DataUtil.load(file, "utf-8", "http://localhost").body()); filenames.add(file.getName()); } } catch (IOException e) { throw new RuntimeException(e); } } return false; } }); for (String filename : filenames) { System.out.println(filename); } tmpItems = new Object[resultParents.size()]; }
From source file:com.screenslicer.core.scrape.trainer.TrainerVisitorAll.java
@Override public void init() { final ArrayList<String> filenames = new ArrayList<String>(); final List<String> bump = Arrays.asList(new String[] { "facebook" }); new File("./test/data-webpages/").listFiles(new FileFilter() { @Override/*from w w w.j a v a 2s.c o m*/ public boolean accept(File file) { if (!file.getAbsolutePath().endsWith("-result") && !file.getAbsolutePath().endsWith("-success") && !file.getAbsolutePath().endsWith("-successnode") && !file.getAbsolutePath().endsWith("-num") && !file.getAbsolutePath().endsWith("-next")) { try { if (bump.contains(file.getName())) { File numFile = new File(file.getAbsolutePath() + "-num"); if (numFile.exists()) { nums.add(0, Integer.parseInt(FileUtils.readFileToString(numFile, "utf-8"))); } else { nums.add(0, 10); } elements.add(0, Util.markTestElement(DataUtil.load(file, "utf-8", "http://localhost").body())); filenames.add(0, file.getName()); } else if (!BUMP_ONLY) { File numFile = new File(file.getAbsolutePath() + "-num"); if (numFile.exists()) { nums.add(Integer.parseInt(FileUtils.readFileToString(numFile, "utf-8"))); } else { nums.add(10); } elements.add( Util.markTestElement(DataUtil.load(file, "utf-8", "http://localhost").body())); filenames.add(file.getName()); } } catch (IOException e) { throw new RuntimeException(e); } } return false; } }); for (String filename : filenames) { System.out.println(filename); } names = filenames.toArray(new String[0]); }
From source file:edu.harvard.iq.safe.lockss.impl.LOCKSSPlatformStatusHtmlParser.java
/** * * @param is// www . ja v a 2 s. co m */ @Override public void getPlatformStatusData(InputStream is) { try { Document doc = DataUtil.load(is, "UTF-8", ""); Element body = doc.body(); // most of the target items are sandwitched by <b> tag // this can be used to reach each target item. String tmpCurrentTime = null; String tmpUpTime = null; String currentTime = null; Elements tags = body.getElementsByTag("b"); for (Element tag : tags) { // get the current-time string: for 1.52.3 or older daemons // this is the ony place to get it. String tagText = tag.text(); logger.log(Level.FINE, "working on tagText={0}", tagText); if (tagText.equals("Daemon Status")) { // find current time and up running currentTime = tag.parent().parent().text(); logger.log(Level.INFO, "currentTime text=[{0}]", currentTime); // "currentTime =Daemon Status lockss.statelib.lib.in.us (usdocspln group) 01:25:55 03/01/12, up 7d5h21m" tmstmpMatcher = currentTimeStampPattern.matcher(currentTime); if (tmstmpMatcher.find()) { logger.log(Level.INFO, "group 0={0}", tmstmpMatcher.group(0)); tmpCurrentTime = tmstmpMatcher.group(1); logger.log(Level.INFO, "Current Time:group 1={0}", tmpCurrentTime); tmpUpTime = tmstmpMatcher.group(2); logger.log(Level.INFO, "UpTime:group 2={0}", tmpUpTime); } } // get the remaining key-value sets if (fieldNameSet.contains(tagText)) { Element parent = tag.parent(); String fieldValue = parent.nextElementSibling().text(); logger.log(Level.FINE, "{0}={1}", new Object[] { tagText, fieldValue }); summaryInfoMap.put(tagText, fieldValue); } } // extract the daemon version and platform info that are located // at the bottom // these data are sandwitched by a <center> tag Elements ctags = body.getElementsByTag("center"); String version = null; String platform = null; for (Element ctag : ctags) { String cText = ctag.text(); logger.log(Level.FINE, "center tag Text={0}", cText); // cText is like this: // Daemon 1.53.3 built 28-Jan-12 01:06:36 on build7.lockss.org, Linux RPM 1 if (StringUtils.isNotBlank(cText) && ctag.child(0).nodeName().equals("font")) { String[] versionPlatform = cText.split(", "); if (versionPlatform.length == 2) { logger.log(Level.INFO, "daemon version={0};platform={1}", versionPlatform); version = DaemonStatusDataUtil.getDaemonVersion(versionPlatform[0]); platform = versionPlatform[1]; } else { // the above regex failed logger.log(Level.WARNING, "String-formatting differs; use pattern matching"); version = DaemonStatusDataUtil.getDaemonVersion(cText); int platformOffset = cText.lastIndexOf(", ") + 2; platform = cText.substring(platformOffset); logger.log(Level.INFO, "platform={0}", platform); } } } if (summaryInfoMap.containsKey("V3 Identity")) { String ipAddress = DaemonStatusDataUtil.getPeerIpAddress(summaryInfoMap.get("V3 Identity")); logger.log(Level.INFO, "ipAddress={0}", ipAddress); if (StringUtils.isNotBlank(ipAddress)) { boxInfoMap.put("host", ipAddress); if (!ipAddress.equals(summaryInfoMap.get("IP Address"))) { summaryInfoMap.put("IP Address", ipAddress); } } else { logger.log(Level.WARNING, "host token is blank or null: use IP Address instead"); logger.log(Level.INFO, "IP Address={0}", summaryInfoMap.get("IP Address")); boxInfoMap.put("host", summaryInfoMap.get("IP Address")); } } // for pre-1.53.3 versions boxInfoMap.put("time", tmpCurrentTime); if (!summaryInfoMap.containsKey("Current Time")) { summaryInfoMap.put("Current Time", tmpCurrentTime); } boxInfoMap.put("up", tmpUpTime); if (!summaryInfoMap.containsKey("Uptime")) { summaryInfoMap.put("Uptime", tmpUpTime); } boxInfoMap.put("version", version); if (!summaryInfoMap.containsKey("Daemon Version")) { summaryInfoMap.put("Daemon Version", version); } boxInfoMap.put("platform", platform); if (!summaryInfoMap.containsKey("Platform")) { summaryInfoMap.put("Platform", platform); } } catch (IOException ex) { logger.log(Level.SEVERE, "IO error", ex); } logger.log(Level.INFO, "boxInfoMap={0}", boxInfoMap); logger.log(Level.INFO, "summaryInfo={0}", summaryInfoMap); }