Example usage for org.jsoup.helper DataUtil load

List of usage examples for org.jsoup.helper DataUtil load

Introduction

In this page you can find the example usage for org.jsoup.helper DataUtil load.

Prototype

public static Document load(InputStream in, String charsetName, String baseUri) throws IOException 

Source Link

Document

Parses a Document from an input steam.

Usage

From source file:com.screenslicer.core.scrape.trainer.TrainerVisitorExtractOnce.java

@Override
public void init() {
    final ArrayList<String> filenames = new ArrayList<String>();
    final List<String> bump = Arrays.asList(new String[] {});
    new File("./test/data-webpages/").listFiles(new FileFilter() {
        @Override//  www  .  j  av  a 2s.  c o  m
        public boolean accept(File file) {
            if (!file.getAbsolutePath().endsWith("-success") && !file.getAbsolutePath().endsWith("-successnode")
                    && !file.getAbsolutePath().endsWith("-result") && !file.getAbsolutePath().endsWith("-num")
                    && !file.getAbsolutePath().endsWith("-next")) {
                try {
                    if (bump.contains(file.getName())) {
                        resultParents.add(0, FileUtils
                                .readFileToString(new File(file.getAbsolutePath() + "-success"), "utf-8"));
                        elements.add(0, DataUtil.load(file, "utf-8", "http://localhost").body());
                        filenames.add(0, file.getName());
                    } else {
                        resultParents.add(FileUtils
                                .readFileToString(new File(file.getAbsolutePath() + "-success", "utf-8")));
                        elements.add(DataUtil.load(file, "utf-8", "http://localhost").body());
                        filenames.add(file.getName());
                    }
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
            return false;
        }
    });
    for (String filename : filenames) {
        System.out.println(filename);
    }
    names = filenames.toArray(new String[0]);
}

From source file:com.screenslicer.core.scrape.trainer.TrainerVisitorProceed.java

@Override
public void init() {
    final ArrayList<String> filenames = new ArrayList<String>();
    final List<String> bump = Arrays.asList(new String[] { "buzzfeed" });
    new File("./test/data-webpages/").listFiles(new FileFilter() {
        @Override//w  ww.  j a  v a  2  s  . co m
        public boolean accept(File file) {
            if (!file.getAbsolutePath().endsWith("-success") && !file.getAbsolutePath().endsWith("-successnode")
                    && !file.getAbsolutePath().endsWith("-result") && !file.getAbsolutePath().endsWith("-num")
                    && !file.getAbsolutePath().endsWith("-next")) {
                try {
                    File fileNext = new File(file.getAbsolutePath() + "-next");
                    if (fileNext.exists()) {
                        if (bump.contains(file.getName())) {
                            nextButtons.add(0, FileUtils.readFileToString(fileNext, "utf-8"));
                            elements.add(0, DataUtil.load(file, "utf-8", "http://localhost").body());
                            filenames.add(0, file.getName());
                        } else {
                            nextButtons.add(FileUtils.readFileToString(fileNext, "utf-8"));
                            elements.add(DataUtil.load(file, "utf-8", "http://localhost").body());
                            filenames.add(file.getName());
                        }
                    }
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
            return false;
        }
    });
    for (String filename : filenames) {
        System.out.println(filename);
    }
    names = filenames.toArray(new String[0]);
}

From source file:com.screenslicer.core.scrape.trainer.TrainerVisitorExtract.java

@Override
public void init() {
    final ArrayList<String> filenames = new ArrayList<String>();
    final List<String> bump = Arrays.asList(new String[] {});
    new File("./test/data-webpages/").listFiles(new FileFilter() {
        @Override// w  w w. ja v  a 2s  . c  o m
        public boolean accept(File file) {
            if (!file.getAbsolutePath().endsWith("-success") && !file.getAbsolutePath().endsWith("-successnode")
                    && !file.getAbsolutePath().endsWith("-result") && !file.getAbsolutePath().endsWith("-num")
                    && !file.getAbsolutePath().endsWith("-next")) {
                try {
                    if (bump.contains(file.getName())) {
                        resultParents.add(0, FileUtils
                                .readFileToString(new File(file.getAbsolutePath() + "-success", "utf-8")));
                        elements.add(0, DataUtil.load(file, "utf-8", "http://localhost").body());
                        filenames.add(0, file.getName());
                    } else {
                        resultParents.add(FileUtils
                                .readFileToString(new File(file.getAbsolutePath() + "-success", "utf-8")));
                        elements.add(DataUtil.load(file, "utf-8", "http://localhost").body());
                        filenames.add(file.getName());
                    }
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
            return false;
        }
    });
    for (String filename : filenames) {
        System.out.println(filename);
    }
    tmpItems = new Object[resultParents.size()];
}

From source file:com.screenslicer.core.scrape.trainer.TrainerVisitorAll.java

@Override
public void init() {
    final ArrayList<String> filenames = new ArrayList<String>();
    final List<String> bump = Arrays.asList(new String[] { "facebook" });
    new File("./test/data-webpages/").listFiles(new FileFilter() {
        @Override/*from  w  w w.j a  v  a  2s.c  o  m*/
        public boolean accept(File file) {
            if (!file.getAbsolutePath().endsWith("-result") && !file.getAbsolutePath().endsWith("-success")
                    && !file.getAbsolutePath().endsWith("-successnode")
                    && !file.getAbsolutePath().endsWith("-num") && !file.getAbsolutePath().endsWith("-next")) {
                try {
                    if (bump.contains(file.getName())) {
                        File numFile = new File(file.getAbsolutePath() + "-num");
                        if (numFile.exists()) {
                            nums.add(0, Integer.parseInt(FileUtils.readFileToString(numFile, "utf-8")));
                        } else {
                            nums.add(0, 10);
                        }
                        elements.add(0,
                                Util.markTestElement(DataUtil.load(file, "utf-8", "http://localhost").body()));
                        filenames.add(0, file.getName());
                    } else if (!BUMP_ONLY) {
                        File numFile = new File(file.getAbsolutePath() + "-num");
                        if (numFile.exists()) {
                            nums.add(Integer.parseInt(FileUtils.readFileToString(numFile, "utf-8")));
                        } else {
                            nums.add(10);
                        }
                        elements.add(
                                Util.markTestElement(DataUtil.load(file, "utf-8", "http://localhost").body()));
                        filenames.add(file.getName());
                    }
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
            return false;
        }
    });
    for (String filename : filenames) {
        System.out.println(filename);
    }
    names = filenames.toArray(new String[0]);
}

From source file:edu.harvard.iq.safe.lockss.impl.LOCKSSPlatformStatusHtmlParser.java

/**
 *
 * @param is//  www .  ja  v  a  2 s. co m
 */
@Override
public void getPlatformStatusData(InputStream is) {

    try {

        Document doc = DataUtil.load(is, "UTF-8", "");
        Element body = doc.body();

        // most of the target items are sandwitched by <b> tag
        // this can be used to reach each target item.
        String tmpCurrentTime = null;
        String tmpUpTime = null;
        String currentTime = null;
        Elements tags = body.getElementsByTag("b");

        for (Element tag : tags) {

            // get the current-time string: for 1.52.3 or older daemons
            // this is the ony place to get it.
            String tagText = tag.text();
            logger.log(Level.FINE, "working on tagText={0}", tagText);

            if (tagText.equals("Daemon Status")) {
                // find current time and up running
                currentTime = tag.parent().parent().text();
                logger.log(Level.INFO, "currentTime text=[{0}]", currentTime);
                // "currentTime =Daemon Status lockss.statelib.lib.in.us (usdocspln group) 01:25:55 03/01/12, up 7d5h21m"
                tmstmpMatcher = currentTimeStampPattern.matcher(currentTime);

                if (tmstmpMatcher.find()) {
                    logger.log(Level.INFO, "group 0={0}", tmstmpMatcher.group(0));
                    tmpCurrentTime = tmstmpMatcher.group(1);
                    logger.log(Level.INFO, "Current Time:group 1={0}", tmpCurrentTime);
                    tmpUpTime = tmstmpMatcher.group(2);
                    logger.log(Level.INFO, "UpTime:group 2={0}", tmpUpTime);
                }
            }

            // get the remaining key-value sets
            if (fieldNameSet.contains(tagText)) {

                Element parent = tag.parent();
                String fieldValue = parent.nextElementSibling().text();
                logger.log(Level.FINE, "{0}={1}", new Object[] { tagText, fieldValue });
                summaryInfoMap.put(tagText, fieldValue);
            }
        }

        // extract the daemon version and platform info that are located
        // at the bottom
        // these data are sandwitched by a <center> tag
        Elements ctags = body.getElementsByTag("center");
        String version = null;
        String platform = null;
        for (Element ctag : ctags) {
            String cText = ctag.text();
            logger.log(Level.FINE, "center tag Text={0}", cText);
            // cText is like this:
            // Daemon 1.53.3 built 28-Jan-12 01:06:36 on build7.lockss.org, Linux RPM 1
            if (StringUtils.isNotBlank(cText) && ctag.child(0).nodeName().equals("font")) {
                String[] versionPlatform = cText.split(", ");
                if (versionPlatform.length == 2) {
                    logger.log(Level.INFO, "daemon version={0};platform={1}", versionPlatform);
                    version = DaemonStatusDataUtil.getDaemonVersion(versionPlatform[0]);
                    platform = versionPlatform[1];
                } else {
                    // the above regex failed
                    logger.log(Level.WARNING, "String-formatting differs; use pattern matching");
                    version = DaemonStatusDataUtil.getDaemonVersion(cText);
                    int platformOffset = cText.lastIndexOf(", ") + 2;
                    platform = cText.substring(platformOffset);
                    logger.log(Level.INFO, "platform={0}", platform);

                }
            }
        }

        if (summaryInfoMap.containsKey("V3 Identity")) {
            String ipAddress = DaemonStatusDataUtil.getPeerIpAddress(summaryInfoMap.get("V3 Identity"));
            logger.log(Level.INFO, "ipAddress={0}", ipAddress);

            if (StringUtils.isNotBlank(ipAddress)) {
                boxInfoMap.put("host", ipAddress);
                if (!ipAddress.equals(summaryInfoMap.get("IP Address"))) {
                    summaryInfoMap.put("IP Address", ipAddress);
                }
            } else {
                logger.log(Level.WARNING, "host token is blank or null: use IP Address instead");
                logger.log(Level.INFO, "IP Address={0}", summaryInfoMap.get("IP Address"));
                boxInfoMap.put("host", summaryInfoMap.get("IP Address"));
            }
        }

        // for pre-1.53.3 versions
        boxInfoMap.put("time", tmpCurrentTime);
        if (!summaryInfoMap.containsKey("Current Time")) {
            summaryInfoMap.put("Current Time", tmpCurrentTime);
        }

        boxInfoMap.put("up", tmpUpTime);
        if (!summaryInfoMap.containsKey("Uptime")) {
            summaryInfoMap.put("Uptime", tmpUpTime);
        }

        boxInfoMap.put("version", version);
        if (!summaryInfoMap.containsKey("Daemon Version")) {
            summaryInfoMap.put("Daemon Version", version);
        }

        boxInfoMap.put("platform", platform);
        if (!summaryInfoMap.containsKey("Platform")) {
            summaryInfoMap.put("Platform", platform);
        }

    } catch (IOException ex) {
        logger.log(Level.SEVERE, "IO error", ex);
    }

    logger.log(Level.INFO, "boxInfoMap={0}", boxInfoMap);
    logger.log(Level.INFO, "summaryInfo={0}", summaryInfoMap);
}