Example usage for org.jsoup.nodes Document getElementsByTag

List of usage examples for org.jsoup.nodes Document getElementsByTag

Introduction

In this page you can find the example usage for org.jsoup.nodes Document getElementsByTag.

Prototype

public Elements getElementsByTag(String tagName) 

Source Link

Document

Finds elements, including and recursively under this element, with the specified tag name.

Usage

From source file:ru.dmitry.mamishev.URLParse.HtmlString.java

public GazInfo getInfoBill() {
    String html = this.htmlString;
    Document doc = Jsoup.parse(html);
    Elements ul = doc.getElementsByTag("ul");
    String numBill = "";
    numBill = ul.eq(1).text();//from   w w w . j a v  a  2  s  .co m
    Document bElements = Jsoup.parseBodyFragment(ul.toString());
    Elements b = bElements.getElementsByTag("b");
    GazInfo billInfo = null;
    String date = "";
    String pay = "";
    if (b.size() > 2) {
        String[] ss = SPLIT.split(b.get(2).text());
        if (ss.length > 0) {
            date = ss[0];
            pay = ss[1];
        }
        billInfo = new GazInfo(b.get(0).text(), b.get(1).text(), date, pay, numBill);
    } else {
        billInfo = new GazInfo("", "", date, pay, "");
    }
    return billInfo;

}

From source file:solarrecorder.SolarRecorder.java

private void getProdData() throws IOException {
    org.jsoup.nodes.Document doc = Jsoup.connect("http://envoy/production").get();

    Element h1 = doc.getElementsByTag("h1").first();
    Element table = h1.nextElementSibling();
    Elements alltr = table.getElementsByTag("tbody").first().getElementsByTag("tr");
    for (Element tr : alltr) {
        Elements alltd = tr.getElementsByTag("td");

        if (alltd.size() == 2) {
            String name = alltd.first().text();
            String value = alltd.last().text();
            switch (name) {
            case "Currently":
            case "Today":
                envoyData.add(new EnvoyData(name, value));
                break;
            }//from   w  w w . j a  va2  s  . c  o  m
        }
    }
}

From source file:solarrecorder.SolarRecorder.java

private void getSysData() throws IOException {
    org.jsoup.nodes.Document doc = Jsoup.connect("http://envoy").get();

    Elements allh2 = doc.getElementsByTag("h2");
    for (Element h2 : allh2) {
        if (h2.text().equals("System Statistics")) {
            Elements tables = h2.parent().getElementsByTag("table");
            Elements alltr = tables.first().getElementsByTag("tbody").first().getElementsByTag("tr");
            for (Element tr : alltr) {
                Elements alltd = tr.getElementsByTag("td");
                String name = alltd.first().text();
                String value = alltd.last().text();
                if (name.equals("Number of Microinverters Online")) {
                    envoyData.add(new EnvoyData(name, value));
                }//from  w ww .  j ava 2s. c  om
            }
        }
    }
}

From source file:uk.co.certait.htmlexporter.writer.AbstractExporter.java

protected Elements getTables(String html) {
    Document document = Jsoup.parse(html);// FIXME parsing twice

    return document.getElementsByTag("table");
}

From source file:uk.co.certait.htmlexporter.writer.AbstractExporter.java

protected StyleMap getStyleMapper(String html) {
    Document document = Jsoup.parse(html);
    Elements styles = document.getElementsByTag("style");// FIXME parsing
    // twice//from  ww  w  .j  a  v  a2s.c o m

    StyleParser parser = new StyleParser();
    StyleMap mapper = new StyleMap(parser.parseStyles(styles));

    return mapper;
}

From source file:us.colloquy.sandbox.FileProcessor.java

@Test
public void listAllUzipedFiles() {
    ///Documents/Tolstoy/diaries
    //System.getProperty("user.home") + "/Documents/Tolstoy/unzipLetters"

    Path pathToLetters = FileSystems.getDefault()
            .getPath(System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries");

    List<Path> results = new ArrayList<>();

    int maxDepth = 6;

    try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> {
        return String.valueOf(path).endsWith(".ncx");
    })) {//from   w w w.  j  a va2s .  c  om

        stream.forEach(results::add);

        //            String joined = stream
        //                    .sorted()
        //                    .map(String::valueOf)
        //                    .collect(Collectors.joining("; "));
        //
        //            System.out.println("\nFound: " + joined);

    } catch (IOException e) {
        e.printStackTrace();
    }

    System.out.println("files: " + results.size());

    Set<String> uriList = new TreeSet<>();

    try {

        for (Path res : results) {
            Path parent = res.getParent();

            System.out.println("---------------------------------------------");
            System.out.println(parent.toString());
            //use jsoup to list all files that contain something useful
            Document doc = Jsoup.parse(res.toFile(), "UTF-8");

            for (Element element : doc.getElementsByTag("docTitle")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {

                    System.out.println("Title: " + child.text());
                }
            }

            for (Element element : doc.getElementsByTag("navPoint")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    String label = child.text();

                    if (StringUtils.isNotEmpty(label)) {
                        if (label.matches("?")) {
                            System.out.println("------------------");
                        }

                        String url = child.getElementsByTag("content").attr("src");

                        if (label.matches(".*\\d{1,3}.*[?--?]+.*") && StringUtils.isNotEmpty(url)) {

                            uriList.add(parent.toString() + File.separator + url.replaceAll("#.*", ""));
                            //                                System.out.println("nav point: " + label + " src " + parent.toString()
                            //                                        + System.lineSeparator() + url.replaceAll("#.*",""));

                        } else {
                            // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src"));
                        }

                    }
                }
            }

        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    System.out.println("Size: " + uriList.size());

    for (String uri : uriList) {
        //parse and
        System.out.println(uri);
    }

}

From source file:us.colloquy.sandbox.FileProcessor.java

@Test
public void listAllUzipedFilesContent() {
    ///Documents/Tolstoy/diaries

    Path pathToLetters = FileSystems.getDefault()
            .getPath(System.getProperty("user.home") + "/Documents/Tolstoy/unzipLetters");

    List<Path> results = new ArrayList<>();

    int maxDepth = 6;

    try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> {
        return String.valueOf(path).endsWith(".opf");
    })) {/*from  ww  w .ja  va2 s  . c  o m*/

        stream.forEach(results::add);

        //            String joined = stream
        //                    .sorted()
        //                    .map(String::valueOf)
        //                    .collect(Collectors.joining("; "));
        //
        //            System.out.println("\nFound: " + joined);

    } catch (IOException e) {
        e.printStackTrace();
    }

    System.out.println("files: " + results.size());

    Set<String> uriList = new TreeSet<>();

    try {

        for (Path res : results) {
            Path parent = res.getParent();

            System.out.println("---------------------------------------------");
            System.out.println(parent.toString());
            //use jsoup to list all files that contain something useful
            Document doc = Jsoup.parse(res.toFile(), "UTF-8");

            for (Element element : doc.getElementsByTag("dc:title")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();
                System.out.println(element.text());

                //                    for (Element child : element.children())
                //                    {
                //                       System.out.println(child.tagName() + "\t" + child.text());
                //                    }
            }

            //                for (Element element : doc.getElementsByTag("navPoint"))
            //                {
            //                    //Letter letter = new Letter();
            //
            //                    // StringBuilder content = new StringBuilder();
            //
            //                    for (Element child : element.children())
            //                    {
            //                        String label = child.text();
            //
            //                        if (StringUtils.isNotEmpty(label))
            //                        {
            //                            if (label.matches("?"))
            //                            {
            //                                System.out.println("------------------");
            //                            }
            //
            //
            //                            String url = child.getElementsByTag("content").attr("src");
            //
            //                            if (label.matches(".*\\d{1,3}.*[?--?]+.*") &&
            //                                    StringUtils.isNotEmpty(url) )
            //                            {
            //
            //                                uriList.add(parent.toString()
            //                                        + File.separator + url.replaceAll("#.*",""));
            ////                                System.out.println("nav point: " + label + " src " + parent.toString()
            ////                                        + System.lineSeparator() + url.replaceAll("#.*",""));
            //
            //
            //                            } else
            //                            {
            //                                // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src"));
            //                            }
            //
            //
            //                        }
            //                    }
            //                }

        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    System.out.println("Size: " + uriList.size());

    for (String uri : uriList) {
        //parse and
        System.out.println(uri);
    }

}

From source file:us.colloquy.sandbox.FileProcessor.java

@Test
public void getURIForAllDiaries() {

    Set<DocumentPointer> uriList = new HashSet<>();
    //String letterDirectory = System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries";

    ///*from  w  ww  .j  a  va 2  s . c  o m*/

    String letterDirectory = System.getProperty("user.home")
            + "/Documents/Tolstoy/90-volume-set/diaries/uzip/dnevnik_1881-1887_vol_49";

    Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory);

    List<Path> results = new ArrayList<>();

    int maxDepth = 6;

    try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> {
        return String.valueOf(path).endsWith(".ncx");
    })) {

        stream.forEach(results::add);

    } catch (IOException e) {
        e.printStackTrace();
    }

    System.out.println("files: " + results.size());

    try {

        for (Path res : results) {
            Path parent = res.getParent();

            //                System.out.println("---------------------------------------------");
            //                System.out.println(parent.toString());
            //use jsoup to list all files that contain something useful
            Document doc = Jsoup.parse(res.toFile(), "UTF-8");

            String title = "";

            for (Element element : doc.getElementsByTag("docTitle")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    title = child.text();
                    // System.out.println("Title: " + title);
                }
            }

            //  System.out.println("==========================   " + res.toString() + " ==========================");

            boolean startPrinting = false;

            boolean newFile = true;

            for (Element element : doc.getElementsByTag("navPoint")) {

                //get nav label and content

                Element navLabelElement = element.select("navLabel").first();
                Element srsElement = element.select("content").first();

                String navLabel = "";
                String srs = "";

                if (navLabelElement != null) {
                    navLabel = navLabelElement.text().replaceAll("\\*", "").trim();
                }

                if (srsElement != null) {
                    srs = srsElement.attr("src");
                }

                if ("??".matches(navLabel))

                {
                    startPrinting = false;

                    // System.out.println("----------------- end of file pointer ---------------");
                }

                if (StringUtils.isNotEmpty(navLabel)
                        && navLabel.matches("??.*|?? ?.*") && newFile) {
                    newFile = false;
                    startPrinting = true;
                    title = navLabel;
                }

                if (startPrinting) {
                    // System.out.println("----------------- file pointer ---------------");
                    //   System.out.println(navLabel + "\t" + srs);

                    DocumentPointer documentPointer = new DocumentPointer(
                            parent.toString() + File.separator + srs.replaceAll("#.*", ""), title);

                    uriList.add(documentPointer);
                }

                //                    for (Element child : element.children())
                //                    {
                //                        String label = child.text();
                //
                //                        if (StringUtils.isNotEmpty(label))
                //                        {
                //                            if (label.matches("??\\s\\d{4}.*"))
                //                            {
                //                                System.out.println("------------------");
                //                            }

                //
                //                            String url = child.getElementsByTag("content").attr("src");
                //
                //                            if (label.matches(".*\\d{1,3}.*[?--?]+.*") &&
                //                                    StringUtils.isNotEmpty(url))
                //                            {
                //                                DocumentPointer letterPointer = new DocumentPointer(parent.toString()
                //                                        + File.separator + url.replaceAll("#.*", ""), title);
                //
                //                                uriList.add(letterPointer);
                ////                                System.out.println("nav point: " + label + " src " + parent.toString()
                ////                                        + System.lineSeparator() + url.replaceAll("#.*",""));
                //
                //
                //                            } else if (label.matches(".*\\d{1,3}.*") &&
                //                                    StringUtils.isNotEmpty(url) && useOnlyNumber)
                //                            {
                //                                DocumentPointer letterPointer = new DocumentPointer(parent.toString()
                //                                        + File.separator + url.replaceAll("#.*", ""), title);
                //
                //                                uriList.add(letterPointer);
                ////                                System.out.println("nav point: " + label + " src " + parent.toString()
                ////                                        + System.lineSeparator() + url.replaceAll("#.*",""));
                //
                //
                //                            } else
                //                            {
                //                                // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src"));
                //                            }
                //
                //
                //                        }
                //                        }
            }

            //   System.out.println("==========================   END OF FILE ==========================");

        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    System.out.println("Size: " + uriList.size());

    for (DocumentPointer pointer : uriList) {
        //parse and
        System.out.println(pointer.getSourse() + "\t" + pointer.getUri());
    }
}

From source file:us.colloquy.util.EpubExtractor.java

public static void getURIForAllLetters(Set<DocumentPointer> uriList, String letterDirectory,
        boolean useOnlyNumber) {

    Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory);

    List<Path> results = new ArrayList<>();

    int maxDepth = 6;

    try (Stream<Path> stream = Files.find(pathToLetters, maxDepth,
            (path, attr) -> String.valueOf(path).endsWith(".ncx"))) {
        stream.forEach(results::add);/*from   ww w  . j a  v  a  2 s.  c o m*/

        //            String joined = stream
        //                    .sorted()
        //                    .map(String::valueOf)
        //                    .collect(Collectors.joining("; "));
        //
        //            System.out.println("\nFound: " + joined);

    } catch (IOException e) {
        e.printStackTrace();
    }

    System.out.println("files: " + results.size());

    try {

        for (Path res : results) {
            Path parent = res.getParent();

            //                System.out.println("---------------------------------------------");
            //                System.out.println(parent.toString());
            //use jsoup to list all files that contain something useful
            Document doc = Jsoup.parse(res.toFile(), "UTF-8");

            String title = "";

            for (Element element : doc.getElementsByTag("docTitle")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    title = child.text();
                    // System.out.println("Title: " + title);
                }
            }

            for (Element element : doc.getElementsByTag("avantitul")) {

                for (Element child : element.children()) {
                    String label = child.text();

                    if (StringUtils.isNotEmpty(label)) {
                        if (label.matches(
                                "  ? ? .*")) {
                            System.out.println("------------------   " + label);
                        }
                    }
                }

            }

            for (Element element : doc.getElementsByTag("navPoint")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    String label = child.text();

                    if (StringUtils.isNotEmpty(label)) {
                        if (label.matches("?")) {
                            System.out.println("------------------ " + "?" + " -------------------");

                        } else if (label.contains(" ?")) {
                            break;
                        }

                        String url = child.getElementsByTag("content").attr("src");

                        if (label.matches(".*\\d{1,3}.*[?--?A-Za-z]+.*") && StringUtils.isNotEmpty(url)) {
                            DocumentPointer documentPointer = new DocumentPointer(
                                    parent.toString() + File.separator + url.replaceAll("#.*", ""), title);

                            uriList.add(documentPointer);
                            //                                System.out.println("nav point: " + label + " src " + parent.toString()
                            //                                        + System.lineSeparator() + url.replaceAll("#.*",""));

                        } else if (label.matches(".*\\d{1,3}.*") && StringUtils.isNotEmpty(url)
                                && useOnlyNumber) {
                            DocumentPointer documentPointer = new DocumentPointer(
                                    parent.toString() + File.separator + url.replaceAll("#.*", ""), title);

                            uriList.add(documentPointer);
                            //                                System.out.println("nav point: " + label + " src " + parent.toString()
                            //                                        + System.lineSeparator() + url.replaceAll("#.*",""));

                        } else {
                            // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src"));
                        }

                    }
                }
            }

        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    //        System.out.println("Size: " + uriList.size());

    //        for (DocumentPointer pointer : uriList)
    //        {
    //            //parse and
    //            System.out.println(pointer.getSourse() + "\t" + pointer.getUri());
    //        }
}

From source file:webcralwerproject1.Webcrawler.java

public String writeContent(Document htmlDocument) {// throws IOException {
    FileWriter fWriter = null;/* w  w w  .jav  a2  s.co m*/
    BufferedWriter writer = null;
    String path = null;
    try {
        File file = new File(DirectoryName + "/" + crawlcount);
        if (!file.exists()) {
            if (file.mkdir()) {
                System.out.println("Repository Directory is created!");
            } else {
                System.out.println("Failed to create directory!");
            }
        }
        File f = new File(file.getAbsolutePath() + "/" + MaxPage + "file.html");
        path = f.getAbsolutePath();
        Elements img = htmlDocument.getElementsByTag("img");
        Elements srcc = htmlDocument.getElementsByAttribute("src");
        for (Element el : img) {
            imagecount++;
            el.attr("src", "a");
        }
        // System.out.println("Imagecount : " + imagecount );
        FileUtils.writeStringToFile(f, htmlDocument.html(), "UTF-8");

    } catch (Exception e) {
        System.out.println("Inside writeContent Exception " + e);
    }
    System.out.println("Inside writeContent ");
    return path;
}