Example usage for org.jsoup.nodes Document getAllElements

List of usage examples for org.jsoup.nodes Document getAllElements

Introduction

In this page you can find the example usage for org.jsoup.nodes Document getAllElements.

Prototype

public Elements getAllElements() 

Source Link

Document

Find all elements under this element (including self, and children of children).

Usage

From source file:Main.java

public static String getScrapeText(Document doc, String query) {
    return getScrapeText(doc.getAllElements(), query);
}

From source file:coyote.dx.web.TestHtmlWorker.java

@Test
public void setHtmlGet() throws IOException {
    Resource resource = new Resource("http://localhost:" + port + "/data/test.html");
    Response response = resource.request();
    assertNotNull(response);//from   ww w.ja  v a  2 s .  c  om

    while (!response.isComplete()) {
        Thread.yield();
    }

    Document doc = response.getDocument();
    assertNotNull(doc);
    Elements elements = doc.getAllElements();
    System.out.println(doc.toString());
    System.out.println("Retrieved document contains " + elements.size() + " elements");
    assertTrue(elements.size() >= 40);
}

From source file:edu.usu.sdl.openstorefront.service.io.HelpImporter.java

/**
 * Accept a stream pointed to markdown//from   ww  w.  j  a  v  a 2 s. com
 *
 * @param in
 * @return
 */
public List<HelpSection> processHelp(InputStream in) {
    List<HelpSection> helpSections = new ArrayList<>();

    String data = "";
    try (BufferedReader bin = new BufferedReader(new InputStreamReader(in))) {
        data = bin.lines().collect(Collectors.joining("\n"));
    } catch (IOException e) {

    }

    PegDownProcessor pegDownProcessor = new PegDownProcessor(PROCESSING_TIMEOUT);
    String html = pegDownProcessor.markdownToHtml(data);
    Document doc = Jsoup.parse(html);
    Elements elements = doc.getAllElements();

    Set<String> headerTags = new HashSet<>();
    headerTags.add("h1");
    headerTags.add("h2");
    headerTags.add("h3");
    headerTags.add("h4");
    headerTags.add("h5");
    headerTags.add("h6");

    boolean capture = false;
    HelpSection helpSection = null;
    for (Element element : elements) {
        if (headerTags.contains(element.tagName().toLowerCase()) == false && capture) {
            if (helpSection != null) {
                if (helpSection.getContent().contains(element.outerHtml()) == false) {
                    helpSection.setContent(helpSection.getContent() + element.outerHtml());
                }
            }
        }

        if (headerTags.contains(element.tagName().toLowerCase())) {
            String title = element.html();

            if (helpSection != null) {
                //save old section
                addHelpSection(helpSections, helpSection);
            }

            String titleSplit[] = title.split(" ");

            helpSection = new HelpSection();
            helpSection.setTitle(title);
            helpSection.setHeaderLevel(Convert.toInteger(element.tagName().toLowerCase().replace("h", "")));
            helpSection.setSectionNumber(titleSplit[0]);
            helpSection.setContent("");

            if (title.contains("*")) {
                helpSection.setAdminSection(true);
            } else {
                helpSection.setAdminSection(false);
            }

            capture = true;
        }
    }
    //Add last section
    if (helpSection != null) {
        addHelpSection(helpSections, helpSection);
    }

    return helpSections;
}

From source file:model.ParseInfoFromSite.java

/**
 * Clear page from all comments. Its give access to some closed information
 * //from   w  w w .  jav  a  2  s .  c  o m
 * @param doc
 * @return
 */
public List<Comment> findAllComments(Document doc) {

    logger.info("run");
    List<Comment> comments = new ArrayList<>();
    for (Element element : doc.getAllElements()) {
        for (Node n : element.childNodes()) {
            if (n.nodeName().equals("#comment")) {
                comments.add((Comment) n);
            }
        }
    }
    return Collections.unmodifiableList(comments);
}

From source file:com.kingfong.webcrawler.util.DOMContentUtils.java

/**
 * This method finds all anchors below the supplied DOM
 * <code>node</code>, and creates appropriate {@link Outlink}
 * records for each (relative to the supplied <code>base</code>
 * URL), and adds them to the <code>outlinks</code> {@link
 * ArrayList}./*from  w w  w.j  a v a  2 s. co  m*/
 *
 * <p>
 *
 * Links without inner structure (tags, text, etc) are discarded, as
 * are links which contain only single nested links and empty text
 * nodes (this is a common DOM-fixup artifact, at least with
 * nekohtml).
 */
public void getOutlinks(String html, URL url, HashSet<String> outlinks) {

    Document document = Jsoup.parse(html);
    Elements elements = document.getAllElements();
    for (Element currentNode : elements) {
        String nodeName = currentNode.tagName();
        // short nodeType = currentNode.;
        Elements children = currentNode.children();
        nodeName = nodeName.toLowerCase();
        LinkParams params = linkParams.get(nodeName);
        if (params != null) {
            // if (!shouldThrowAwayLink(currentNode, children, childLen,
            // params)) {

            // StringBuilder linkText = new StringBuilder();
            // getText(linkText, currentNode, true);

            Attributes attrs = currentNode.attributes();
            String target = null;
            boolean noFollow = false;
            boolean post = false;
            Iterator<Attribute> iterator = attrs.iterator();
            while (iterator.hasNext()) {
                Attribute attr = iterator.next();
                String attrName = attr.getKey();
                if (params.attrName.equalsIgnoreCase(attrName)) {
                    target = attr.getValue();
                } else if ("rel".equalsIgnoreCase(attrName) && "nofollow".equalsIgnoreCase(attr.getValue())) {
                    noFollow = true;
                } else if ("method".equalsIgnoreCase(attrName) && "post".equalsIgnoreCase(attr.getValue())) {
                    post = true;
                }
            }
            if (StringUtils.startsWith(target, "/")) {
                target = url.getProtocol() + "://" + url.getHost() + target;
            }
            if (target != null && URLFilter.filt(target)) {
                outlinks.add(target);
            }
            // }
            // this should not have any children, skip them
            if (params.childLen == 0)
                continue;
        }
    }
}

From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.helpers.boilerplateremoval.impl.JusTextBoilerplateRemoval.java

/**
 * remove unwanted parts from a jsoup doc
 *
 * @param jsoupDoc//from w  w  w  .j  a  va  2  s  . co  m
 * @return
 */
public Document cleanDom(Document jsoupDoc) {
    String[] tagsToRemove = { "head", "script", ".hidden", "embedded" };

    for (String tag : tagsToRemove) {
        Elements selectedTags = jsoupDoc.select(tag);
        for (Element element : selectedTags) {
            element.remove();
        }
    }
    //remove comments (might be slow)
    for (Element element : jsoupDoc.getAllElements()) {
        for (Node n : element.childNodes()) {
            NodeHelper.removeComments(n);
        }
    }
    return jsoupDoc;

}

From source file:autoInsurance.BeiJPiccImpl.java

void init(Document doc) throws Exception {
    String str = "<select class=\"w_p80\" name=\"carKindCodeBak\" title=\"   \" id=\"carKindCodeBak\"><option value=\"A01\"></option><option value=\"B01\"></option><option value=\"B02\"></option><option value=\"B11\"></option><option value=\"B12\"></option><option value=\"B13\"></option><option value=\"B21\"></option><option value=\"B91\"></option><option value=\"C01\"></option><option value=\"C02\"></option><option value=\"C03\"></option><option value=\"C04\"></option><option value=\"C11\"></option><option value=\"C20\"></option><option value=\"C22\"></option><option value=\"C23\"></option><option value=\"C24\"></option><option value=\"C25\"></option><option value=\"C26\"></option><option value=\"C27\"></option><option value=\"C28\"></option><option value=\"C29\"></option><option value=\"C30\"></option><option value=\"C31\"></option><option value=\"C39\"></option><option value=\"C41\"></option><option value=\"C42\"></option><option value=\"C43\"></option><option value=\"C44\"></option><option value=\"C45\"></option><option value=\"C46\"></option><option value=\"C47\"></option><option value=\"C48\"></option><option value=\"C49\"></option><option value=\"C50\"></option><option value=\"C51\">X</option><option value=\"C52\">/</option><option value=\"C53\">/</option><option value=\"C54\"></option><option value=\"C55\"></option><option value=\"C56\"></option><option value=\"C57\"></option><option value=\"C58\"></option><option value=\"C61\"></option><option value=\"C69\"></option><option value=\"C90\"></option><option value=\"D01\"></option><option value=\"D02\"></option><option value=\"D03\"></option><option value=\"E01\"></option><option value=\"E11\"></option><option value=\"E12\">/</option><option value=\"Z99\"></option></select>";
    Document tmpDoc = Jsoup.parse(str);
    Elements els = tmpDoc.select("#carKindCodeBak> option");
    for (Element el : els) {
        carTypeMap.put(el.attr("value"), el.text());
    }/*from  w ww.ja  va 2  s .  c  o  m*/

    templateData = new HashMap<String, String>();
    List<FormElement> forms = doc.getAllElements().forms();
    for (FormElement form : forms) {
        List<KeyVal> datas = form.formData();
        for (KeyVal item : datas) {
            templateData.put(item.key(), item.value());
            //System.out.print(item.key()+"="+item.value() + "&");
        }
        System.out.println("------");
    }

    templateData.put("prpCmainCI.sumAmount", "122000");
    templateData.put("prpCitemKindCI.familyNo", "1");//null
    templateData.put("prpCitemKindCI.amount", "122000");//0
    templateData.put("prpCitemKindCI.adjustRate", "0.9");//1
}

From source file:org.norvelle.addressdiscoverer.parse.structured.BackwardsFlattenedDocumentIterator.java

/**
 * Generate the iterator and position its pointer so it can be walked backward
 * using next()//from   w w w  .j  a  v  a  2 s . c o  m
 * 
 * @param soup
 * @param encoding
 * @param status
 * @throws java.io.UnsupportedEncodingException
 * @throws org.norvelle.addressdiscoverer.exceptions.EndNodeWalkingException
 */
public BackwardsFlattenedDocumentIterator(Document soup, String encoding,
        ExtractIndividualsStatusReporter status) throws UnsupportedEncodingException, EndNodeWalkingException {
    this.status = status;
    this.status.setTotalNumericSteps(soup.getAllElements().size());

    // First we generate the flattened list of elements
    this.walkNodeBackwards(soup, encoding);
    this.status.reportProgressText("Backwards document iterator created successfully");
    logger.log(Level.FINE, "Flattened document: \n{0}", StringUtils.join(this.elementsWithNames, "\n"));

    // Now, we set the cursor to the end so we can iterate backwards
    this.currPosition = this.elementsWithNames.size() - 1;
}

From source file:org.norvelle.addressdiscoverer.parse.unstructured.ForwardsFlattenedDocumentIterator.java

/**
 * Generate the iterator and position its pointer so it can be walked backward
 * using next()/* ww w. j  ava 2s . c  o m*/
 * 
 * @param soup
 * @param encoding
 * @param status
 * @throws java.io.UnsupportedEncodingException
 * @throws org.norvelle.addressdiscoverer.exceptions.EndNodeWalkingException
 */
public ForwardsFlattenedDocumentIterator(Document soup, String encoding,
        ExtractIndividualsStatusReporter status) throws UnsupportedEncodingException, EndNodeWalkingException {
    this.status = status;
    this.status.setTotalNumericSteps(soup.getAllElements().size());

    // First we generate the flattened list of elements
    this.walkNodeForwards(soup, encoding);
    this.status.reportProgressText("Backwards document iterator created successfully");
    logger.log(Level.FINE, "Flattened document: \n{0}", StringUtils.join(this.elementsWithNames, "\n"));

    // Now, we set the cursor to the end so we can iterate backwards
    this.currPosition = this.elementsWithNames.size() - 1;

    // If we have any remaining Nodes to add as intermediates, add them to
    // the last name Node we found.
    if (!intermediateValuesList.isEmpty())
        this.intermediateElementMap.put(lastNameContainingElement, this.intermediateValuesList);
}

From source file:org.tinymediamanager.scraper.ofdb.OfdbMetadataProvider.java

@Override
public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception {
    LOGGER.debug("getMetadata() " + options.toString());

    if (options.getType() != MediaType.MOVIE) {
        throw new UnsupportedMediaTypeException(options.getType());
    }/*from w  w  w  . j a  v a 2s .  co m*/

    // we have 3 entry points here
    // a) getMetadata has been called with an ofdbId
    // b) getMetadata has been called with an imdbId
    // c) getMetadata has been called from a previous search

    String detailUrl = "";

    // case a) and c)
    if (StringUtils.isNotBlank(options.getId(getProviderInfo().getId())) || options.getResult() != null) {

        if (StringUtils.isNotBlank(options.getId(getProviderInfo().getId()))) {
            detailUrl = "http://www.ofdb.de/view.php?page=film&fid=" + options.getId(getProviderInfo().getId());
        } else {
            detailUrl = options.getResult().getUrl();
        }
    }

    // case b)
    if (options.getResult() == null && StringUtils.isNotBlank(options.getId(MediaMetadata.IMDB))) {
        MediaSearchOptions searchOptions = new MediaSearchOptions(MediaType.MOVIE);
        searchOptions.setImdbId(options.getId(MediaMetadata.IMDB));
        try {
            List<MediaSearchResult> results = search(searchOptions);
            if (results != null && !results.isEmpty()) {
                options.setResult(results.get(0));
                detailUrl = options.getResult().getUrl();
            }
        } catch (Exception e) {
            LOGGER.warn("failed IMDB search: " + e.getMessage());
        }
    }

    // we can only work further if we got a search result on ofdb.de
    if (StringUtils.isBlank(detailUrl)) {
        throw new Exception("We did not get any useful movie url");
    }

    MediaMetadata md = new MediaMetadata(providerInfo.getId());
    // generic Elements used all over
    Elements el = null;
    String ofdbId = StrgUtils.substr(detailUrl, "film\\/(\\d+),");
    if (StringUtils.isBlank(ofdbId)) {
        ofdbId = StrgUtils.substr(detailUrl, "fid=(\\d+)");
    }

    Url url;
    try {
        LOGGER.trace("get details page");
        url = new Url(detailUrl);
        InputStream in = url.getInputStream();
        Document doc = Jsoup.parse(in, "UTF-8", "");
        in.close();

        if (doc.getAllElements().size() < 10) {
            throw new Exception("meh - we did not receive a valid web page");
        }

        // parse details

        // IMDB ID "http://www.imdb.com/Title?1194173"
        el = doc.getElementsByAttributeValueContaining("href", "imdb.com");
        if (!el.isEmpty()) {
            md.setId(MediaMetadata.IMDB, "tt" + StrgUtils.substr(el.first().attr("href"), "\\?(\\d+)"));
        }

        // title / year
        // <meta property="og:title" content="Bourne Vermchtnis, Das (2012)" />
        el = doc.getElementsByAttributeValue("property", "og:title");
        if (!el.isEmpty()) {
            String[] ty = parseTitle(el.first().attr("content"));
            md.setTitle(StrgUtils.removeCommonSortableName(ty[0]));
            try {
                md.setYear(Integer.parseInt(ty[1]));
            } catch (Exception ignored) {
            }
        }
        // another year position
        if (md.getYear() == 0) {
            // <a href="view.php?page=blaettern&Kat=Jahr&Text=2012">2012</a>
            el = doc.getElementsByAttributeValueContaining("href", "Kat=Jahr");
            try {
                md.setYear(Integer.parseInt(el.first().text()));
            } catch (Exception ignored) {
            }
        }

        // original title (has to be searched with a regexp)
        // <tr valign="top">
        // <td nowrap=""><font class="Normal" face="Arial,Helvetica,sans-serif"
        // size="2">Originaltitel:</font></td>
        // <td>&nbsp;&nbsp;</td>
        // <td width="99%"><font class="Daten" face="Arial,Helvetica,sans-serif"
        // size="2"><b>Brave</b></font></td>
        // </tr>
        String originalTitle = StrgUtils.substr(doc.body().html(), "(?s)Originaltitel.*?<b>(.*?)</b>");
        if (!originalTitle.isEmpty()) {
            md.setOriginalTitle(StrgUtils.removeCommonSortableName(originalTitle));
        }

        // Genre: <a href="view.php?page=genre&Genre=Action">Action</a>
        el = doc.getElementsByAttributeValueContaining("href", "page=genre");
        for (Element g : el) {
            md.addGenre(getTmmGenre(g.text()));
        }

        // rating
        // <div itemtype="http://schema.org/AggregateRating" itemscope
        // itemprop="aggregateRating">Note: <span
        // itemprop="ratingValue">6.73</span><meta
        // itemprop="worstRating" content="1" />
        el = doc.getElementsByAttributeValue("itemprop", "ratingValue");
        if (!el.isEmpty()) {
            String r = el.text();
            if (!r.isEmpty()) {
                try {
                    md.setRating(Float.parseFloat(r));
                } catch (Exception e) {
                    LOGGER.debug("could not parse rating");
                }
            }
        }

        // get PlotLink; open url and parse
        // <a href="plot/22523,31360,Die-Bourne-Identitt"><b>[mehr]</b></a>
        LOGGER.trace("parse plot");
        el = doc.getElementsByAttributeValueMatching("href", "plot\\/\\d+,");
        if (!el.isEmpty()) {
            String plotUrl = BASE_URL + "/" + el.first().attr("href");
            try {
                url = new Url(plotUrl);
                in = url.getInputStream();
                Document plot = Jsoup.parse(in, "UTF-8", "");
                in.close();
                Elements block = plot.getElementsByClass("Blocksatz"); // first
                                                                       // Blocksatz
                                                                       // is plot
                String p = block.first().text(); // remove all html stuff
                p = p.substring(p.indexOf("Mal gelesen") + 12); // remove "header"
                md.setPlot(p);
            } catch (Exception e) {
                LOGGER.error("failed to get plot page: " + e.getMessage());
            }
        }

        // http://www.ofdb.de/view.php?page=film_detail&fid=226745
        LOGGER.debug("parse actor detail");
        String movieDetail = BASE_URL + "/view.php?page=film_detail&fid=" + ofdbId;
        doc = null;
        try {
            url = new Url(movieDetail);
            in = url.getInputStream();
            doc = Jsoup.parse(in, "UTF-8", "");
            in.close();
        } catch (Exception e) {
            LOGGER.error("failed to get detail page: " + e.getMessage());
        }

        if (doc != null) {
            parseCast(doc.getElementsContainingOwnText("Regie"), MediaCastMember.CastType.DIRECTOR, md);
            parseCast(doc.getElementsContainingOwnText("Darsteller"), MediaCastMember.CastType.ACTOR, md);
            parseCast(doc.getElementsContainingOwnText("Stimme/Sprecher"), MediaCastMember.CastType.ACTOR, md);
            parseCast(doc.getElementsContainingOwnText("Synchronstimme (deutsch)"),
                    MediaCastMember.CastType.ACTOR, md);
            parseCast(doc.getElementsContainingOwnText("Drehbuchautor(in)"), MediaCastMember.CastType.WRITER,
                    md);
            parseCast(doc.getElementsContainingOwnText("Produzent(in)"), MediaCastMember.CastType.PRODUCER, md);
        }
    } catch (Exception e) {
        LOGGER.error("Error parsing " + detailUrl);
        throw e;
    }

    return md;
}