Example usage for org.jsoup.nodes Element tag

List of usage examples for org.jsoup.nodes Element tag

Introduction

In this page you can find the example usage for org.jsoup.nodes Element tag.

Prototype

Tag tag

To view the source code for org.jsoup.nodes Element tag.

Click Source Link

Usage

From source file:Main.java

private static boolean preserveWhitespace(Node node) {
    // looks only at this element and one level up, to prevent recursion & needless stack searches
    if (node != null && node instanceof Element) {
        Element element = (Element) node;
        return element.tag().preserveWhitespace()
                || element.parent() != null && element.parent().tag().preserveWhitespace();
    }//from   w w  w.ja v  a2  s. co  m
    return false;
}

From source file:Main.java

private static void appendNewlineIfBr(Element element, StringBuilder accum) {
    if (element.tag().getName().equals("br"))
        accum.append("\n");
}

From source file:edu.ucla.cs.scai.swim.qa.ontology.dbpedia.tipicality.DbpediaCsvDownload.java

private static void download(Element e) throws MalformedURLException, IOException {
    for (Element c : e.children()) {
        String tagName = c.tag().getName();
        if (tagName.equals("small")) {
            for (Element c1 : c.children()) {
                if (c1.tag().getName().equals("a") && c1.text().equalsIgnoreCase("csv")) {
                    String href = c1.attr("href");
                    System.out.println("Downloading " + href);
                    try {
                        URL remoteFile = new URL(href);
                        ReadableByteChannel rbc = Channels.newChannel(remoteFile.openStream());
                        String[] s = href.split("\\/");
                        FileOutputStream fos = new FileOutputStream(
                                DBpediaOntology.DBPEDIA_CSV_FOLDER + s[s.length - 1]);
                        fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE);
                    } catch (Exception ex) {
                        ex.printStackTrace();
                    }/* w  w  w .  jav a 2  s . com*/
                }
            }
        } else if (tagName.equals("ul")) {
            for (Element c1 : c.children()) {
                if (c1.tagName().equals("li")) {
                    download(c1);
                }
            }
        }
    }
}

From source file:com.gistlabs.mechanize.document.html.JsoupDataUtil.java

private static void filterElementsByTag(List<Element> results, Element element, Set<String> tagSet) {
    if (tagSet.contains(element.tag().getName().toLowerCase()))
        results.add(element);//from  w ww.j  ava  2 s . c o  m

    for (Element child : element.children())
        filterElementsByTag(results, child, tagSet);
}

From source file:com.soulgalore.crawler.core.impl.AhrefPageURLParser.java

private Set<CrawlerURL> fetch(String query, String attributeKey, Document doc, String url) {

    final Set<CrawlerURL> urls = new HashSet<CrawlerURL>();

    final Elements elements = doc.select(query);

    for (Element src : elements) {

        if (src.attr(attributeKey).isEmpty())
            continue;

        // don't fetch mailto links
        if (src.attr(attributeKey).startsWith(MAIL_TO))
            continue;

        else if (IFRAME.equals(src.tag().getName()))
            urls.add(new CrawlerURL(src.attr(attributeKey), url));

        else/*from   w w  w.j a  v  a 2s .  co m*/
            urls.add(new CrawlerURL(src.attr(attributeKey), url));

    }

    return urls;

}

From source file:com.jejking.hh.nord.corpus.AllrisHtmlToRawDrucksache.java

private ImmutableList<String> druckSachenContents(Document htmlDoc) {
    /*//from w  w w .  j  av  a2  s . c o  m
     * In this way we can identify the bits of "RTF" like text inserted into the overall HTML.
     * JSoup cleans up the broken HTML removing the xml declaration and inserted html roots
     * that ALLRIS manages to put in.
     */
    Elements contentMetaElements = htmlDoc.getElementsByAttributeValue("name", "generator");
    ImmutableList.Builder<String> listBuilder = ImmutableList.builder();

    /*
     * Iterate over our candidates. Sometimes there are several.
     */
    for (Element contentMetaElement : contentMetaElements) {
        StringBuilder contentAsTextBuilder = new StringBuilder();
        Element nextSibling = contentMetaElement.nextElementSibling();

        /*
         * In the cleaned up HTML DOM returned by JSoup the "RTF" content is
         * rendered as siblings of the meta node (JSoup having removed the html, head, body
         * elements which should never have been there in the first place). 
         */
        while (nextSibling != null && !nextSibling.tag().equals("meta")) {
            contentAsTextBuilder.append(nextSibling.text());
            nextSibling = nextSibling.nextElementSibling();
        }
        /*
         * Only carry over non-empty content.
         */
        String contentAsText = contentAsTextBuilder.toString();
        if (!removeNonBreakingSpacesAndTrim(contentAsText).isEmpty()) {
            listBuilder.add(contentAsText);
        }
    }

    return listBuilder.build();
}

From source file:edu.ucla.cs.scai.swim.qa.ontology.dbpedia.DBpediaOntologyOld.java

private void traverseHierarchy(Element e, DBpediaCategory category, HashMap<String, DBpediaCategory> map) {
    for (Element c : e.children()) {
        String tagName = c.tag().getName();
        if (tagName.equals("a")) {
            String href = c.attr("href");
            if (href != null && href.length() > 0) {
                category.setLabel(c.text());
                category.setUri(CLASSES_BASE_URI + c.text());
                map.put(category.getLabel(), category);
                System.out.println(c.text() + "\t" + CLASSES_BASE_URI + c.text());
            }//from   www . j  av a 2s .c o m
        } else if (tagName.equals("ul")) {
            for (Element c1 : c.children()) {
                if (c1.tagName().equals("li")) {
                    DBpediaCategory cc = new DBpediaCategory();
                    traverseHierarchy(c1, cc, map);
                    cc.parents = new HashSet<>();
                    cc.parents.add(category);
                    category.getSubClasses().add(cc);
                }
            }
        }
    }
}

From source file:de.geeksfactory.opacclient.apis.Open.java

/**
 * Better version of JSoup's implementation of this function ({@link
 * org.jsoup.nodes.FormElement#formData()}).
 *
 * @param form       The form to submit/* w  ww . jav a 2 s .c  o  m*/
 * @param submitName The name attribute of the button which is clicked to submit the form, or
 *                   null
 * @return A MultipartEntityBuilder containing the data of the form
 */
protected MultipartEntityBuilder formData(FormElement form, String submitName) {
    MultipartEntityBuilder data = MultipartEntityBuilder.create();
    data.setLaxMode();

    // iterate the form control elements and accumulate their values
    for (Element el : form.elements()) {
        if (!el.tag().isFormSubmittable()) {
            continue; // contents are form listable, superset of submitable
        }
        String name = el.attr("name");
        if (name.length() == 0)
            continue;
        String type = el.attr("type");

        if ("select".equals(el.tagName())) {
            Elements options = el.select("option[selected]");
            boolean set = false;
            for (Element option : options) {
                data.addTextBody(name, option.val());
                set = true;
            }
            if (!set) {
                Element option = el.select("option").first();
                if (option != null) {
                    data.addTextBody(name, option.val());
                }
            }
        } else if ("checkbox".equalsIgnoreCase(type) || "radio".equalsIgnoreCase(type)) {
            // only add checkbox or radio if they have the checked attribute
            if (el.hasAttr("checked")) {
                data.addTextBody(name, el.val().length() > 0 ? el.val() : "on");
            }
        } else if ("submit".equalsIgnoreCase(type) || "image".equalsIgnoreCase(type)) {
            if (submitName != null && el.attr("name").contains(submitName)) {
                data.addTextBody(name, el.val());
            }
        } else {
            data.addTextBody(name, el.val());
        }
    }
    return data;
}

From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java

/**
 * Extract text and hyperlinks from an element
 * supposingly containing only text.//from w ww . j a va 2 s  .  c o m
 * 
 * @param textElement
 *       The element to be processed.
 * @param rawStr
 *       The StringBuffer to contain the raw text.
 * @param linkedStr
 *       The StringBuffer to contain the text with hyperlinks.
 */
private void processTextElement(Element textElement, StringBuilder rawStr, StringBuilder linkedStr) { // we process each element contained in the specified text element
    for (Node node : textElement.childNodes()) { // element node
        if (node instanceof Element) {
            Element element = (Element) node;
            String eltName = element.tag().getName();

            // section headers: same thing
            if (eltName.equals(XmlNames.ELT_H2) || eltName.equals(XmlNames.ELT_H3)
                    || eltName.equals(XmlNames.ELT_H4) || eltName.equals(XmlNames.ELT_H5)
                    || eltName.equals(XmlNames.ELT_H6)) {
                processParagraphElement(element, rawStr, linkedStr);
            }

            // paragraphs inside paragraphs are processed recursively
            else if (eltName.equals(XmlNames.ELT_P)) {
                processParagraphElement(element, rawStr, linkedStr);
            }

            // superscripts are to be avoided
            else if (eltName.equals(XmlNames.ELT_SUP)) { // they are either external references or WP inline notes
                                                         // cf. http://en.wikipedia.org/wiki/Template%3ACitation_needed
            }

            // small caps are placed before phonetic transcriptions of names, which we avoid
            else if (eltName.equals(XmlNames.ELT_SMALL)) { // we don't need them, and they can mess up NER tools
            }

            // we ignore certain types of span (phonetic trancription, WP buttons...) 
            else if (eltName.equals(XmlNames.ELT_SPAN)) {
                processSpanElement(element, rawStr, linkedStr);
            }

            // hyperlinks must be included in the linked string, provided they are not external
            else if (eltName.equals(XmlNames.ELT_A)) {
                processHyperlinkElement(element, rawStr, linkedStr);
            }

            // lists
            else if (eltName.equals(XmlNames.ELT_UL)) {
                processListElement(element, rawStr, linkedStr, false);
            } else if (eltName.equals(XmlNames.ELT_OL)) {
                processListElement(element, rawStr, linkedStr, true);
            } else if (eltName.equals(XmlNames.ELT_DL)) {
                processDescriptionListElement(element, rawStr, linkedStr);
            }

            // list item
            else if (eltName.equals(XmlNames.ELT_LI)) {
                processTextElement(element, rawStr, linkedStr);
            }

            // divisions are just processed recursively
            else if (eltName.equals(XmlNames.ELT_DIV)) {
                processDivisionElement(element, rawStr, linkedStr);
            }

            // quotes are just processed recursively
            else if (eltName.equals(XmlNames.ELT_BLOCKQUOTE)) {
                processQuoteElement(element, rawStr, linkedStr);
            }
            // citation
            else if (eltName.equals(XmlNames.ELT_CITE)) {
                processParagraphElement(element, rawStr, linkedStr);
            }

            // other elements are considered as simple text
            else {
                String text = element.text();
                rawStr.append(text);
                linkedStr.append(text);
            }
        }

        // text node
        else if (node instanceof TextNode) { // get the text
            TextNode textNode = (TextNode) node;
            String text = textNode.text();
            // if at the begining of a new line, or already preceeded by a space, remove leading spaces
            while (rawStr.length() > 0
                    && (rawStr.charAt(rawStr.length() - 1) == '\n' || rawStr.charAt(rawStr.length() - 1) == ' ')
                    && text.startsWith(" "))
                text = text.substring(1);
            // complete string buffers
            rawStr.append(text);
            linkedStr.append(text);
        }
    }
}

From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java

/**
 * Pulls a text from a Wikipedia URL without images, tags, etc.
 * //from   w  w  w . j a  v a 2 s  .  c o m
 * @param url
 *       Address of the targetted text.
 * @return
 *       An Article object representing the retrieved object.
 * 
 * @throws ReaderException
 *       Problem while retrieving the text.
 */
@Override
public Article read(URL url) throws ReaderException {
    Article result = null;
    String name = getName(url);

    try { // get the page
        String address = url.toString();
        logger.log("Retrieving page " + address);
        long startTime = System.currentTimeMillis();
        Document document = retrieveSourceCode(name, url);

        // get its title
        Element firstHeadingElt = document.getElementsByAttributeValue(XmlNames.ATT_ID, ID_TITLE).get(0);
        String title = firstHeadingElt.text();
        logger.log("Get title: " + title);

        // get raw and linked texts
        logger.log("Get raw and linked texts.");
        StringBuilder rawStr = new StringBuilder();
        StringBuilder linkedStr = new StringBuilder();
        Element bodyContentElt = document.getElementsByAttributeValue(XmlNames.ATT_ID, ID_CONTENT).get(0);
        // processing each element in the content part
        boolean ignoringSection = false;
        boolean first = true;
        for (Element element : bodyContentElt.children()) {
            String eltName = element.tag().getName();
            String eltClass = element.attr(XmlNames.ATT_CLASS);

            // section headers
            if (eltName.equals(XmlNames.ELT_H2)) {
                first = false;
                // get section name
                StringBuilder fakeRaw = new StringBuilder();
                StringBuilder fakeLinked = new StringBuilder();
                processParagraphElement(element, fakeRaw, fakeLinked);
                String str = fakeRaw.toString().trim().toLowerCase(Locale.ENGLISH);
                // check section name
                if (IGNORED_SECTIONS.contains(str))
                    ignoringSection = true;
                else {
                    ignoringSection = false;
                    rawStr.append("\n-----");
                    linkedStr.append("\n-----");
                    processParagraphElement(element, rawStr, linkedStr);
                }
            }

            else if (!ignoringSection) { // lower sections
                if (eltName.equals(XmlNames.ELT_H3) || eltName.equals(XmlNames.ELT_H4)
                        || eltName.equals(XmlNames.ELT_H5) || eltName.equals(XmlNames.ELT_H6)) {
                    first = false;
                    processParagraphElement(element, rawStr, linkedStr);
                }

                // paragraph
                else if (eltName.equals(XmlNames.ELT_P)) {
                    String str = element.text();
                    // ignore possible initial disambiguation link
                    if (!first || !str.startsWith(PARAGRAPH_FORTHE)) {
                        first = false;
                        processParagraphElement(element, rawStr, linkedStr);
                    }
                }

                // list
                else if (eltName.equals(XmlNames.ELT_UL)) {
                    first = false;
                    processListElement(element, rawStr, linkedStr, false);
                } else if (eltName.equals(XmlNames.ELT_OL)) {
                    first = false;
                    processListElement(element, rawStr, linkedStr, true);
                } else if (eltName.equals(XmlNames.ELT_DL)) {
                    first = false;
                    processDescriptionListElement(element, rawStr, linkedStr);
                }

                // tables
                else if (eltName.equals(XmlNames.ELT_TABLE)) {
                    first = !processTableElement(element, rawStr, linkedStr);
                }

                // divisions
                else if (eltName.equals(XmlNames.ELT_DIV)) { // ignore possible initial picture 
                    if (!first || eltClass == null || !eltClass.contains(CLASS_THUMB))
                        first = !processDivisionElement(element, rawStr, linkedStr);
                }

                // we ignore certain types of span (phonetic trancription, WP buttons...) 
                else if (eltName.equals(XmlNames.ELT_SPAN)) {
                    first = !processSpanElement(element, rawStr, linkedStr);
                }

                // hyperlinks must be included in the linked string, provided they are not external
                else if (eltName.equals(XmlNames.ELT_A)) {
                    first = !processHyperlinkElement(element, rawStr, linkedStr);
                }

                // quotes are just processed recursively
                else if (eltName.equals(XmlNames.ELT_BLOCKQUOTE)) {
                    first = !processQuoteElement(element, rawStr, linkedStr);
                }

                // other tags are ignored
            }
        }

        // create article object
        result = new Article(name);
        result.setTitle(title);
        result.setUrl(url);
        result.initDate();

        // clean text
        String rawText = rawStr.toString();
        rawText = cleanText(rawText);
        //         rawText = ArticleCleaning.replaceChars(rawText);
        result.setRawText(rawText);
        logger.log("Length of the raw text: " + rawText.length() + " chars.");
        String linkedText = linkedStr.toString();
        linkedText = cleanText(linkedText);
        //         linkedText = ArticleCleaning.replaceChars(linkedText);
        result.setLinkedText(linkedText);
        logger.log("Length of the linked text: " + linkedText.length() + " chars.");

        // get original html source code
        logger.log("Get original HTML source code.");
        String originalPage = document.toString();
        result.setOriginalPage(originalPage);
        logger.log("Length of the original page: " + originalPage.length() + " chars.");

        // get the categories of the article 
        List<ArticleCategory> categories = getArticleCategories(result);
        result.setCategories(categories);

        long endTime = System.currentTimeMillis();
        logger.log("Total duration: " + (endTime - startTime) + " ms.");
    } catch (ClientProtocolException e) {
        e.printStackTrace();
    } catch (ParseException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (org.json.simple.parser.ParseException e) {
        e.printStackTrace();
    }

    return result;
}