Example usage for org.jsoup.nodes Document baseUri

List of usage examples for org.jsoup.nodes Document baseUri

Introduction

In this page you can find the example usage for org.jsoup.nodes Document baseUri.

Prototype

@Override
    public String baseUri() 

Source Link

Usage

From source file:app.data.parse.WebPageUtil.java

public static WebPageInfo parse(String url, Cache<String, WebPageInfo> urlInfoCache) throws IOException {
    String original = url;//  w w  w  .j  a  va 2 s  .  c o m

    // hit toutiao.io
    // fixme http://toutiao.io/shares/640539/url
    if (original.startsWith("https://toutiao.io/posts/")) {
        original = original.replace("/posts/", "/k/");
    }

    // check cache
    WebPageInfo info = urlInfoCache != null ? urlInfoCache.getIfPresent(original) : null;
    if (info != null) {
        return info;
    } else {
        info = new WebPageInfo();
        info.url = original;
    }

    // attach url
    Document doc = requestUrl(info.url);
    info.url = doc.baseUri(); // or doc.location()

    // hit gold.xitu.io
    if (info.url.startsWith("http://gold.xitu.io/entry/")) {
        Elements origin = doc.select("div[class=ellipsis]");
        Elements originLink = origin.select("a[class=share-link]");
        info.url = originLink.attr("href");

        // reconnect
        doc = requestUrl(info.url);
        info.url = doc.baseUri(); // or doc.location()
    }

    info.url = smartUri(info.url);

    // get title
    Elements metaTitle = doc.select("meta[property=og:title]");
    if (metaTitle != null) {
        info.title = metaTitle.attr("content");
    }
    if (StringUtils.isEmpty(info.title)) {
        metaTitle = doc.select("meta[property=twitter:title]");
        if (metaTitle != null) {
            info.title = metaTitle.attr("content");
        }
        info.title = StringUtils.isEmpty(info.title) ? doc.title() : info.title;
    }

    // get desc
    Elements metaDesc = doc.select("meta[property=og:description]");
    if (metaDesc != null) {
        info.description = metaDesc.attr("content");
    }
    if (StringUtils.isEmpty(info.description)) {
        metaDesc = doc.select("meta[property=twitter:description]");
        if (metaDesc != null) {
            info.description = metaDesc.attr("content");
        }
        if (StringUtils.isEmpty(info.description)) {
            metaDesc = doc.select("meta[name=description]");
            if (metaDesc != null) {
                info.description = metaDesc.attr("content");
            }
            if (StringUtils.isEmpty(info.description)) {
                metaDesc = doc.body().select("p");
                if (metaDesc != null) {
                    for (Element element : metaDesc) {
                        info.description = element.text();
                        if (info.description != null && info.description.length() >= 20) {
                            break;
                        }
                    }
                }
            }
        }
    }
    info.description = ellipsis(info.description, 140, "...");

    // cache info
    if (urlInfoCache != null) {
        urlInfoCache.put(original, info);
    }
    return info;
}

From source file:ac.simons.oembed.Oembed.java

/**
 * Parses  the given html document into a document and processes 
 * all anchor elements. If a valid anchor is found, it tries to
 * get an oembed response for it's url and than render the result
 * into the document replacing the given anchor.<br>
 * It returns the html representation of the new document.<br>
 * If there's an error or no oembed result for an url, the anchor tag
 * will be left as it was. //from   w  ww  .  ja v a  2s.  com
 * @param document The document that should be checked for links to transform
 * @return the transformed document
 */
public Document transformDocument(final Document document) {
    boolean changedBaseUri = false;
    if (document.baseUri() == null && this.getBaseUri() != null) {
        document.setBaseUri(this.getBaseUri());
        changedBaseUri = true;
    }
    for (Element a : document.getElementsByTag("a")) {
        final String href = a.absUrl("href");
        try {
            String renderedRespose = null;
            final OembedResponse oembedResponse = this.transformUrl(href);
            // There was no response or an exception happened
            if (oembedResponse == null)
                continue;
            // There is a handler for this response
            else if (this.getHandler().containsKey(oembedResponse.getSource()))
                this.getHandler().get(oembedResponse.getSource()).handle(document, a, oembedResponse);
            // Try to render the response itself and replace the current anchor
            else if ((renderedRespose = oembedResponse.render()) != null) {
                a.before(renderedRespose);
                a.remove();
            }
        } catch (OembedException e) {
            logger.warn(String.format("Skipping '%s': %s", href, e.getMessage()));
        }
    }
    if (changedBaseUri)
        document.setBaseUri(null);
    return document;
}

From source file:de.ncoder.studipsync.studip.jsoup.JsoupStudipAdapter.java

private void setDocument(Document document) throws StudipException {
    this.document = document;
    try {//from   www  .j ava  2s .c  o m
        URL url = new URL(document.baseUri());
        log.trace("NAV: " + url);
        for (NavigationListener listener : listeners) {
            listener.navigated(url);
        }
    } catch (MalformedURLException e) {
        StudipException ex = new StudipException("Illegal URL " + document.baseUri(), e);
        ex.put("studip.url", document.baseUri());
        ex.put("studip.document", document);
        throw ex;
    }
}

From source file:me.vertretungsplan.parser.UntisCommonParser.java

void parseMultipleMonitorDays(SubstitutionSchedule v, Document doc, JSONObject data)
        throws JSONException, CredentialInvalidException {
    if (doc.select(".mon_head").size() > 1) {
        for (int j = 0; j < doc.select(".mon_head").size(); j++) {
            Document doc2 = Document.createShell(doc.baseUri());
            doc2.body().appendChild(doc.select(".mon_head").get(j).clone());
            Element next = doc.select(".mon_head").get(j).nextElementSibling();
            if (next != null && next.tagName().equals("center")) {
                doc2.body().appendChild(next.select(".mon_title").first().clone());
                if (next.select("table:has(tr.list)").size() > 0) {
                    doc2.body().appendChild(next.select("table:has(tr.list)").first());
                }/*ww w .  j  av a  2 s.co  m*/
                if (next.select("table.info").size() > 0) {
                    doc2.body().appendChild(next.select("table.info").first());
                }
            } else if (doc.select(".mon_title").size() - 1 >= j) {
                doc2.body().appendChild(doc.select(".mon_title").get(j).clone());
                doc2.body().appendChild(doc.select("table:has(tr.list)").get(j).clone());
            } else {
                continue;
            }
            SubstitutionScheduleDay day = parseMonitorDay(doc2, data);
            v.addDay(day);
        }
    } else if (doc.select(".mon_title").size() > 1) {
        for (int j = 0; j < doc.select(".mon_title").size(); j++) {
            Document doc2 = Document.createShell(doc.baseUri());
            doc2.body().appendChild(doc.select(".mon_title").get(j).clone());
            Element next = doc.select(".mon_title").get(j).nextElementSibling();
            while (next != null && !next.tagName().equals("center")) {
                doc2.body().appendChild(next);
                next = doc.select(".mon_title").get(j).nextElementSibling();
            }
            SubstitutionScheduleDay day = parseMonitorDay(doc2, data);
            v.addDay(day);
        }
    } else {
        SubstitutionScheduleDay day = parseMonitorDay(doc, data);
        v.addDay(day);
    }
}

From source file:de.geeksfactory.opacclient.apis.Zones.java

private void loadMediaList(Document lentDoc, List<LentItem> items) throws IOException {
    items.addAll(parseMediaList(lentDoc));
    String nextPageUrl = findNextPageUrl(lentDoc);
    if (nextPageUrl != null) {
        Document doc = Jsoup.parse(httpGet(nextPageUrl, getDefaultEncoding()));
        doc.setBaseUri(lentDoc.baseUri());
        loadMediaList(doc, items);/*  www. j  av  a 2  s  .com*/
    }
}

From source file:de.geeksfactory.opacclient.apis.Zones.java

private void loadResList(Document lentDoc, List<ReservedItem> items) throws IOException {
    items.addAll(parseResList(lentDoc));
    String nextPageUrl = findNextPageUrl(lentDoc);
    if (nextPageUrl != null) {
        Document doc = Jsoup.parse(httpGet(nextPageUrl, getDefaultEncoding()));
        doc.setBaseUri(lentDoc.baseUri());
        loadResList(doc, items);//from   w  ww .j  a  va2  s  .  c o m
    }
}

From source file:no.kantega.publishing.admin.content.htmlfilter.RemoveNestedSpanTagsFilter.java

@Override
public Document runFilter(Document document) {
    final Document clean = Document.createShell(document.baseUri());
    if (document.body() != null) // frameset documents won't have a body. the clean doc will have empty body.
        copySafeNodes(document.body(), clean.body());

    return clean;
}

From source file:org.b3log.symphony.service.LinkForgeMgmtService.java

/**
 * Forges the specified URL.//w ww .  j  a  v a  2 s. c  om
 *
 * @param url the specified URL
 * @param userId the specified user id
 */
public void forge(final String url, final String userId) {
    String html;
    String baseURL;
    try {
        final Document doc = Jsoup.connect(url).timeout(5000).userAgent(Symphonys.USER_AGENT_BOT).get();

        doc.select("body").prepend("<a href=\"" + url + "\">" + url + "</a>"); // Add the specified URL itfself

        html = doc.html();

        baseURL = doc.baseUri();
    } catch (final Exception e) {
        LOGGER.log(Level.ERROR, "Parses link [" + url + "] failed", e);

        return;
    }

    final List<JSONObject> links = Links.getLinks(baseURL, html);
    final List<JSONObject> cachedTags = tagCache.getTags();

    final Transaction transaction = linkRepository.beginTransaction();
    try {
        for (final JSONObject lnk : links) {
            final String addr = lnk.optString(Link.LINK_ADDR);
            JSONObject link = linkRepository.getLink(addr);

            if (null == link) {
                link = new JSONObject();
                link.put(Link.LINK_ADDR, lnk.optString(Link.LINK_ADDR));
                link.put(Link.LINK_BAD_CNT, 0);
                link.put(Link.LINK_BAIDU_REF_CNT, 0);
                link.put(Link.LINK_CLICK_CNT, 0);
                link.put(Link.LINK_GOOD_CNT, 0);
                link.put(Link.LINK_SCORE, 0);
                link.put(Link.LINK_SUBMIT_CNT, 0);
                link.put(Link.LINK_TITLE, lnk.optString(Link.LINK_TITLE));
                link.put(Link.LINK_TYPE, Link.LINK_TYPE_C_FORGE);

                LOGGER.info(link.optString(Link.LINK_ADDR) + "____" + link.optString(Link.LINK_TITLE));
                linkRepository.add(link);

                final JSONObject linkCntOption = optionRepository.get(Option.ID_C_STATISTIC_LINK_COUNT);
                final int linkCnt = linkCntOption.optInt(Option.OPTION_VALUE);
                linkCntOption.put(Option.OPTION_VALUE, linkCnt + 1);
                optionRepository.update(Option.ID_C_STATISTIC_LINK_COUNT, linkCntOption);
            } else {
                link.put(Link.LINK_BAIDU_REF_CNT, lnk.optInt(Link.LINK_BAIDU_REF_CNT));
                link.put(Link.LINK_TITLE, lnk.optString(Link.LINK_TITLE));
                link.put(Link.LINK_SCORE, lnk.optInt(Link.LINK_BAIDU_REF_CNT)); // XXX: Need a score algorithm

                linkRepository.update(link.optString(Keys.OBJECT_ID), link);
            }

            final String linkId = link.optString(Keys.OBJECT_ID);
            final double linkScore = link.optDouble(Link.LINK_SCORE, 0D);
            String title = link.optString(Link.LINK_TITLE) + " " + link.optString(Link.LINK_T_KEYWORDS);
            title = Pangu.spacingText(title);
            String[] titles = title.split(" ");
            titles = Strings.trimAll(titles);

            for (final JSONObject cachedTag : cachedTags) {
                final String tagId = cachedTag.optString(Keys.OBJECT_ID);

                final String tagTitle = cachedTag.optString(Tag.TAG_TITLE);
                if (!Strings.containsIgnoreCase(tagTitle, titles)) {
                    continue;
                }

                final JSONObject tag = tagRepository.get(tagId);

                // clean
                tagUserLinkRepository.removeByTagIdUserIdAndLinkId(tagId, userId, linkId);

                // re-add
                final JSONObject tagLinkRel = new JSONObject();
                tagLinkRel.put(Tag.TAG_T_ID, tagId);
                tagLinkRel.put(UserExt.USER_T_ID, userId);
                tagLinkRel.put(Link.LINK_T_ID, linkId);
                tagLinkRel.put(Link.LINK_SCORE, linkScore);
                tagUserLinkRepository.add(tagLinkRel);

                // refresh link score
                tagUserLinkRepository.updateTagLinkScore(tagId, linkId, linkScore);

                // re-calc tag link count
                final int tagLinkCnt = tagUserLinkRepository.countTagLink(tagId);
                tag.put(Tag.TAG_LINK_CNT, tagLinkCnt);
                tagRepository.update(tagId, tag);
            }
        }

        transaction.commit();

        LOGGER.info("Forged link [" + url + "]");
    } catch (final Exception e) {
        if (transaction.isActive()) {
            transaction.rollback();
        }

        LOGGER.log(Level.ERROR, "Saves links failed", e);
    }
}

From source file:org.cellcore.code.engine.page.extractor.starcity.STCPageDataExtractor.java

@Override
protected String getName(Document doc) throws UnsupportedCardException {
    if (!doc.select("h3").select(":contains(Foil)").isEmpty()) {
        throw new UnsupportedCardException("foil");
    }//ww  w.j  a va 2s.  c o  m
    String href = doc.baseUri();
    String code = href.substring(href.lastIndexOf("=") + 1, href.length());
    jsonProc(code, doc);
    if (doc.getElementById("custom_card_name_STC") != null) {
        return doc.getElementById("custom_card_name_STC").text();
    }
    return null;
}

From source file:org.openmrs.module.radiology.report.template.DefaultMrrtReportTemplateFileParser.java

private final void initializeTemplate(MrrtReportTemplate template, Document doc) {
    final Elements metaTags = doc.getElementsByTag("meta");

    template.setPath(doc.baseUri());
    template.setCharset(metaTags.attr("charset"));
    for (Element metaTag : metaTags) {
        final String name = metaTag.attr("name");
        final String content = metaTag.attr("content");

        switch (name) {
        case DCTERMS_TITLE:
            template.setDcTermsTitle(content);
            break;
        case DCTERMS_DESCRIPTION:
            template.setDcTermsDescription(content);
            break;
        case DCTERMS_IDENTIFIER:
            template.setDcTermsIdentifier(content);
            break;
        case DCTERMS_TYPE:
            template.setDcTermsType(content);
            break;
        case DCTERMS_LANGUAGE:
            template.setDcTermsLanguage(content);
            break;
        case DCTERMS_PUBLISHER:
            template.setDcTermsPublisher(content);
            break;
        case DCTERMS_RIGHTS:
            template.setDcTermsRights(content);
            break;
        case DCTERMS_LICENSE:
            template.setDcTermsLicense(content);
            break;
        case DCTERMS_DATE:
            template.setDcTermsDate(content);
            break;
        case DCTERMS_CREATOR:
            template.setDcTermsCreator(content);
            break;
        default:/*from   w  w  w  . j ava  2s . co  m*/
            log.debug("Unhandled meta tag " + name);
        }
    }
}