Example usage for org.jsoup.nodes Document setBaseUri

List of usage examples for org.jsoup.nodes Document setBaseUri

Introduction

In this page you can find the example usage for org.jsoup.nodes Document setBaseUri.

Prototype

public void setBaseUri(final String baseUri) 

Source Link

Document

Update the base URI of this node and all of its descendants.

Usage

From source file:me.vertretungsplan.parser.UntisInfoHeadlessParser.java

@Override
public SubstitutionSchedule getSubstitutionSchedule()
        throws IOException, JSONException, CredentialInvalidException {
    new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore);

    SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData);

    Document doc = Jsoup.parse(httpGet(url, data.optString(PARAM_ENCODING, null)));
    doc.setBaseUri(url);
    Elements dayElems = doc.select("#vertretung > p > b, #vertretung > b");

    Elements frames = doc.select("frame[src*=w00]");
    if (dayElems.size() == 0 && frames.size() > 0) {
        // doc is embedded in frame
        doc = Jsoup.parse(httpGet(frames.get(0).absUrl("src"), data.optString(PARAM_ENCODING, null)));
        dayElems = doc.select("#vertretung > p > b, #vertretung > b");
    }//  w w  w. ja  v a2s  . c o  m

    for (Element dayElem : dayElems) {
        SubstitutionScheduleDay day = new SubstitutionScheduleDay();
        day.setLastChangeString("");

        String date = dayElem.text();
        day.setDateString(date);
        day.setDate(ParserUtils.parseDate(date));

        Element next;
        if (dayElem.parent().tagName().equals("p")) {
            next = dayElem.parent().nextElementSibling().nextElementSibling();
        } else {
            next = dayElem.parent().select("p").first().nextElementSibling();
        }
        parseDay(day, next, v, null);
    }
    v.setClasses(getAllClasses());
    v.setTeachers(getAllTeachers());
    return v;
}

From source file:de.geeksfactory.opacclient.apis.IOpac.java

static void parseResList(List<ReservedItem> media, Document doc, JSONObject data) {
    if (doc.select("a[name=RES]").size() == 0)
        return;//from   w ww .  j  a  v  a2  s.  c o  m
    Elements copytrs = doc.select("a[name=RES] ~ table:contains(Titel)").first().select("tr");
    doc.setBaseUri(data.optString("baseurl"));
    DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN);

    int trs = copytrs.size();
    if (trs < 2) {
        return;
    }
    assert (trs > 0);
    for (int i = 1; i < trs; i++) {
        Element tr = copytrs.get(i);
        ReservedItem item = new ReservedItem();

        item.setTitle(tr.child(0).text().trim().replace("\u00a0", ""));
        item.setAuthor(tr.child(1).text().trim().replace("\u00a0", ""));
        try {
            item.setReadyDate(fmt.parseLocalDate(tr.child(4).text().trim().replace("\u00a0", "")));
        } catch (IllegalArgumentException e) {
            item.setStatus(tr.child(4).text().trim().replace("\u00a0", ""));
        }
        if (tr.select("a").size() > 0) {
            item.setCancelData(tr.select("a").last().attr("href"));
        }

        media.add(item);
    }
    assert (media.size() == trs - 1);

}

From source file:com.bluedragon.search.index.crawl.handler.FileHandlerHTMLImpl.java

public DocumentWrap crawl(String uriroot, File file) throws CrawlException {
    DocumentWrap document = new DocumentWrap();

    try {/*from   w w w  .  ja  v  a  2  s. c o  m*/
        // Get the body from the WORD
        String htmlBody = FileUtils.readFileToString(file);

        Document doc = Jsoup.parse(htmlBody);

        if (uriroot != null)
            doc.setBaseUri(uriroot);

        setAnchors(doc, uriroot);

        // Setup the document
        document.setContent(doc.text(), bStoreBody);
        document.setSize((int) file.length());
        document.setType("text/html");

        document.setId(file.getCanonicalPath());

        if (uriroot != null)
            document.setURL(getUrl(uriroot, file));

    } catch (FileNotFoundException e) {
        throw new CrawlException("File not found: " + file, e);
    } catch (IOException e) {
        throw new CrawlException("File: " + file, e);
    } catch (Exception e) {
        throw new CrawlException("File: " + file, e);
    }

    return document;
}

From source file:de.geeksfactory.opacclient.apis.IOpac.java

static void parseMediaList(List<LentItem> media, Document doc, JSONObject data) {
    if (doc.select("a[name=AUS]").size() == 0)
        return;/*  ww w  . j  a v  a2  s  . c o m*/

    Elements copytrs = doc.select("a[name=AUS] ~ table, a[name=AUS] ~ form table").first().select("tr");
    doc.setBaseUri(data.optString("baseurl"));

    DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN);

    int trs = copytrs.size();
    if (trs < 2) {
        return;
    }
    assert (trs > 0);

    JSONObject copymap = new JSONObject();
    try {
        if (data.has("accounttable")) {
            copymap = data.getJSONObject("accounttable");
        }
    } catch (JSONException e) {
    }

    Pattern datePattern = Pattern.compile("\\d{2}\\.\\d{2}\\.\\d{4}");
    for (int i = 1; i < trs; i++) {
        Element tr = copytrs.get(i);
        LentItem item = new LentItem();

        if (copymap.optInt("title", 0) >= 0) {
            item.setTitle(tr.child(copymap.optInt("title", 0)).text().trim().replace("\u00a0", ""));
        }
        if (copymap.optInt("author", 1) >= 0) {
            item.setAuthor(tr.child(copymap.optInt("author", 1)).text().trim().replace("\u00a0", ""));
        }
        if (copymap.optInt("format", 2) >= 0) {
            item.setFormat(tr.child(copymap.optInt("format", 2)).text().trim().replace("\u00a0", ""));
        }
        int prolongCount = 0;
        if (copymap.optInt("prolongcount", 3) >= 0) {
            prolongCount = Integer
                    .parseInt(tr.child(copymap.optInt("prolongcount", 3)).text().trim().replace("\u00a0", ""));
            item.setStatus(String.valueOf(prolongCount) + "x verl.");
        }
        if (data.optInt("maxprolongcount", -1) != -1) {
            item.setRenewable(prolongCount < data.optInt("maxprolongcount", -1));
        }
        if (copymap.optInt("returndate", 4) >= 0) {
            String value = tr.child(copymap.optInt("returndate", 4)).text().trim().replace("\u00a0", "");
            Matcher matcher = datePattern.matcher(value);
            if (matcher.find()) {
                try {
                    item.setDeadline(fmt.parseLocalDate(matcher.group()));
                } catch (IllegalArgumentException e1) {
                    e1.printStackTrace();
                }
            }
        }
        if (copymap.optInt("prolongurl", 5) >= 0) {
            if (tr.children().size() > copymap.optInt("prolongurl", 5)) {
                Element cell = tr.child(copymap.optInt("prolongurl", 5));
                if (cell.select("input[name=MedNrVerlAll]").size() > 0) {
                    // new iOPAC Version 1.45 - checkboxes to prolong multiple items
                    // internal convention: We add "NEW" to the media ID to show that we have
                    // the new iOPAC version
                    Element input = cell.select("input[name=MedNrVerlAll]").first();
                    String value = input.val();
                    item.setProlongData("NEW" + value);
                    item.setId(value.split(";")[0]);
                    if (input.hasAttr("disabled"))
                        item.setRenewable(false);
                } else {
                    // previous versions - link for prolonging on every medium
                    String link = cell.select("a").attr("href");
                    item.setProlongData(link);
                    // find media number with regex
                    Pattern pattern = Pattern.compile("mednr=([^&]*)&");
                    Matcher matcher = pattern.matcher(link);
                    if (matcher.find() && matcher.group() != null)
                        item.setId(matcher.group(1));
                }
            }
        }

        media.add(item);
    }
    assert (media.size() == trs - 1);

}

From source file:me.vertretungsplan.parser.UntisMonitorParser.java

private void loadUrl(String url, String encoding, boolean following, List<Document> docs, String startUrl,
        int recursionDepth) throws IOException, CredentialInvalidException {
    String html;//from   w w  w.j  a v  a 2  s.c o  m
    if (url.equals(VALUE_URL_LOGIN_RESPONSE)) {
        html = loginResponse;
    } else {
        try {
            html = httpGet(url, encoding).replace("&nbsp;", "");
        } catch (HttpResponseException e) {
            if (docs.size() == 0) {
                throw e;
            } else {
                return; // ignore if first page was loaded and redirect didn't work
            }
        }
    }
    Document doc = Jsoup.parse(html);
    doc.setBaseUri(url);

    if (doc.select(".mon_title").size() == 0) {
        // We have a problem - there seems to be no substitution schedule. Maybe it is hiding
        // inside a frame?
        if (doc.select("frameset frame[name").size() > 0) {
            for (Element frame : doc.select("frameset frame")) {
                if (frame.attr("src").matches(".*subst_\\d\\d\\d.html?")
                        && recursionDepth < MAX_RECURSION_DEPTH) {
                    String frameUrl = frame.absUrl("src");
                    loadUrl(frame.absUrl("src"), encoding, following, docs, frameUrl, recursionDepth + 1);
                }
            }
        } else if (doc.text().contains("registriert")) {
            throw new CredentialInvalidException();
        } else {
            if (docs.size() == 0) {
                // ignore if first page was loaded and redirect didn't work
                throw new IOException(
                        "Could not find .mon-title, seems like there is no Untis " + "schedule here");
            }
        }
    } else {
        findSubDocs(docs, html, doc);

        if (following && doc.select("meta[http-equiv=refresh]").size() > 0) {
            Element meta = doc.select("meta[http-equiv=refresh]").first();
            String attr = meta.attr("content").toLowerCase();
            String redirectUrl = url.substring(0, url.lastIndexOf("/") + 1)
                    + attr.substring(attr.indexOf("url=") + 4);
            if (!redirectUrl.equals(startUrl) && recursionDepth < MAX_RECURSION_DEPTH) {
                loadUrl(redirectUrl, encoding, true, docs, startUrl, recursionDepth + 1);
            }
        }
    }
}

From source file:ac.simons.oembed.Oembed.java

/**
 * Parses  the given html document into a document and processes 
 * all anchor elements. If a valid anchor is found, it tries to
 * get an oembed response for it's url and than render the result
 * into the document replacing the given anchor.<br>
 * It returns the html representation of the new document.<br>
 * If there's an error or no oembed result for an url, the anchor tag
 * will be left as it was. // w  w w.  j  av  a  2  s  .  com
 * @param document The document that should be checked for links to transform
 * @return the transformed document
 */
public Document transformDocument(final Document document) {
    boolean changedBaseUri = false;
    if (document.baseUri() == null && this.getBaseUri() != null) {
        document.setBaseUri(this.getBaseUri());
        changedBaseUri = true;
    }
    for (Element a : document.getElementsByTag("a")) {
        final String href = a.absUrl("href");
        try {
            String renderedRespose = null;
            final OembedResponse oembedResponse = this.transformUrl(href);
            // There was no response or an exception happened
            if (oembedResponse == null)
                continue;
            // There is a handler for this response
            else if (this.getHandler().containsKey(oembedResponse.getSource()))
                this.getHandler().get(oembedResponse.getSource()).handle(document, a, oembedResponse);
            // Try to render the response itself and replace the current anchor
            else if ((renderedRespose = oembedResponse.render()) != null) {
                a.before(renderedRespose);
                a.remove();
            }
        } catch (OembedException e) {
            logger.warn(String.format("Skipping '%s': %s", href, e.getMessage()));
        }
    }
    if (changedBaseUri)
        document.setBaseUri(null);
    return document;
}

From source file:de.geeksfactory.opacclient.apis.Heidi.java

@Override
public void start() throws IOException {
    String html = httpGet(opac_url + "/search.cgi?art=f", ENCODING, false, cookieStore);
    Document doc = Jsoup.parse(html);
    doc.setBaseUri(opac_url);
    sessid = null;/*from w  w  w  .j  a v  a  2s.  com*/
    for (Element link : doc.select("a")) {
        String sid = getQueryParamsFirst(link.absUrl("href")).get("sess");
        if (sid != null) {
            sessid = sid;
            break;
        }
    }
    super.start();
}

From source file:de.geeksfactory.opacclient.apis.Open.java

@Override
public SearchRequestResult searchGetPage(int page) throws IOException, OpacErrorException, JSONException {
    /*/*w w w .jav  a 2s . c o  m*/
    When there are many pages of results, there will only be links to the next 4 and
    previous 4 pages, so we will click links until it gets to the correct page.
     */

    if (searchResultDoc == null)
        throw new NotReachableException();

    Document doc = searchResultDoc;

    Elements pageLinks = doc.select("span[id$=DataPager1]").first().select("a[id*=LinkButtonPageN");
    int from = Integer.valueOf(pageLinks.first().text());
    int to = Integer.valueOf(pageLinks.last().text());
    Element linkToClick;
    boolean willBeCorrectPage;

    if (page < from) {
        linkToClick = pageLinks.first();
        willBeCorrectPage = false;
    } else if (page > to) {
        linkToClick = pageLinks.last();
        willBeCorrectPage = false;
    } else {
        linkToClick = pageLinks.get(page - from);
        willBeCorrectPage = true;
    }

    Pattern pattern = Pattern.compile("javascript:__doPostBack\\('([^,]*)','([^\\)]*)'\\)");
    Matcher matcher = pattern.matcher(linkToClick.attr("href"));
    if (!matcher.find())
        throw new OpacErrorException(StringProvider.INTERNAL_ERROR);

    FormElement form = (FormElement) doc.select("form").first();
    HttpEntity data = formData(form, null).addTextBody("__EVENTTARGET", matcher.group(1))
            .addTextBody("__EVENTARGUMENT", matcher.group(2)).build();

    ByteArrayOutputStream stream = new ByteArrayOutputStream();
    data.writeTo(stream);

    String postUrl = form.attr("abs:action");

    String html = httpPost(postUrl, data, "UTF-8");
    if (willBeCorrectPage) {
        // We clicked on the correct link
        Document doc2 = Jsoup.parse(html);
        doc2.setBaseUri(postUrl);
        return parse_search(doc2, page);
    } else {
        // There was no correct link, so try to find one again
        searchResultDoc = Jsoup.parse(html);
        searchResultDoc.setBaseUri(postUrl);
        return searchGetPage(page);
    }
}

From source file:de.geeksfactory.opacclient.apis.Zones22.java

private Document login(Account acc) throws IOException, OpacErrorException {
    String html = httpGet(//w  w  w  .  jav a 2 s. c o m
            opac_url + "/APS_ZONES?fn=MyZone&Style=Portal3&SubStyle=&Lang=GER&ResponseEncoding=utf-8",
            getDefaultEncoding());
    Document doc = Jsoup.parse(html);
    doc.setBaseUri(opac_url + "/APS_ZONES");
    if (doc.select(".AccountSummaryCounterLink").size() > 0) {
        return doc;
    }
    if (doc.select("#LoginForm").size() == 0) {
        throw new NotReachableException();
    }
    List<NameValuePair> params = new ArrayList<NameValuePair>();

    for (Element input : doc.select("#LoginForm input")) {
        if (!input.attr("name").equals("BRWR") && !input.attr("name").equals("PIN"))
            params.add(new BasicNameValuePair(input.attr("name"), input.attr("value")));
    }
    params.add(new BasicNameValuePair("BRWR", acc.getName()));
    params.add(new BasicNameValuePair("PIN", acc.getPassword()));

    String loginHtml;
    try {
        loginHtml = httpPost(doc.select("#LoginForm").get(0).absUrl("action"), new UrlEncodedFormEntity(params),
                getDefaultEncoding());
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
        return null;
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }

    if (!loginHtml.contains("Kontostand")) {
        throw new OpacErrorException(stringProvider.getString(StringProvider.LOGIN_FAILED));
    }

    Document doc2 = Jsoup.parse(loginHtml);
    Pattern objid_pat = Pattern.compile("Obj_([0-9]+)\\?.*");
    for (Element a : doc2.select("a")) {
        Matcher objid_matcher = objid_pat.matcher(a.attr("href"));
        if (objid_matcher.matches()) {
            accountobj = objid_matcher.group(1);
        }
    }

    return doc2;
}

From source file:de.geeksfactory.opacclient.apis.Heidi.java

@Override
public List<SearchField> getSearchFields() throws IOException, OpacErrorException, JSONException {
    String html = httpGet(opac_url + "/search.cgi?art=f", ENCODING, false, cookieStore);
    Document doc = Jsoup.parse(html);
    doc.setBaseUri(opac_url);
    List<SearchField> fields = new ArrayList<>();

    Elements options = doc.select("select[name=kat1] option");
    for (Element option : options) {
        TextSearchField field = new TextSearchField();
        field.setDisplayName(option.text());
        field.setId(option.attr("value"));
        field.setHint("");
        fields.add(field);//from ww w.ja  v a 2 s .  com
    }

    DropdownSearchField field = new DropdownSearchField();

    Elements zst_opts = doc.select("#teilk2 option");
    for (int i = 0; i < zst_opts.size(); i++) {
        Element opt = zst_opts.get(i);
        if (!opt.val().equals("")) {
            field.addDropdownValue(opt.val(), opt.text());
        }
    }
    field.setDisplayName("Einrichtung");
    field.setId("f[teil2]");
    field.setVisible(true);
    field.setMeaning(SearchField.Meaning.BRANCH);
    fields.add(field);

    try {
        field = new DropdownSearchField();
        Document doc2 = Jsoup
                .parse(httpGet(opac_url + "/zweigstelle.cgi?sess=" + sessid, ENCODING, false, cookieStore));
        Elements home_opts = doc2.select("#zweig option");
        for (int i = 0; i < home_opts.size(); i++) {
            Element opt = home_opts.get(i);
            if (!opt.val().equals("")) {
                Map<String, String> option = new HashMap<>();
                option.put("key", opt.val());
                option.put("value", opt.text());
                field.addDropdownValue(opt.val(), opt.text());
            }
        }
        field.setDisplayName("Leihstelle");
        field.setId("_heidi_branch");
        field.setVisible(true);
        field.setMeaning(SearchField.Meaning.HOME_BRANCH);
        fields.add(field);
    } catch (IOException e) {
        e.printStackTrace();
    }

    TextSearchField pagefield = new TextSearchField();
    pagefield.setId("_heidi_page");
    pagefield.setVisible(false);
    pagefield.setDisplayName("Seite");
    pagefield.setHint("");
    fields.add(pagefield);

    return fields;
}