Example usage for org.jsoup.nodes Document select

Introduction

In this page you can find the example usage for org.jsoup.nodes Document select.

Prototype

public Elements select(String cssQuery)

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:org.wallride.web.support.Posts.java

public String thumbnail(Post post) {
    if (post.getCover() != null) {
        return wallRideProperties.getMediaUrlPrefix() + post.getCover().getId();
    } else {//from  w  ww  .  j a  va 2  s  . c o  m
        Document document = Jsoup.parse(post.getBody());
        Elements elements = document.select("img");
        for (Element element : elements) {
            return element.attr("src");
        }
    }
    return null;
}

From source file:de.tudarmstadt.ukp.argumentation.data.roomfordebate.NYTimesArticleExtractor.java

public Article extractArticle(String html) throws ParseException, IOException {
    Article result = new Article();

    Document doc = Jsoup.parse(html, getBaseName());

    Element element;//  w  w w .j  av  a 2s .c o m
    try {
        element = doc.select("article.rfd").iterator().next();
    } catch (NoSuchElementException exception) {
        throw new IOException("Cannot find article.rfd element");
    }

    //      System.out.println(element);

    String dateText = element.select("p.pubdate").text().replaceAll("Updated[\\s]+", "");
    // time
    try {
        DateFormat df = new SimpleDateFormat("MMM dd, yyyy, hh:mm aaa", Locale.ENGLISH);
        Date date = df.parse(dateText);
        result.setTimestamp(date);
    } catch (ParseException e) {
        // June 24, 2015
        DateFormat df = new SimpleDateFormat("MMM dd, yyyy", Locale.ENGLISH);
        Date date = df.parse(dateText);
        result.setTimestamp(date);
    }

    // title
    result.setTitle(TextCleaningUtils.normalizeWithParagraphs(element.select("h1").text()));

    // text
    StringBuilder sb = new StringBuilder();
    for (Element p : element.select("div.nytint-post > p")) {
        sb.append(p.text());
        sb.append("\n");
    }
    result.setText(TextCleaningUtils.normalizeWithParagraphs(sb.toString()));

    // debate title
    result.setDebateTitle(TextCleaningUtils
            .normalizeWithParagraphs(doc.select("div.nytint-discussion-overview > h2").text()));

    // debate url
    result.setDebateUrl(doc.select("div.nytint-discussion-overview > h2 > a").iterator().next().attr("href"));

    // document url
    result.setUrl(doc.select("meta[name=communityAssetURL]").attr("content"));

    // debate description
    result.setDebateDescription(TextCleaningUtils
            .normalizeWithParagraphs(((TextNode) doc.select("div.nytint-discussion-overview > p").iterator()
                    .next().childNodes().iterator().next()).text()));

    // aurhor
    result.setAuthor(element.select("div.nytint-mugshots > img").iterator().next().attr("alt"));

    // topics
    for (Element a : element.select("p.nytint-tags > a")) {
        result.getTopics().add(a.attr("href"));
    }

    return result;
}

From source file:com.liato.bankdroid.banking.banks.PayPal.java

@Override
protected LoginPackage preLogin() throws BankException, ClientProtocolException, IOException {
    urlopen = new Urllib(context, CertificateReader.getCertificates(context, R.raw.cert_paypal));
    urlopen.setUserAgent(/* ww  w  .j av a  2 s. co m*/
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36");
    //Get cookies and url to post to
    response = urlopen.open("https://www.paypal.com/en");
    Document d = Jsoup.parse(response);
    Element e = d.select("form[name=login_form]").first();

    String strPostUrl;
    if (e != null && !TextUtils.isEmpty(e.attr("action"))) {
        strPostUrl = e.attr("action").trim();
    } else {
        throw new BankException(res.getText(R.string.unable_to_find).toString() + " post url.");
    }
    List<NameValuePair> postData = new ArrayList<NameValuePair>();
    postData.add(new BasicNameValuePair("login_email", username));
    postData.add(new BasicNameValuePair("login_password", password));
    postData.add(new BasicNameValuePair("target_page", "0"));
    postData.add(new BasicNameValuePair("submit.x", "Log In"));
    postData.add(new BasicNameValuePair("form_charset", "UTF-8"));
    postData.add(new BasicNameValuePair("browser_name", "undefined"));
    postData.add(new BasicNameValuePair("browser_version", "undefined"));
    postData.add(new BasicNameValuePair("operating_system", "Windows"));
    postData.add(new BasicNameValuePair("bp_mid",
            "v=1;a1=na~a2=na~a3=na~a4=Mozilla~a5=Netscape~a6=5.0 (Windows; en-US)~a7=20100713~a8=na~a9=true~a10=Windows NT 6.1~a11=true~a12=Win32~a13=na~a14=Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.7) Gecko/20100713 Firefox/3.6.7 ( .NET CLR 3.5.30729; .NET4.0C)~a15=true~a16=en-US~a17=na~a18=www.paypal.com~a19=na~a20=na~a21=na~a22=na~a23=1280~a24=720~a25=24~a26=658~a27=na~a28=Sun Oct 31 2010 18:41:07 GMT 0100~a29=1~a30=def|qt1|qt2|qt3|qt4|qt5|qt6|swf|~a31=yes~a32=na~a33=na~a34=no~a35=no~a36=yes~a37=no~a38=online~a39=no~a40=Windows NT 6.1~a41=no~a42=no~"));
    postData.add(new BasicNameValuePair("bp_ks1",
            "v=1;l=16;Di0:2663Di1:48Ui0:15Ui1:81Di2:176Di3:48Ui2:32Ui3:96Di4:384Ui4:48Di5:352Ui5:48Di6:128Ui6:80Di7:112Ui7:48Di8:113Ui8:79Di9:125Ui9:51Di10:98Ui10:72Di11:227Ui11:51Di12:80Ui12:80Di13:128Ui13:64Di14:48Ui14:80Di15:416Ui15:80"));
    postData.add(new BasicNameValuePair("bp_ks2", ""));
    postData.add(new BasicNameValuePair("bp_ks3", ""));
    postData.add(new BasicNameValuePair("flow_name", "xpt/Marketing_CommandDriven/homepage/IndividualsHome"));
    postData.add(new BasicNameValuePair("fso",
            "k2TDENTlxEJnhbuYDYFmKMyVq0kUZPsdK6j3V1gPUwuZvyAmzzpRs4Cmjet0z19AwlxXfW"));
    return new LoginPackage(urlopen, postData, response, strPostUrl);
}

From source file:org.javiermoreno.torrentscratcher.Runner.java

public Movie getMovie(String path) throws IOException {
    String url = "http://www.elitetorrent.net{path}";
    url = url.replace("{path}", path);
    log.debug("Retrieving " + path + ".");
    Document doc = Jsoup.connect(url).get();

    Movie movie = new Movie();
    String title = doc.select("#box-ficha > h2").text();
    // strip parentheses: http://stackoverflow.com/questions/1138552/replace-string-in-parentheses-using-regex
    title = title.replaceAll("\\([^\\(]*\\)", "").replaceAll("\\[[^\\(]*\\]", "").replaceAll("aka$", "").trim();
    movie.setTitle(title);//from w w  w  . j a va 2 s. c  o m
    movie.setUrl(url);
    movie.setDescription(doc.select("p.descrip").eq(1).text());
    movie.setType("movie");
    movie.setImage("http://www.elitetorrent.net/" + doc.select("img.imagen_ficha").attr("src"));

    Torrent torrent = new Torrent();
    torrent.setMagnet(doc.select("a[href^=magnet]").attr("href"));
    torrent.setFilesize(doc.select("dl.info-tecnica dd").eq(3).text());
    movie.getTorrents().put("720p", torrent);

    return movie;
}

From source file:com.anhao.spring.service.impl.PhotosServiceImpl.java

private void getWallpaperTags(String wallpaperId) {
    String wallpaperUrl = "http://alpha.wallhaven.cc/wallpaper/" + wallpaperId;
    Document docDetails = getWallpaperHtmlDocument(wallpaperUrl);
    Elements Tags = docDetails.select("#tags li");
    for (Element tag : Tags) {
        //iduuid ?wallhavenID
        String photosId = jobPhotosDAO.findByWallpaperId(wallpaperId);
        //tagUUID
        Element tagName = tag.select(".tagname").first();

        String TagId = tagDAO.findByTagName(tagName.text());

        System.out.println("wallpaperId:" + wallpaperId + "====tag name " + tagName.text());
        PhotosTag photosTag = new PhotosTag();

        photosTag.setPhotoId(photosId);//from   ww w  .  jav  a  2 s .c  o  m
        photosTag.setTagId(TagId);
        photostagDAO.add(photosTag);
    }
}

From source file:com.johan.vertretungsplan.parser.SVPlanParser.java

public Vertretungsplan getVertretungsplan() throws IOException, JSONException {
    new LoginHandler(schule).handleLogin(executor, cookieStore, username, password); //

    JSONArray urls = schule.getData().getJSONArray("urls");
    String encoding = schule.getData().getString("encoding");
    List<Document> docs = new ArrayList<Document>();

    for (int i = 0; i < urls.length(); i++) {
        JSONObject url = urls.getJSONObject(i);
        loadUrl(url.getString("url"), encoding, docs);
    }//from ww w. j av  a2 s.c  om

    LinkedHashMap<String, VertretungsplanTag> tage = new LinkedHashMap<String, VertretungsplanTag>();
    for (Document doc : docs) {
        if (doc.select(".svp-tabelle").size() > 0) {
            VertretungsplanTag tag = new VertretungsplanTag();
            String date = "Unbekanntes Datum";
            if (doc.select(".svp-plandatum-heute, .svp-plandatum-morgen").size() > 0)
                date = doc.select(".svp-plandatum-heute, .svp-plandatum-morgen").text();
            else if (doc.title().startsWith("Vertretungsplan fr "))
                date = doc.title().substring("Vertretungsplan fr ".length());
            tag.setDatum(date);
            if (doc.select(".svp-uploaddatum").size() > 0)
                tag.setStand(doc.select(".svp-uploaddatum").text().replace("Aktualisierung: ", ""));

            Elements rows = doc.select(".svp-tabelle tr");
            String lastLesson = "";
            for (Element row : rows) {
                if (row.hasClass("svp-header"))
                    continue;

                Vertretung vertretung = new Vertretung();
                List<String> affectedClasses = new ArrayList<String>();

                for (Element column : row.select("td")) {
                    if (!hasData(column.text())) {
                        continue;
                    }
                    String type = column.className();
                    if (type.startsWith("svp-stunde")) {
                        vertretung.setLesson(column.text());
                        lastLesson = column.text();
                    } else if (type.startsWith("svp-klasse"))
                        affectedClasses = Arrays.asList(column.text().split(", "));
                    else if (type.startsWith("svp-esfehlt"))
                        vertretung.setPreviousTeacher(column.text());
                    else if (type.startsWith("svp-esvertritt"))
                        vertretung.setTeacher(column.text());
                    else if (type.startsWith("svp-fach"))
                        vertretung.setSubject(column.text());
                    else if (type.startsWith("svp-bemerkung")) {
                        vertretung.setDesc(column.text());
                        vertretung.setType(recognizeType(column.text()));
                    } else if (type.startsWith("svp-raum"))
                        vertretung.setRoom(column.text());

                    if (vertretung.getLesson() == null)
                        vertretung.setLesson(lastLesson);
                }

                if (vertretung.getType() == null) {
                    vertretung.setType("Vertretung");
                }

                for (String klasse : affectedClasses) {
                    KlassenVertretungsplan kv = tag.getKlassen().get(klasse);
                    if (kv == null)
                        kv = new KlassenVertretungsplan(klasse);
                    kv.add(vertretung);
                    tag.getKlassen().put(klasse, kv);
                }
            }

            List<String> nachrichten = new ArrayList<String>();
            if (doc.select("h2:contains(Mitteilungen)").size() > 0) {
                Element h2 = doc.select("h2:contains(Mitteilungen)").first();
                Element sibling = h2.nextElementSibling();
                while (sibling != null && sibling.tagName().equals("p")) {
                    for (String nachricht : TextNode.createFromEncoded(sibling.html(), null).getWholeText()
                            .split("<br />\\s*<br />")) {
                        if (hasData(nachricht))
                            nachrichten.add(nachricht);
                    }
                    sibling = sibling.nextElementSibling();
                }
            }
            tag.setNachrichten(nachrichten);

            tage.put(date, tag);
        } else {
            throw new IOException("keine SVPlan-Tabelle gefunden");
        }
    }
    Vertretungsplan v = new Vertretungsplan();
    v.setTage(new ArrayList<VertretungsplanTag>(tage.values()));

    return v;
}

From source file:mg.jerytodik.business.service.impl.JeryTodikSourceServiceImpl.java

private Elements getCssLinks(final String url) throws IOException {
    Document document = Jsoup.connect(url).get();

    return document.select("link[href]");
}

From source file:org.ala.lucene.CreateWordPressIndex.java

/**
 * Index the WP pages by parsing with Jsoup and indexing into SOLR
 *
 * @return//from  w ww  .  j a va 2 s .c  o  m
 * @throws IOException
 */
protected int indexPages() throws Exception {
    int documentCount = 0;
    // Initialise SOLR
    SolrServer solrServer = solrUtils.getSolrServer();
    logger.info("Deleting all WordPress documents in SOLR index...");
    solrServer.deleteByQuery("idxtype:" + IndexedTypes.WORDPRESS); // delete WP pages
    solrServer.commit();

    for (String pageUrl : this.pageUrls) {
        try {
            // Crawl and extract text from WP pages
            Document document = Jsoup.connect(pageUrl + CONTENT_ONLY_PARAM).get();
            String title = document.select("head > title").text();
            String id = document.select("head > meta[name=id]").attr("content");
            String bodyText = document.body().text();
            Elements postCategories = document.select("ul[class=post-categories]");
            List<String> categoriesOut = new ArrayList<String>();
            Boolean excludePost = false;

            if (!postCategories.isEmpty()) {
                // Is a WP post (not page)
                Elements categoriesIn = postCategories.select("li > a"); // get list of li elements

                for (Element cat : categoriesIn) {
                    String thisCat = cat.text();

                    if (thisCat != null && excludedCategories.contains(thisCat)) { // "button".equals(thisCat)
                        // exclude category "button" posts
                        excludePost = true;
                    }
                    if (thisCat != null) {
                        // add category to list
                        categoriesOut.add(thisCat.replaceAll(" ", "_"));
                    }
                }
            }

            if (excludePost) {
                logger.debug("Excluding post (id: " + id + ") with category: "
                        + StringUtils.join(categoriesOut, "|"));
                continue;
            }

            documentCount++;
            // Index with SOLR
            logger.debug(documentCount + ". Indexing WP page - id: " + id + " | title: " + title + " | text: "
                    + StringUtils.substring(bodyText, 0, 100) + "... ");
            SolrInputDocument doc = new SolrInputDocument();
            doc.addField("idxtype", IndexedTypes.WORDPRESS);
            doc.addField("guid", WP_BASE_URI + id); // use page_id based URI instead of permalink in case permalink is too long for id field
            doc.addField("id", "wp" + id); // probably not needed but safer to leave in
            doc.addField("name", title, 1.2f);
            doc.addField("content", bodyText);
            doc.addField("australian_s", "recorded"); // so they appear in default QF search
            doc.addField("categories", categoriesOut);
            // add to index
            solrServer.add(doc);

            if (documentCount % 100 == 0) {
                logger.info("Committing to SOLR (count = " + documentCount + ")...");
                solrServer.commit();
            }
        } catch (IOException ex) {
            // catch it so we don't stop indexing other pages
            logger.warn("Problem accessing/reading WP page: " + ex.getMessage(), ex);
        }
    }

    logger.info("Final Committing to SOLR...");
    solrServer.commit();
    //logger.info("Optimising SOLR index...");
    //solrServer.optimize(); // throws errors on my machine??
    logger.info("Committed to SOLR. Final document count: " + documentCount);
    return documentCount;
}

From source file:me.vertretungsplan.parser.UntisSubstitutionParser.java

@Override
public SubstitutionSchedule getSubstitutionSchedule()
        throws IOException, JSONException, CredentialInvalidException {
    new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore);

    String encoding = data.optString(PARAM_ENCODING, null);
    SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData);

    int successfulSchedules = 0;
    HttpResponseException lastExceptionSchedule = null;
    for (String baseUrl : ParserUtils.handleUrlsWithDateFormat(urls)) {
        try {//from www.  j  a va 2 s  .c o  m
            Document doc = Jsoup.parse(this.httpGet(baseUrl, encoding));
            Elements classes = doc.select("td a");

            String lastChange = doc.select("td[align=right]:not(:has(b))").text();

            int successfulClasses = 0;
            HttpResponseException lastExceptionClass = null;
            for (Element klasse : classes) {
                try {
                    Document classDoc = Jsoup.parse(
                            httpGet(baseUrl.substring(0, baseUrl.lastIndexOf("/")) + "/" + klasse.attr("href"),
                                    encoding));

                    parseSubstitutionTable(v, lastChange, classDoc);
                    successfulClasses++;
                } catch (HttpResponseException e) {
                    lastExceptionClass = e;
                }
            }
            if (successfulClasses == 0 && lastExceptionClass != null) {
                throw lastExceptionClass;
            }
            successfulSchedules++;
        } catch (HttpResponseException e) {
            lastExceptionSchedule = e;
        }
    }
    if (successfulSchedules == 0 && lastExceptionSchedule != null) {
        throw lastExceptionSchedule;
    }
    if (data.has(PARAM_WEBSITE)) {
        v.setWebsite(data.getString(PARAM_WEBSITE));
    } else {
        v.setWebsite(urls.get(0));
    }
    v.setClasses(getAllClasses());
    v.setTeachers(getAllTeachers());
    return v;
}

From source file:mobi.jenkinsci.ci.client.sso.GoogleSsoHandler.java

@Override
public IOException getException(final HttpResponse response) {
    final StatusLine httpStatusLine = response.getStatusLine();
    final int statusCode = httpStatusLine.getStatusCode();
    switch (statusCode) {
    case HttpURLConnection.HTTP_OK:
        try {/*from  w  w w.  j  a  v  a  2s. co m*/
            final Document responseDoc = Jsoup.parse(response.getEntity().getContent(), "UTF-8", "");
            final Element errorDiv = responseDoc.select("div[id~=(error|errormsg)").first();
            if (errorDiv != null) {
                return new IOException(getDivText(errorDiv));
            }
        } catch (final Exception e) {
        }
        // Break is not needed here: we want to fallback to the 'default' case
        // if no error div is found

    default:
        return new IOException("Google Authentication FAILED");
    }
}