Example usage for org.jsoup.select Elements select

List of usage examples for org.jsoup.select Elements select

Introduction

In this page you can find the example usage for org.jsoup.select Elements select.

Prototype

public Elements select(String query) 

Source Link

Document

Find matching elements within this element list.

Usage

From source file:org.manalith.ircbot.plugin.linuxpkgfinder.GentooPortageProvider.java

@Override
public String find(String arg) {
    String result = "";
    String url = "http://gentoo-portage.com/Search?search=" + arg;

    String pkgname = "";
    String description = "";

    Connection conn = Jsoup.connect(url);
    conn.timeout(120000); // timeout : 60s. This is to slow - -;

    try {/*from   ww w. ja v a2 s  . co  m*/
        Elements e = conn.get().select("#search_results").select("a");

        int result_size = e.size();
        if (result_size == 0) {
            result = "[Gentoo]  ";
            return result;
        }

        pkgname = e.select("div").text().split("\\s")[0];
        if (!pkgname.split("\\/")[1].equals(arg)) {
            result = "[Gentoo]  ";
            return result;
        }

        description = e.select("div").get(1).text();

        String detail_url = "http://gentoo-portage.com/" + pkgname;
        Connection conn2 = Jsoup.connect(detail_url);
        conn2.timeout(120000); // timeout : 60s.

        Elements ee = conn2.get().select("#ebuild_list>ul>li");

        if (ee.size() == 0) {
            result = "[Gentoo]  ";
            return result;
        }

        result = ee.get(0).select("div").get(0).select("b").text();
        result += " : " + description;

    } catch (Exception e) {
        logger.error(e.getMessage(), e);
        result = ": " + e.getMessage();
    }

    return result;
}

From source file:accountgen.controller.Controller.java

private void setName(Document doc, Person p) {
    Elements e = doc.getElementsByClass("address");
    Element name = e.select("h3").first();
    p.setFirstname(StringEscapeUtils.unescapeHtml4(name.html().split(" ")[0]).trim());
    p.setMiddlename("");
    p.setLastname(StringEscapeUtils.unescapeHtml4(name.html().split(name.html().split(" ")[0])[1]).trim());
}

From source file:gov.medicaid.screening.dao.impl.BusinessLienDAOBean.java

/**
 * Get value pair of label element./* ww w .j  a  v  a2s . c om*/
 *
 * @param elements group of elements
 * @param label label to look for
 * @return value
 */
private String getValuePairOfLabel(Elements elements, String label) {
    Element labelElement = elements.select("dt:containsOwn(" + label + ")").first();
    return labelElement != null && labelElement.nextElementSibling() != null
            ? labelElement.nextElementSibling().text()
            : "";
}

From source file:accountgen.controller.Controller.java

private void setAddress(Document doc, Person p) {
    Elements e = doc.getElementsByClass("address");
    Element ad = e.select(".adr").first();
    Address address = new Address();
    String streetnumber = StringEscapeUtils
            .unescapeHtml4(// ww w.  jav a  2s .c om
                    ad.html().split("<br />")[0].split(" ")[ad.html().split("<br />")[0].split(" ").length - 1])
            .trim();
    String state = StringEscapeUtils
            .unescapeHtml4(
                    ad.html().split("<br />")[1].split(" ")[ad.html().split("<br />")[1].split(" ").length - 1])
            .trim();
    address.setStreetnumber(streetnumber);
    address.setStreetname(StringEscapeUtils.unescapeHtml4(ad.html().split(streetnumber)[0]).trim());
    address.setState(state);
    address.setPostcode(
            StringEscapeUtils.unescapeHtml4(ad.html().split("<br />")[1].split(state)[0]).trim().split(" ")[0]);
    address.setCountry(Consts.COUNTRY);
    p.setAdress(address);
}

From source file:com.dmrr.asistenciasx.Horarios.java

private void jButton1ActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_jButton1ActionPerformed
    try {//from  w  w  w  .  j a  v  a  2s  .c  o m
        int x = jTableHorarios.getSelectedRow();
        if (x == -1) {
            JOptionPane.showMessageDialog(this, "Seleccione un profesor primero", "Datos incompletos",
                    JOptionPane.WARNING_MESSAGE);
            return;
        }
        Integer idProfesor = Integer
                .parseInt((String) jTableHorarios.getValueAt(jTableHorarios.getSelectedRow(), 1));

        JPasswordField pf = new JPasswordField();
        String nip = "";
        int okCxl = JOptionPane.showConfirmDialog(null, pf, "Introduzca el NIP del jefe del departamento",
                JOptionPane.OK_CANCEL_OPTION, JOptionPane.PLAIN_MESSAGE);
        if (okCxl == JOptionPane.OK_OPTION) {
            nip = new String(pf.getPassword());
        } else {
            return;
        }

        org.jsoup.Connection.Response respuesta = Jsoup
                .connect("http://siiauescolar.siiau.udg.mx/wus/gupprincipal.valida_inicio")
                .data("p_codigo_c", "2225255", "p_clave_c", nip).method(org.jsoup.Connection.Method.POST)
                .timeout(0).execute();

        Document login = respuesta.parse();
        String sessionId = respuesta.cookie(getFecha() + "SIIAUSESION");
        String sessionId2 = respuesta.cookie(getFecha() + "SIIAUUDG");

        Document listaHorarios = Jsoup.connect("http://siiauescolar.siiau.udg.mx/wse/sspsecc.consulta_oferta")
                .data("ciclop", "201510", "cup", "J", "deptop", "", "codprofp", "" + idProfesor, "ordenp", "0",
                        "mostrarp", "1000", "tipop", "T", "secp", "A", "regp", "T")
                .userAgent("Mozilla").cookie(getFecha() + "SIIAUSESION", sessionId)
                .cookie(getFecha() + "SIIAUUDG", sessionId2).timeout(0).post();

        Elements tabla = listaHorarios.select("body");
        tabla.select("style").remove();
        Elements font = tabla.select("font");
        font.removeAttr("size");

        System.out.println(tabla.html());

        JEditorPane jEditorPane = new JEditorPane();
        jEditorPane.setEditable(false);

        HTMLEditorKit kit = new HTMLEditorKit();
        jEditorPane.setEditorKit(kit);

        javax.swing.text.Document doc = kit.createDefaultDocument();
        jEditorPane.setDocument(doc);
        jEditorPane.setText(tabla.html());

        JOptionPane.showMessageDialog(null, jEditorPane);

    } catch (IOException ex) {
        Logger.getLogger(Horarios.class.getName()).log(Level.SEVERE, null, ex);
    }

}

From source file:com.crawler.app.run.JellyfishCrawlerSiteCB.java

/**
 * This function is called when a page is fetched and ready to be processed
 * by your program./*w  w  w  . j  a  v  a 2s  .c  o  m*/
 */
@Override
public void visit(Page page) {
    String url = page.getWebURL().getURL();
    //logger.info("URL: ", url);
    String host = "127.0.0.1";
    String port = "3306";
    String dbName = "crawler";
    String dbUser = "root";
    String dbPwd = "";
    MysqlCrawler.createConn(host, port, dbName, dbUser, dbPwd);
    System.out.println("\n URL visit: " + url);

    String href = url.toLowerCase();
    if (href.startsWith("http://careerbuilder.vn/vi/tim-viec-lam") && href.endsWith(".html")) {
        if (page.getParseData() instanceof HtmlParseData) {
            HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
            String text = htmlParseData.getText();
            String html = htmlParseData.getHtml();
            String title = htmlParseData.getTitle();

            Document doc = Jsoup.parse(html, "UTF-8");
            //doc.outputSettings().escapeMode(EscapeMode.xhtml);
            Element body = doc.body();

            Elements listDetail = body.select("section div[class=MyJobLeft]");
            String jobUrl = url;
            String jobName = listDetail.select("h1").html();
            String companyName = listDetail.select("div[class=tit_company]").html();
            String jobLocation = listDetail.select(
                    "div[class=box2Detail] ul[class=DetailJobNew] p[class=fl_left] b[itemprop=jobLocation] a")
                    .html();
            String companyAddress = listDetail
                    .select("div[class=box1Detail] p[class=TitleDetailNew] label[itemprop=addressLocality]")
                    .html();
            String companyContact = listDetail
                    .select("div[class=box1Detail] p[class=TitleDetailNew] label strong").html();
            String companyPhone = listDetail
                    .select("div[class=col-lg-6 col-md-6 col-sm-12] p[id=company_contact]").html();
            String companyWebsite = listDetail
                    .select("div[class=col-lg-6 col-md-6 col-sm-12] p a[id=company_website]").html();

            if (listDetail.isEmpty() || jobName.isEmpty()) {
                listDetail = body.select("div[id=main_content] div[id=main_content_right]");
                jobName = listDetail.select("h1 p").html();
                companyName = listDetail.select("div[class=intro_company] div[class=title_into] p").html();
                jobLocation = listDetail
                        .select("div[class=intro_job] ul[class=left_380] p[itemprop=jobLocation] a").html();
                if (listDetail.isEmpty() || jobName.isEmpty()) {
                    listDetail = body.select("div[id=main_content] div[class=content_right]");
                    jobName = listDetail.select("h1").html();
                    companyName = listDetail
                            .select("div[class=intro_company] div[class=title_into] p[class=title_comp]")
                            .html();
                    Elements gCompanyWebList = listDetail
                            .select("div[class=intro_company] div[class=title_into] p");
                    if (!gCompanyWebList.isEmpty() && gCompanyWebList.size() > 1)
                        companyWebsite = gCompanyWebList.get(1).html();
                    jobLocation = listDetail
                            .select("div[class=intro_job] ul[class=left_380] p[itemprop=jobLocation] a").html();

                }
            }
            jobName = listDetail.select("h1 a").html();
            if (jobName.isEmpty())
                jobName = listDetail.select("h1 p").html();
            if (jobName.isEmpty())
                jobName = listDetail.select("h1").html();

            System.out.println("\n Title : " + jobName);
            try {
                Integer siteID = 3;
                //String companyWebsite = "";
                /*
                MysqlCrawler.getInstance().insertJFHRContents(
                      siteID
                      , jobUrl
                      , jobName
                      , jobLocation
                      , companyName
                      , companyAddress
                      , companyPhone
                      , companyContact
                      , companyWebsite);
                       */
                //System.exit(1);
            } catch (Exception ex) {
                //System.out.println("\n Fail I : " + i);
                System.out.println("\n Ex : " + ex);
            }
        }

    }

    /*
        Header[] responseHeaders = page.getFetchResponseHeaders();
        if (responseHeaders != null) {
          logger.debug("Response headers:");
          for (Header header : responseHeaders) {
            logger.debug("\t{}: {}", header.getName(), header.getValue());
          }
        }
    */
    logger.debug("=============");
}

From source file:org.ala.lucene.CreateWordPressIndex.java

/**
 * Index the WP pages by parsing with Jsoup and indexing into SOLR
 *
 * @return//from w w  w . jav a 2  s . c  o  m
 * @throws IOException
 */
protected int indexPages() throws Exception {
    int documentCount = 0;
    // Initialise SOLR
    SolrServer solrServer = solrUtils.getSolrServer();
    logger.info("Deleting all WordPress documents in SOLR index...");
    solrServer.deleteByQuery("idxtype:" + IndexedTypes.WORDPRESS); // delete WP pages
    solrServer.commit();

    for (String pageUrl : this.pageUrls) {
        try {
            // Crawl and extract text from WP pages
            Document document = Jsoup.connect(pageUrl + CONTENT_ONLY_PARAM).get();
            String title = document.select("head > title").text();
            String id = document.select("head > meta[name=id]").attr("content");
            String bodyText = document.body().text();
            Elements postCategories = document.select("ul[class=post-categories]");
            List<String> categoriesOut = new ArrayList<String>();
            Boolean excludePost = false;

            if (!postCategories.isEmpty()) {
                // Is a WP post (not page)
                Elements categoriesIn = postCategories.select("li > a"); // get list of li elements

                for (Element cat : categoriesIn) {
                    String thisCat = cat.text();

                    if (thisCat != null && excludedCategories.contains(thisCat)) { // "button".equals(thisCat)
                        // exclude category "button" posts
                        excludePost = true;
                    }
                    if (thisCat != null) {
                        // add category to list
                        categoriesOut.add(thisCat.replaceAll(" ", "_"));
                    }
                }
            }

            if (excludePost) {
                logger.debug("Excluding post (id: " + id + ") with category: "
                        + StringUtils.join(categoriesOut, "|"));
                continue;
            }

            documentCount++;
            // Index with SOLR
            logger.debug(documentCount + ". Indexing WP page - id: " + id + " | title: " + title + " | text: "
                    + StringUtils.substring(bodyText, 0, 100) + "... ");
            SolrInputDocument doc = new SolrInputDocument();
            doc.addField("idxtype", IndexedTypes.WORDPRESS);
            doc.addField("guid", WP_BASE_URI + id); // use page_id based URI instead of permalink in case permalink is too long for id field
            doc.addField("id", "wp" + id); // probably not needed but safer to leave in
            doc.addField("name", title, 1.2f);
            doc.addField("content", bodyText);
            doc.addField("australian_s", "recorded"); // so they appear in default QF search
            doc.addField("categories", categoriesOut);
            // add to index
            solrServer.add(doc);

            if (documentCount % 100 == 0) {
                logger.info("Committing to SOLR (count = " + documentCount + ")...");
                solrServer.commit();
            }
        } catch (IOException ex) {
            // catch it so we don't stop indexing other pages
            logger.warn("Problem accessing/reading WP page: " + ex.getMessage(), ex);
        }
    }

    logger.info("Final Committing to SOLR...");
    solrServer.commit();
    //logger.info("Optimising SOLR index...");
    //solrServer.optimize(); // throws errors on my machine??
    logger.info("Committed to SOLR. Final document count: " + documentCount);
    return documentCount;
}

From source file:de.ncoder.studipsync.studip.jsoup.JsoupStudipAdapter.java

@Override
public List<Seminar> parseSeminars() throws StudipException {
    ensureLoggedIn();//from  www  .  ja va2 s.c  o m

    navigate(PAGE_SEMINARS);

    Elements events = document.select("#content>table:first-of-type>tbody>tr");
    List<Seminar> seminars = new ArrayList<>();
    for (org.jsoup.nodes.Element event : events) {
        if (event.select(">td").size() > 4) {
            Elements info = event.select(">td:nth-of-type(4)>a:first-of-type");
            Elements font = info.select("font");
            if (info.size() >= 1 && font.size() >= 2) {
                Seminar seminar = Seminar.getSeminar(info.get(0).absUrl("href"), font.get(0).text().trim(),
                        font.get(1).text().trim());
                seminars.add(seminar);
            }
        }
    }
    log.debug("Parsed " + seminars.size() + " seminars.");
    log.trace(seminars.toString());
    return seminars;
}

From source file:org.confab.PhpBB3Parser.java

/**
 * Parses each post for a particular topic.
 * @param  html         Html containing the posts to be parsed 
 * @return              List of Post objects 
 *//*from   w w  w  .  j av  a 2 s  .c o  m*/
public List<Post> parsePosts(Document html, ForumThread parent) {
    Utilities.debug("Starting parsePosts");
    List<Post> ret = new ArrayList<Post>();

    // Each post should have it's own table
    Elements div_posts = html.select("div#posts");
    assert !div_posts.isEmpty();
    Elements posts_table = div_posts.select("table[id~=(post\\d+)]");
    assert !posts_table.isEmpty();

    for (Element el_post : posts_table) {
        Post new_post = new Post(parent);

        // Get post id (id=post\d+)
        new_post.id = el_post.attr("id").replace("post", "").trim();
        assert new_post.id != null;

        // Get post message 
        Elements el_message = el_post.select("div[id~=(post_message_\\d+)]");
        assert !el_message.isEmpty();
        new_post.message = el_message.first().text();
        assert new_post.message != null;
        Utilities.debug("new_post.message: " + new_post.message);

        // Get post author
        Elements el_author = el_post.select(".bigusername");
        assert !el_author.isEmpty();
        new_post.author.username = el_author.first().text();
        assert new_post.author != null;
        Utilities.debug("new_post.author: " + new_post.author);

        ret.add(new_post);
    }

    Utilities.debug("Finished parsePosts");
    return ret;
}