Example usage for org.jsoup.select Elements html

List of usage examples for org.jsoup.select Elements html

Introduction

In this page you can find the example usage for org.jsoup.select Elements html.

Prototype

public String html() 

Source Link

Document

Get the combined inner HTML of all matched elements.

Usage

From source file:com.manisha.allmybooksarepacked.service.BookParser.java

private Double findShippingWeight() {
    Elements weight = doc.select(PathMapping.WEIGHT);
    weight.select("b").remove();
    weight.select("a").remove();
    String str = weight.html().replace("(", "").replace(")", "").split(" ")[0];
    try {/*  w  w  w . j a va 2  s .  c om*/
        if (StringUtils.isNotBlank(str)) {
            return Double.valueOf(str);
        }
    } catch (Exception ex) {
    }
    return null;
}

From source file:fusion.Fusion.java

private static boolean isSynonym(Value val1, Value val2) throws IOException {
    boolean isSyn = false;
    String thesaurusUrl = "http://words.bighugelabs.com/api/2/92eae7f933f0f63404b3438ca46861e5/"
            + val1.getValue() + "/xml";

    Document doc = Jsoup.connect(thesaurusUrl).get();
    Elements synonyms = doc.select("w");

    String syn = synonyms.html();
    String[] synonymsArray = syn.split("\n");
    ArrayList<String> synonymsList = new ArrayList<String>(Arrays.asList(synonymsArray));
    if (synonymsList.contains(val2.getValue())) {
        val1.addToSynonyms(val2);
        val2.addToSynonyms(val1);
        isSyn = true;//w w w .  j  ava 2 s  . c o  m

    }
    return isSyn;
}

From source file:com.gote.downloader.kgs.KGSDownloader.java

/**
 * Check if a game is public, if yes, then the URL of that game will be sent back.
 * /* w ww. j av a  2  s.  com*/
 * @param pCell Element which represents the first KGS archives column
 * @return link of the SGF or null
 */
public String isPublicGame(Element pCell) {
    Elements a = pCell.getElementsByTag("a");

    if (a != null && a.size() > 0) {
        // Check if it is a visible game
        if (a.html().equals(KGSUtils.KGS_TAG_FR_YES)) {
            return a.attr("href");
        }
    }

    return null;
}

From source file:com.ewcms.plugin.crawler.generate.EwcmsContentCrawler.java

/**
 * ?page??/*w w  w  .j a  v a 2s . co m*/
 */
@Override
public void visit(Page page) {
    try {
        String url = page.getWebURL().getURL();

        page.setContentType("text/html; charset=" + gather.getEncoding());
        Document doc = Jsoup.connect(url).timeout(gather.getTimeOutWait().intValue() * 1000).get();

        String title = doc.title();
        if (gather.getTitleExternal() && gather.getTitleRegex() != null
                && gather.getTitleRegex().length() > 0) {
            Elements titleEles = doc.select(gather.getTitleRegex());
            if (!titleEles.isEmpty()) {
                String tempTitle = titleEles.text();
                if (tempTitle != null && tempTitle.length() > 0) {
                    title = tempTitle;
                }
            }
        }

        if (title != null && title.trim().length() > 0) {
            Elements elements = doc.select(matchRegex);
            if (filterRegex != null && filterRegex.trim().length() > 0) {
                elements = elements.not(filterRegex);
            }
            if (!elements.isEmpty()) {
                String subHtml = elements.html();
                Document blockDoc = Jsoup.parse(subHtml);
                String contentText = blockDoc.html();

                if (gather.getRemoveHref()) {
                    Document moveDoc = Jsoup.parse(contentText);
                    Elements moveEles = moveDoc.select("*").not("a");
                    contentText = moveEles.html();
                }
                if (gather.getRemoveHtmlTag())
                    contentText = doc.text();

                if (isLocal) {
                    contentText = doc.text();

                    Boolean isMatcher = true;
                    for (int i = 0; i < keys.length; i++) {
                        Boolean result = Pattern.compile(keys[i].trim()).matcher(contentText).find();
                        if (!result) {
                            isMatcher = false;
                            break;
                        }
                    }

                    if (isMatcher) {
                        Storage storage = new Storage();
                        storage.setGatherId(gather.getId());
                        storage.setGatherName(gather.getName());
                        storage.setTitle(title);
                        storage.setUrl(url);
                        try {
                            gatherService.addStorage(storage);
                        } catch (Exception e) {
                            logger.error("save storage error : {}", e.getLocalizedMessage());
                        } finally {
                            storage = null;
                        }
                    }
                } else {
                    Content content = new Content();
                    content.setDetail(contentText);
                    content.setPage(1);
                    List<Content> contents = new ArrayList<Content>();
                    contents.add(content);

                    Article article = new Article();
                    article.setTitle(title);
                    article.setContents(contents);

                    articleMainService.addArticleMainByCrawler(article, gather.getChannelId(),
                            CrawlerUtil.USER_NAME);
                }
            }
        }
    } catch (IOException e) {
        logger.warn(e.getLocalizedMessage());
    }
}

From source file:com.dmrr.asistenciasx.Horarios.java

private void jButton1ActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_jButton1ActionPerformed
    try {/*from  w  w  w.  ja v  a  2 s .com*/
        int x = jTableHorarios.getSelectedRow();
        if (x == -1) {
            JOptionPane.showMessageDialog(this, "Seleccione un profesor primero", "Datos incompletos",
                    JOptionPane.WARNING_MESSAGE);
            return;
        }
        Integer idProfesor = Integer
                .parseInt((String) jTableHorarios.getValueAt(jTableHorarios.getSelectedRow(), 1));

        JPasswordField pf = new JPasswordField();
        String nip = "";
        int okCxl = JOptionPane.showConfirmDialog(null, pf, "Introduzca el NIP del jefe del departamento",
                JOptionPane.OK_CANCEL_OPTION, JOptionPane.PLAIN_MESSAGE);
        if (okCxl == JOptionPane.OK_OPTION) {
            nip = new String(pf.getPassword());
        } else {
            return;
        }

        org.jsoup.Connection.Response respuesta = Jsoup
                .connect("http://siiauescolar.siiau.udg.mx/wus/gupprincipal.valida_inicio")
                .data("p_codigo_c", "2225255", "p_clave_c", nip).method(org.jsoup.Connection.Method.POST)
                .timeout(0).execute();

        Document login = respuesta.parse();
        String sessionId = respuesta.cookie(getFecha() + "SIIAUSESION");
        String sessionId2 = respuesta.cookie(getFecha() + "SIIAUUDG");

        Document listaHorarios = Jsoup.connect("http://siiauescolar.siiau.udg.mx/wse/sspsecc.consulta_oferta")
                .data("ciclop", "201510", "cup", "J", "deptop", "", "codprofp", "" + idProfesor, "ordenp", "0",
                        "mostrarp", "1000", "tipop", "T", "secp", "A", "regp", "T")
                .userAgent("Mozilla").cookie(getFecha() + "SIIAUSESION", sessionId)
                .cookie(getFecha() + "SIIAUUDG", sessionId2).timeout(0).post();

        Elements tabla = listaHorarios.select("body");
        tabla.select("style").remove();
        Elements font = tabla.select("font");
        font.removeAttr("size");

        System.out.println(tabla.html());

        JEditorPane jEditorPane = new JEditorPane();
        jEditorPane.setEditable(false);

        HTMLEditorKit kit = new HTMLEditorKit();
        jEditorPane.setEditorKit(kit);

        javax.swing.text.Document doc = kit.createDefaultDocument();
        jEditorPane.setDocument(doc);
        jEditorPane.setText(tabla.html());

        JOptionPane.showMessageDialog(null, jEditorPane);

    } catch (IOException ex) {
        Logger.getLogger(Horarios.class.getName()).log(Level.SEVERE, null, ex);
    }

}

From source file:org.jresponder.message.MessageRefImpl.java

/**
 * Render a message in the context of a particular subscriber
 * and subscription./*from   w ww. ja  v a 2s  . com*/
 */
@Override
public boolean populateMessage(MimeMessage aMimeMessage, SendConfig aSendConfig, Subscriber aSubscriber,
        Subscription aSubscription) {

    try {

        // prepare context
        Map<String, Object> myRenderContext = new HashMap<String, Object>();
        myRenderContext.put("subscriber", aSubscriber);
        myRenderContext.put("subscription", aSubscription);
        myRenderContext.put("config", aSendConfig);
        myRenderContext.put("message", this);

        // render the whole file
        String myRenderedFileContents = TextRenderUtil.getInstance().render(fileContents, myRenderContext);

        // now parse again with Jsoup
        Document myDocument = Jsoup.parse(myRenderedFileContents);

        String myHtmlBody = "";
        String myTextBody = "";

        // html body
        Elements myBodyElements = myDocument.select("#htmlbody");
        if (!myBodyElements.isEmpty()) {
            myHtmlBody = myBodyElements.html();
        }

        // text body
        Elements myJrTextBodyElements = myDocument.select("#textbody");
        if (!myJrTextBodyElements.isEmpty()) {
            myTextBody = TextUtil.getInstance().getWholeText(myJrTextBodyElements.first());
        }

        // now build the actual message
        MimeMessage myMimeMessage = aMimeMessage;
        // wrap it in a MimeMessageHelper - since some things are easier with that
        MimeMessageHelper myMimeMessageHelper = new MimeMessageHelper(myMimeMessage);

        // set headers

        // subject
        myMimeMessageHelper.setSubject(TextRenderUtil.getInstance()
                .render((String) propMap.get(MessageRefProp.JR_SUBJECT.toString()), myRenderContext));

        // TODO: implement DKIM, figure out subetha

        String mySenderEmailPattern = aSendConfig.getSenderEmailPattern();
        String mySenderEmail = TextRenderUtil.getInstance().render(mySenderEmailPattern, myRenderContext);
        myMimeMessage.setSender(new InternetAddress(mySenderEmail));

        myMimeMessageHelper.setTo(aSubscriber.getEmail());

        // from
        myMimeMessageHelper.setFrom(
                TextRenderUtil.getInstance()
                        .render((String) propMap.get(MessageRefProp.JR_FROM_EMAIL.toString()), myRenderContext),
                TextRenderUtil.getInstance()
                        .render((String) propMap.get(MessageRefProp.JR_FROM_NAME.toString()), myRenderContext));

        // see how to set body

        // if we have both text and html, then do multipart
        if (myTextBody.trim().length() > 0 && myHtmlBody.trim().length() > 0) {

            // create wrapper multipart/alternative part
            MimeMultipart ma = new MimeMultipart("alternative");
            myMimeMessage.setContent(ma);
            // create the plain text
            BodyPart plainText = new MimeBodyPart();
            plainText.setText(myTextBody);
            ma.addBodyPart(plainText);
            // create the html part
            BodyPart html = new MimeBodyPart();
            html.setContent(myHtmlBody, "text/html");
            ma.addBodyPart(html);
        }

        // if only HTML, then just use that
        else if (myHtmlBody.trim().length() > 0) {
            myMimeMessageHelper.setText(myHtmlBody, true);
        }

        // if only text, then just use that
        else if (myTextBody.trim().length() > 0) {
            myMimeMessageHelper.setText(myTextBody, false);
        }

        // if neither text nor HTML, then the message is being skipped,
        // so we just return null
        else {
            return false;
        }

        return true;

    } catch (MessagingException e) {
        throw new RuntimeException(e);
    } catch (UnsupportedEncodingException e) {
        throw new RuntimeException(e);
    }

}

From source file:org.brunocvcunha.taskerbox.impl.jobs.MonsterJobSeeker.java

private boolean handleJob(String jobTitle, String jobEmployer, String location, String jobUrl)
        throws JSONException, ClientProtocolException, IOException, URISyntaxException {

    if (alreadyPerformedAction(jobUrl)) {
        return true;
    }// w w w.j  a v a  2 s.c o m

    String headline = jobUrl + " - " + location + " - " + jobTitle + " - " + jobEmployer;

    if (!considerTitle(jobTitle)) {
        logInfo(log, "-- Ignored [title] " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }

    if (!considerEmployer(jobEmployer)) {
        logInfo(log, "-- Ignored [employer] " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }

    if (!considerLocation(location)) {
        logInfo(log, "-- Ignored [location] " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }

    try {
        Thread.sleep(1000L);
    } catch (InterruptedException e) {
        e.printStackTrace();
    }

    HttpEntity jobEntity = TaskerboxHttpBox.getInstance().getEntityForURL(jobUrl);
    String jobResult = TaskerboxHttpBox.getInstance().readResponseFromEntity(jobEntity);
    Document jobDocument = Jsoup.parse(jobResult);

    Elements elDescription = jobDocument.select("div#jobBodyContent");

    if (!jobDocument.html().contains("ApplyOnlineUrl: ''")
            && !jobDocument.html().contains("ApplyOnlineUrl: 'http://my.monster.com") && !this.externalApply) {
        logInfo(log, "-- Ignored [externalApply] " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }

    if (!considerVisaDescription(elDescription.html())) {
        logInfo(log, "-- Ignored [visa] " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }
    if (!considerExperienceDescription(elDescription.html())) {
        logInfo(log, "-- Ignored [exp] " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }

    ScorerResult result = LinkedInJobDBComparer.getScore(elDescription.html());

    if (result.getScore() < this.requiredScore) {
        logInfo(log,
                "-- Ignored [scorer] " + result.getScore() + " - " + result.getMatches() + " - " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }

    headline = headline + " - " + result.getMatches();

    logInfo(log, "Open --> " + headline);
    // logInfo(log, elDescription.html());

    performUnique(jobUrl);

    try {
        Thread.sleep(5000L);
    } catch (InterruptedException e) {
        e.printStackTrace();
    }

    return true;

}

From source file:org.brunocvcunha.taskerbox.impl.jobs.DiceJobSeeker.java

private boolean handleJob(String jobTitle, String jobEmployer, String location, String jobUrl)
        throws JSONException, ClientProtocolException, IOException, URISyntaxException {

    if (alreadyPerformedAction(jobUrl)) {
        return true;
    }/*  w  ww.  j ava2  s  . c  om*/

    String headline = jobUrl + " - " + location + " - " + jobTitle + " - " + jobEmployer;

    System.out.println(headline);

    if (!considerTitle(jobTitle)) {
        logInfo(log, "-- Ignored [title] " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }

    if (!considerEmployer(jobEmployer)) {
        logInfo(log, "-- Ignored [employer] " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }

    if (!considerLocation(location)) {
        logInfo(log, "-- Ignored [location] " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }

    HttpEntity jobEntity = TaskerboxHttpBox.getInstance().getEntityForURL(jobUrl);
    String jobResult = TaskerboxHttpBox.getInstance().readResponseFromEntity(jobEntity);
    Document jobDocument = Jsoup.parse(jobResult);

    Elements elDescription = jobDocument.select("div.job_description");
    if (elDescription.isEmpty()) {
        elDescription = jobDocument.select("div#detailDescription");
    }

    /*
     * if (!jobDocument.html().contains("ApplyOnlineUrl: ''") &&
     * !jobDocument.html().contains("ApplyOnlineUrl: 'http://my.monster.com") && !externalApply) {
     * logInfo(log, "-- Ignored [externalApply] " + headline); addAlreadyPerformedAction(jobUrl);
     * return true; }
     */

    if (!considerVisaDescription(elDescription.html())) {
        logInfo(log, "-- Ignored [visa] " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }
    if (!considerExperienceDescription(elDescription.html())) {
        log.info("-- Ignored [exp] " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }

    ScorerResult result = LinkedInJobDBComparer.getScore(elDescription.html());

    if (result.getScore() < this.requiredScore) {
        logInfo(log,
                "-- Ignored [scorer] " + result.getScore() + " - " + result.getMatches() + " - " + headline);
        addAlreadyPerformedAction(jobUrl);
        return true;
    }

    headline = headline + " - " + result.getMatches();

    logInfo(log, "Open --> " + headline);
    // logInfo(log, elDescription.html());

    performUnique(jobUrl);

    try {
        Thread.sleep(5000L);
    } catch (InterruptedException e) {
        e.printStackTrace();
    }

    return true;

}

From source file:org.confab.PhpBB3Parser.java

/**
 * Parses each topic for a particular forum.
 * @param  forum        Document of html containing topics
 * @param  parent       Forum the threads belong to
 * @return              List of ForumThread objects 
 *//*from w  w w  . j  a va 2s  .c  o  m*/
public List<ForumThread> parseForumThreads(Document forum, Forum parent) {
    Utilities.debug("parseForumThreads");

    List<ForumThread> ret = new ArrayList<ForumThread>();

    // Get topic table
    Elements thread_table_tds = forum.select("tbody[id*=threadbits_forum_] td");
    if (thread_table_tds.isEmpty()) {
        Utilities.debug("It seems " + parent.url + " has no topics.");
        return ret;
    }

    // Get any stickies
    Elements stickies = thread_table_tds.select("td:contains(Sticky:)  a[id*=thread_title_]");

    // Get all topics
    Elements els_a = thread_table_tds.select("a[id*=thread_title_]");
    assert !els_a.isEmpty();

    // Loop topics and grab info about each
    for (Element el_a : els_a) {
        ForumThread new_topic = new ForumThread(parent);

        // Get topic 
        new_topic.title = el_a.text();
        assert new_topic.title != null;
        Utilities.debug("new_topic.title: " + new_topic.title);

        // Check if sticky
        if (stickies.html().contains(new_topic.title)) {
            new_topic.isSticky = true;
            Utilities.debug("new_topic.isSticky: " + new_topic.isSticky);
        }

        // Get URL
        new_topic.url = el_a.attr("href");
        assert new_topic.url != null;
        Utilities.debug("new_topic.url:" + new_topic.url);

        ret.add(new_topic);
    }

    Utilities.debug("end printForumThreads");
    return ret;
}

From source file:org.confab.VBulletinParser.java

public List<Forum> parseForums(Document root, BulletinBoard parent) {
    Utilities.debug("parseForums");

    List<Forum> ret = new ArrayList<Forum>();

    // get table/*from w  w w.  j ava 2s  .c om*/
    Elements forum_table = root.select("tbody[id*=collapseobj_forumbit_] tr");
    assert !forum_table.isEmpty();

    for (Element el_tr : forum_table) {
        Forum new_forum = new Forum(parent);

        // Get the table data for this row
        Elements el_tds = el_tr.select("td");
        assert !el_tds.isEmpty() : el_tr.html();

        // xbox360achievements has a lot of subforums and puts these in their own table
        // The <a>'s are picked up as children of the parent <td> so don't parse this sub-
        // tables row's seperatly
        if (!el_tds.select("td.thead").isEmpty() || el_tds.size() < 3) {
            //Utilities.debug("tr doesn't seem to have anything we want, skipping.");
            continue;
        }

        // Get the title URL
        Elements els_a = el_tds.get(1).select("a");
        assert !els_a.isEmpty() : el_tds.html();
        new_forum.url = els_a.first().attr("href");
        assert new_forum.url != null;
        Utilities.debug("new_forum.url : " + new_forum.url);

        // Get the title text
        assert els_a.first() != null;
        new_forum.title = els_a.first().text();
        assert new_forum.title != null;
        Utilities.debug("new_forum.title : " + new_forum.title);

        // Check for any subforums in remaining a elements
        els_a.remove(els_a.first());
        for (Element el_a : els_a) {
            Forum sub_forum = new Forum(parent);
            sub_forum.url = el_a.attr("href");
            assert sub_forum.url != null;
            sub_forum.title = el_a.text();
            assert sub_forum.title != null;
            new_forum.subForums.add(sub_forum);
            Utilities.debug("added subForum: " + sub_forum.title);
        }

        // Get num viewing the current forum
        Element el_viewing = el_tr.select(":matchesOwn((\\d+ Viewing))").first();
        if (el_viewing != null) {
            new_forum.numViewing = el_viewing.text();
        } else {
            new_forum.numViewing = "0";
        }
        Utilities.debug("new_forum.numViewing : " + new_forum.numViewing);

        // Get the description/message of this topic
        Element el_description = el_tds.get(1).select("div.smallfont").first();
        if (el_description != null) {
            new_forum.description = el_description.text();
        } else {
            new_forum.description = "";
        }
        Utilities.debug("new_forum.description : " + new_forum.description);

        Utilities.debug("new_forum.parent.url : " + new_forum.parent.url);

        ret.add(new_forum);
        Utilities.debug("-----");
    }
    Utilities.debug("end parseForums");
    return ret;
}