Example usage for org.jsoup.nodes Document select

Introduction

In this page you can find the example usage for org.jsoup.nodes Document select.

Prototype

public Elements select(String cssQuery)

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:net.liuxuan.Tools.signup.SignupV2ex.java

public void getLoginForm() throws IOException {

    HttpGet httpget = new HttpGet("http://v2ex.com/signin");
    CloseableHttpResponse response1 = httpclient.execute(httpget);
    try {/*from   w w  w.j  ava2 s  . co  m*/
        HttpEntity entity = response1.getEntity();
        //?once
        String content = EntityUtils.toString(entity);
        //                System.out.println(content);
        System.out.println("--------------");
        System.out.println("--------------");
        Document doc = Jsoup.parse(content);
        //                Elements inputs = doc.select("input[type=text]");
        Elements inputs = doc.select("input[type=hidden]");
        for (int i = 0; i < inputs.size(); i++) {
            Element element = inputs.get(i);
            params.add(new BasicNameValuePair(element.attr("name"), element.attr("value")));
            //                    params.put(element.attr("name"), element.attr("value"));
            System.out.println(element.toString());
            System.out.println(element.attr("name"));
            System.out.println(element.attr("value"));

        }

        System.out.println("--------------");
        System.out.println("--------------");

        System.out.println("--------------");
        System.out.println("--------------");
        System.out.println("Login form get: " + response1.getStatusLine());
        EntityUtils.consume(entity);

        System.out.println("Initial set of cookies:");
        List<Cookie> cookies = cookieStore.getCookies();
        if (cookies.isEmpty()) {
            System.out.println("None");
        } else {
            for (int i = 0; i < cookies.size(); i++) {
                System.out.println("- " + cookies.get(i).toString());
            }
        }
    } finally {
        response1.close();
    }

    //            HttpUriRequest login = RequestBuilder.post()
    //                    .setUri(new URI("http://v2ex.com/signin"))
    //                    .addParameter("u", "mosliu")
    //                    .addParameter("p", "mosesmoses")
    //                    .build();
    //            CloseableHttpResponse response2 = httpclient.execute(login);
    //            try {
    //                HttpEntity entity = response2.getEntity();
    //
    //                System.out.println("Login form get: " + response2.getStatusLine());
    //                
    //                EntityUtils.consume(entity);
    //
    //                System.out.println("Post logon cookies:");
    //                List<Cookie> cookies = cookieStore.getCookies();
    //                if (cookies.isEmpty()) {
    //                    System.out.println("None");
    //                } else {
    //                    for (int i = 0; i < cookies.size(); i++) {
    //                        System.out.println("- " + cookies.get(i).toString());
    //                    }
    //                }
    //                
    //                
    //                
    //            } finally {
    //                response2.close();
    //            }
    //            
    //            
    //            httpget = new HttpGet("http://v2ex.com/signin");
    //            response1 = httpclient.execute(httpget);
    //            try {
    //                HttpEntity entity = response1.getEntity();
    //                String content = EntityUtils.toString(entity);
    //                System.out.println("-----------------content---------------------");
    //                System.out.println(content);
    //                
    //                EntityUtils.consume(entity);
    //            } finally {
    //                response1.close();
    //            }
    //            
    //            
}

From source file:edu.usu.sdl.openstorefront.report.ExternalLinkValidationReport.java

@Override
protected void gatherData() {
    ComponentResource componentResourceExample = new ComponentResource();
    componentResourceExample.setActiveStatus(ComponentResource.ACTIVE_STATUS);
    List<ComponentResource> componentResources = service.getPersistenceService()
            .queryByExample(ComponentResource.class, componentResourceExample);
    Map<String, List<ComponentResource>> resourceMap = new HashMap<>();
    componentResources.forEach(resource -> {
        if (resourceMap.containsKey(resource.getComponentId())) {
            resourceMap.get(resource.getComponentId()).add(resource);
        } else {//from w  ww. j a va2  s  . co m
            List<ComponentResource> resources = new ArrayList<>();
            resources.add(resource);
            resourceMap.put(resource.getComponentId(), resources);
        }
    });

    Component componentExample = new Component();
    componentExample.setActiveStatus(Component.ACTIVE_STATUS);
    componentExample.setApprovalState(ApprovalStatus.APPROVED);
    List<Component> components = service.getPersistenceService().queryByExample(Component.class,
            componentExample);

    Map<String, Component> componentMap = new HashMap<>();
    components.forEach(component -> {
        componentMap.put(component.getComponentId(), component);
    });

    //exact all links
    long linkCountId = 1;
    for (Component component : componentMap.values()) {

        Document doc = Jsoup.parseBodyFragment(component.getDescription());
        Elements elements = doc.select("a");

        for (Element element : elements) {
            String link = element.attr("href");
            LinkCheckModel linkCheckModel = new LinkCheckModel();
            linkCheckModel.setId(component.getComponentId() + "-" + (linkCountId++));
            linkCheckModel.setComponentName(component.getName());
            linkCheckModel.setLink(link);
            linkCheckModel.setNetworkOfLink(getNetworkOfLink(link));
            linkCheckModel.setResourceType("Description Link");
            linkCheckModel.setSecurityMarking(component.getSecurityMarkingType());
            links.add(linkCheckModel);
        }

        List<ComponentResource> resources = resourceMap.get(component.getComponentId());
        if (resources != null) {
            for (ComponentResource resource : resources) {
                String link = resource.getLink();

                //Blank means it's an internal resource
                if (StringUtils.isNotBlank(link)) {
                    if (link.toLowerCase().contains("<a")) {
                        doc = Jsoup.parseBodyFragment(link);
                        elements = doc.select("a");
                        for (Element element : elements) {
                            link = element.attr("href");
                            break;
                        }
                    }

                    LinkCheckModel linkCheckModel = new LinkCheckModel();
                    linkCheckModel.setId(component.getComponentId() + "-" + resource.getResourceId());
                    linkCheckModel.setComponentName(component.getName());
                    linkCheckModel.setLink(link);
                    linkCheckModel.setNetworkOfLink(getNetworkOfLink(resource.getLink()));
                    linkCheckModel.setResourceType(
                            TranslateUtil.translate(ResourceType.class, resource.getResourceType()));
                    linkCheckModel.setSecurityMarking(resource.getSecurityMarkingType());
                    links.add(linkCheckModel);
                }
            }
        }

    }
    checkLinks();
}

From source file:com.sinelead.car.club.NewsFragment.java

public void parseNewsUrl() {
    HttpCache httpCache = new HttpCache(context);
    httpCache.httpGet("http://m.xincheping.com/", new HttpCacheListener() {

        protected void onPreGet() {
            // do something like show progressBar before httpGet, runs on
            // the UI thread
        }//from w ww  .j  a v a  2 s. com

        protected void onPostGet(HttpResponse httpResponse, boolean isInCache) {
            // do something like show data after httpGet, runs on the UI
            // thread
            if (httpResponse != null) {
                // get data success
                String html = httpResponse.getResponseBody();
                Document doc = Jsoup.parse(html);
                Elements uls = doc.select("ul.slides"); // classul

                bannerList = uls.first().getElementsByTag("a");

                if (imagePagerAdapter != null) {
                    imagePagerAdapter.setBannerList(bannerList);
                    imagePagerAdapter.notifyDataSetChanged();
                }

            } else {
                // get data fail
            }
        }
    });
    return;
}

From source file:crawler.AScraper.java

@Splitter(inputChannel = "channel1", outputChannel = "channel2")
public List<Element> scrape(ResponseEntity<String> payload) {
    String html = payload.getBody();
    final Document htmlDoc;
    try {//from  w  ww  . j av  a 2s . c  om
        htmlDoc = Jsoup.parse(new String(html.getBytes("ISO-8859-1"), "GBK"));
    } catch (UnsupportedEncodingException e) {
        LOG.error("Unsupported page encoding.");
        return null;
    }
    final Elements anchorNodes = htmlDoc.select("body").select("div[id^=read]").select("a");
    final List<Element> anchorList = new ArrayList<>();
    anchorNodes.traverse(new NodeVisitor() {
        @Override
        public void head(org.jsoup.nodes.Node node, int depth) {
            if (node instanceof org.jsoup.nodes.Element) {
                Element e = (Element) node;
                if (StringUtils.containsIgnoreCase(e.text(), ANCHOR_TEXT_PATTERN)) {
                    anchorList.add(e);
                }
            }
        }

        @Override
        public void tail(Node node, int depth) {
        }
    });
    return anchorList;
}

From source file:org.brunocvcunha.taskerbox.impl.jobs.LinkedInJobSeeker.java

public void bootstrapLinkedInHttpClient(boolean fetchCookie)
        throws ClientProtocolException, IllegalStateException, IOException, URISyntaxException {
    this.httpClient = TaskerboxHttpBox.getInstance().getHttpClient();

    HttpGet get = new HttpGet("https://www.linkedin.com/");
    HttpResponse getResponse = this.httpClient.execute(get);

    String getContent = EntityUtils.toString(getResponse.getEntity());

    Document getDoc = Jsoup.parse(getContent);

    String loginCsrfParam = getDoc.select("input[name=loginCsrfParam]").attr("value");
    String csrfToken = getDoc.select("input[name=csrfToken]").attr("value");

    logInfo(log, loginCsrfParam);/*from  w ww  . j  ava2  s  .  c  o m*/

    HttpPost post = new HttpPost("https://www.linkedin.com/uas/login-submit");
    List<NameValuePair> pairs2 = new ArrayList<>();
    pairs2.add(new BasicNameValuePair("isJsEnabled", "true"));
    pairs2.add(new BasicNameValuePair("source_app", ""));
    pairs2.add(new BasicNameValuePair("session_key", this.userEmail));
    pairs2.add(new BasicNameValuePair("session_password", this.userPassword));
    pairs2.add(new BasicNameValuePair("session_redirect", ""));
    pairs2.add(new BasicNameValuePair("trk", ""));
    pairs2.add(new BasicNameValuePair("loginCsrfParam", loginCsrfParam));
    pairs2.add(new BasicNameValuePair("fromEmail", ""));
    pairs2.add(new BasicNameValuePair("csrfToken", csrfToken));
    pairs2.add(new BasicNameValuePair("sourceAlias", "0_7r5yezRXCiA_H0CRD8sf6DhOjTKUNps5xGTqeX8EEoi"));
    pairs2.add(new BasicNameValuePair("client_ts", "1413507675390"));
    pairs2.add(new BasicNameValuePair("client_r", "a@gmail.com:812661382:422199706:736472965"));
    pairs2.add(new BasicNameValuePair("client_output", "-1850142"));
    pairs2.add(new BasicNameValuePair("client_n", "812661382:422199706:736472965"));
    pairs2.add(new BasicNameValuePair("client_v", "1.0.1"));

    UrlEncodedFormEntity entity2 = new UrlEncodedFormEntity(pairs2);
    post.setEntity(entity2);

    this.httpClient.execute(post);

}

From source file:com.liato.bankdroid.banking.banks.coop.Coop.java

@Override
protected LoginPackage preLogin() throws BankException, ClientProtocolException, IOException {
    urlopen = new Urllib(context,
            CertificateReader.getCertificates(context, R.raw.cert_coop, R.raw.cert_coop2));
    urlopen.addHeader("Origin", "https://www.coop.se");
    urlopen.addHeader("Referer", "https://www.coop.se/Mina-sidor/Logga-in-puffsida/?li=True");
    response = urlopen.open("https://www.coop.se/");
    Document d = Jsoup.parse(response);
    String pageGuid = d.select("input[name=pageGuid]").first().val();
    WebAuthenticateRequest webAuthReq = new WebAuthenticateRequest(pageGuid, username, password);
    urlopen.addHeader("Content-Type", "application/json");
    HttpEntity e = new StringEntity(getObjectmapper().writeValueAsString(webAuthReq));

    HttpResponse httpResponse = urlopen//w  w w .  j ava  2 s . c o  m
            .openAsHttpResponse("https://www.coop.se/Services/PlainService.svc/JsonExecute", e, true);
    if (httpResponse.getStatusLine().getStatusCode() != 200) {
        throw new BankException(res.getString(R.string.invalid_username_password));
    }

    LoginPackage lp = new LoginPackage(urlopen, null, response, "https://www.coop.se/Mina-sidor/Oversikt/");
    lp.setIsLoggedIn(true);
    return lp;
}

From source file:com.maxl.java.aips2xml.Aips2Xml.java

static String convertHtmlToXml(String med_title, String html_str, String regnr_str) {
    Document mDoc = Jsoup.parse(html_str);
    mDoc.outputSettings().escapeMode(EscapeMode.xhtml);
    mDoc.outputSettings().prettyPrint(true);
    mDoc.outputSettings().indentAmount(4);

    // <div id="monographie"> -> <fi>
    mDoc.select("div[id=monographie]").tagName("fi").removeAttr("id");
    // <div class="MonTitle"> -> <title>
    mDoc.select("div[class=MonTitle]").tagName("title").removeAttr("class").removeAttr("id");
    // Beautify the title to the best of my possibilities ... still not good enough!
    String title_str = mDoc.select("title").text().trim().replaceAll("<br />", "").replaceAll("(\\t|\\r?\\n)+",
            "");/*w w  w .  j a v a 2  s  .  c o m*/
    if (!title_str.equals(med_title))
        if (SHOW_ERRORS)
            System.err.println(med_title + " differs from " + title_str);
    // Fallback solution: use title from the header AIPS.xml file - the titles look all pretty good!
    mDoc.select("title").first().text(med_title);
    // <div class="ownerCompany"> -> <owner>
    Element owner_elem = mDoc.select("div[class=ownerCompany]").first();
    if (owner_elem != null) {
        owner_elem.tagName("owner").removeAttr("class");
        String owner_str = mDoc.select("owner").text();
        mDoc.select("owner").first().text(owner_str);
    } else {
        mDoc.select("title").after("<owner></owner>");
        if (DB_LANGUAGE.equals("de"))
            mDoc.select("owner").first().text("k.A.");
        else if (DB_LANGUAGE.equals("fr"))
            mDoc.select("owner").first().text("n.s.");
    }

    // <div class="paragraph"> -> <paragraph>
    mDoc.select("div[class=paragraph]").tagName("paragraph").removeAttr("class").removeAttr("id");
    // <div class="absTitle"> -> <paragraphTitle>
    mDoc.select("div[class=absTitle]").tagName("paragraphtitle").removeAttr("class");
    // <div class="untertitle1"> -> <paragraphSubTitle>
    mDoc.select("div[class=untertitle1]").tagName("paragraphsubtitle").removeAttr("class");
    // <div class="untertitle"> -> <paragraphSubTitle>
    mDoc.select("div[class=untertitle]").tagName("paragraphsubtitle").removeAttr("class");
    // <div class="shortCharacteristic"> -> <characteristic>
    mDoc.select("div[class=shortCharacteristic]").tagName("characteristic").removeAttr("class");
    // <div class="image">
    mDoc.select("div[class=image]").tagName("image").removeAttr("class");

    // <p class="spacing1"> -> <p> / <p class="noSpacing"> -> <p>
    mDoc.select("p[class]").tagName("p").removeAttr("class");
    // <span style="font-style:italic"> -> <i>
    mDoc.select("span").tagName("i").removeAttr("style");
    // <i class="indention1"> -> <i> / <i class="indention2"> -> <b-i> 
    mDoc.select("i[class=indention1]").tagName("i").removeAttr("class");
    mDoc.select("i[class=indention2]").tagName("i").removeAttr("class");
    // mDoc.select("p").select("i").tagName("i");
    // mDoc.select("paragraphtitle").select("i").tagName("para-i");
    // mDoc.select("paragraphsubtitle").select("i").tagName("parasub-i");
    Elements elems = mDoc.select("paragraphtitle");
    for (Element e : elems) {
        if (!e.text().isEmpty())
            e.text(e.text());
    }
    elems = mDoc.select("paragraphsubtitle");
    for (Element e : elems) {
        if (!e.text().isEmpty())
            e.text(e.text());
    }

    // Here we take care of tables
    // <table class="s21"> -> <table>
    mDoc.select("table[class]").removeAttr("class");
    mDoc.select("table").removeAttr("cellspacing").removeAttr("cellpadding").removeAttr("border");
    mDoc.select("colgroup").remove();
    mDoc.select("td").removeAttr("class").removeAttr("colspan").removeAttr("rowspan");
    mDoc.select("tr").removeAttr("class");
    elems = mDoc.select("div[class]");
    for (Element e : elems) {
        if (e.text().isEmpty())
            e.remove();
    }

    mDoc.select("tbody").unwrap();
    // Remove nested table (a nasty table-in-a-table
    Elements nested_table = mDoc.select("table").select("tr").select("td").select("table");
    if (!nested_table.isEmpty()) {
        nested_table.select("table").unwrap();
    }

    // Here we take care of the images
    mDoc.select("img").removeAttr("style").removeAttr("align").removeAttr("border");

    // Subs and sups
    mDoc.select("sub[class]").tagName("sub").removeAttr("class");
    mDoc.select("sup[class]").tagName("sup").removeAttr("class");
    mDoc.select("td").select("sub").tagName("td-sub");
    mDoc.select("td").select("sup").tagName("td-sup");
    // Remove floating <td-sup> tags
    mDoc.select("p").select("td-sup").tagName("sup");
    mDoc.select("p").select("td-sub").tagName("sub");

    // Box
    mDoc.select("div[class=box]").tagName("box").removeAttr("class");

    // Insert swissmedicno5 after <owner> tag
    mDoc.select("owner").after("<swissmedicno5></swissmedicno5");
    mDoc.select("swissmedicno5").first().text(regnr_str);

    // Remove html, head and body tags         
    String xml_str = mDoc.select("body").first().html();

    //xml_str = xml_str.replaceAll("<tbody>", "").replaceAll("</tbody>", "");
    xml_str = xml_str.replaceAll("<sup> </sup>", "");
    xml_str = xml_str.replaceAll("<sub> </sub>", "");
    xml_str = xml_str.replaceAll("<p> <i>", "<p><i>");
    xml_str = xml_str.replaceAll("</p> </td>", "</p></td>");
    xml_str = xml_str.replaceAll("<p> </p>", "<p></p>"); // MUST be improved, the space is not a real space!!
    xml_str = xml_str.replaceAll("", "- ");
    xml_str = xml_str.replaceAll("<br />", "");
    xml_str = xml_str.replaceAll("(?m)^[ \t]*\r?\n", "");

    // Remove multiple instances of <p></p>
    Scanner scanner = new Scanner(xml_str);
    String new_xml_str = "";
    int counter = 0;
    while (scanner.hasNextLine()) {
        String line = scanner.nextLine();
        if (line.trim().equals("<p></p>")) {
            counter++;
        } else
            counter = 0;
        if (counter < 3)
            new_xml_str += line;
    }
    scanner.close();

    return new_xml_str;
}

From source file:me.rkfg.xmpp.bot.plugins.CoolStoryPlugin.java

private String fetchStory(Website website) throws IOException {
    int roll = 0;
    String result;//from www.ja  va2s .  c om
    int resultLength;
    int resultLines;

    //noinspection ConstantConditions
    do {
        roll++;

        final Document doc = Jsoup.connect(website.getUrlString()).userAgent(DEFAULT_UA).get();
        doc.outputSettings(new Document.OutputSettings().prettyPrint(false));
        logger.info("Fetched a story from {}", doc.location());

        final Element story = doc.select(website.getCssQuery()).first();
        if (story == null) {
            return ERROR_COULD_NOT_PARSE;
        }

        story.select("div").remove();
        story.select("img").forEach(img -> img.replaceWith(new TextNode(img.attr("src"), "")));
        story.select("br").after("\\n");
        story.select("p").before("\\n\\n");
        final String storyHtml = story.html().replaceAll("\\\\n", "\n");

        result = Jsoup.clean(storyHtml, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false))
                .trim();
        resultLength = result.length();
        resultLines = countLines(result);

    } while (CONFIG_REROLL_LONG_STORIES
            && (resultLength > CONFIG_MAX_STORY_LENGTH || resultLines > CONFIG_MAX_STORY_LINES)
            && roll <= CONFIG_MAX_ROLLS);

    return result;
}

From source file:com.aurel.track.exchange.docx.exporter.PreprocessImage.java

/**
 * Removes the HTML5 figure tag and saves the figcaption in the <img> tag's "alt" attribute for later use
 * @param htmlContent//w w  w  .j  ava2  s . com
 * @return
 */
private Document removeFigureSaveFigcaption(String htmlContent) {
    Document doc = Jsoup.parseBodyFragment(htmlContent);
    //figure is a HTML5 tag not accepted by Tidy, so it should be replaced by the content <img>-tag, and the figcaption is saved in the "alt" attribute
    Elements figureElements = doc.select("figure");
    Element figcaptionNode = null;
    if (figureElements != null) {
        for (Iterator<Element> iterator = figureElements.iterator(); iterator.hasNext();) {
            Element figureElement = iterator.next();
            Elements figureChildren = figureElement.getAllElements();
            Node imageNode = null;
            if (figureChildren != null) {
                for (Element figureChild : figureChildren) {
                    if ("img".equals(figureChild.nodeName())) {
                        imageNode = figureChild;
                    } else {
                        if ("figcaption".equals(figureChild.nodeName())) {
                            figcaptionNode = figureChild;
                            //set "figcaption" text as value for "alt" attribute  
                            if (imageNode != null) {
                                imageNode.attr("alt", figcaptionNode.text());
                            }
                        }
                    }
                }
            }
            if (imageNode != null) {
                figureElement.replaceWith(imageNode);
            }
        }
    }
    return doc;
}

From source file:com.github.binlee1990.spider.video.spider.PersonCrawler.java

@Override
public void visit(Page page) {
    int docid = page.getWebURL().getDocid();
    String url = page.getWebURL().getURL();

    if (!url.startsWith("http://www.javlibrary.com/cn/?v=jav")) {
        return;/*from ww  w  . j ava2 s .  c  o m*/
    }

    if (page.getParseData() instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
        String html = htmlParseData.getHtml();

        Document doc = Jsoup.parse(html);

        String videoIdentificationCode = doc.select("div#video_id td.text").first().text().toString();
        if (StringUtils.isNotBlank(videoIdentificationCode)) {

            Video queryVideo = new Video();
            queryVideo.setIdentificationCode(videoIdentificationCode);
            queryVideo.setUrl(url);
            Video video = videoMapper.queryByVideo(queryVideo);

            if (null == video) {
                video = generateVideo(url, doc, videoIdentificationCode);

                try {
                    videoMapper.insertSelective(video);
                } catch (Exception e) {
                }

                logger.warn("==================handle " + video.getIdentificationCode() + "\n"
                        + JSON.toJSONString(video));
            } else {
                Date now = new Date();
                //logger.warn(url + "    " + video.getIdentificationCode() + "");
                while (true) {
                    Video v = videoMapper.queryByVideo(queryVideo);
                    int number = v.getOccurNumber();
                    int updateNumber = number + 1;
                    int c = videoMapper.updateOccurNumberById(v.getId(), updateNumber, number, now);
                    if (c == 1) {
                        break;
                    }
                }
            }

            int videoId = videoMapper.queryByVideo(video).getId();

            try {
                createVideoActress(doc, videoId);
            } catch (Exception e) {
            }

            try {
                createVideoCategory(doc, videoId);
            } catch (Exception e) {
            }
        }
    }
}