List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:net.liuxuan.Tools.signup.SignupV2ex.java
public void getLoginForm() throws IOException { HttpGet httpget = new HttpGet("http://v2ex.com/signin"); CloseableHttpResponse response1 = httpclient.execute(httpget); try {/*from w w w.j ava2 s . co m*/ HttpEntity entity = response1.getEntity(); //?once String content = EntityUtils.toString(entity); // System.out.println(content); System.out.println("--------------"); System.out.println("--------------"); Document doc = Jsoup.parse(content); // Elements inputs = doc.select("input[type=text]"); Elements inputs = doc.select("input[type=hidden]"); for (int i = 0; i < inputs.size(); i++) { Element element = inputs.get(i); params.add(new BasicNameValuePair(element.attr("name"), element.attr("value"))); // params.put(element.attr("name"), element.attr("value")); System.out.println(element.toString()); System.out.println(element.attr("name")); System.out.println(element.attr("value")); } System.out.println("--------------"); System.out.println("--------------"); System.out.println("--------------"); System.out.println("--------------"); System.out.println("Login form get: " + response1.getStatusLine()); EntityUtils.consume(entity); System.out.println("Initial set of cookies:"); List<Cookie> cookies = cookieStore.getCookies(); if (cookies.isEmpty()) { System.out.println("None"); } else { for (int i = 0; i < cookies.size(); i++) { System.out.println("- " + cookies.get(i).toString()); } } } finally { response1.close(); } // HttpUriRequest login = RequestBuilder.post() // .setUri(new URI("http://v2ex.com/signin")) // .addParameter("u", "mosliu") // .addParameter("p", "mosesmoses") // .build(); // CloseableHttpResponse response2 = httpclient.execute(login); // try { // HttpEntity entity = response2.getEntity(); // // System.out.println("Login form get: " + response2.getStatusLine()); // // EntityUtils.consume(entity); // // System.out.println("Post logon cookies:"); // List<Cookie> cookies = cookieStore.getCookies(); // if (cookies.isEmpty()) { // System.out.println("None"); // } else { // for (int i = 0; i < cookies.size(); i++) { // System.out.println("- " + cookies.get(i).toString()); // } // } // // // // } finally { // response2.close(); // } // // // httpget = new HttpGet("http://v2ex.com/signin"); // response1 = httpclient.execute(httpget); // try { // HttpEntity entity = response1.getEntity(); // String content = EntityUtils.toString(entity); // System.out.println("-----------------content---------------------"); // System.out.println(content); // // EntityUtils.consume(entity); // } finally { // response1.close(); // } // // }
From source file:edu.usu.sdl.openstorefront.report.ExternalLinkValidationReport.java
@Override protected void gatherData() { ComponentResource componentResourceExample = new ComponentResource(); componentResourceExample.setActiveStatus(ComponentResource.ACTIVE_STATUS); List<ComponentResource> componentResources = service.getPersistenceService() .queryByExample(ComponentResource.class, componentResourceExample); Map<String, List<ComponentResource>> resourceMap = new HashMap<>(); componentResources.forEach(resource -> { if (resourceMap.containsKey(resource.getComponentId())) { resourceMap.get(resource.getComponentId()).add(resource); } else {//from w ww. j a va2 s . co m List<ComponentResource> resources = new ArrayList<>(); resources.add(resource); resourceMap.put(resource.getComponentId(), resources); } }); Component componentExample = new Component(); componentExample.setActiveStatus(Component.ACTIVE_STATUS); componentExample.setApprovalState(ApprovalStatus.APPROVED); List<Component> components = service.getPersistenceService().queryByExample(Component.class, componentExample); Map<String, Component> componentMap = new HashMap<>(); components.forEach(component -> { componentMap.put(component.getComponentId(), component); }); //exact all links long linkCountId = 1; for (Component component : componentMap.values()) { Document doc = Jsoup.parseBodyFragment(component.getDescription()); Elements elements = doc.select("a"); for (Element element : elements) { String link = element.attr("href"); LinkCheckModel linkCheckModel = new LinkCheckModel(); linkCheckModel.setId(component.getComponentId() + "-" + (linkCountId++)); linkCheckModel.setComponentName(component.getName()); linkCheckModel.setLink(link); linkCheckModel.setNetworkOfLink(getNetworkOfLink(link)); linkCheckModel.setResourceType("Description Link"); linkCheckModel.setSecurityMarking(component.getSecurityMarkingType()); links.add(linkCheckModel); } List<ComponentResource> resources = resourceMap.get(component.getComponentId()); if (resources != null) { for (ComponentResource resource : resources) { String link = resource.getLink(); //Blank means it's an internal resource if (StringUtils.isNotBlank(link)) { if (link.toLowerCase().contains("<a")) { doc = Jsoup.parseBodyFragment(link); elements = doc.select("a"); for (Element element : elements) { link = element.attr("href"); break; } } LinkCheckModel linkCheckModel = new LinkCheckModel(); linkCheckModel.setId(component.getComponentId() + "-" + resource.getResourceId()); linkCheckModel.setComponentName(component.getName()); linkCheckModel.setLink(link); linkCheckModel.setNetworkOfLink(getNetworkOfLink(resource.getLink())); linkCheckModel.setResourceType( TranslateUtil.translate(ResourceType.class, resource.getResourceType())); linkCheckModel.setSecurityMarking(resource.getSecurityMarkingType()); links.add(linkCheckModel); } } } } checkLinks(); }
From source file:com.sinelead.car.club.NewsFragment.java
public void parseNewsUrl() { HttpCache httpCache = new HttpCache(context); httpCache.httpGet("http://m.xincheping.com/", new HttpCacheListener() { protected void onPreGet() { // do something like show progressBar before httpGet, runs on // the UI thread }//from w ww .j a v a 2 s. com protected void onPostGet(HttpResponse httpResponse, boolean isInCache) { // do something like show data after httpGet, runs on the UI // thread if (httpResponse != null) { // get data success String html = httpResponse.getResponseBody(); Document doc = Jsoup.parse(html); Elements uls = doc.select("ul.slides"); // classul bannerList = uls.first().getElementsByTag("a"); if (imagePagerAdapter != null) { imagePagerAdapter.setBannerList(bannerList); imagePagerAdapter.notifyDataSetChanged(); } } else { // get data fail } } }); return; }
From source file:crawler.AScraper.java
@Splitter(inputChannel = "channel1", outputChannel = "channel2") public List<Element> scrape(ResponseEntity<String> payload) { String html = payload.getBody(); final Document htmlDoc; try {//from w ww . j av a 2s . c om htmlDoc = Jsoup.parse(new String(html.getBytes("ISO-8859-1"), "GBK")); } catch (UnsupportedEncodingException e) { LOG.error("Unsupported page encoding."); return null; } final Elements anchorNodes = htmlDoc.select("body").select("div[id^=read]").select("a"); final List<Element> anchorList = new ArrayList<>(); anchorNodes.traverse(new NodeVisitor() { @Override public void head(org.jsoup.nodes.Node node, int depth) { if (node instanceof org.jsoup.nodes.Element) { Element e = (Element) node; if (StringUtils.containsIgnoreCase(e.text(), ANCHOR_TEXT_PATTERN)) { anchorList.add(e); } } } @Override public void tail(Node node, int depth) { } }); return anchorList; }
From source file:org.brunocvcunha.taskerbox.impl.jobs.LinkedInJobSeeker.java
public void bootstrapLinkedInHttpClient(boolean fetchCookie) throws ClientProtocolException, IllegalStateException, IOException, URISyntaxException { this.httpClient = TaskerboxHttpBox.getInstance().getHttpClient(); HttpGet get = new HttpGet("https://www.linkedin.com/"); HttpResponse getResponse = this.httpClient.execute(get); String getContent = EntityUtils.toString(getResponse.getEntity()); Document getDoc = Jsoup.parse(getContent); String loginCsrfParam = getDoc.select("input[name=loginCsrfParam]").attr("value"); String csrfToken = getDoc.select("input[name=csrfToken]").attr("value"); logInfo(log, loginCsrfParam);/*from w ww . j ava2 s . c o m*/ HttpPost post = new HttpPost("https://www.linkedin.com/uas/login-submit"); List<NameValuePair> pairs2 = new ArrayList<>(); pairs2.add(new BasicNameValuePair("isJsEnabled", "true")); pairs2.add(new BasicNameValuePair("source_app", "")); pairs2.add(new BasicNameValuePair("session_key", this.userEmail)); pairs2.add(new BasicNameValuePair("session_password", this.userPassword)); pairs2.add(new BasicNameValuePair("session_redirect", "")); pairs2.add(new BasicNameValuePair("trk", "")); pairs2.add(new BasicNameValuePair("loginCsrfParam", loginCsrfParam)); pairs2.add(new BasicNameValuePair("fromEmail", "")); pairs2.add(new BasicNameValuePair("csrfToken", csrfToken)); pairs2.add(new BasicNameValuePair("sourceAlias", "0_7r5yezRXCiA_H0CRD8sf6DhOjTKUNps5xGTqeX8EEoi")); pairs2.add(new BasicNameValuePair("client_ts", "1413507675390")); pairs2.add(new BasicNameValuePair("client_r", "a@gmail.com:812661382:422199706:736472965")); pairs2.add(new BasicNameValuePair("client_output", "-1850142")); pairs2.add(new BasicNameValuePair("client_n", "812661382:422199706:736472965")); pairs2.add(new BasicNameValuePair("client_v", "1.0.1")); UrlEncodedFormEntity entity2 = new UrlEncodedFormEntity(pairs2); post.setEntity(entity2); this.httpClient.execute(post); }
From source file:com.liato.bankdroid.banking.banks.coop.Coop.java
@Override protected LoginPackage preLogin() throws BankException, ClientProtocolException, IOException { urlopen = new Urllib(context, CertificateReader.getCertificates(context, R.raw.cert_coop, R.raw.cert_coop2)); urlopen.addHeader("Origin", "https://www.coop.se"); urlopen.addHeader("Referer", "https://www.coop.se/Mina-sidor/Logga-in-puffsida/?li=True"); response = urlopen.open("https://www.coop.se/"); Document d = Jsoup.parse(response); String pageGuid = d.select("input[name=pageGuid]").first().val(); WebAuthenticateRequest webAuthReq = new WebAuthenticateRequest(pageGuid, username, password); urlopen.addHeader("Content-Type", "application/json"); HttpEntity e = new StringEntity(getObjectmapper().writeValueAsString(webAuthReq)); HttpResponse httpResponse = urlopen//w w w . j ava 2 s . c o m .openAsHttpResponse("https://www.coop.se/Services/PlainService.svc/JsonExecute", e, true); if (httpResponse.getStatusLine().getStatusCode() != 200) { throw new BankException(res.getString(R.string.invalid_username_password)); } LoginPackage lp = new LoginPackage(urlopen, null, response, "https://www.coop.se/Mina-sidor/Oversikt/"); lp.setIsLoggedIn(true); return lp; }
From source file:com.maxl.java.aips2xml.Aips2Xml.java
static String convertHtmlToXml(String med_title, String html_str, String regnr_str) { Document mDoc = Jsoup.parse(html_str); mDoc.outputSettings().escapeMode(EscapeMode.xhtml); mDoc.outputSettings().prettyPrint(true); mDoc.outputSettings().indentAmount(4); // <div id="monographie"> -> <fi> mDoc.select("div[id=monographie]").tagName("fi").removeAttr("id"); // <div class="MonTitle"> -> <title> mDoc.select("div[class=MonTitle]").tagName("title").removeAttr("class").removeAttr("id"); // Beautify the title to the best of my possibilities ... still not good enough! String title_str = mDoc.select("title").text().trim().replaceAll("<br />", "").replaceAll("(\\t|\\r?\\n)+", "");/*w w w . j a v a 2 s . c o m*/ if (!title_str.equals(med_title)) if (SHOW_ERRORS) System.err.println(med_title + " differs from " + title_str); // Fallback solution: use title from the header AIPS.xml file - the titles look all pretty good! mDoc.select("title").first().text(med_title); // <div class="ownerCompany"> -> <owner> Element owner_elem = mDoc.select("div[class=ownerCompany]").first(); if (owner_elem != null) { owner_elem.tagName("owner").removeAttr("class"); String owner_str = mDoc.select("owner").text(); mDoc.select("owner").first().text(owner_str); } else { mDoc.select("title").after("<owner></owner>"); if (DB_LANGUAGE.equals("de")) mDoc.select("owner").first().text("k.A."); else if (DB_LANGUAGE.equals("fr")) mDoc.select("owner").first().text("n.s."); } // <div class="paragraph"> -> <paragraph> mDoc.select("div[class=paragraph]").tagName("paragraph").removeAttr("class").removeAttr("id"); // <div class="absTitle"> -> <paragraphTitle> mDoc.select("div[class=absTitle]").tagName("paragraphtitle").removeAttr("class"); // <div class="untertitle1"> -> <paragraphSubTitle> mDoc.select("div[class=untertitle1]").tagName("paragraphsubtitle").removeAttr("class"); // <div class="untertitle"> -> <paragraphSubTitle> mDoc.select("div[class=untertitle]").tagName("paragraphsubtitle").removeAttr("class"); // <div class="shortCharacteristic"> -> <characteristic> mDoc.select("div[class=shortCharacteristic]").tagName("characteristic").removeAttr("class"); // <div class="image"> mDoc.select("div[class=image]").tagName("image").removeAttr("class"); // <p class="spacing1"> -> <p> / <p class="noSpacing"> -> <p> mDoc.select("p[class]").tagName("p").removeAttr("class"); // <span style="font-style:italic"> -> <i> mDoc.select("span").tagName("i").removeAttr("style"); // <i class="indention1"> -> <i> / <i class="indention2"> -> <b-i> mDoc.select("i[class=indention1]").tagName("i").removeAttr("class"); mDoc.select("i[class=indention2]").tagName("i").removeAttr("class"); // mDoc.select("p").select("i").tagName("i"); // mDoc.select("paragraphtitle").select("i").tagName("para-i"); // mDoc.select("paragraphsubtitle").select("i").tagName("parasub-i"); Elements elems = mDoc.select("paragraphtitle"); for (Element e : elems) { if (!e.text().isEmpty()) e.text(e.text()); } elems = mDoc.select("paragraphsubtitle"); for (Element e : elems) { if (!e.text().isEmpty()) e.text(e.text()); } // Here we take care of tables // <table class="s21"> -> <table> mDoc.select("table[class]").removeAttr("class"); mDoc.select("table").removeAttr("cellspacing").removeAttr("cellpadding").removeAttr("border"); mDoc.select("colgroup").remove(); mDoc.select("td").removeAttr("class").removeAttr("colspan").removeAttr("rowspan"); mDoc.select("tr").removeAttr("class"); elems = mDoc.select("div[class]"); for (Element e : elems) { if (e.text().isEmpty()) e.remove(); } mDoc.select("tbody").unwrap(); // Remove nested table (a nasty table-in-a-table Elements nested_table = mDoc.select("table").select("tr").select("td").select("table"); if (!nested_table.isEmpty()) { nested_table.select("table").unwrap(); } // Here we take care of the images mDoc.select("img").removeAttr("style").removeAttr("align").removeAttr("border"); // Subs and sups mDoc.select("sub[class]").tagName("sub").removeAttr("class"); mDoc.select("sup[class]").tagName("sup").removeAttr("class"); mDoc.select("td").select("sub").tagName("td-sub"); mDoc.select("td").select("sup").tagName("td-sup"); // Remove floating <td-sup> tags mDoc.select("p").select("td-sup").tagName("sup"); mDoc.select("p").select("td-sub").tagName("sub"); // Box mDoc.select("div[class=box]").tagName("box").removeAttr("class"); // Insert swissmedicno5 after <owner> tag mDoc.select("owner").after("<swissmedicno5></swissmedicno5"); mDoc.select("swissmedicno5").first().text(regnr_str); // Remove html, head and body tags String xml_str = mDoc.select("body").first().html(); //xml_str = xml_str.replaceAll("<tbody>", "").replaceAll("</tbody>", ""); xml_str = xml_str.replaceAll("<sup> </sup>", ""); xml_str = xml_str.replaceAll("<sub> </sub>", ""); xml_str = xml_str.replaceAll("<p> <i>", "<p><i>"); xml_str = xml_str.replaceAll("</p> </td>", "</p></td>"); xml_str = xml_str.replaceAll("<p> </p>", "<p></p>"); // MUST be improved, the space is not a real space!! xml_str = xml_str.replaceAll("", "- "); xml_str = xml_str.replaceAll("<br />", ""); xml_str = xml_str.replaceAll("(?m)^[ \t]*\r?\n", ""); // Remove multiple instances of <p></p> Scanner scanner = new Scanner(xml_str); String new_xml_str = ""; int counter = 0; while (scanner.hasNextLine()) { String line = scanner.nextLine(); if (line.trim().equals("<p></p>")) { counter++; } else counter = 0; if (counter < 3) new_xml_str += line; } scanner.close(); return new_xml_str; }
From source file:me.rkfg.xmpp.bot.plugins.CoolStoryPlugin.java
private String fetchStory(Website website) throws IOException { int roll = 0; String result;//from www.ja va2s . c om int resultLength; int resultLines; //noinspection ConstantConditions do { roll++; final Document doc = Jsoup.connect(website.getUrlString()).userAgent(DEFAULT_UA).get(); doc.outputSettings(new Document.OutputSettings().prettyPrint(false)); logger.info("Fetched a story from {}", doc.location()); final Element story = doc.select(website.getCssQuery()).first(); if (story == null) { return ERROR_COULD_NOT_PARSE; } story.select("div").remove(); story.select("img").forEach(img -> img.replaceWith(new TextNode(img.attr("src"), ""))); story.select("br").after("\\n"); story.select("p").before("\\n\\n"); final String storyHtml = story.html().replaceAll("\\\\n", "\n"); result = Jsoup.clean(storyHtml, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false)) .trim(); resultLength = result.length(); resultLines = countLines(result); } while (CONFIG_REROLL_LONG_STORIES && (resultLength > CONFIG_MAX_STORY_LENGTH || resultLines > CONFIG_MAX_STORY_LINES) && roll <= CONFIG_MAX_ROLLS); return result; }
From source file:com.aurel.track.exchange.docx.exporter.PreprocessImage.java
/** * Removes the HTML5 figure tag and saves the figcaption in the <img> tag's "alt" attribute for later use * @param htmlContent//w w w .j ava2 s . com * @return */ private Document removeFigureSaveFigcaption(String htmlContent) { Document doc = Jsoup.parseBodyFragment(htmlContent); //figure is a HTML5 tag not accepted by Tidy, so it should be replaced by the content <img>-tag, and the figcaption is saved in the "alt" attribute Elements figureElements = doc.select("figure"); Element figcaptionNode = null; if (figureElements != null) { for (Iterator<Element> iterator = figureElements.iterator(); iterator.hasNext();) { Element figureElement = iterator.next(); Elements figureChildren = figureElement.getAllElements(); Node imageNode = null; if (figureChildren != null) { for (Element figureChild : figureChildren) { if ("img".equals(figureChild.nodeName())) { imageNode = figureChild; } else { if ("figcaption".equals(figureChild.nodeName())) { figcaptionNode = figureChild; //set "figcaption" text as value for "alt" attribute if (imageNode != null) { imageNode.attr("alt", figcaptionNode.text()); } } } } } if (imageNode != null) { figureElement.replaceWith(imageNode); } } } return doc; }
From source file:com.github.binlee1990.spider.video.spider.PersonCrawler.java
@Override public void visit(Page page) { int docid = page.getWebURL().getDocid(); String url = page.getWebURL().getURL(); if (!url.startsWith("http://www.javlibrary.com/cn/?v=jav")) { return;/*from ww w . j ava2 s . c o m*/ } if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String html = htmlParseData.getHtml(); Document doc = Jsoup.parse(html); String videoIdentificationCode = doc.select("div#video_id td.text").first().text().toString(); if (StringUtils.isNotBlank(videoIdentificationCode)) { Video queryVideo = new Video(); queryVideo.setIdentificationCode(videoIdentificationCode); queryVideo.setUrl(url); Video video = videoMapper.queryByVideo(queryVideo); if (null == video) { video = generateVideo(url, doc, videoIdentificationCode); try { videoMapper.insertSelective(video); } catch (Exception e) { } logger.warn("==================handle " + video.getIdentificationCode() + "\n" + JSON.toJSONString(video)); } else { Date now = new Date(); //logger.warn(url + " " + video.getIdentificationCode() + ""); while (true) { Video v = videoMapper.queryByVideo(queryVideo); int number = v.getOccurNumber(); int updateNumber = number + 1; int c = videoMapper.updateOccurNumberById(v.getId(), updateNumber, number, now); if (c == 1) { break; } } } int videoId = videoMapper.queryByVideo(video).getId(); try { createVideoActress(doc, videoId); } catch (Exception e) { } try { createVideoCategory(doc, videoId); } catch (Exception e) { } } } }