List of usage examples for org.jsoup.nodes Element html
public String html()
From source file:com.kantenkugel.discordbot.jdocparser.JDocParser.java
private static List<DocBlock> getDocBlock(String jdocBase, Element elem, ClassDocumentation reference) { if (elem != null) { String baseLink = JDocUtil.getLink(jdocBase, reference); List<DocBlock> blocks = new ArrayList<>(10); String hashLink = null;// w ww .ja va 2 s.co m for (elem = elem.nextElementSibling(); elem != null; elem = elem.nextElementSibling()) { if (elem.tagName().equals("a")) { hashLink = '#' + elem.attr("name"); } else if (elem.tagName().equals("ul")) { Element tmp = elem.getElementsByTag("h4").first(); String title = JDocUtil.fixSpaces(tmp.text().trim()); String description = "", signature = ""; OrderedMap<String, List<String>> fields = new ListOrderedMap<>(); for (; tmp != null; tmp = tmp.nextElementSibling()) { if (tmp.tagName().equals("pre")) { //contains full signature signature = JDocUtil.fixSpaces(tmp.text().trim()); } else if (tmp.tagName().equals("div") && tmp.className().equals("block")) { //main block of content (description or deprecation) Element deprecationElem = tmp.getElementsByClass("deprecationComment").first(); if (deprecationElem != null) { //deprecation block fields.put("Deprecated:", Collections .singletonList(JDocUtil.formatText(deprecationElem.html(), baseLink))); } else { //description block description = JDocUtil.formatText(tmp.html(), baseLink); } } else if (tmp.tagName().equals("dl")) { //a field String fieldName = null; List<String> fieldValues = new ArrayList<>(); for (Element element : tmp.children()) { if (element.tagName().equals("dt")) { if (fieldName != null) { fields.put(fieldName, fieldValues); fieldValues = new ArrayList<>(); } fieldName = JDocUtil.fixSpaces(element.text().trim()); } else if (element.tagName().equals("dd")) { fieldValues.add(JDocUtil.formatText(element.html(), baseLink)); } } if (fieldName != null) { fields.put(fieldName, fieldValues); } } } blocks.add(new DocBlock(title, hashLink, signature, description, fields)); } } return blocks; } return null; }
From source file:io.seldon.importer.articles.ItemAttributesImporter.java
public static Map<String, String> getAttributes(String url, String existingCategory) { ItemProcessResult itemProcessResult = new ItemProcessResult(); itemProcessResult.client_item_id = url; itemProcessResult.extraction_status = "EXTRACTION_FAILED"; logger.info("Trying to get attributes for " + url); Map<String, String> attributes = null; String title = ""; String category = ""; String subCategory = ""; String img_url = ""; String description = ""; String tags = ""; String leadtext = ""; String link = ""; String publishDate = ""; String domain = ""; try {//www .j a va 2s.c o m long now = System.currentTimeMillis(); long timeSinceLastRequest = now - lastUrlFetchTime; if (timeSinceLastRequest < minFetchGapMsecs) { long timeToSleep = minFetchGapMsecs - timeSinceLastRequest; logger.info( "Sleeping " + timeToSleep + "msecs as time since last fetch is " + timeSinceLastRequest); Thread.sleep(timeToSleep); } Document articleDoc = Jsoup.connect(url).userAgent("SeldonBot/1.0").timeout(httpGetTimeout).get(); lastUrlFetchTime = System.currentTimeMillis(); //get IMAGE URL if (StringUtils.isNotBlank(imageCssSelector)) { Element imageElement = articleDoc.select(imageCssSelector).first(); if (imageElement != null && imageElement.attr("content") != null) { img_url = imageElement.attr("content"); } if (imageElement != null && StringUtils.isBlank(img_url)) { img_url = imageElement.attr("src"); } if (imageElement != null && StringUtils.isBlank(img_url)) { img_url = imageElement.attr("href"); } } if (StringUtils.isBlank(img_url) && StringUtils.isNotBlank(defImageUrl)) { logger.info("Setting image to default: " + defImageUrl); img_url = defImageUrl; } img_url = StringUtils.strip(img_url); //get TITLE if (StringUtils.isNotBlank(titleCssSelector)) { Element titleElement = articleDoc.select(titleCssSelector).first(); if ((titleElement != null) && (titleElement.attr("content") != null)) { title = titleElement.attr("content"); } // if still blank get from text instead if (StringUtils.isBlank(title) && (titleElement != null)) { title = titleElement.text(); } } //get LEAD TEXT if (StringUtils.isNotBlank(leadTextCssSelector)) { Element leadElement = articleDoc.select(leadTextCssSelector).first(); if (leadElement != null && leadElement.attr("content") != null) { leadtext = leadElement.attr("content"); } } //get publish date if (StringUtils.isNotBlank(publishDateCssSelector)) { //2013-01-21T10:40:55Z Element pubElement = articleDoc.select(publishDateCssSelector).first(); if (pubElement != null && pubElement.attr("content") != null) { String pubtext = pubElement.attr("content"); SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH); Date result = null; try { result = df.parse(pubtext); } catch (ParseException e) { logger.info("Failed to parse date withUTC format " + pubtext); } //try a simpler format df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ENGLISH); try { result = df.parse(pubtext); } catch (ParseException e) { logger.info("Failed to parse date " + pubtext); } if (result != null) publishDate = dateFormatter.format(result); else logger.error("Failed to parse date " + pubtext); } } //get Link if (StringUtils.isNotBlank(linkCssSelector)) { Element linkElement = articleDoc.select(linkCssSelector).first(); if (linkElement != null && linkElement.attr("content") != null) { link = linkElement.attr("content"); } } //get CONTENT if (StringUtils.isNotBlank(textCssSelector)) { Element descriptionElement = articleDoc.select(textCssSelector).first(); if (descriptionElement != null) description = Jsoup.parse(descriptionElement.html()).text(); } //get TAGS Set<String> tagSet = AttributesImporterUtils.getTags(articleDoc, tagsCssSelector, title); if (tagSet.size() > 0) tags = CollectionTools.join(tagSet, ","); //get CATEGORY - client specific if (StringUtils.isNotBlank(categoryCssSelector)) { Element categoryElement = articleDoc.select(categoryCssSelector).first(); if (categoryElement != null && categoryElement.attr("content") != null) { category = categoryElement.attr("content"); if (StringUtils.isNotBlank(category)) category = category.toUpperCase(); } } else if (StringUtils.isNotBlank(categoryClassPrefix)) { String className = "io.seldon.importer.articles.category." + categoryClassPrefix + "CategoryExtractor"; Class<?> clazz = Class.forName(className); Constructor<?> ctor = clazz.getConstructor(); CategoryExtractor extractor = (CategoryExtractor) ctor.newInstance(); category = extractor.getCategory(url, articleDoc); } //get Sub CATEGORY - client specific if (StringUtils.isNotBlank(subCategoryCssSelector)) { Element subCategoryElement = articleDoc.select(subCategoryCssSelector).first(); if (subCategoryElement != null && subCategoryElement.attr("content") != null) { subCategory = subCategoryElement.attr("content"); if (StringUtils.isNotBlank(subCategory)) subCategory = category.toUpperCase(); } } else if (StringUtils.isNotBlank(subCategoryClassPrefix)) { String className = "io.seldon.importer.articles.category." + subCategoryClassPrefix + "SubCategoryExtractor"; Class<?> clazz = Class.forName(className); Constructor<?> ctor = clazz.getConstructor(); CategoryExtractor extractor = (CategoryExtractor) ctor.newInstance(); subCategory = extractor.getCategory(url, articleDoc); } // Get domain if (domainIsNeeded) { domain = getDomain(url); } if ((StringUtils.isNotBlank(title) && (imageNotNeeded || StringUtils.isNotBlank(img_url)) && (categoryNotNeeded || StringUtils.isNotBlank(category)) && (!domainIsNeeded || StringUtils.isNotBlank(domain)))) { attributes = new HashMap<String, String>(); attributes.put(TITLE, title); if (StringUtils.isNotBlank(category)) attributes.put(CATEGORY, category); if (StringUtils.isNotBlank(subCategory)) attributes.put(SUBCATEGORY, subCategory); if (StringUtils.isNotBlank(link)) attributes.put(LINK, link); if (StringUtils.isNotBlank(leadtext)) attributes.put(LEAD_TEXT, leadtext); if (StringUtils.isNotBlank(img_url)) attributes.put(IMG_URL, img_url); if (StringUtils.isNotBlank(tags)) attributes.put(TAGS, tags); attributes.put(CONTENT_TYPE, VERIFIED_CONTENT_TYPE); if (StringUtils.isNotBlank(description)) attributes.put(DESCRIPTION, description); if (StringUtils.isNotBlank(publishDate)) attributes.put(PUBLISH_DATE, publishDate); if (StringUtils.isNotBlank(domain)) attributes.put(DOMAIN, domain); System.out.println("Item: " + url + "; Category: " + category + " SubCategory: " + subCategory); itemProcessResult.extraction_status = "EXTRACTION_SUCCEEDED"; } else { logger.warn("Failed to get needed attributes for article " + url); logger.warn("[title=" + title + ", img_url=" + img_url + ", category=" + category + ", domain=" + domain + "]"); } { // check for failures for the log result if (StringUtils.isBlank(title)) { itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",") + "title"; } if (!imageNotNeeded && StringUtils.isBlank(img_url)) { itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",") + "img_url"; } if (!categoryNotNeeded && StringUtils.isBlank(category)) { itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",") + "category"; } } } catch (Exception e) { logger.error("Article: " + url + ". Attributes import FAILED", e); itemProcessResult.error = e.toString(); } AttributesImporterUtils.logResult(logger, itemProcessResult); return attributes; }
From source file:org.brnvrn.Main.java
/** * Parse a tr HTML element describing the tool * @param tool is to be updated/* w w w . j a v a 2s . c o m*/ * @param tr brings the data * @return true if successful */ private static boolean parseTrTool(Tool tool, Element tr) { boolean success = true; Element nameLink = tr.select("td:eq(0)").first(); if (nameLink == null) return false; tool.setName(nameLink.text()); tool.setUrl(nameLink.getElementsByTag("a").attr("href")); tool.setLicense(tr.select("td:eq(2)").first().text()); tool.setCompatibility(tr.select("td:eq(3)").first().text()); // More complicated: We will extract and remove known nodes, the rest will be description Element tdDescription = tr.select("td:eq(1)").first(); Elements smalls = tdDescription.getElementsByTag("small"); for (Element small : smalls) { Element author = small.getElementsContainingText("Author").first(); if (author != null) { String authorsString = author.text(); authorsString = authorsString.substring(authorsString.indexOf(":") + 1); tool.addAuthor(authorsString.split(",")); small.remove(); } Element sourceCode = small.getElementsContainingText("ource").last(); if (sourceCode != null) { tool.setUrl_src(sourceCode.attr("href")); small.remove(); } } tdDescription.getElementsByTag("br").remove(); tool.setDescription(Jsoup.clean(tdDescription.html(), Whitelist.relaxed())); // ownText will miss the contained links in the description tool.setDescriptionText(tdDescription.text()); bestEffortThemeLanguage(tool); return success; }
From source file:com.astamuse.asta4d.render.RenderUtil.java
public final static void applyMessages(Element target) { Context context = Context.getCurrentThreadContext(); List<Element> msgElems = target.select(ExtNodeConstants.MSG_NODE_TAG_SELECTOR); for (final Element msgElem : msgElems) { Attributes attributes = msgElem.attributes(); String key = attributes.get(ExtNodeConstants.MSG_NODE_ATTR_KEY); // List<String> externalizeParamKeys = getExternalizeParamKeys(attributes); Object defaultMsg = new Object() { @Override//from w w w .j a v a 2 s .co m public String toString() { return ExtNodeConstants.MSG_NODE_ATTRVALUE_HTML_PREFIX + msgElem.html(); } }; Locale locale = LocalizeUtil.getLocale(attributes.get(ExtNodeConstants.MSG_NODE_ATTR_LOCALE)); String currentTemplatePath = attributes.get(ExtNodeConstants.ATTR_TEMPLATE_PATH); if (StringUtils.isEmpty(currentTemplatePath)) { logger.warn("There is a msg tag which does not hold corresponding template file path:{}", msgElem.outerHtml()); } else { context.setData(TRACE_VAR_TEMPLATE_PATH, currentTemplatePath); } final Map<String, Object> paramMap = getMessageParams(attributes, locale, key); String text; switch (I18nMessageHelperTypeAssistant.configuredHelperType()) { case Mapped: text = I18nMessageHelperTypeAssistant.getConfiguredMappedHelper().getMessageWithDefault(locale, key, defaultMsg, paramMap); break; case Ordered: default: // convert map to array List<Object> numberedParamNameList = new ArrayList<>(); for (int index = 0; paramMap .containsKey(ExtNodeConstants.MSG_NODE_ATTR_PARAM_PREFIX + index); index++) { numberedParamNameList.add(paramMap.get(ExtNodeConstants.MSG_NODE_ATTR_PARAM_PREFIX + index)); } text = I18nMessageHelperTypeAssistant.getConfiguredOrderedHelper().getMessageWithDefault(locale, key, defaultMsg, numberedParamNameList.toArray()); } Node node; if (text.startsWith(ExtNodeConstants.MSG_NODE_ATTRVALUE_TEXT_PREFIX)) { node = ElementUtil.text(text.substring(ExtNodeConstants.MSG_NODE_ATTRVALUE_TEXT_PREFIX.length())); } else if (text.startsWith(ExtNodeConstants.MSG_NODE_ATTRVALUE_HTML_PREFIX)) { node = ElementUtil .parseAsSingle(text.substring(ExtNodeConstants.MSG_NODE_ATTRVALUE_HTML_PREFIX.length())); } else { node = ElementUtil.text(text); } msgElem.replaceWith(node); context.setData(TRACE_VAR_TEMPLATE_PATH, null); } }
From source file:org.brunocvcunha.taskerbox.impl.custom.hardmob.HardmobEmailAction.java
@Override public void spreadAction(final String url, String postTitle) { EmailAction email = getEmailAction(); StringBuffer sb = new StringBuffer(); sb.append(url);// www . ja v a 2 s . c om EmailValueVO emailVO = new EmailValueVO(); emailVO.setTitle("Hardmob - " + postTitle); try { Document doc = TaskerboxHttpBox.getInstance().getDocumentForURL(url); for (Element post : doc.select(".postcontent")) { sb.append("<br>"); sb.append(post.html()); } } catch (ClientProtocolException e) { e.printStackTrace(); } catch (IllegalStateException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (URISyntaxException e) { e.printStackTrace(); } emailVO.setBody(sb.toString()); email.action(emailVO); }
From source file:org.brunocvcunha.taskerbox.impl.custom.slickdeals.SlickDealsEmailAction.java
@Override public void spreadAction(final String url, String postTitle) { EmailAction email = getEmailAction(); EmailValueVO emailVO = new EmailValueVO(); StringBuffer sb = new StringBuffer(); sb.append(url);//from ww w .j a v a2s .co m emailVO.setTitle("SlickDeals - " + postTitle); try { Document doc = TaskerboxHttpBox.getInstance().getDocumentForURL(url); for (Element post : doc.select(".post_message")) { sb.append("<br>"); sb.append(post.html()); } } catch (ClientProtocolException e) { e.printStackTrace(); } catch (IllegalStateException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (URISyntaxException e) { e.printStackTrace(); } emailVO.setBody(sb.toString()); email.action(emailVO); }
From source file:com.msds.km.service.Impl.YunmaiAPIDrivingLicenseRecognitionServcieiImpl.java
/** * html???/* www. j a v a 2 s . co m*/ * @param html ??xml?java * @return */ protected DrivingLicense parseDrivingLicense(String html) { if (html.isEmpty()) { throw new RecognitionException("the html content is empty"); } Document document = Jsoup.parse(html); if (document == null) { throw new RecognitionException( "the document prased from html content is null, please check the website"); } Elements fieldsets = document.select("div[class=left result] fieldset"); if (fieldsets.size() != 1) { throw new RecognitionException( "the document should has result filedset, the content of the web page may be changed."); } Element regResult = fieldsets.first(); String result = regResult.html().trim(); // String removedStr = "<legend></legend>"; if (result.startsWith(removedStr)) { result = result.substring(removedStr.length()); } //??xml result = StringEscapeUtils.unescapeXml(result); // result = "<drivingLicense>" + result + "</drivingLicense>"; return (DrivingLicense) stream.fromXML(result); }
From source file:com.msds.km.service.Impl.DrivingLicenseRecognitionServcieiImpl.java
/** * html???// w ww . j a va 2 s .c o m * * @param html * ??xml?java * @return * @throws Exception */ private DrivingLicense parseDrivingLicense(String html) throws Exception { if (html.isEmpty()) { logger.info(""); return null; } Document document = Jsoup.parse(html); if (document == null) { logger.info("html"); return null; } Elements fieldsets = document.select("div[class=left result] fieldset"); if (fieldsets.size() != 1) { logger.info("?"); return null; } Element regResult = fieldsets.first(); String result = regResult.html().trim(); // String removedStr = "<legend></legend>"; if (result.startsWith(removedStr)) { result = result.substring(removedStr.length()); } // ??xml result = StringEscapeUtils.unescapeXml(result); // result = "<drivingLicense>" + result + "</drivingLicense>"; return XMLUtils.toObject(result, DrivingLicense.class); }
From source file:com.mycompany.grabberrasskazov.threads.ThreadForPageSave.java
public void indexStory(String pageUrl) { try {// w ww .j a v a 2s. c o m String oldId = pageUrl.replace(GlobalVars.mainSite, ""); if (!mainBean.storyExists(oldId)) { Stories r = new Stories(); Document doc = Jsoup.connect(pageUrl) .userAgent("Opera/9.80 (X11; Linux x86_64) " + "Presto/2.12.388 Version/12.16").get(); Elements nameBlockElements = doc.select("b:containsOwn(?)"); Element nameBlock = nameBlockElements.get(0); nameBlock = nameBlock.parent().parent(); nameBlockElements = nameBlock.select("td:eq(1)"); nameBlock = nameBlockElements.get(0); String storyName = nameBlock.text(); r.setStoryName(storyName); // Start of processing writer Elements writerBlockElements = doc.select("b:containsOwn(?:)"); Element writerBlock = writerBlockElements.get(0); writerBlock = writerBlock.parent().parent(); writerBlockElements = writerBlock.select("td:eq(1)"); writerBlock = writerBlockElements.get(0); String writersUrl = writerBlock.select("a:eq(0)").attr("href"); String writersName = writerBlock.select("a:eq(0)").text(); String writersContacts = writerBlock.select("a:eq(1)").attr("href"); StoryWriters storyWriter = new StoryWriters(); storyWriter.setOldId(writersUrl); storyWriter.setWriterEmail(writersContacts); storyWriter.setWriterName(writersName); storyWriter = mainBean.saveWriter(storyWriter); Set<StoriesToWritersRelations> storiesToWritersRelationses = new HashSet<StoriesToWritersRelations>(); StoriesToWritersRelations storiesToWritersRelations = new StoriesToWritersRelations(); storiesToWritersRelations.setStories(r); storiesToWritersRelations.setStoryWriters(storyWriter); r.setStoriesToWritersRelationses(storiesToWritersRelationses); // End of processing writer Set<StoriesToCategoriessRelations> catsRelationses = new HashSet<>(); Elements katsInfo = doc.select("a[href*=ras.shtml?kat]"); for (Element kat : katsInfo) { String katId = kat.attr("href"); StoryCategories cat = mainBean.getCat(katId); StoriesToCategoriessRelations catsRelations = new StoriesToCategoriessRelations(); catsRelations.setStoryCategories(cat); catsRelations.setStories(r); catsRelationses.add(catsRelations); } r.setStoriesToCategoriessRelationses(catsRelationses); Elements textBlocks = doc.select("p[align=justify]"); Element textBlock = textBlocks.get(0); String textStr = textBlock.html(); r.setStoryText(textStr.replace("\"", "'")); r.setOldId(oldId); mainBean.saveStory(r); } } catch (IOException ex) { ex.printStackTrace(); } }
From source file:virgil.meanback.HistoryInfo.java
/** * * @param url/*from ww w . j a va2 s.co m*/ * @return * @throws Exception */ @SuppressWarnings("") public Stock parse(String url) throws Exception { java.util.logging.Logger.getLogger("com.gargoylesoftware.htmlunit").setLevel(Level.OFF); Stock stock = new Stock(); List<DayInfo> list = new ArrayList<>(); /** * HtmlUnitweb? */ WebClient wc = new WebClient(BrowserVersion.CHROME); wc.getOptions().setUseInsecureSSL(true); wc.getOptions().setJavaScriptEnabled(true); // ?JStrue wc.getOptions().setCssEnabled(false); // ?css? wc.getOptions().setThrowExceptionOnScriptError(false); // js?? wc.getOptions().setTimeout(50000); // 10S0? wc.getOptions().setDoNotTrackEnabled(false); HtmlPage page = wc.getPage(url); HtmlElement documentElement = page.getDocumentElement(); Document doc = Jsoup.parse(documentElement.asXml()); String name = doc.select("#BIZ_IS_Name").text(); String code = doc.select(".BIZ_IS_price_id span").text(); code = code.substring(code.indexOf("(") + 2, code.length() - 1); Elements els = doc.select("#BIZ_hq_historySearch tbody tr"); stock.setCode(code); stock.setName(name); int count = 0; for (Element el : els) { if (!el.html().contains("sum")) { DayInfo dayInfo = new DayInfo(); String dateString = el.select("td.e1").text(); SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd"); Date date = format.parse(dateString); String open = el.select("td").eq(1).text(); String close = el.select("td").eq(2).text(); double cd = Double.parseDouble(close); String low = el.select("td").eq(5).text(); String high = el.select("td").eq(6).text(); String volume = el.select("td").eq(7).text(); dayInfo.setClose(close); dayInfo.setDateString(dateString); dayInfo.setHigh(high); dayInfo.setLow(low); dayInfo.setOpen(open); dayInfo.setVolume(volume); dayInfo.setDate(date); list.add(dayInfo); count++; if (list.size() > 79) { break; } } } stock.setList(list); return stock; }