List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:de.geeksfactory.opacclient.apis.Zones.java
static String findNextPageUrl(Document doc) { if (doc.select(".pageNavLink[title*=nchsten]").size() > 0) { Element link = doc.select(".pageNavLink[title*=nchsten]").first(); return link.absUrl("href"); } else {/*from w ww . ja v a 2 s.c o m*/ return null; } }
From source file:io.seldon.importer.articles.FileItemAttributesImporter.java
public static Map<String, String> getAttributes(String url, String existingCategory) { ItemProcessResult itemProcessResult = new ItemProcessResult(); itemProcessResult.client_item_id = url; itemProcessResult.extraction_status = "EXTRACTION_FAILED"; logger.info("Trying to get attributes for " + url); Map<String, String> attributes = null; String title = ""; String category = ""; String subCategory = ""; String img_url = ""; String description = ""; String tags = ""; String leadtext = ""; String link = ""; String publishDate = ""; String domain = ""; try {/*w w w .j a v a 2 s. c om*/ long now = System.currentTimeMillis(); long timeSinceLastRequest = now - lastUrlFetchTime; if (timeSinceLastRequest < minFetchGapMsecs) { long timeToSleep = minFetchGapMsecs - timeSinceLastRequest; logger.info( "Sleeping " + timeToSleep + "msecs as time since last fetch is " + timeSinceLastRequest); Thread.sleep(timeToSleep); } Document articleDoc = Jsoup.connect(url).userAgent("SeldonBot/1.0").timeout(httpGetTimeout).get(); lastUrlFetchTime = System.currentTimeMillis(); //get IMAGE URL if (StringUtils.isNotBlank(imageCssSelector)) { Element imageElement = articleDoc.select(imageCssSelector).first(); if (imageElement != null) { if (imageElement.attr("content") != null) { img_url = imageElement.attr("content"); } if (StringUtils.isBlank(img_url) && imageElement.attr("src") != null) { img_url = imageElement.attr("src"); } if (StringUtils.isBlank(img_url) && imageElement.attr("href") != null) { img_url = imageElement.attr("href"); } } } if (StringUtils.isBlank(img_url) && StringUtils.isNotBlank(defImageUrl)) { logger.info("Setting image to default: " + defImageUrl); img_url = defImageUrl; } img_url = StringUtils.strip(img_url); //get TITLE if (StringUtils.isNotBlank(titleCssSelector)) { Element titleElement = articleDoc.select(titleCssSelector).first(); if (titleElement != null && titleElement.attr("content") != null) { title = titleElement.attr("content"); } } //get Lead Text if (StringUtils.isNotBlank(leadTextCssSelector)) { Element leadElement = articleDoc.select(leadTextCssSelector).first(); if (leadElement != null && leadElement.attr("content") != null) { leadtext = leadElement.attr("content"); } } //get publish date if (StringUtils.isNotBlank(publishDateCssSelector)) { //2013-01-21T10:40:55Z Element pubElement = articleDoc.select(publishDateCssSelector).first(); if (pubElement != null && pubElement.attr("content") != null) { String pubtext = pubElement.attr("content"); SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH); Date result = null; try { result = df.parse(pubtext); } catch (ParseException e) { logger.info("Failed to parse date withUTC format " + pubtext); } //try a simpler format df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ENGLISH); try { result = df.parse(pubtext); } catch (ParseException e) { logger.info("Failed to parse date " + pubtext); } if (result != null) publishDate = dateFormatter.format(result); else logger.error("Failed to parse date " + pubtext); } } //get Link if (StringUtils.isNotBlank(linkCssSelector)) { Element linkElement = articleDoc.select(linkCssSelector).first(); if (linkElement != null && linkElement.attr("content") != null) { link = linkElement.attr("content"); } } //get CONTENT if (StringUtils.isNotBlank(textCssSelector)) { Element descriptionElement = articleDoc.select(textCssSelector).first(); if (descriptionElement != null) description = Jsoup.parse(descriptionElement.html()).text(); } //get TAGS Set<String> tagSet = AttributesImporterUtils.getTags(articleDoc, tagsCssSelector, title); if (tagSet.size() > 0) tags = CollectionTools.join(tagSet, ","); //get CATEGORY - client specific if (StringUtils.isNotBlank(categoryCssSelector)) { Element categoryElement = articleDoc.select(categoryCssSelector).first(); if (categoryElement != null && categoryElement.attr("content") != null) { category = categoryElement.attr("content"); if (StringUtils.isNotBlank(category)) category = category.toUpperCase(); } } else if (StringUtils.isNotBlank(categoryClassPrefix)) { String className = "io.seldon.importer.articles.category." + categoryClassPrefix + "CategoryExtractor"; Class<?> clazz = Class.forName(className); Constructor<?> ctor = clazz.getConstructor(); CategoryExtractor extractor = (CategoryExtractor) ctor.newInstance(); category = extractor.getCategory(url, articleDoc); } //get Sub CATEGORY - client specific if (StringUtils.isNotBlank(subCategoryCssSelector)) { Element subCategoryElement = articleDoc.select(subCategoryCssSelector).first(); if (subCategoryElement != null && subCategoryElement.attr("content") != null) { subCategory = subCategoryElement.attr("content"); if (StringUtils.isNotBlank(subCategory)) subCategory = category.toUpperCase(); } } else if (StringUtils.isNotBlank(subCategoryClassPrefix)) { String className = "io.seldon.importer.articles.category." + subCategoryClassPrefix + "SubCategoryExtractor"; Class<?> clazz = Class.forName(className); Constructor<?> ctor = clazz.getConstructor(); CategoryExtractor extractor = (CategoryExtractor) ctor.newInstance(); subCategory = extractor.getCategory(url, articleDoc); } // Get domain if (domainIsNeeded) { domain = getDomain(url); } if (StringUtils.isNotBlank(title) && (imageNotNeeded || StringUtils.isNotBlank(img_url)) && (categoryNotNeeded || StringUtils.isNotBlank(category)) && (!domainIsNeeded || StringUtils.isNotBlank(domain))) { attributes = new HashMap<String, String>(); attributes.put(TITLE, title); if (StringUtils.isNotBlank(category)) attributes.put(CATEGORY, category); if (StringUtils.isNotBlank(subCategory)) attributes.put(SUBCATEGORY, subCategory); if (StringUtils.isNotBlank(link)) attributes.put(LINK, link); if (StringUtils.isNotBlank(leadtext)) attributes.put(LEAD_TEXT, leadtext); if (StringUtils.isNotBlank(img_url)) attributes.put(IMG_URL, img_url); if (StringUtils.isNotBlank(tags)) attributes.put(TAGS, tags); attributes.put(CONTENT_TYPE, VERIFIED_CONTENT_TYPE); if (StringUtils.isNotBlank(description)) attributes.put(DESCRIPTION, description); if (StringUtils.isNotBlank(publishDate)) attributes.put(PUBLISH_DATE, publishDate); if (StringUtils.isNotBlank(domain)) attributes.put(DOMAIN, domain); System.out.println("Item: " + url + "; Category: " + category); itemProcessResult.extraction_status = "EXTRACTION_SUCCEEDED"; } else { logger.warn("Failed to get title for article " + url); logger.warn("[title=" + title + ", img_url=" + img_url + ", category=" + category + ", domain=" + domain + "]"); } { // check for failures for the log result if (StringUtils.isBlank(title)) { itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",") + "title"; } if (!imageNotNeeded && StringUtils.isBlank(img_url)) { itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",") + "img_url"; } if (!categoryNotNeeded && StringUtils.isBlank(category)) { itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",") + "category"; } } } catch (Exception e) { logger.warn("Article: " + url + ". Attributes import FAILED", e); itemProcessResult.error = e.toString(); } AttributesImporterUtils.logResult(logger, itemProcessResult); return attributes; }
From source file:net.slkdev.swagger.confluence.service.impl.XHtmlToConfluenceServiceImpl.java
private static String reformatXHtml(final String inputXhtml, final Map<String, ConfluenceLink> confluenceLinkMap) { final Document document = Jsoup.parse(inputXhtml, "utf-8", Parser.xmlParser()); document.outputSettings().prettyPrint(false); document.outputSettings().escapeMode(xhtml); document.outputSettings().charset("UTF-8"); final Elements linkElements = document.select("a"); for (final Element linkElement : linkElements) { final String originalHref = linkElement.attr("href"); final ConfluenceLink confluenceLink = confluenceLinkMap.get(originalHref); if (confluenceLink == null) { LOG.debug("NO LINK MAPPING FOUND TO COVERT LINK: {}", originalHref); continue; }//from www. ja va2 s . c o m final String confluenceLinkMarkup = confluenceLink.getConfluenceLinkMarkup(); LOG.debug("LINK CONVERSION: {} -> {}", originalHref, confluenceLinkMarkup); linkElement.before(confluenceLinkMarkup); linkElement.html(""); linkElement.unwrap(); } reformatXHtmlHeadings(document, "h2"); reformatXHtmlHeadings(document, "h3"); reformatXHtmlHeadings(document, "#toctitle"); final SwaggerConfluenceConfig swaggerConfluenceConfig = SWAGGER_CONFLUENCE_CONFIG.get(); if (swaggerConfluenceConfig.getPaginationMode() == PaginationMode.SINGLE_PAGE) { if (swaggerConfluenceConfig.isIncludeTableOfContentsOnSinglePage()) { reformatXHtmlBreakAfterElements(document, "#toc"); } reformatXHtmlBreakAfterElements(document, ".sect1"); } reformatXHtmlSpacing(document.select(".sect2")); reformatXHtmlSpacing(document.select(".sect3")); return document.html(); }
From source file:de.luhmer.owncloudnewsreader.reader.GoogleReaderApi.GoogleReaderMethods.java
public static ArrayList<String[]> getTagList(String _USERNAME, String _PASSWORD) { Log.d(GoogleReaderConstants.APP_NAME, "METHOD: getTagList()"); ArrayList<String[]> _TAGTITLE_ARRAYLIST = new ArrayList<String[]>(); String _TAG_LABEL = null;/*from www . j av a2 s.c o m*/ try { _TAG_LABEL = "user/" + AuthenticationManager.getGoogleUserID(_USERNAME, _PASSWORD) + "/label/"; } catch (IOException e) { e.printStackTrace(); } Document doc = null; try { doc = Jsoup.connect(GoogleReaderConstants._TAG_LIST_URL) .header("Authorization", GoogleReaderConstants._AUTHPARAMS + AuthenticationManager.getGoogleAuthKey(_USERNAME, _PASSWORD)) .userAgent(GoogleReaderConstants.APP_NAME).timeout(6000).get(); } catch (IOException e) { e.printStackTrace(); } Elements links = doc.select("string"); for (Element link : links) { //String tagAttrib = link.attr("name"); String tagText = link.text(); if (Func_Strings.FindWordInString(tagText, _TAG_LABEL)) { _TAGTITLE_ARRAYLIST.add(new String[] { tagText.substring(32), tagText }); } } //String[] _TAGTITLE_ARRAY = new String[_TAGTITLE_ARRAYLIST.size()]; //_TAGTITLE_ARRAYLIST.toArray(_TAGTITLE_ARRAY); //return _TAGTITLE_ARRAY; return _TAGTITLE_ARRAYLIST; }
From source file:de.geeksfactory.opacclient.apis.IOpac.java
static void parseResList(List<ReservedItem> media, Document doc, JSONObject data) { if (doc.select("a[name=RES]").size() == 0) return;/* www . ja v a 2s . com*/ Elements copytrs = doc.select("a[name=RES] ~ table:contains(Titel)").first().select("tr"); doc.setBaseUri(data.optString("baseurl")); DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); int trs = copytrs.size(); if (trs < 2) { return; } assert (trs > 0); for (int i = 1; i < trs; i++) { Element tr = copytrs.get(i); ReservedItem item = new ReservedItem(); item.setTitle(tr.child(0).text().trim().replace("\u00a0", "")); item.setAuthor(tr.child(1).text().trim().replace("\u00a0", "")); try { item.setReadyDate(fmt.parseLocalDate(tr.child(4).text().trim().replace("\u00a0", ""))); } catch (IllegalArgumentException e) { item.setStatus(tr.child(4).text().trim().replace("\u00a0", "")); } if (tr.select("a").size() > 0) { item.setCancelData(tr.select("a").last().attr("href")); } media.add(item); } assert (media.size() == trs - 1); }
From source file:com.astamuse.asta4d.render.RenderUtil.java
private final static void applySnippetResultToElement(Document doc, String snippetRefId, Element snippetElement, Element renderTarget, Renderer renderer) { apply(renderTarget, renderer);//from w w w . j a v a2s . c o m if (snippetElement.ownerDocument() == null) { // it means this snippet element is replaced by a // element completely String reSelector = SelectorUtil.attr(ExtNodeConstants.SNIPPET_NODE_TAG_SELECTOR, ExtNodeConstants.ATTR_SNIPPET_REF, snippetRefId); Elements elems = doc.select(reSelector); if (elems.size() > 0) { snippetElement = elems.get(0); } else { snippetElement = null; } } if (snippetElement != null) { snippetElement.attr(ExtNodeConstants.SNIPPET_NODE_ATTR_STATUS, ExtNodeConstants.SNIPPET_NODE_ATTR_STATUS_FINISHED); } }
From source file:de.geeksfactory.opacclient.apis.Zones.java
static List<ReservedItem> parseResList(Document doc) { List<ReservedItem> reservations = new ArrayList<>(); for (Element table : doc.select( ".MessageBrowseItemDetailsCell table, " + ".MessageBrowseItemDetailsCellStripe" + " table")) { ReservedItem item = new ReservedItem(); for (Element tr : table.select("tr")) { String desc = tr.select(".MessageBrowseFieldNameCell").text().trim(); String value = tr.select(".MessageBrowseFieldDataCell").text().trim(); if (desc.equals("Titel")) item.setTitle(value);//from w ww . ja va2s. c om if (desc.equals("Publikationsform")) item.setFormat(value); if (desc.equals("Liefern an")) item.setBranch(value); if (desc.equals("Status")) item.setStatus(value); } if ("Gelscht".equals(item.getStatus())) continue; reservations.add(item); } return reservations; }
From source file:org.cellcore.code.engine.page.extractor.mtgf.MTGFPageDataExtractor.java
@Override protected int getStock(Document doc) { if (!doc.select(".card-buy").select("option").isEmpty()) { String val = doc.select(".card-buy").select("option").last().text(); return Integer.parseInt(val); }/* w w w .j av a 2 s .co m*/ return 0; }
From source file:org.cellcore.code.engine.page.extractor.mfrag.MFRAGPageDataExtractor.java
@Override protected String[] getOtherNames(Document doc) { String fr = doc.select(".prod-det_s-titre").text(); return new String[] { fr }; }
From source file:org.cellcore.code.engine.page.extractor.mfrag.MFRAGPageDataExtractor.java
@Override protected float getPrice(Document doc) { String var = doc.select(".prod-det_prix").text(); return Float.parseFloat(this.cleanPriceString(var)); }