List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:org.opens.rules.doc.utils.exportdomtocsv.ExportDomToCsv.java
/** * Before using it please set the FOLDER variable with the path where you * want to create your csv file./*from w ww. j av a 2 s .c o m*/ * * @param args * @throws IOException */ public static void main(String[] args) throws IOException { File ref = FileUtils.getFile(FOLDER); JsoupFunc jsf = new JsoupFunc(); Document doc = jsf.getDocument(); Elements thematiques = doc.select("div.thematique"); StringBuilder sb = new StringBuilder(); String testCode = ""; String testLabel = ""; String critere = ""; for (int i = 2; i < thematiques.size(); i++) { String themeIndex = String.valueOf(i - 1) + ""; String theme = (thematiques.get(i).child(0).text() + ""); Elements criteres = thematiques.get(i).select("h3"); for (int j = 1; j < criteres.size(); j++) { Element critereLevel = criteres.get(j); String critereH3String = critereLevel.toString(); String level = critereH3String.substring(critereH3String.indexOf("[") + 1, critereH3String.indexOf("]")) + ""; Elements tests = criteres.get(j).nextElementSibling().select("[id^=test-]"); try { critere = criteres.get(j).id().substring(5, 10) + ""; } catch (StringIndexOutOfBoundsException sioobe) { try { critere = criteres.get(j).id().substring(5, 9) + ""; } catch (StringIndexOutOfBoundsException sioobe2) { critere = criteres.get(j).id().substring(5, 8) + ""; } } String[] critereArray = criteres.get(j).text().split("] "); String critereLabel = critereArray[1].toString() + ""; for (Element el : tests) { Pattern digitPattern = Pattern.compile("\\d+\\.\\d+\\.\\d+\\s?\\:?\\s?"); Matcher matcher = digitPattern.matcher(el.text()); if (matcher.find()) { String testLabelReplace = el.html() .replace("index.php", "http://www.accessiweb.org/index.php").replace("\n", ""); testLabel = testLabelReplace.substring(matcher.end(), testLabelReplace.length()) + ""; } try { testCode = el.id().substring(5, 12) + ""; } catch (StringIndexOutOfBoundsException sioobe) { try { testCode = (el.id().substring(5, 11) + ""); } catch (StringIndexOutOfBoundsException sioobe3) { testCode = (el.id().substring(5, 10) + ""); } } sb.append(themeIndex + theme + critere + critereLabel + testCode + testLabel + level + "\n"); } } } FileUtils.writeStringToFile(ref, sb.toString()); }
From source file:org.opens.tanaguru.processing.ProcessRemarkServiceImpl.java
@Override public SourceCodeRemark createSourceCodeRemark(TestSolution processResult, Element element, String messageCode) {/* w w w.jav a2 s . co m*/ SourceCodeRemark remark = sourceCodeRemarkFactory.create(); remark.setIssue(processResult); remark.setMessageCode(messageCode); remark.setLineNumber(searchElementLineNumber(element)); remark.setTarget(element.nodeName()); remark.setSnippet(getSnippetFromElement(element)); for (String attr : evidenceElementList) { EvidenceElement evidenceElementSup; if (StringUtils.equalsIgnoreCase(attr, "text")) { evidenceElementSup = getEvidenceElement(attr, element.text()); } else { evidenceElementSup = getEvidenceElement(attr, element.attr(attr)); } remark.addElement(evidenceElementSup); } return remark; }
From source file:org.opens.tanaguru.rules.accessiweb22.Aw22Rule06061.java
@Override protected void select(SSPHandler sspHandler, ElementHandler<Element> elementHandler) { ElementSelector elementsSelector = new SimpleElementSelector(NOT_ANCHOR_LINK_CSS_LIKE_QUERY); elementsSelector.selectElements(sspHandler, elementHandler); for (Element el : elementHandler.get()) { if (StringUtils.isBlank(el.text()) && el.getElementsByAttributeValueMatching(ALT_ATTR, "^(?=\\s*\\S).*$").isEmpty()) { emptyLinksHandler.add(el);/*www . ja v a2 s. c o m*/ } } }
From source file:org.opens.tanaguru.rules.elementchecker.helper.RuleCheckHelper.java
/** * Add an evidenceElement to a given evidenceElement collection. * @param eeList//from w ww. j a v a 2 s . c o m * @param element * @param attr * @return */ private static void addEvidenceElementToCollection(Collection<EvidenceElement> eeList, Element element, String attr) { EvidenceElement extraEe; if (isElementTextRequested(attr)) { extraEe = prs.getEvidenceElement(attr, element.text()); } else if (isAttributeExternalResource(attr)) { extraEe = prs.getEvidenceElement(attr, buildAttributeValue(element, attr, true)); } else { extraEe = prs.getEvidenceElement(attr, buildAttributeValue(element, attr, false)); } eeList.add(extraEe); }
From source file:org.sakaiproject.nakamura.files.migrator.PageMigrator.java
boolean isEmpty(Element htmlElement) { // filter out TinyMCE instances htmlElement.select(".mceEditor").remove(); String htmlContent = htmlElement.text().trim(); String[] elementNames = new String[] { "img", "iframe", "frame", "input", "select", "option" }; boolean containsElement = false; for (String elementName : elementNames) { if (!htmlElement.select(elementName).isEmpty()) { containsElement = true;//from ww w.ja v a2 s. c om } } return !(htmlElement.hasText() || containsElement); }
From source file:org.sbs.goodcrawler.jobconf.ExtractConfig.java
/** * ????/*from w w w . j a v a2 s . c o m*/ * @param doc * @return * @throws ConfigurationException */ public ExtractConfig loadConfig(Document doc) throws ConfigurationException { Elements extractElement = doc.select("extract"); super.jobName = doc.select("job").attr("name"); super.indexName = doc.select("job").attr("indexName"); String temp = extractElement.select("threadNum").text(); if (StringUtils.isNotBlank(temp)) { this.threadNum = Integer.parseInt(temp); } Elements templateElement = extractElement.select("extract").select("template"); Iterator<Element> it = templateElement.iterator(); while (it.hasNext()) { Element template = it.next(); ExtractTemplate extractTemplate = new ExtractTemplate(); // ?Url???? Elements urlPatternElement = template.select("url"); List<Pattern> patterns = Lists.newArrayList(); for (Element urlElement : urlPatternElement) { patterns.add(Pattern.compile(urlElement.text())); } extractTemplate.setUrlPattern(patterns); extractTemplate.setName(template.attr("name")); // ??? Elements selectElement = template.select("elements").first().children(); for (Element element : selectElement) { if ("element".equals(element.tagName())) { AbstractElementCssSelector<?> selector = ElementCssSelectorFactory.create(element); extractTemplate.addCssSelector(selector); } else if ("if".equals(element.tagName())) { IFConditions ifConditions = IFConditions.create(element); extractTemplate.addConditions(ifConditions); } } this.templates.add(extractTemplate); } return this; }
From source file:org.sbs.goodcrawler.jobconf.FetchConfig.java
/** * ???// www .ja v a2 s. co m * @param confFile * @return */ public FetchConfig loadConfig(Document confDoc) throws ConfigurationException { try { Document doc = confDoc; super.jobName = doc.select("job").attr("name"); super.indexName = doc.select("job").attr("indexName"); Elements e = doc.select("fetch"); this.type = e.select("type").text(); this.agent = e.select("agent").text(); String temp = e.select("threadNum").text(); if (StringUtils.isNotBlank(temp)) { this.threadNum = Integer.parseInt(temp); } temp = e.select("delayBetweenRequests").text(); if (StringUtils.isNotBlank(temp)) { this.delayBetweenRequests = Integer.parseInt(temp); } temp = e.select("maxDepthOfCrawling").text(); if (StringUtils.isNotBlank(temp)) { this.maxDepthOfCrawling = Integer.parseInt(temp); } temp = e.select("fetchBinaryContent").text(); if (StringUtils.isNotBlank(temp)) { this.fetchBinaryContent = Boolean.parseBoolean(temp); } if (StringUtils.isNotBlank(e.select("maxOutgoingLinksToFollow").text())) { this.maxOutgoingLinksToFollow = Integer.parseInt(e.select("maxOutgoingLinksToFollow").text()); } temp = e.select("fileSuffix").text(); if (StringUtils.isNotBlank(temp)) { this.fileSuffix = temp; } temp = e.select("maxDownloadSizePerPage").text(); if (StringUtils.isNotBlank(temp)) { this.maxDownloadSizePerPage = Integer.parseInt(temp); } temp = e.select("https").text(); if (StringUtils.isNotBlank(temp)) { this.https = Boolean.parseBoolean(temp); } temp = e.select("onlyDomain").text(); if (StringUtils.isNotBlank(temp)) { this.onlyDomain = Boolean.parseBoolean(temp); } temp = e.select("socketTimeoutMilliseconds").text(); if (StringUtils.isNotBlank(temp)) { this.socketTimeoutMilliseconds = Integer.parseInt(temp); } temp = e.select("connectionTimeout").text(); if (StringUtils.isNotBlank(temp)) { this.connectionTimeout = Integer.parseInt(temp); } temp = e.select("maxTotalConnections").text(); if (StringUtils.isNotBlank(temp)) { this.maxTotalConnections = Integer.parseInt(temp); } temp = e.select("maxConnectionsPerHost").text(); if (StringUtils.isNotBlank(temp)) { this.maxConnectionsPerHost = Integer.parseInt(e.select("maxConnectionsPerHost").text()); } temp = e.select("maxConnectionsPerHost").text(); if (StringUtils.isNotBlank(temp)) { this.maxConnectionsPerHost = Integer.parseInt(temp); } if (StringUtils.isNotBlank(e.select("proxyHost").text())) { this.proxyHost = e.select("proxyHost").text(); } if (StringUtils.isNotBlank(e.select("proxyPort").text())) { this.proxyPort = Integer.parseInt(e.select("proxyPort").text()); } if (StringUtils.isNotBlank(e.select("proxyUsername").text())) { this.proxyUsername = e.select("proxyUsername").text(); } if (StringUtils.isNotBlank(e.select("proxyPassword").text())) { this.proxyPassword = e.select("proxyPassword").text(); } if (StringUtils.isNotBlank(e.select("proxyHost").text())) { this.proxyHost = e.select("proxyHost").text(); } // seed Elements seeds = doc.select("fetch seeds seed"); for (Element element : seeds) { WebURL url = new WebURL(); String seed = element.text(); this.seeds.add(seed); url.setURL(seed); url.setJobName(jobName); url.setDepth((short) 0); try { PendingManager.getPendingUlr(jobName).addElement(url); BloomfilterHelper.getInstance().add(url.getURL()); } catch (QueueException e1) { e1.printStackTrace(); } } /* * ??Url */ Elements fetchUrlFilters = doc.select("fetchUrlFilters filter"); for (Element element : fetchUrlFilters) { this.fetchUrlFilters.add(element.text()); } /* * ?????Url */ Elements extractUrlfilters = doc.select("extractUrlfilters filter"); for (Element element : extractUrlfilters) { this.extractUrlfilters.add(element.text()); } } catch (NumberFormatException e) { throw new ConfigurationException("?" + e.getMessage()); } return this; }
From source file:org.sbs.goodcrawler.plugin.extract.ExtractorDytt8.java
@Override public ExtractedPage<?, ?> onExtract(Page page) { if (null != page) { try {//from w ww. j a va2s.c om Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()), urlUtils.getBaseUrl(page.getWebURL().getURL())); if (null != page.getWebURL().getURL() && page.getWebURL().getURL().contains("game/")) return null; // ???Url?Url Elements links = doc.getElementsByTag("a"); if (!links.isEmpty()) { for (Element link : links) { String linkHref = link.absUrl("href"); if (StringUtils.isNotBlank(linkHref) && filterUrls(linkHref)) { try { WebURL url = new WebURL(); url.setURL(linkHref); url.setJobName(conf.jobName); pendingUrls.addUrl(url); } catch (QueueException e) { log.error(e.getMessage()); } catch (Exception e) { log.error(e.getMessage()); } } } } // ?? // Map<String, String> selects = conf.getSelects(); Map<String, String> selects = null; ExtractedPage<String, Object> epage = pendingStore.new ExtractedPage<String, Object>(); epage.setUrl(page.getWebURL()); HashMap<String, Object> result = new HashMap<>(); Elements text = doc.select("#Zoom"); if (null == text || text.size() == 0) { return null; } String name = doc.select("h1").text(); name = name.replace("", "").replace("<<", "").replace("", "").replace(">>", ""); result.put("movie", name); // result.put("_id", name); String ts[] = doc.select("h2 a").text().split(" "); if (ts.length >= 2) { result.put("type", ts[1].trim()); } else { result.put("type", "unknow"); } result.put("url", page.getWebURL().getURL()); for (Entry<String, String> entry : selects.entrySet()) { Elements elements = doc.select(entry.getValue()); if (elements.isEmpty()) return null; else { if ("content".equals(entry.getKey())) { for (Element element : elements) { // Elements imgs = element.select("img[src]"); StringBuilder sb = new StringBuilder(); for (Element img : imgs) { sb.append(img.attr("src")).append(";"); } result.put("img", sb.toString()); // ? Elements movieInfos = element.select("p"); for (Element info : movieInfos) { String infotext = info.text(); try { String infotext_ = info.html(); int start, end = 0; start = infotext_.indexOf(""); if (start > 0) { end = infotext_.lastIndexOf(""); if (end > 0 && start < end) { result.put("jq", infotext_.substring(start, end)); } else { end = infotext_.lastIndexOf("."); if (end > 0 && start < end) { result.put("jq", infotext_.substring(start, end)); } } } infotext_ = null; } catch (Exception e) { e.printStackTrace(); } if (infotext.startsWith("")) { String ss[] = infotext.split(""); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } else if (infotext.startsWith("?")) { String ss[] = infotext.split("?"); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } else if (infotext.contains("")) { infotext = info.html(); String[] ss = infotext.split("<br />"); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } else if (infotext.contains(":")) { infotext = info.html(); String[] ss = infotext.split("<br />"); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } } // if(result.size()<5){ // result.put("content", value) // } // ? Elements elements2 = elements.select("td"); sb.setLength(0); for (Element download : elements2) { sb.append(download.text()).append(";"); } result.put("download", sb.toString()); } } } // result.put(entry.getKey(), elements.html()); } if (StringUtils.isNotBlank((String) result.get("nd"))) { result.put("nd", Integer.parseInt((String) result.get("nd"))); } epage.setMessages(result); try { pendingStore.addExtracedPage(epage); } catch (QueueException e) { log.error(e.getMessage()); } return epage; } catch (UnsupportedEncodingException e) { log.error(e.getMessage()); e.printStackTrace(); } } return null; }
From source file:org.sbs.goodcrawler.plugin.extract.ExtractYouku.java
/** * ??//from w ww . j av a 2 s. co m * @param url * @return */ public HashMap<String, Object> getInformation(Page page) { HashMap<String, Object> map = Maps.newHashMap(); String url = page.getWebURL().getURL(); try { ExtractedPage<String, Object> epage = pendingStore.new ExtractedPage<String, Object>(); epage.setUrl(page.getWebURL()); Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()), urlUtils.getBaseUrl(page.getWebURL().getURL())); // ???Url?Url Elements links = doc.getElementsByTag("a"); if (!links.isEmpty()) { for (Element link : links) { String linkHref = link.absUrl("href"); if (filterUrls(linkHref)) { WebURL weburl = new WebURL(); weburl.setURL(linkHref); weburl.setJobName(conf.jobName); Submitor.submitUrl(weburl); } } } if (url.contains("/show_page/")) { String title = doc.select(".title .name").text(); if (StringUtils.isBlank(title)) return null; map.put("title", title); String category = doc.select(".title .type a").text(); if (StringUtils.isBlank(category)) return null; map.put("category", category); String _year = CharMatcher.DIGIT.retainFrom(doc.select(".title .pub").text()); if (StringUtils.isNotBlank(_year)) { int year = Integer.parseInt(_year); map.put("year", year); } String score = CharMatcher.DIGIT.retainFrom(doc.select(".ratingstar .num").text()); map.put("score", score); String alias = doc.select(".alias").text(); if (alias.contains(":")) { map.put("translation", alias.split(":")[1]); } String img = doc.select(".thumb img").attr("src"); if (StringUtils.isBlank(img)) return null; map.put("thumbnail", img); String area = doc.select(".row2 .area a").text(); if (StringUtils.isBlank(area)) return null; map.put("area", area); String[] type = doc.select(".row2 .type a").text().split(" "); if (null == type || type.length == 0) return null; map.put("type", Sets.newHashSet(type)); String director = doc.select(".row2 .director a").text(); map.put("director", director); String _duration = CharMatcher.DIGIT.retainFrom(doc.select(".row2 .duration").text()); if (StringUtils.isNotBlank(_duration)) { int duration = Integer.parseInt(_duration); map.put("duration", duration); } String _hot = CharMatcher.anyOf(",").removeFrom(doc.select(".row2 .vr .num").text()); _hot = CharMatcher.DIGIT.retainFrom(_hot); if (StringUtils.isNotBlank(_hot)) { int hot = Integer.parseInt(_hot); map.put("hot", hot); } String sumary = doc.select(".detail .long").text(); map.put("summary", sumary); // Elements elements = doc.select(".baseaction a"); HashMap<String, String> playList = Maps.newHashMap(); for (Element element : elements) { String n = element.text(); String urlString = element.attr("href"); if (StringUtils.isBlank(urlString)) return null; Document d2 = Jsoup.parse(new URL(urlString), 10000); if (null != d2) { String x = d2.select("#link2").attr("value"); if (StringUtils.isBlank(x)) return null; playList.put(n, x); } } map.put("online", playList); } else if (url.contains("/v_show/")) { Document d3 = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()), urlUtils.getBaseUrl(page.getWebURL().getURL())); // ???Url?Url Elements links2 = d3.getElementsByTag("a"); if (!links2.isEmpty()) { for (Element link : links2) { String linkHref = link.absUrl("href"); if (filterUrls(linkHref)) { WebURL weburl = new WebURL(); weburl.setURL(linkHref); weburl.setJobName(conf.jobName); try { pendingUrls.addUrl(weburl); } catch (QueueException e) { log.error(e.getMessage()); } } } } String p = d3.select("h1.title a").attr("href"); if (StringUtils.isBlank(p)) return null; return getInformation(p); } } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } if (map != null && map.size() > 4) { if (null == map.get("year")) { map.put("year", 1800); } } return map; }
From source file:org.sbs.goodcrawler.plugin.extract.ExtractYouku.java
private HashMap<String, Object> getInformation(String p) { HashMap<String, Object> map = Maps.newHashMap(); try {/*w ww . ja v a 2 s. co m*/ if (p.contains("/show_page/")) { Document doc = Jsoup.parse(new URL(p), 15000); // ???Url?Url Elements links = doc.getElementsByTag("a"); if (!links.isEmpty()) { for (Element link : links) { String linkHref = link.absUrl("href"); if (filterUrls(linkHref)) { WebURL weburl = new WebURL(); weburl.setURL(linkHref); weburl.setJobName(conf.jobName); try { pendingUrls.addUrl(weburl); } catch (QueueException e) { log.error(e.getMessage()); } } } } String title = doc.select(".title .name").text(); if (StringUtils.isBlank(title)) return null; map.put("title", title); String category = doc.select(".title .type a").text(); if (StringUtils.isBlank(category)) return null; map.put("category", category); String _year = CharMatcher.DIGIT.retainFrom(doc.select(".title .pub").text()); if (StringUtils.isNotBlank(_year)) { int year = Integer.parseInt(_year); map.put("year", year); } String score = CharMatcher.DIGIT.retainFrom(doc.select(".ratingstar .num").text()); map.put("score", score); String alias = doc.select(".alias").text(); if (alias.contains(":")) { map.put("translation", alias.split(":")[1]); } String img = doc.select(".thumb img").attr("src"); if (StringUtils.isBlank(img)) return null; map.put("thumbnail", Lists.newArrayList(img)); String area = doc.select(".row2 .area a").text(); if (StringUtils.isBlank(area)) return null; map.put("area", area); String[] type = doc.select(".row2 .type a").text().split(" "); if (null == type || type.length == 0) return null; map.put("type", Lists.newArrayList(type)); String director = doc.select(".row2 .director a").text(); map.put("director", director); String _duration = CharMatcher.DIGIT.retainFrom(doc.select(".row2 .duration").text()); if (StringUtils.isNotBlank(_duration)) { int duration = Integer.parseInt(_duration); map.put("duration", duration); } String _hot = CharMatcher.anyOf(",").removeFrom(doc.select(".row2 .vr .num").text()); if (StringUtils.isNotBlank(_hot)) { int hot = Integer.parseInt(_hot); map.put("hot", hot); } String sumary = doc.select(".detail .long").text(); map.put("summary", sumary); // Elements elements = doc.select(".baseaction a"); HashMap<String, String> playList = Maps.newHashMap(); for (Element element : elements) { String n = element.text(); String urlString = element.attr("href"); if (StringUtils.isBlank(urlString)) return null; Document d2 = Jsoup.parse(new URL(urlString), 10000); if (null != d2) { String x = d2.select("#link2").attr("value"); if (StringUtils.isBlank(x)) return null; playList.put(n, x); } } map.put("online", playList); } else return null; } catch (Exception e) { return map; } return map; }