List of usage examples for org.jsoup.nodes Element id
public String id()
From source file:org.keycloak.testsuite.util.saml.LoginBuilder.java
public static HttpUriRequest handleLoginPage(UserRepresentation user, String loginPage) { String username = user.getUsername(); String password = getPasswordOf(user); org.jsoup.nodes.Document theLoginPage = Jsoup.parse(loginPage); List<NameValuePair> parameters = new LinkedList<>(); for (Element form : theLoginPage.getElementsByTag("form")) { String method = form.attr("method"); String action = form.attr("action"); boolean isPost = method != null && "post".equalsIgnoreCase(method); for (Element input : form.getElementsByTag("input")) { if (Objects.equals(input.id(), "username")) { parameters.add(new BasicNameValuePair(input.attr("name"), username)); } else if (Objects.equals(input.id(), "password")) { parameters.add(new BasicNameValuePair(input.attr("name"), password)); } else { parameters.add(new BasicNameValuePair(input.attr("name"), input.val())); }//from w ww. j ava2 s. co m } if (isPost) { HttpPost res = new HttpPost(action); UrlEncodedFormEntity formEntity; try { formEntity = new UrlEncodedFormEntity(parameters, "UTF-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } res.setEntity(formEntity); return res; } else { UriBuilder b = UriBuilder.fromPath(action); for (NameValuePair parameter : parameters) { b.queryParam(parameter.getName(), parameter.getValue()); } return new HttpGet(b.build()); } } throw new IllegalArgumentException("Invalid login form: " + loginPage); }
From source file:org.keycloak.testsuite.util.saml.RequiredConsentBuilder.java
/** * Prepares a GET/POST request for consent granting . The consent page is expected * to have at least input fields with id "kc-login" and "kc-cancel". * * @param consentPage//from w w w . ja v a 2 s . c o m * @param consent * @return */ public HttpUriRequest handleConsentPage(String consentPage, URI currentURI) { org.jsoup.nodes.Document theLoginPage = Jsoup.parse(consentPage); List<NameValuePair> parameters = new LinkedList<>(); for (Element form : theLoginPage.getElementsByTag("form")) { String method = form.attr("method"); String action = form.attr("action"); boolean isPost = method != null && "post".equalsIgnoreCase(method); for (Element input : form.getElementsByTag("input")) { if (Objects.equals(input.id(), "kc-login")) { if (approveConsent) parameters.add(new BasicNameValuePair(input.attr("name"), input.attr("value"))); } else if (Objects.equals(input.id(), "kc-cancel")) { if (!approveConsent) parameters.add(new BasicNameValuePair(input.attr("name"), input.attr("value"))); } else { parameters.add(new BasicNameValuePair(input.attr("name"), input.val())); } } if (isPost) { HttpPost res = new HttpPost(currentURI.resolve(action)); UrlEncodedFormEntity formEntity; try { formEntity = new UrlEncodedFormEntity(parameters, "UTF-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } res.setEntity(formEntity); return res; } else { UriBuilder b = UriBuilder.fromPath(action); for (NameValuePair parameter : parameters) { b.queryParam(parameter.getName(), parameter.getValue()); } return new HttpGet(b.build()); } } throw new IllegalArgumentException("Invalid consent page: " + consentPage); }
From source file:com.romeikat.datamessie.core.processing.service.cleaning.extract.TagExctractor.java
private String extractContent(final RawContent rawContent, final Document document, final String tagSelector) { if (tagSelector == null || tagSelector.isEmpty()) { return null; }//from ww w .jav a 2 s .co m // Parse tag selector String tagName = null; String idName = null; List<String> classNames = null; final String warningMessage = "Could not apply tag selecting rule on document " + document.getId() + " (" + document.getUrl() + ") due to malformed tag selector " + tagSelector + " of source " + document.getSourceId(); try { final String[] parts = tagSelector.split("#"); tagName = parts[0]; if (tagName.isEmpty()) { tagName = null; } if (parts.length >= 2) { idName = parts[1]; if (idName.isEmpty()) { idName = null; } } if (parts.length >= 3) { classNames = Arrays.asList(parts[2].split(" ")); } if (tagName == null || idName == null && classNames == null) { LOG.warn(warningMessage); return null; } } catch (final Exception e) { LOG.warn(warningMessage, e); return null; } // With tag selector, search for appropriate element final org.jsoup.nodes.Document jsoupDocument = Jsoup.parse(rawContent.getContent()); final List<Element> matchingElements = new ArrayList<Element>(); final Elements elementsWithTagName = jsoupDocument.getElementsByTag(tagName); for (final Element elementWithTagName : elementsWithTagName) { final boolean idNameMatches = idName == null || elementWithTagName.id().equals(idName); final boolean classNamesMatch = classNames == null || elementWithTagName.classNames().containsAll(classNames); if (idNameMatches && classNamesMatch) { matchingElements.add(elementWithTagName); } } // Unique match found if (matchingElements.size() == 1) { final Element matchingElement = matchingElements.get(0); return matchingElement.html(); } // No unique match found return null; }
From source file:com.astamuse.asta4d.web.form.field.impl.AbstractRadioAndCheckboxRenderer.java
protected Renderer retrieveAndCreateValueMap(final String editTargetSelector, final String displayTargetSelector) { Renderer render = Renderer.create(); if (PrepareRenderingDataUtil.retrieveStoredDataFromContextBySelector(editTargetSelector) == null) { final List<Pair<String, String>> inputList = new LinkedList<>(); final List<OptionValuePair> optionList = new LinkedList<>(); render.add(editTargetSelector, new ElementSetter() { @Override// w w w .j av a 2 s . c om public void set(Element elem) { inputList.add(Pair.of(elem.id(), elem.attr("value"))); } }); render.add(":root", new Renderable() { @Override public Renderer render() { Renderer render = Renderer.create(); for (Pair<String, String> input : inputList) { String id = input.getLeft(); final String value = input.getRight(); if (StringUtils.isEmpty(id)) { if (allowNonIdItems()) { optionList.add(new OptionValuePair(value, value)); } else { String msg = "The target item[%s] must have id specified."; throw new IllegalArgumentException(String.format(msg, editTargetSelector)); } } else { render.add(SelectorUtil.attr("for", id), Renderer.create("label", new ElementSetter() { @Override public void set(Element elem) { optionList.add(new OptionValuePair(value, elem.text())); } })); render.add(":root", new Renderable() { @Override public Renderer render() { PrepareRenderingDataUtil.storeDataToContextBySelector(editTargetSelector, displayTargetSelector, new OptionValueMap(optionList)); return Renderer.create(); } }); } } // end for loop return render; } }); } return render; }
From source file:com.astamuse.asta4d.web.form.field.impl.AbstractRadioAndCheckboxRenderer.java
protected Renderer addAlternativeDom(final String editTargetSelector, final List<String> valueList) { Renderer renderer = Renderer.create(); // renderer.addDebugger("entry root"); // renderer.addDebugger("entry root:edit target:", editTargetSelector); final List<String> matchedIdList = new LinkedList<>(); final List<String> unMatchedIdList = new LinkedList<>(); renderer.add(editTargetSelector, new ElementSetter() { @Override//from w ww . jav a2s. c om public void set(Element elem) { if (valueList.contains((elem.attr("value")))) { matchedIdList.add(elem.id()); } else { unMatchedIdList.add(elem.id()); } } }); renderer.add(":root", new Renderable() { @Override public Renderer render() { Renderer renderer = Renderer.create().disableMissingSelectorWarning(); // renderer.addDebugger("before hide unmatch"); // renderer.addDebugger("before add match"); if (matchedIdList.isEmpty()) { renderer.add(addDefaultAlternativeDom(editTargetSelector, valueList)); } else { // do nothing for remaining the existing label element // but we still have to revive the possibly existing duplicate container for (final String inputId : matchedIdList) { final List<String> matchedDuplicatorRefList = new LinkedList<>(); final String labelRefSelector = SelectorUtil.attr(RadioPrepareRenderer.LABEL_REF_ATTR, inputId); final String labelDefaultSelector = SelectorUtil.attr(SelectorUtil.tag("label"), "for", inputId); renderer.add(labelRefSelector, new ElementSetter() { @Override public void set(Element elem) { String ref = elem.attr(RadioPrepareRenderer.DUPLICATOR_REF_ATTR); if (StringUtils.isNotEmpty(ref)) { matchedDuplicatorRefList.add(ref); } } }); renderer.add(new ElementNotFoundHandler(labelRefSelector) { @Override public Renderer alternativeRenderer() { return Renderer.create(labelDefaultSelector, new ElementSetter() { @Override public void set(Element elem) { String ref = elem.attr(RadioPrepareRenderer.DUPLICATOR_REF_ATTR); if (StringUtils.isNotEmpty(ref)) { matchedDuplicatorRefList.add(ref); } }// end set }); }// end alternativeRenderer });// end ElementNotFoundHandler renderer.add(":root", new Renderable() { @Override public Renderer render() { Renderer renderer = Renderer.create().disableMissingSelectorWarning(); for (String ref : matchedDuplicatorRefList) { renderer.add( SelectorUtil.attr(RadioPrepareRenderer.DUPLICATOR_REF_ID_ATTR, ref), ToBeHiddenLaterFlagAttr, Clear); } renderer.add(labelRefSelector, ToBeHiddenLaterFlagAttr, Clear); renderer.add(labelDefaultSelector, ToBeHiddenLaterFlagAttr, Clear); return renderer.enableMissingSelectorWarning(); } }); } } return renderer.enableMissingSelectorWarning(); } }); return renderer; }
From source file:com.astamuse.asta4d.web.form.field.impl.AbstractRadioAndCheckboxRenderer.java
protected Renderer setDelayedHiddenFlag(final String targetSelector) { // hide the input element final List<String> duplicatorRefList = new LinkedList<>(); final List<String> idList = new LinkedList<>(); Renderer renderer = Renderer.create(targetSelector, new ElementSetter() { @Override//ww w . ja va2s . co m public void set(Element elem) { String duplicatorRef = elem.attr(RadioPrepareRenderer.DUPLICATOR_REF_ATTR); if (StringUtils.isNotEmpty(duplicatorRef)) { duplicatorRefList.add(duplicatorRef); } idList.add(elem.id()); } }); return renderer.add(":root", new Renderable() { @Override public Renderer render() { Renderer render = Renderer.create().disableMissingSelectorWarning(); for (String ref : duplicatorRefList) { render.add(SelectorUtil.attr(RadioPrepareRenderer.DUPLICATOR_REF_ID_ATTR, ref), ToBeHiddenLaterFlagAttr, ""); } for (String id : idList) { render.add(SelectorUtil.attr(RadioPrepareRenderer.LABEL_REF_ATTR, id), ToBeHiddenLaterFlagAttr, ""); } for (String id : idList) { render.add(SelectorUtil.attr("label", "for", id), ToBeHiddenLaterFlagAttr, ""); } render.add(targetSelector, ToBeHiddenLaterFlagAttr, ""); // render.addDebugger("after set hidden flag"); return render.enableMissingSelectorWarning(); } }); }
From source file:com.astamuse.asta4d.web.form.field.impl.AbstractRadioAndCheckboxRenderer.java
protected Renderer addDefaultAlternativeDom(final String editTargetSelector, final List<String> valueList) { final List<String> duplicatorRefList = new LinkedList<>(); final List<String> idList = new LinkedList<>(); ClosureVarRef<Boolean> editTargetExists = new ClosureVarRef<Boolean>(false); Renderer renderer = Renderer.create(editTargetSelector, new ElementSetter() { @Override//from ww w . j a va 2s. c o m public void set(Element elem) { String duplicatorRef = elem.attr(RadioPrepareRenderer.DUPLICATOR_REF_ATTR); if (StringUtils.isNotEmpty(duplicatorRef)) { duplicatorRefList.add(duplicatorRef); } idList.add(elem.id()); editTargetExists.set(true); } }); /* renderer.add(":root", () -> { return Renderer.create().addDebugger("current root for addDefaultAlternativeDom"); }); */ renderer.add(":root", new Renderable() { @Override public Renderer render() { // skip create display alternative DOM if edit target does not exist. if (editTargetExists.get()) { // it is OK } else { return Renderer.create(); } String attachTargetSelector; if (duplicatorRefList.size() > 0) { attachTargetSelector = SelectorUtil.attr(RadioPrepareRenderer.DUPLICATOR_REF_ID_ATTR, duplicatorRefList.get(duplicatorRefList.size() - 1)); } else if (idList.size() == 0) { String msg = "The target item[%s] must have id specified."; throw new IllegalArgumentException(String.format(msg, editTargetSelector)); } else { attachTargetSelector = SelectorUtil.id(idList.get(idList.size() - 1)); } return new Renderer(attachTargetSelector, new ElementTransformer(null) { @Override public Element invoke(Element elem) { GroupNode group = new GroupNode(); Element editClone = elem.clone(); group.appendChild(editClone); for (String v : valueList) { String nonNullString = retrieveDisplayStringFromStoredOptionValueMap(editTargetSelector, v); group.appendChild(createAlternativeDisplayElement(nonNullString)); } return group; }// invoke });// new renderer }// render() });// renderable return renderer; }
From source file:de.geeksfactory.opacclient.apis.Heidi.java
@Override public ReservationResult reservation(DetailledItem item, Account account, int useraction, String selection) throws IOException { String html = httpGet(opac_url + "/bestellung.cgi?ks=" + item.getId() + "&sess=" + sessid, ENCODING, false, cookieStore);/*from w w w . j a v a 2 s .co m*/ Document doc = Jsoup.parse(html); if (doc.select("input[name=pw]").size() > 0) { List<NameValuePair> nameValuePairs = new ArrayList<>(2); nameValuePairs.add(new BasicNameValuePair("id", account.getName())); nameValuePairs.add(new BasicNameValuePair("pw", account.getPassword())); nameValuePairs.add(new BasicNameValuePair("sess", sessid)); nameValuePairs.add(new BasicNameValuePair("log", "login")); nameValuePairs.add(new BasicNameValuePair("weiter", "bestellung.cgi?ks=" + item.getId())); html = httpPost(opac_url + "/login.cgi", new UrlEncodedFormEntity(nameValuePairs), ENCODING); doc = Jsoup.parse(html); if (doc.select(".loginbox .meld").size() > 0) { return new ReservationResult(MultiStepResult.Status.ERROR, doc.select(".loginbox .meld").text()); } } if (doc.select("input[name=ort]").size() > 0) { if (selection != null) { List<NameValuePair> nameValuePairs = new ArrayList<>(2); nameValuePairs.add(new BasicNameValuePair("ks", item.getId())); nameValuePairs.add(new BasicNameValuePair("ort", selection)); nameValuePairs.add(new BasicNameValuePair("sess", sessid)); nameValuePairs.add(new BasicNameValuePair("funktion", "Vormerkung")); html = httpPost(opac_url + "/bestellung.cgi", new UrlEncodedFormEntity(nameValuePairs), ENCODING); doc = Jsoup.parse(html); } else { List<Map<String, String>> options = new ArrayList<>(); for (Element input : doc.select("input[name=ort]")) { Element label = doc.select("label[for=" + input.id() + "]").first(); Map<String, String> selopt = new HashMap<>(); selopt.put("key", input.attr("value")); selopt.put("value", label.text()); options.add(selopt); } ReservationResult res = new ReservationResult(MultiStepResult.Status.SELECTION_NEEDED); res.setSelection(options); return res; } } if (doc.select(".fehler").size() > 0) { String text = doc.select(".fehler").text(); return new ReservationResult(MultiStepResult.Status.ERROR, text); } String text = doc.select(".meld2").text(); if (text.contains("Das Medium wurde")) { return new ReservationResult(MultiStepResult.Status.OK, text); } else { return new ReservationResult(MultiStepResult.Status.ERROR, text); } }
From source file:com.astamuse.asta4d.web.form.field.impl.AbstractRadioAndCheckboxPrepareRenderer.java
@Override public Renderer preRender(final String editSelector, final String displaySelector) { if (duplicateSelector != null && labelWrapperIndicatorAttr != null) { String msg = "duplicateSelector (%s) and labelWrapperIndicatorAttr (%s) cannot be specified at same time."; throw new IllegalArgumentException(String.format(msg, duplicateSelector, labelWrapperIndicatorAttr)); }// w w w. j a va2 s . c o m Renderer renderer = super.preRender(editSelector, displaySelector); renderer.disableMissingSelectorWarning(); // create wrapper for input element final WrapperIdHolder wrapperIdHolder = new WrapperIdHolder(); if (duplicateSelector == null && optionMap != null) { renderer.add(new Renderer(editSelector, new ElementTransformer(null) { @Override public Element invoke(Element elem) { if (wrapperIdHolder.wrapperId != null) { throw new RuntimeException("The target of selector[" + editSelector + "] must be unique but over than 1 target was found." + "Perhaps you have specified an option value map on a group of elements " + "which is intented to be treated as predefined static options by html directly."); } String id = elem.id(); if (StringUtils.isEmpty(id)) { String msg = "A %s input element must have id value being configured:%s"; throw new RuntimeException(String.format(msg, getTypeString(), elem.outerHtml())); } GroupNode wrapper = new GroupNode(); // cheating the rendering engine for not skipping the rendering on group node wrapper.attr(ExtNodeConstants.GROUP_NODE_ATTR_TYPE, ExtNodeConstants.GROUP_NODE_ATTR_TYPE_USERDEFINE); // put the input element under the wrapper node wrapper.appendChild(elem.clone()); String wrapperId = IdGenerator.createId(); wrapper.attr("id", wrapperId); wrapperIdHolder.inputId = id; wrapperIdHolder.wrapperId = wrapperId; // record the selector for against label if (labelWrapperIndicatorAttr == null) { wrapperIdHolder.labelSelector = SelectorUtil.attr("label", "for", wrapperIdHolder.inputId); } else { wrapperIdHolder.labelSelector = SelectorUtil.attr(labelWrapperIndicatorAttr, wrapperIdHolder.inputId); } return wrapper; } })); renderer.add(":root", new Renderable() { @Override public Renderer render() { if (wrapperIdHolder.wrapperId == null) { // for display mode? return Renderer.create(); } // remove the label element and cache it in warpperIdHolder, we will relocate it later(since we have to duplicate the // input // and label pair by given option value map, we have to make sure that the input and label elements are in same parent // node // which can be duplicated) Renderer renderer = Renderer.create().disableMissingSelectorWarning(); renderer.add(new Renderer(wrapperIdHolder.labelSelector, new ElementTransformer(null) { @Override public Element invoke(Element elem) { wrapperIdHolder.relocatingLabels.add(elem.clone()); return new GroupNode(); } })); return renderer.enableMissingSelectorWarning(); } }); renderer.add(":root", new Renderable() { @Override public Renderer render() { if (wrapperIdHolder.wrapperId == null) { // for display mode? return Renderer.create(); } String selector = SelectorUtil.id(wrapperIdHolder.wrapperId); // relocate the label element to the wrapper node return Renderer.create(selector, new ElementSetter() { @Override public void set(Element elem) { if (wrapperIdHolder.relocatingLabels.isEmpty()) {// no existing label found Element label = new Element(Tag.valueOf("label"), ""); label.attr("for", wrapperIdHolder.inputId); elem.appendChild(label); } else { for (Element label : wrapperIdHolder.relocatingLabels) { elem.appendChild(label); } } } }); } }); } else { if (duplicateSelector != null && optionMap != null) { // if duplicateSelector is specified, we just only need to store the input element id renderer.add(editSelector, new ElementSetter() { @Override public void set(Element elem) { if (wrapperIdHolder.inputId != null) { String msg = "The target of selector[%s] (inside duplicator:%s) must be unique but over than 1 target was found."; throw new RuntimeException(String.format(msg, editSelector, duplicateSelector)); } String id = elem.id(); if (StringUtils.isEmpty(id)) { String msg = "A %s input element (inside duplicator:%s) must have id value being configured:%s"; throw new RuntimeException( String.format(msg, getTypeString(), duplicateSelector, elem.outerHtml())); } wrapperIdHolder.inputId = id; // record the selector for against label // labelWrapperIndicatorAttr would not be null since we checked it at the entry of this method. wrapperIdHolder.labelSelector = SelectorUtil.attr("label", "for", wrapperIdHolder.inputId); } }); } } // here we finished restructure the input element and its related label element and then we begin to manufacture all the input/label // pairs for option list renderer.add(":root", new Renderable() { @Override public Renderer render() { if (optionMap == null) { // for static options Renderer renderer = Renderer.create(); final List<String> inputIdList = new LinkedList<>(); renderer.add(editSelector, new ElementSetter() { @Override public void set(Element elem) { inputIdList.add(elem.id()); } }); renderer.add(":root", new Renderable() { @Override public Renderer render() { Renderer render = Renderer.create().disableMissingSelectorWarning(); for (String id : inputIdList) { render.add(SelectorUtil.attr(labelWrapperIndicatorAttr, id), LABEL_REF_ATTR, id); render.add(SelectorUtil.attr("label", "for", id), LABEL_REF_ATTR, id); } return render.enableMissingSelectorWarning(); } }); if (duplicateSelector != null) { renderer.add(duplicateSelector, new Renderable() { @Override public Renderer render() { String duplicatorRef = IdGenerator.createId(); Renderer render = Renderer.create(":root", DUPLICATOR_REF_ID_ATTR, duplicatorRef); render.add("input", DUPLICATOR_REF_ATTR, duplicatorRef); String labelSelector; if (labelWrapperIndicatorAttr == null) { labelSelector = SelectorUtil.tag("label"); } else { labelSelector = SelectorUtil.attr(labelWrapperIndicatorAttr); } render.add(labelSelector, DUPLICATOR_REF_ATTR, duplicatorRef); return render; } }); } return renderer; } else { if (wrapperIdHolder.wrapperId == null && duplicateSelector == null) { // for display mode? return Renderer.create(); } if (wrapperIdHolder.inputId == null) { // target input element not found return Renderer.create(); } String selector = duplicateSelector == null ? SelectorUtil.id(wrapperIdHolder.wrapperId) : duplicateSelector; return Renderer.create(selector, optionMap.getOptionList(), row -> { Renderer renderer = Renderer.create().disableMissingSelectorWarning(); String inputSelector = SelectorUtil.id("input", wrapperIdHolder.inputId); renderer.add(inputSelector, "value", row.getValue()); // we have to generate a new uuid for the input element to make sure its id is unique even we duplicated it. String newInputId = inputIdByValue ? row.getValue() : IdGenerator.createId(); // make the generated id more understandable by prefixing with original id newInputId = wrapperIdHolder.inputId + "-" + newInputId; String duplicatorRef = null; if (duplicateSelector != null) { duplicatorRef = IdGenerator.createId(); } renderer.add(":root", DUPLICATOR_REF_ID_ATTR, duplicatorRef); renderer.add(inputSelector, DUPLICATOR_REF_ATTR, duplicatorRef); renderer.add(inputSelector, "id", newInputId); // may be a wrapper container of label renderer.add(wrapperIdHolder.labelSelector, LABEL_REF_ATTR, newInputId); if (labelWrapperIndicatorAttr != null) { renderer.add(wrapperIdHolder.labelSelector, labelWrapperIndicatorAttr, newInputId); } renderer.add(wrapperIdHolder.labelSelector, DUPLICATOR_REF_ATTR, duplicatorRef); renderer.add("label", "for", newInputId); renderer.add("label", row.getDisplayText()); return renderer.enableMissingSelectorWarning(); }); } } }); // since we cheated the rendering engine, we should set the type of group node created to faked for fast clean up renderer.add(":root", new Renderable() { @Override public Renderer render() { if (wrapperIdHolder.wrapperId == null) { // for display mode? return Renderer.create(); } String selector = SelectorUtil.id(wrapperIdHolder.wrapperId); return Renderer.create(selector, new ElementSetter() { @Override public void set(Element elem) { elem.attr(ExtNodeConstants.GROUP_NODE_ATTR_TYPE, ExtNodeConstants.GROUP_NODE_ATTR_TYPE_FAKE); } }); } }); PrepareRenderingDataUtil.storeDataToContextBySelector(editSelector, displaySelector, optionMap); return renderer.enableMissingSelectorWarning(); }
From source file:com.jimplush.goose.ContentExtractor.java
/** * remove any divs that looks like non-content, clusters of links, or paras with no gusto * * @param node// w w w . j av a2s . co m * @return */ private Element cleanupNode(Element node) { if (logger.isDebugEnabled()) { logger.debug("Starting cleanup Node"); } node = addSiblings(node); Elements nodes = node.children(); for (Element e : nodes) { if (e.tagName().equals("p")) { continue; } if (logger.isDebugEnabled()) { logger.debug("CLEANUP NODE: " + e.id() + " class: " + e.attr("class")); } boolean highLinkDensity = isHighLinkDensity(e); if (highLinkDensity) { if (logger.isDebugEnabled()) { logger.debug("REMOVING NODE FOR LINK DENSITY: " + e.id() + " class: " + e.attr("class")); } e.remove(); continue; } // now check for word density // grab all the paragraphs in the children and remove ones that are too small to matter Elements subParagraphs = e.getElementsByTag("p"); for (Element p : subParagraphs) { if (p.text().length() < 25) { p.remove(); } } // now that we've removed shorty paragraphs let's make sure to exclude any first paragraphs that don't have paras as // their next siblings to avoid getting img bylines // first let's remove any element that now doesn't have any p tags at all Elements subParagraphs2 = e.getElementsByTag("p"); if (subParagraphs2.size() == 0 && !e.tagName().equals("td")) { if (logger.isDebugEnabled()) { logger.debug("Removing node because it doesn't have any paragraphs"); } e.remove(); continue; } //if this node has a decent enough gravityScore we should keep it as well, might be content int topNodeScore = getScore(node); int currentNodeScore = getScore(e); float thresholdScore = (float) (topNodeScore * .08); if (logger.isDebugEnabled()) { logger.debug("topNodeScore: " + topNodeScore + " currentNodeScore: " + currentNodeScore + " threshold: " + thresholdScore); } if (currentNodeScore < thresholdScore) { if (!e.tagName().equals("td")) { if (logger.isDebugEnabled()) { logger.debug("Removing node due to low threshold score"); } e.remove(); } else { if (logger.isDebugEnabled()) { logger.debug("Not removing TD node"); } } continue; } } return node; }