List of usage examples for com.google.gwt.dom.client AnchorElement getHref
public String getHref()
From source file:ch.unifr.pai.twice.widgets.mpproxy.client.ProxyBody.java
License:Apache License
/** * Client side logic to rewrite a given URL * /* w w w . java2 s. com*/ * @param element * @param servletPath * @param proxyPath */ public static void rewriteUrl(com.google.gwt.dom.client.Element element, String servletPath, String proxyPath) { NodeList<Node> nodes = element.getChildNodes(); for (int i = 0; i < nodes.getLength(); i++) { Node n = nodes.getItem(i); if (com.google.gwt.dom.client.Element.is(n)) { com.google.gwt.dom.client.Element e = com.google.gwt.dom.client.Element.as(n); if (e != null && e.getTagName() != null && e.getTagName().equalsIgnoreCase("a")) { AnchorElement anchor = AnchorElement.as(e); if (anchor.getHref() != null && !anchor.getHref().isEmpty()) anchor.removeAttribute("onmousedown"); } for (String att : attributesToManipulate) { String value = e.getAttribute(att); if (value != null && !value.startsWith(servletPath) && value.matches("((http)|/).*")) { String transformed = Rewriter.translateCleanUrl(value, servletPath, proxyPath); if (!transformed.equals(value)) e.setAttribute(att, transformed); } } rewriteUrl(e, servletPath, proxyPath); } } }
From source file:com.dom_distiller.client.ContentExtractor.java
License:Open Source License
private static void makeAllLinksAbsolute(Node rootNode) { Element root = Element.as(rootNode); // AnchorElement.getHref() and ImageElement.getSrc() both return the // absolute URI, so simply set them as the respective attributes. NodeList<Element> allLinks = root.getElementsByTagName("A"); for (int i = 0; i < allLinks.getLength(); i++) { AnchorElement link = AnchorElement.as(allLinks.getItem(i)); if (!link.getHref().isEmpty()) { link.setHref(link.getHref()); }//from w w w. j av a 2 s.c o m } NodeList<Element> videoTags = root.getElementsByTagName("VIDEO"); for (int i = 0; i < videoTags.getLength(); i++) { VideoElement video = (VideoElement) videoTags.getItem(i); if (!video.getPoster().isEmpty()) { video.setPoster(video.getPoster()); } } makeAllSrcAttributesAbsolute(root); }
From source file:com.dom_distiller.client.PagingLinksFinder.java
License:Open Source License
private static String findPagingLink(Element root, String original_domain, PageLink pageLink) { // findPagingLink() is static, so clear mLinkDebugInfo before processing the links. if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) { mLinkDebugInfo.clear();/* ww w .jav a2 s . c o m*/ } String baseUrl = findBaseUrl(original_domain); // Remove trailing '/' from window location href, because it'll be used to compare with // other href's whose trailing '/' are also removed. String wndLocationHref = StringUtil.findAndReplace(Window.Location.getHref(), "\\/$", ""); NodeList<Element> allLinks = root.getElementsByTagName("A"); Map<String, PagingLinkObj> possiblePages = new HashMap<String, PagingLinkObj>(); // Loop through all links, looking for hints that they may be next- or previous- page links. // Things like having "page" in their textContent, className or id, or being a child of a // node with a page-y className or id. // Also possible: levenshtein distance? longest common subsequence? // After we do that, assign each page a score. for (int i = 0; i < allLinks.getLength(); i++) { AnchorElement link = AnchorElement.as(allLinks.getItem(i)); int width = link.getOffsetWidth(); int height = link.getOffsetHeight(); if (width == 0 || height == 0) { appendDbgStrForLink(link, "ignored: sz=" + width + "x" + height); continue; } if (!DomUtil.isVisible(link)) { appendDbgStrForLink(link, "ignored: invisible"); continue; } // Remove url anchor and then trailing '/' from link's href. // Note that AnchorElement.getHref() returns the absolute URI, so there's no need to // worry about relative links. String linkHref = StringUtil.findAndReplace(StringUtil.findAndReplace(link.getHref(), "#.*$", ""), "\\/$", ""); // Ignore page link that is empty, not http/https, or same as current window location. // If the page link is same as the base URL: // - next page link: ignore it, since we would already have seen it. // - previous page link: don't ignore it, since some sites will simply have the same // base URL for the first page. if (linkHref.isEmpty() || !StringUtil.match(linkHref, "^https?://") || linkHref.equalsIgnoreCase(wndLocationHref) || (pageLink == PageLink.NEXT && linkHref.equalsIgnoreCase(baseUrl))) { appendDbgStrForLink(link, "ignored: empty or same as current or base url" + baseUrl); continue; } // If it's on a different domain, skip it. String[] urlSlashes = StringUtil.split(linkHref, "\\/+"); if (urlSlashes.length < 3 || // Expect at least the protocol, domain, and path. !getLocationHost(original_domain).equalsIgnoreCase(urlSlashes[1])) { appendDbgStrForLink(link, "ignored: different domain"); continue; } // Use javascript innerText (instead of javascript textContent) to only get visible // text. String linkText = DomUtil.getInnerText(link); // If the linkText looks like it's not the next or previous page, skip it. if (StringUtil.match(linkText, EXTRANEOUS_REGEX) || linkText.length() > 25) { appendDbgStrForLink(link, "ignored: one of extra"); continue; } // For next page link, if the initial part of the URL is identical to the base URL, but // the rest of it doesn't contain any digits, it's certainly not a next page link. // However, this doesn't apply to previous page link, because most sites will just have // the base URL for the first page. // TODO(kuan): baseUrl (returned by findBaseUrl()) is NOT the prefix of the current // window location, even though it appears to be so the way it's used here. // TODO(kuan): do we need to apply this heuristic to previous page links if current page // number is not 2? if (pageLink == PageLink.NEXT) { String linkHrefRemaining = StringUtil.findAndReplace(linkHref, baseUrl, ""); if (!StringUtil.match(linkHrefRemaining, "\\d")) { appendDbgStrForLink(link, "ignored: no number beyond base url " + baseUrl); continue; } } PagingLinkObj linkObj = null; if (!possiblePages.containsKey(linkHref)) { // Have not encountered this href. linkObj = new PagingLinkObj(i, 0, linkText, linkHref); possiblePages.put(linkHref, linkObj); } else { // Have already encountered this href, append its text to existing entry's. linkObj = possiblePages.get(linkHref); linkObj.mLinkText += " | " + linkText; } // If the base URL isn't part of this URL, penalize this link. It could still be the // link, but the odds are lower. // Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html. // TODO(kuan): again, baseUrl (returned by findBaseUrl()) is NOT the prefix of the // current window location, even though it appears to be so the way it's used here. if (linkHref.indexOf(baseUrl) != 0) { linkObj.mScore -= 25; appendDbgStrForLink(link, "score=" + linkObj.mScore + ": not part of base url " + baseUrl); } // Concatenate the link text with class name and id, and determine the score based on // existence of various paging-related words. String linkData = linkText + " " + link.getClassName() + " " + link.getId(); appendDbgStrForLink(link, "txt+class+id=" + linkData); if (StringUtil.match(linkData, pageLink == PageLink.NEXT ? NEXT_LINK_REGEX : PREV_LINK_REGEX)) { linkObj.mScore += 50; appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has " + (pageLink == PageLink.NEXT ? "next" : "prev" + " regex")); } if (StringUtil.match(linkData, "pag(e|ing|inat)")) { linkObj.mScore += 25; appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has pag* word"); } if (StringUtil.match(linkData, "(first|last)")) { // -65 is enough to negate any bonuses gotten from a > or in the text. // If we already matched on "next", last is probably fine. // If we didn't, then it's bad. Penalize. // Same for "prev". if ((pageLink == PageLink.NEXT && !StringUtil.match(linkObj.mLinkText, NEXT_LINK_REGEX)) || (pageLink == PageLink.PREV && !StringUtil.match(linkObj.mLinkText, PREV_LINK_REGEX))) { linkObj.mScore -= 65; appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has first|last but no " + (pageLink == PageLink.NEXT ? "next" : "prev") + " regex"); } } if (StringUtil.match(linkData, NEGATIVE_REGEX) || StringUtil.match(linkData, EXTRANEOUS_REGEX)) { linkObj.mScore -= 50; appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has neg or extra regex"); } if (StringUtil.match(linkData, pageLink == PageLink.NEXT ? PREV_LINK_REGEX : NEXT_LINK_REGEX)) { linkObj.mScore -= 200; appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has opp of " + (pageLink == PageLink.NEXT ? "next" : "prev") + " regex"); } // Check if a parent element contains page or paging or paginate. boolean positiveMatch = false, negativeMatch = false; Element parent = link.getParentElement(); while (parent != null && (positiveMatch == false || negativeMatch == false)) { String parentClassAndId = parent.getClassName() + " " + parent.getId(); if (!positiveMatch && StringUtil.match(parentClassAndId, "pag(e|ing|inat)")) { linkObj.mScore += 25; positiveMatch = true; appendDbgStrForLink(link, "score=" + linkObj.mScore + ": posParent - " + parentClassAndId); } // TODO(kuan): to get 1st page for prev page link, this can't be applied; however, // the non-application might be the cause of recursive prev page being returned, // i.e. for page 1, it may incorrectly return page 3 for prev page link. if (!negativeMatch && StringUtil.match(parentClassAndId, NEGATIVE_REGEX)) { // If this is just something like "footer", give it a negative. // If it's something like "body-and-footer", leave it be. if (!StringUtil.match(parentClassAndId, POSITIVE_REGEX)) { linkObj.mScore -= 25; negativeMatch = true; appendDbgStrForLink(link, "score=" + linkObj.mScore + ": negParent - " + parentClassAndId); } } parent = parent.getParentElement(); } // If the URL looks like it has paging in it, add to the score. // Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34. if (StringUtil.match(linkHref, "p(a|g|ag)?(e|ing|ination)?(=|\\/)[0-9]{1,2}") || StringUtil.match(linkHref, "(page|paging)")) { linkObj.mScore += 25; appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has paging info"); } // If the URL contains negative values, give a slight decrease. if (StringUtil.match(linkHref, EXTRANEOUS_REGEX)) { linkObj.mScore -= 15; appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has extra regex"); } // If the link text can be parsed as a number, give it a minor bonus, with a slight bias // towards lower numbered pages. This is so that pages that might not have 'next' in // their text can still get scored, and sorted properly by score. // TODO(kuan): it might be wrong to assume that it knows about other pages in the // document and that it starts on the first page. int linkTextAsNumber = 0; try { linkTextAsNumber = Integer.parseInt(linkText, 10); } catch (NumberFormatException e) { } if (linkTextAsNumber > 0) { // Punish 1 since we're either already there, or it's probably before what we // want anyway. if (linkTextAsNumber == 1) { linkObj.mScore -= 10; } else { linkObj.mScore += Math.max(0, 10 - linkTextAsNumber); } appendDbgStrForLink(link, "score=" + linkObj.mScore + ": linktxt is a num"); } } // for all links // Loop through all of the possible pages from above and find the top candidate for the next // page URL. Require at least a score of 50, which is a relatively high confidence that // this page is the next link. PagingLinkObj topPage = null; if (!possiblePages.isEmpty()) { Collection<PagingLinkObj> possiblePageObjs = possiblePages.values(); Iterator<PagingLinkObj> iter = possiblePageObjs.iterator(); while (iter.hasNext()) { PagingLinkObj pageObj = iter.next(); if (pageObj.mScore >= 50 && (topPage == null || topPage.mScore < pageObj.mScore)) { topPage = pageObj; } } } String pagingHref = null; if (topPage != null) { pagingHref = StringUtil.findAndReplace(topPage.mLinkHref, "\\/$", ""); appendDbgStrForLink(allLinks.getItem(topPage.mLinkIndex), "found: score=" + topPage.mScore + ", txt=[" + topPage.mLinkText + "], " + pagingHref); } if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) { logDbgInfoToConsole(pageLink, pagingHref, allLinks); } return pagingHref; }
From source file:com.dom_distiller.client.PagingLinksFinder.java
License:Open Source License
private static void logDbgInfoToConsole(PageLink pageLink, String pagingHref, NodeList<Element> allLinks) { // This logs the following to the console: // - number of links processed // - the next or previous page link found // - for each link: its href, text, concatenated debug string. // Location of logging output is different when running in different modes: // - "ant test.dev": test output file. // - chrome browser distiller viewer: chrome logfile. // (TODO)kuan): investigate how to get logging when running "ant test.prod" - currently, // nothing appears. In the meantime, throwing an exception with a log message at suspicious // codepoints can produce a call stack and help debugging, albeit tediously. LogUtil.logToConsole("numLinks=" + allLinks.getLength() + ", found " + (pageLink == PageLink.NEXT ? "next: " : "prev: ") + (pagingHref != null ? pagingHref : "null")); for (int i = 0; i < allLinks.getLength(); i++) { AnchorElement link = AnchorElement.as(allLinks.getItem(i)); // Use javascript innerText (instead of javascript textContent) to get only visible // text./*from ww w . j a va2 s . c om*/ String text = DomUtil.getInnerText(link); // Trim unnecessary whitespaces from text. String[] words = StringUtil.split(text, "\\s+"); text = ""; for (int w = 0; w < words.length; w++) { text += words[w]; if (w < words.length - 1) text += " "; } LogUtil.logToConsole( i + ")" + link.getHref() + ", txt=[" + text + "], dbg=[" + mLinkDebugInfo.get(link) + "]"); } }
From source file:com.vaadin.client.WidgetUtil.java
License:Apache License
/** * Resolve a relative URL to an absolute URL based on the current document's * location./*ww w. ja va 2s . c om*/ * * @param url * a string with the relative URL to resolve * @return the corresponding absolute URL as a string */ public static String getAbsoluteUrl(String url) { if (BrowserInfo.get().isIE8()) { // The hard way - must use innerHTML and attach to DOM in IE8 DivElement divElement = Document.get().createDivElement(); divElement.getStyle().setDisplay(Display.NONE); RootPanel.getBodyElement().appendChild(divElement); divElement.setInnerHTML("<a href='" + escapeAttribute(url) + "' ></a>"); AnchorElement a = divElement.getChild(0).cast(); String href = a.getHref(); RootPanel.getBodyElement().removeChild(divElement); return href; } else { AnchorElement a = Document.get().createAnchorElement(); a.setHref(url); return a.getHref(); } }
From source file:org.chromium.distiller.DomUtil.java
License:Open Source License
/** * Makes all anchors and video posters absolute. This calls "makeAllSrcAttributesAbsolute". * @param rootNode The root Node to look through. *//*from w ww .ja v a2s . c om*/ public static void makeAllLinksAbsolute(Node rootNode) { Element root = Element.as(rootNode); // AnchorElement.getHref() and ImageElement.getSrc() both return the // absolute URI, so simply set them as the respective attributes. if ("A".equals(root.getTagName())) { AnchorElement link = AnchorElement.as(root); if (!link.getHref().isEmpty()) { link.setHref(link.getHref()); } } NodeList<Element> allLinks = root.getElementsByTagName("A"); for (int i = 0; i < allLinks.getLength(); i++) { AnchorElement link = AnchorElement.as(allLinks.getItem(i)); if (!link.getHref().isEmpty()) { link.setHref(link.getHref()); } } if (root.getTagName().equals("VIDEO")) { VideoElement video = (VideoElement) root; if (!video.getPoster().isEmpty()) { video.setPoster(video.getPoster()); } } NodeList<Element> videoTags = root.getElementsByTagName("VIDEO"); for (int i = 0; i < videoTags.getLength(); i++) { VideoElement video = (VideoElement) videoTags.getItem(i); if (!video.getPoster().isEmpty()) { video.setPoster(video.getPoster()); } } makeAllSrcAttributesAbsolute(root); makeSrcSetAbsolute(root); }
From source file:org.chromium.distiller.extractors.embeds.TwitterExtractor.java
License:Open Source License
/** * Handle a Twitter embed that has not yet been rendered. * @param e The root element of the embed (should be a "blockquote"). * @return EmbeddedElement object representing the embed or null. *//* www . jav a2 s . co m*/ private WebEmbed handleNotRendered(Element e) { // Make sure the characteristic class name for Twitter exists. if (!e.getClassName().contains("twitter-tweet")) { return null; } // Get the last anchor element in this section; it should contain the tweet id. NodeList<Element> anchors = e.getElementsByTagName("a"); if (anchors.getLength() == 0) { return null; } AnchorElement tweetAnchor = AnchorElement.as(anchors.getItem(anchors.getLength() - 1)); if (!DomUtil.hasRootDomain(tweetAnchor.getHref(), "twitter.com")) { return null; } // Get specific attributes about the Twitter embed. String path = tweetAnchor.getPropertyString("pathname"); String id = getTweetIdFromPath(path); if (id == null) { return null; } return new WebEmbed(e, "twitter", id, null); }
From source file:org.chromium.distiller.PageParameterParser.java
License:Open Source License
private static String resolveLinkHref(AnchorElement link, AnchorElement baseAnchor) { // Anchors without "href" attribute are not considered potential pagination links. String linkHref = link.getAttribute("href"); if (linkHref.isEmpty()) return ""; baseAnchor.setAttribute("href", linkHref); return baseAnchor.getHref(); }
From source file:org.chromium.distiller.PagingLinksFinder.java
License:Open Source License
public static String resolveLinkHref(String linkHref, AnchorElement baseAnchor) { baseAnchor.setAttribute("href", linkHref); return baseAnchor.getHref(); }
From source file:org.chromium.distiller.PagingLinksFinder.java
License:Open Source License
private static void logDbgInfoToConsole(PageLink pageLink, String pagingHref, NodeList<Element> allLinks) { // This logs the following to the console: // - number of links processed // - the next or previous page link found // - for each link: its href, text, concatenated debug string. // Location of logging output is different when running in different modes: // - "ant test.dev": test output file. // - chrome browser distiller viewer: chrome logfile. // (TODO)kuan): investigate how to get logging when running "ant test.prod" - currently, // nothing appears. In the meantime, throwing an exception with a log message at suspicious // codepoints can produce a call stack and help debugging, albeit tediously. LogUtil.logToConsole("numLinks=" + allLinks.getLength() + ", found " + (pageLink == PageLink.NEXT ? "next: " : "prev: ") + (pagingHref != null ? pagingHref : "null")); for (int i = 0; i < allLinks.getLength(); i++) { AnchorElement link = AnchorElement.as(allLinks.getItem(i)); // Use javascript innerText (instead of javascript textContent) to get only visible // text./*w w w . j av a 2 s. c om*/ String text = DomUtil.getInnerText(link); // Trim unnecessary white spaces from text. String[] words = StringUtil.split(text, "\\s+"); text = ""; for (int w = 0; w < words.length; w++) { text += words[w]; if (w < words.length - 1) text += " "; } LogUtil.logToConsole( i + ")" + link.getHref() + ", txt=[" + text + "], dbg=[" + mLinkDebugInfo.get(link) + "]"); } }