List of usage examples for com.google.common.net InternetDomainName isUnderPublicSuffix
public boolean isUnderPublicSuffix()
From source file:org.sindice.core.analytics.commons.util.URIUtil.java
/** * Return the second-level domain name. Returns null if the domain is not valid. * This method normalises domain names by removing the leading www sub-domain, * if present./*from w w w . j a v a 2s .c o m*/ * @param domain * @return */ public static String getSndDomain(String domain) { if (domain == null) { return null; } // Remove www subdomain if it exists if (domain.startsWith("www.")) { domain = domain.substring(4); } if (InternetDomainName.isValid(domain)) { // the domain is valid according to the RFC3490 final InternetDomainName idn = InternetDomainName.from(domain); if (idn.hasPublicSuffix()) { // the domain has a public suffix if (idn.isUnderPublicSuffix()) { return idn.topPrivateDomain().name(); } else if (idn.hasParent()) { final List<String> parts = idn.parts(); return parts.get(parts.size() - 2).concat(".").concat(parts.get(parts.size() - 1)); } } } return null; }
From source file:uk.bl.wa.extract.LinkExtractor.java
/** * Attempt to parse out the private domain. Fall back on host if things go * awry.// w w w .java 2 s . c o m * * @param host * @return */ public static String extractPrivateSuffixFromHost(String host) { if (host == null) return null; // Parse out the public suffix: InternetDomainName domainName; try { domainName = InternetDomainName.from(host); } catch (Exception e) { return host; } InternetDomainName suffix = null; // It appears the IDN class does not know about the various UK // second-level domains. // If it's a UK host, override the result by assuming three levels: if (host.endsWith(".uk")) { ImmutableList<String> parts = domainName.parts(); if (parts.size() >= 3) { suffix = InternetDomainName.from(parts.get(parts.size() - 3) + "." + parts.get(parts.size() - 2) + "." + parts.get(parts.size() - 1)); } } else { if (domainName.isTopPrivateDomain() || domainName.isUnderPublicSuffix()) { suffix = domainName.topPrivateDomain(); } else { suffix = domainName; } } // If it all failed for some reason, fall back on the host value: if (suffix == null) suffix = domainName; return suffix.toString(); }
From source file:google.registry.flows.host.HostFlowUtils.java
/** Checks that a host name is valid. */ static InternetDomainName validateHostName(String name) throws EppException { checkArgumentNotNull(name, "Must specify host name to validate"); if (name.length() > 253) { throw new HostNameTooLongException(); }/*from ww w .ja v a 2s . c om*/ String hostNameLowerCase = Ascii.toLowerCase(name); if (!name.equals(hostNameLowerCase)) { throw new HostNameNotLowerCaseException(hostNameLowerCase); } try { String hostNamePunyCoded = Idn.toASCII(name); if (!name.equals(hostNamePunyCoded)) { throw new HostNameNotPunyCodedException(hostNamePunyCoded); } InternetDomainName hostName = InternetDomainName.from(name); if (!name.equals(hostName.toString())) { throw new HostNameNotNormalizedException(hostName.toString()); } // Checks whether a hostname is deep enough. Technically a host can be just one under a // public suffix (e.g. example.com) but we require by policy that it has to be at least one // part beyond that (e.g. ns1.example.com). The public suffix list includes all current // ccTlds, so this check requires 4+ parts if it's a ccTld that doesn't delegate second // level domains, such as .co.uk. But the list does not include new tlds, so in that case // we just ensure 3+ parts. In the particular case where our own tld has a '.' in it, we know // that there need to be 4 parts as well. if (hostName.isUnderPublicSuffix()) { if (hostName.parent().isUnderPublicSuffix()) { return hostName; } } else { // We need to know how many parts the hostname has beyond the public suffix, but we don't // know what the public suffix is. If the host is in bailiwick and we are hosting a // multipart "tld" like .co.uk the publix suffix might be 2 parts. Otherwise it's an // unrecognized tld that's not on the public suffix list, so assume the tld alone is the // public suffix. Optional<InternetDomainName> tldParsed = findTldForName(hostName); int suffixSize = tldParsed.isPresent() ? tldParsed.get().parts().size() : 1; if (hostName.parts().size() >= suffixSize + 2) { return hostName; } } throw new HostNameTooShallowException(); } catch (IllegalArgumentException e) { throw new InvalidHostNameException(); } }
From source file:focusedCrawler.util.LinkRelevance.java
public String getTopLevelDomainName() { InternetDomainName domain = this.getDomainName(); try {/*from w w w . j a va 2 s . com*/ if (domain.isUnderPublicSuffix()) { return domain.topPrivateDomain().toString(); } else { // if the domain is a public suffix, just use it as top level domain return domain.toString(); } } catch (Exception e) { throw new IllegalStateException("Invalid top private domain name=[" + domain + "] in URL=[" + url + "]", e); } }
From source file:com.addthis.hydra.data.filter.bundle.BundleFilterURL.java
@Override public boolean filter(Bundle bundle) { String pv = ValueUtil.asNativeString(field.getValue(bundle)); if (!asFile) { if (pv == null || pv.length() < 7) { return invalidExit; }//from ww w.j a v a 2 s . co m String lpv = pv.trim().toLowerCase(); if (!(lpv.startsWith("http"))) { if (fixProto) { if (clean && lpv.indexOf("%2f") >= 0) { pv = LessBytes.urldecode(pv); } pv = "http://".concat(pv); } else { return invalidExit; } } if (clean && (lpv.startsWith("http%") || lpv.startsWith("https%"))) { pv = LessBytes.urldecode(pv); } } // up to two 'decoding' passes on the url to try and find a valid one for (int i = 0; i < 2; i++) { if (pv == null) { return invalidExit; } try { URL urec = asFile ? new URL("file://".concat(pv)) : new URL(pv); String urlhost = urec.getHost(); String returnhost = null; if (resolveIP) { synchronized (iphost) { returnhost = iphost.get(urlhost).toLowerCase(); if (returnhost == null) { returnhost = resolveDottedIP(urlhost); iphost.put(urlhost, returnhost); if (iphost.size() > maxhostcache) { iphost.removeEldest(); } } } } else { returnhost = urlhost.toLowerCase(); } // store cleaned up (url decoded) version back to packet if (clean) { if (urec != null && urec.getPath().isEmpty()) { // if the path element is null, append the slash pv = pv.concat("/"); } field.setValue(bundle, ValueFactory.create(pv)); } if (setHost != null) { if (toBaseDomain) { returnhost = NetUtil.getBaseDomain(returnhost); } else if (toTopPrivateDomain) { if (returnhost != null && InternetDomainName.isValid(returnhost)) { InternetDomainName domain = InternetDomainName.from(returnhost); if (domain.hasPublicSuffix() && domain.isUnderPublicSuffix()) { InternetDomainName topPrivateDomain = domain.topPrivateDomain(); returnhost = topPrivateDomain.toString(); } } } setHost.setValue(bundle, ValueFactory.create(returnhost)); } if (setPath != null) { setPath.setValue(bundle, ValueFactory.create(urec.getPath())); } if (setParams != null) { setParams.setValue(bundle, ValueFactory.create(urec.getQuery())); } if (setAnchor != null) { setAnchor.setValue(bundle, ValueFactory.create(urec.getRef())); } if (setHostNormal != null) { Matcher m = hostNormalPattern.matcher(returnhost); if (m.find()) { returnhost = m.group(1); } setHostNormal.setValue(bundle, ValueFactory.create(returnhost)); } if (setTopPrivateDomain != null) { String topDomain = returnhost; if (InternetDomainName.isValid(returnhost)) { InternetDomainName domainName = InternetDomainName.from(returnhost); if (domainName.isTopPrivateDomain() || domainName.isUnderPublicSuffix()) { topDomain = DOT_JOINER.join(domainName.topPrivateDomain().parts()); } } setTopPrivateDomain.setValue(bundle, ValueFactory.create(topDomain)); } } catch (MalformedURLException e) { if (pv.indexOf("%3") > 0 && pv.indexOf("%2") > 0) { pv = LessBytes.urldecode(pv); } else { if (debugMalformed) { System.err.println("malformed(" + i + ") " + pv); } return invalidExit; } } } return true; }
From source file:org.apache.commons.httpclient.cookie.CookieSpecBase.java
/** * Return an array of {@link Cookie}s that should be submitted with a * request with given attributes, <tt>false</tt> otherwise. * //from w ww. jav a 2 s .c om * If the SortedMap comes from an HttpState and is not itself * thread-safe, it may be necessary to synchronize on the HttpState * instance to protect against concurrent modification. * * @param host the host to which the request is being submitted * @param port the port to which the request is being submitted (currently * ignored) * @param path the path to which the request is being submitted * @param secure <tt>true</tt> if the request is using a secure protocol * @param cookies SortedMap of <tt>Cookie</tt>s to be matched * @return an array of <tt>Cookie</tt>s matching the criterium */ @Override public Cookie[] match(String host, int port, String path, boolean secure, final SortedMap<String, Cookie> cookies) { LOG.trace("enter CookieSpecBase.match(" + "String, int, String, boolean, SortedMap)"); if (cookies == null) { return null; } List<Cookie> matching = new LinkedList<Cookie>(); InternetDomainName domain; try { domain = InternetDomainName.fromLenient(host); } catch (IllegalArgumentException e) { domain = null; } String candidate = (domain != null) ? domain.name() : host; while (candidate != null) { Iterator<Cookie> iter = cookies.subMap(candidate, candidate + Cookie.DOMAIN_OVERBOUNDS).values() .iterator(); while (iter.hasNext()) { Cookie cookie = (Cookie) (iter.next()); if (match(host, port, path, secure, cookie)) { addInPathOrder(matching, cookie); } } StoredIterator.close(iter); if (domain != null && domain.isUnderPublicSuffix()) { domain = domain.parent(); candidate = domain.name(); } else { candidate = null; } } return (Cookie[]) matching.toArray(new Cookie[matching.size()]); }