List of usage examples for com.google.common.net InternetDomainName isTopPrivateDomain
public boolean isTopPrivateDomain()
From source file:uk.bl.wa.extract.LinkExtractor.java
/** * Attempt to parse out the private domain. Fall back on host if things go * awry.//w w w . j a va 2s .c o m * * @param host * @return */ public static String extractPrivateSuffixFromHost(String host) { if (host == null) return null; // Parse out the public suffix: InternetDomainName domainName; try { domainName = InternetDomainName.from(host); } catch (Exception e) { return host; } InternetDomainName suffix = null; // It appears the IDN class does not know about the various UK // second-level domains. // If it's a UK host, override the result by assuming three levels: if (host.endsWith(".uk")) { ImmutableList<String> parts = domainName.parts(); if (parts.size() >= 3) { suffix = InternetDomainName.from(parts.get(parts.size() - 3) + "." + parts.get(parts.size() - 2) + "." + parts.get(parts.size() - 1)); } } else { if (domainName.isTopPrivateDomain() || domainName.isUnderPublicSuffix()) { suffix = domainName.topPrivateDomain(); } else { suffix = domainName; } } // If it all failed for some reason, fall back on the host value: if (suffix == null) suffix = domainName; return suffix.toString(); }
From source file:com.addthis.hydra.data.filter.bundle.BundleFilterURL.java
@Override public boolean filter(Bundle bundle) { String pv = ValueUtil.asNativeString(field.getValue(bundle)); if (!asFile) { if (pv == null || pv.length() < 7) { return invalidExit; }/*from w w w .ja va2s.co m*/ String lpv = pv.trim().toLowerCase(); if (!(lpv.startsWith("http"))) { if (fixProto) { if (clean && lpv.indexOf("%2f") >= 0) { pv = LessBytes.urldecode(pv); } pv = "http://".concat(pv); } else { return invalidExit; } } if (clean && (lpv.startsWith("http%") || lpv.startsWith("https%"))) { pv = LessBytes.urldecode(pv); } } // up to two 'decoding' passes on the url to try and find a valid one for (int i = 0; i < 2; i++) { if (pv == null) { return invalidExit; } try { URL urec = asFile ? new URL("file://".concat(pv)) : new URL(pv); String urlhost = urec.getHost(); String returnhost = null; if (resolveIP) { synchronized (iphost) { returnhost = iphost.get(urlhost).toLowerCase(); if (returnhost == null) { returnhost = resolveDottedIP(urlhost); iphost.put(urlhost, returnhost); if (iphost.size() > maxhostcache) { iphost.removeEldest(); } } } } else { returnhost = urlhost.toLowerCase(); } // store cleaned up (url decoded) version back to packet if (clean) { if (urec != null && urec.getPath().isEmpty()) { // if the path element is null, append the slash pv = pv.concat("/"); } field.setValue(bundle, ValueFactory.create(pv)); } if (setHost != null) { if (toBaseDomain) { returnhost = NetUtil.getBaseDomain(returnhost); } else if (toTopPrivateDomain) { if (returnhost != null && InternetDomainName.isValid(returnhost)) { InternetDomainName domain = InternetDomainName.from(returnhost); if (domain.hasPublicSuffix() && domain.isUnderPublicSuffix()) { InternetDomainName topPrivateDomain = domain.topPrivateDomain(); returnhost = topPrivateDomain.toString(); } } } setHost.setValue(bundle, ValueFactory.create(returnhost)); } if (setPath != null) { setPath.setValue(bundle, ValueFactory.create(urec.getPath())); } if (setParams != null) { setParams.setValue(bundle, ValueFactory.create(urec.getQuery())); } if (setAnchor != null) { setAnchor.setValue(bundle, ValueFactory.create(urec.getRef())); } if (setHostNormal != null) { Matcher m = hostNormalPattern.matcher(returnhost); if (m.find()) { returnhost = m.group(1); } setHostNormal.setValue(bundle, ValueFactory.create(returnhost)); } if (setTopPrivateDomain != null) { String topDomain = returnhost; if (InternetDomainName.isValid(returnhost)) { InternetDomainName domainName = InternetDomainName.from(returnhost); if (domainName.isTopPrivateDomain() || domainName.isUnderPublicSuffix()) { topDomain = DOT_JOINER.join(domainName.topPrivateDomain().parts()); } } setTopPrivateDomain.setValue(bundle, ValueFactory.create(topDomain)); } } catch (MalformedURLException e) { if (pv.indexOf("%3") > 0 && pv.indexOf("%2") > 0) { pv = LessBytes.urldecode(pv); } else { if (debugMalformed) { System.err.println("malformed(" + i + ") " + pv); } return invalidExit; } } } return true; }