List of usage examples for com.google.common.net InternetDomainName topPrivateDomain
public InternetDomainName topPrivateDomain()
From source file:edu.uga.cs.fluxbuster.utils.DomainNameUtils.java
/** * Extracts the effective second level domain name. * * @param domainname the full domain name * @return the second level domain name or null on error *///from w w w .j av a 2 s . com public static String extractEffective2LD(String domainname) { String retval = null; try { InternetDomainName idn = InternetDomainName.from(domainname); InternetDomainName sld = idn.topPrivateDomain(); retval = sld.name(); } catch (Exception e) { if (log.isDebugEnabled()) { log.debug("Unable to extract 2LD.", e); } } return retval; }
From source file:com.evidon.areweprivateyet.AnalysisUtils.java
public static String getGuavaDomain(String url) throws Exception { if (url.indexOf("#") > 0) { url = url.substring(0, url.indexOf("#")); }//from w w w. jav a 2 s.c om if (url.indexOf("?") > 0) { url = url.substring(0, url.indexOf("?")); } if (url.indexOf(";") > 0) { url = url.substring(0, url.indexOf(";")); } if (url.indexOf("|") > 0) { url = url.substring(0, url.indexOf("|")); } if (url.indexOf("_") > 0) { url = url.replaceAll("_", ""); } if (url.indexOf("%") > 0) { url = url.replaceAll("%", ""); } // strip port if (url.indexOf(":8080") > 0) { url = url.replaceAll(":8080", ""); } String host = new URI(url).getHost(); try { InternetDomainName domainName = InternetDomainName.from(host); return domainName.topPrivateDomain().name(); } catch (java.lang.IllegalStateException e) { return AnalysisUtils.getBaseDomain(url); } catch (java.lang.IllegalArgumentException e) { if (url.startsWith("https://")) { url = url.substring(7); } if (url.startsWith("http://")) { url = url.substring(7); } if (url.indexOf("/") > 0) { url = url.substring(0, url.indexOf("/")); } if (url.matches("^(?:[0-9]{1,3}\\.){3}[0-9]{1,3}$")) { return url; } else { throw new Exception(); } } }
From source file:org.sindice.core.analytics.commons.util.URIUtil.java
/** * Return the second-level domain name. Returns null if the domain is not valid. * This method normalises domain names by removing the leading www sub-domain, * if present./*ww w. j a va 2 s . co m*/ * @param domain * @return */ public static String getSndDomain(String domain) { if (domain == null) { return null; } // Remove www subdomain if it exists if (domain.startsWith("www.")) { domain = domain.substring(4); } if (InternetDomainName.isValid(domain)) { // the domain is valid according to the RFC3490 final InternetDomainName idn = InternetDomainName.from(domain); if (idn.hasPublicSuffix()) { // the domain has a public suffix if (idn.isUnderPublicSuffix()) { return idn.topPrivateDomain().name(); } else if (idn.hasParent()) { final List<String> parts = idn.parts(); return parts.get(parts.size() - 2).concat(".").concat(parts.get(parts.size() - 1)); } } } return null; }
From source file:uk.bl.wa.extract.LinkExtractor.java
/** * Attempt to parse out the private domain. Fall back on host if things go * awry.//from ww w . j a va2 s . c o m * * @param host * @return */ public static String extractPrivateSuffixFromHost(String host) { if (host == null) return null; // Parse out the public suffix: InternetDomainName domainName; try { domainName = InternetDomainName.from(host); } catch (Exception e) { return host; } InternetDomainName suffix = null; // It appears the IDN class does not know about the various UK // second-level domains. // If it's a UK host, override the result by assuming three levels: if (host.endsWith(".uk")) { ImmutableList<String> parts = domainName.parts(); if (parts.size() >= 3) { suffix = InternetDomainName.from(parts.get(parts.size() - 3) + "." + parts.get(parts.size() - 2) + "." + parts.get(parts.size() - 1)); } } else { if (domainName.isTopPrivateDomain() || domainName.isUnderPublicSuffix()) { suffix = domainName.topPrivateDomain(); } else { suffix = domainName; } } // If it all failed for some reason, fall back on the host value: if (suffix == null) suffix = domainName; return suffix.toString(); }
From source file:org.archive.porky.ExtractTopPrivateDomainFromHostNameUDF.java
public String exec(Tuple input) throws IOException { if (input == null || input.size() == 0) return null; String str = null;/*from www.j a v a2 s. co m*/ try { str = (String) input.get(0); InternetDomainName domainName = InternetDomainName.from(str); return (domainName.topPrivateDomain().name()); } catch (Exception e) { return "other"; } }
From source file:com.jaeksoft.searchlib.analysis.filter.domain.DomainTldTokenFilter.java
@Override public final boolean incrementToken() throws IOException { if (!input.incrementToken()) return false; try {//from www . j av a 2 s.c om URL url = LinkUtils.newEncodedURL(termAtt.toString()); InternetDomainName domainName = InternetDomainName.from(url.getHost()); termAtt.setEmpty(); termAtt.append(domainName.topPrivateDomain().toString()); } catch (MalformedURLException e) { if (silent) return false; throw e; } catch (IllegalArgumentException e) { if (silent) return false; throw e; } catch (URISyntaxException e) { if (silent) return false; throw new IOException(e); } return true; }
From source file:focusedCrawler.util.LinkRelevance.java
public String getTopLevelDomainName() { InternetDomainName domain = this.getDomainName(); try {/*from ww w .j a v a 2 s. c o m*/ if (domain.isUnderPublicSuffix()) { return domain.topPrivateDomain().toString(); } else { // if the domain is a public suffix, just use it as top level domain return domain.toString(); } } catch (Exception e) { throw new IllegalStateException("Invalid top private domain name=[" + domain + "] in URL=[" + url + "]", e); } }
From source file:org.mayocat.multitenancy.DefaultTenantResolver.java
private String extractSlugFromHost(String host) { String rootDomain;//from w w w . j a va 2 s. co m String siteName = siteSettings.getWebDomainName().or(siteSettings.getDomainName()); if (Strings.emptyToNull(siteName) == null) { InternetDomainName domainName = InternetDomainName.from(host); if (domainName.hasPublicSuffix()) { // Domain is under a valid TLD, extract the TLD + first child rootDomain = domainName.topPrivateDomain().name(); } else if (host.indexOf(".") > 0 && host.indexOf(".") < host.length()) { // Otherwise, best guess : strip everything before the first dot. rootDomain = host.substring(host.indexOf(".") + 1); } else { rootDomain = host; } } else { rootDomain = StringUtils.substringBefore(siteSettings.getDomainName(), ":"); } if (host.indexOf("." + rootDomain) > 0) { return host.substring(0, host.indexOf("." + rootDomain)); } else { return host; } }
From source file:com.addthis.hydra.data.filter.bundle.BundleFilterURL.java
@Override public boolean filter(Bundle bundle) { String pv = ValueUtil.asNativeString(field.getValue(bundle)); if (!asFile) { if (pv == null || pv.length() < 7) { return invalidExit; }// w ww . ja v a2 s . c o m String lpv = pv.trim().toLowerCase(); if (!(lpv.startsWith("http"))) { if (fixProto) { if (clean && lpv.indexOf("%2f") >= 0) { pv = LessBytes.urldecode(pv); } pv = "http://".concat(pv); } else { return invalidExit; } } if (clean && (lpv.startsWith("http%") || lpv.startsWith("https%"))) { pv = LessBytes.urldecode(pv); } } // up to two 'decoding' passes on the url to try and find a valid one for (int i = 0; i < 2; i++) { if (pv == null) { return invalidExit; } try { URL urec = asFile ? new URL("file://".concat(pv)) : new URL(pv); String urlhost = urec.getHost(); String returnhost = null; if (resolveIP) { synchronized (iphost) { returnhost = iphost.get(urlhost).toLowerCase(); if (returnhost == null) { returnhost = resolveDottedIP(urlhost); iphost.put(urlhost, returnhost); if (iphost.size() > maxhostcache) { iphost.removeEldest(); } } } } else { returnhost = urlhost.toLowerCase(); } // store cleaned up (url decoded) version back to packet if (clean) { if (urec != null && urec.getPath().isEmpty()) { // if the path element is null, append the slash pv = pv.concat("/"); } field.setValue(bundle, ValueFactory.create(pv)); } if (setHost != null) { if (toBaseDomain) { returnhost = NetUtil.getBaseDomain(returnhost); } else if (toTopPrivateDomain) { if (returnhost != null && InternetDomainName.isValid(returnhost)) { InternetDomainName domain = InternetDomainName.from(returnhost); if (domain.hasPublicSuffix() && domain.isUnderPublicSuffix()) { InternetDomainName topPrivateDomain = domain.topPrivateDomain(); returnhost = topPrivateDomain.toString(); } } } setHost.setValue(bundle, ValueFactory.create(returnhost)); } if (setPath != null) { setPath.setValue(bundle, ValueFactory.create(urec.getPath())); } if (setParams != null) { setParams.setValue(bundle, ValueFactory.create(urec.getQuery())); } if (setAnchor != null) { setAnchor.setValue(bundle, ValueFactory.create(urec.getRef())); } if (setHostNormal != null) { Matcher m = hostNormalPattern.matcher(returnhost); if (m.find()) { returnhost = m.group(1); } setHostNormal.setValue(bundle, ValueFactory.create(returnhost)); } if (setTopPrivateDomain != null) { String topDomain = returnhost; if (InternetDomainName.isValid(returnhost)) { InternetDomainName domainName = InternetDomainName.from(returnhost); if (domainName.isTopPrivateDomain() || domainName.isUnderPublicSuffix()) { topDomain = DOT_JOINER.join(domainName.topPrivateDomain().parts()); } } setTopPrivateDomain.setValue(bundle, ValueFactory.create(topDomain)); } } catch (MalformedURLException e) { if (pv.indexOf("%3") > 0 && pv.indexOf("%2") > 0) { pv = LessBytes.urldecode(pv); } else { if (debugMalformed) { System.err.println("malformed(" + i + ") " + pv); } return invalidExit; } } } return true; }