List of usage examples for com.google.common.net InternetDomainName isValid
public static boolean isValid(String name)
From source file:org.elasticsearch.plugin.readonlyrest.acl.blocks.rules.impl.XForwardedForSyncRule.java
public XForwardedForSyncRule(Settings s) throws RuleNotConfiguredException { super();//from w ww . ja v a 2 s . c o m String[] a = s.getAsArray(getKey()); if (a != null && a.length > 0) { allowedAddresses = Lists.newArrayList(); for (int i = 0; i < a.length; i++) { if (!Strings.isNullOrEmpty(a[i])) { try { IPMask.getIPMask(a[i]); } catch (Exception e) { if (!InternetDomainName.isValid(a[i])) { throw new RuleConfigurationError("invalid address", e); } } allowedAddresses.add(a[i].trim()); } } } else { throw new RuleNotConfiguredException(); } }
From source file:org.elasticsearch.plugin.readonlyrest.acl.blocks.rules.impl.HostsSyncRule.java
public HostsSyncRule(Settings s) throws RuleNotConfiguredException { super();//from ww w . j av a 2s . co m acceptXForwardedForHeader = s.getAsBoolean("accept_x-forwarded-for_header", false); String[] a = s.getAsArray("hosts"); if (a != null && a.length > 0) { allowedAddresses = Lists.newArrayList(); for (int i = 0; i < a.length; i++) { if (!Strings.isNullOrEmpty(a[i])) { try { IPMask.getIPMask(a[i]); } catch (Exception e) { if (!InternetDomainName.isValid(a[i])) { throw new RuleConfigurationError("invalid address", e); } } allowedAddresses.add(a[i].trim()); } } } else { throw new RuleNotConfiguredException(); } }
From source file:edu.uci.ics.crawler4j.url.WebURL.java
public void setURL(String url) { this.url = url; int domainStartIdx = url.indexOf("//") + 2; int domainEndIdx = url.indexOf('/', domainStartIdx); domainEndIdx = (domainEndIdx > domainStartIdx) ? domainEndIdx : url.length(); String domain = url.substring(domainStartIdx, domainEndIdx); registeredDomain = domain;//from w ww . j a v a2 s . c o m subDomain = ""; if (tldList != null && !(domain.isEmpty()) && InternetDomainName.isValid(domain)) { String candidate = null; String rd = null; String sd = null; String[] parts = domain.split("\\."); for (int i = parts.length - 1; i >= 0; i--) { if (rd == null) { if (candidate == null) { candidate = parts[i]; } else { candidate = parts[i] + "." + candidate; } if (tldList.isRegisteredDomain(candidate)) { rd = candidate; } } else { if (sd == null) { sd = parts[i]; } else { sd = parts[i] + "." + sd; } } } if (rd != null) { registeredDomain = rd; } if (sd != null) { subDomain = sd; } } path = url.substring(domainEndIdx); int pathEndIdx = path.indexOf('?'); if (pathEndIdx >= 0) { path = path.substring(0, pathEndIdx); } }
From source file:org.archive.crawler.prefetch.HostQuotaEnforcer.java
@Override protected boolean shouldProcess(CrawlURI curi) { String uriHostname = serverCache.getHostFor(curi.getUURI()).getHostName(); if (getApplyToSubdomains() && InternetDomainName.isValid(host) && InternetDomainName.isValid(uriHostname)) { InternetDomainName h = InternetDomainName.from(host); InternetDomainName uriHostOrAncestor = InternetDomainName.from(uriHostname); while (true) { if (uriHostOrAncestor.equals(h)) { return true; }/*from w w w. j a v a2 s . c o m*/ if (uriHostOrAncestor.hasParent()) { uriHostOrAncestor = uriHostOrAncestor.parent(); } else { break; } } return false; } else { return serverCache.getHostFor(curi.getUURI()) == serverCache.getHostFor(host); } }
From source file:org.archive.modules.fetcher.BdbCookieStore.java
/** * Returns a {@link LimitedCookieStoreFacade} whose * {@link LimitedCookieStoreFacade#getCookies()} method returns only cookies * from {@code host} and its parent domains, if applicable. *//*from ww w. j a va 2 s .c o m*/ public CookieStore cookieStoreFor(String host) { CompositeCollection cookieCollection = new CompositeCollection(); if (InternetDomainName.isValid(host)) { InternetDomainName domain = InternetDomainName.from(host); while (domain != null) { Collection<Cookie> subset = hostSubset(domain.toString()); cookieCollection.addComposited(subset); if (domain.hasParent()) { domain = domain.parent(); } else { domain = null; } } } else { Collection<Cookie> subset = hostSubset(host.toString()); cookieCollection.addComposited(subset); } @SuppressWarnings("unchecked") List<Cookie> cookieList = new RestrictedCollectionWrappedList<Cookie>(cookieCollection); LimitedCookieStoreFacade store = new LimitedCookieStoreFacade(cookieList); return store; }
From source file:com.qwazr.crawler.web.manager.WebCrawlThread.java
private boolean matchesInitialDomain(URI uri) { String host = uri.getHost();//from w w w . j a v a 2s .c o m if (StringUtils.isEmpty(host)) return false; if (!InternetDomainName.isValid(host)) return false; return internetDomainName.equals(InternetDomainName.from(host)); }
From source file:com.addthis.hydra.data.filter.bundle.BundleFilterURL.java
@Override public boolean filter(Bundle bundle) { String pv = ValueUtil.asNativeString(field.getValue(bundle)); if (!asFile) { if (pv == null || pv.length() < 7) { return invalidExit; }/* w w w . ja v a 2s . c o m*/ String lpv = pv.trim().toLowerCase(); if (!(lpv.startsWith("http"))) { if (fixProto) { if (clean && lpv.indexOf("%2f") >= 0) { pv = LessBytes.urldecode(pv); } pv = "http://".concat(pv); } else { return invalidExit; } } if (clean && (lpv.startsWith("http%") || lpv.startsWith("https%"))) { pv = LessBytes.urldecode(pv); } } // up to two 'decoding' passes on the url to try and find a valid one for (int i = 0; i < 2; i++) { if (pv == null) { return invalidExit; } try { URL urec = asFile ? new URL("file://".concat(pv)) : new URL(pv); String urlhost = urec.getHost(); String returnhost = null; if (resolveIP) { synchronized (iphost) { returnhost = iphost.get(urlhost).toLowerCase(); if (returnhost == null) { returnhost = resolveDottedIP(urlhost); iphost.put(urlhost, returnhost); if (iphost.size() > maxhostcache) { iphost.removeEldest(); } } } } else { returnhost = urlhost.toLowerCase(); } // store cleaned up (url decoded) version back to packet if (clean) { if (urec != null && urec.getPath().isEmpty()) { // if the path element is null, append the slash pv = pv.concat("/"); } field.setValue(bundle, ValueFactory.create(pv)); } if (setHost != null) { if (toBaseDomain) { returnhost = NetUtil.getBaseDomain(returnhost); } else if (toTopPrivateDomain) { if (returnhost != null && InternetDomainName.isValid(returnhost)) { InternetDomainName domain = InternetDomainName.from(returnhost); if (domain.hasPublicSuffix() && domain.isUnderPublicSuffix()) { InternetDomainName topPrivateDomain = domain.topPrivateDomain(); returnhost = topPrivateDomain.toString(); } } } setHost.setValue(bundle, ValueFactory.create(returnhost)); } if (setPath != null) { setPath.setValue(bundle, ValueFactory.create(urec.getPath())); } if (setParams != null) { setParams.setValue(bundle, ValueFactory.create(urec.getQuery())); } if (setAnchor != null) { setAnchor.setValue(bundle, ValueFactory.create(urec.getRef())); } if (setHostNormal != null) { Matcher m = hostNormalPattern.matcher(returnhost); if (m.find()) { returnhost = m.group(1); } setHostNormal.setValue(bundle, ValueFactory.create(returnhost)); } if (setTopPrivateDomain != null) { String topDomain = returnhost; if (InternetDomainName.isValid(returnhost)) { InternetDomainName domainName = InternetDomainName.from(returnhost); if (domainName.isTopPrivateDomain() || domainName.isUnderPublicSuffix()) { topDomain = DOT_JOINER.join(domainName.topPrivateDomain().parts()); } } setTopPrivateDomain.setValue(bundle, ValueFactory.create(topDomain)); } } catch (MalformedURLException e) { if (pv.indexOf("%3") > 0 && pv.indexOf("%2") > 0) { pv = LessBytes.urldecode(pv); } else { if (debugMalformed) { System.err.println("malformed(" + i + ") " + pv); } return invalidExit; } } } return true; }
From source file:google.registry.xml.XmlTestUtils.java
/** * Deeply explore the object and normalize values so that things we consider equal compare so. * The return value consists of two parts: the updated key and the value. The value is * straightforward enough: it is the rendering of the subtree to be attached at the current point. * The key is more complicated, because of namespaces. When an XML element specifies namespaces * using xmlns attributes, those namespaces apply to the element as well as all of its * descendants. That means that, when prefixing the element name with the full namespace path, * as required to do proper comparison, the element name depends on its children. When looping * through a JSONObject map, we can't just recursively generate the value and store it using the * key. We may have to update the key as well, to get the namespaces correct. A returned key of * null indicates that we should use the existing key. A non-null key indicates that we should * replace the existing key.// ww w . j a va 2 s .c o m * * @param elementName the name under which the current subtree was found, or null if the current * subtree's name is nonexistent or irrelevant * @param obj the current subtree * @param path the (non-namespaced) element path used for ignoredPaths purposes * @param ignoredPaths the set of paths whose values should be set to IGNORED * @param nsMap the inherited namespace identifier-to-URI map * @return the key under which the rendered subtree should be stored (or null), and the rendered * subtree */ private static Map.Entry<String, Object> normalize(@Nullable String elementName, Object obj, @Nullable String path, Set<String> ignoredPaths, Map<String, String> nsMap) throws Exception { if (obj instanceof JSONObject) { JSONObject jsonObject = (JSONObject) obj; Map<String, Object> map = new HashMap<>(); String[] names = JSONObject.getNames(jsonObject); if (names != null) { // Separate all elements and keys into namespace specifications, which we must process // first, and everything else. ImmutableList.Builder<String> namespacesBuilder = new ImmutableList.Builder<>(); ImmutableList.Builder<String> othersBuilder = new ImmutableList.Builder<>(); for (String key : names) { (key.startsWith("xmlns") ? namespacesBuilder : othersBuilder).add(key); } // First, handle all namespace specifications, updating our ns-to-URI map. Use a HashMap // rather than an ImmutableMap.Builder so that we can override existing map entries. HashMap<String, String> newNsMap = new HashMap<>(); newNsMap.putAll(nsMap); for (String key : namespacesBuilder.build()) { // Parse the attribute name, of the form xmlns:nsid, and extract the namespace identifier. // If there's no colon, we are setting the default namespace. List<String> components = Splitter.on(':').splitToList(key); String ns = (components.size() >= 2) ? components.get(1) : ""; newNsMap.put(ns, jsonObject.get(key).toString()); } nsMap = ImmutableMap.copyOf(newNsMap); // Now, handle the non-namespace items, recursively transforming the map and mapping all // namespaces to the full URI for proper comparison. for (String key : othersBuilder.build()) { String simpleKey = Iterables.getLast(Splitter.on(':').split(key)); String newPath = (path == null) ? simpleKey : (path + "." + simpleKey); String mappedKey; Object value; if (ignoredPaths.contains(newPath)) { mappedKey = null; // Set ignored fields to a value that will compare equal. value = "IGNORED"; } else { Map.Entry<String, Object> simpleEntry = normalize(key, jsonObject.get(key), newPath, ignoredPaths, nsMap); mappedKey = simpleEntry.getKey(); value = simpleEntry.getValue(); } if (mappedKey == null) { // Note that this does not follow the XML rules exactly. I read somewhere that attribute // names, unlike element names, never use the default namespace. But after // JSONification, we cannot distinguish between attributes and child elements, so we // apply the default namespace to everything. Hopefully that will not cause a problem. mappedKey = key.equals("content") ? key : mapName(key, nsMap, true); } map.put(mappedKey, value); } } // Map the namespace of the element name of the map we are normalizing. elementName = mapName(elementName, nsMap, true); // If a node has both text content and attributes, the text content will end up under a key // called "content". If that's the only thing left (which will only happen if there was an // "xmlns:*" key that we removed), treat the node as just text and recurse. if (map.size() == 1 && map.containsKey("content")) { return new AbstractMap.SimpleEntry<>(elementName, normalize(null, jsonObject.get("content"), path, ignoredPaths, nsMap).getValue()); } // The conversion to JSON converts <a/> into "" and the semantically equivalent <a></a> into // an empty map, so normalize that here. return new AbstractMap.SimpleEntry<>(elementName, map.isEmpty() ? "" : map); } if (obj instanceof JSONArray) { // Another problem resulting from JSONification: If the array contains elements whose names // are the same before URI expansion, but different after URI expansion, because they use // xmlns attribute that define the namespaces differently, we will screw up. Again, hopefully // that doesn't happen much. The reverse is also true: If the array contains names that are // different before URI expansion, but the same after, we may have a problem, because the // elements will wind up in different JSONArrays as a result of JSONification. We wave our // hands and just assume that the URI expansion of the first element holds for all others. Set<Object> set = new HashSet<>(); String mappedKey = null; for (int i = 0; i < ((JSONArray) obj).length(); ++i) { Map.Entry<String, Object> simpleEntry = normalize(null, ((JSONArray) obj).get(i), path, ignoredPaths, nsMap); if (i == 0) { mappedKey = simpleEntry.getKey(); } set.add(simpleEntry.getValue()); } return new AbstractMap.SimpleEntry<String, Object>(mappedKey, set); } if (obj instanceof Number) { return new AbstractMap.SimpleEntry<String, Object>(null, obj.toString()); } if (obj instanceof Boolean) { return new AbstractMap.SimpleEntry<String, Object>(null, ((Boolean) obj) ? "1" : "0"); } if (obj instanceof String) { // Turn stringified booleans into integers. Both are acceptable as xml boolean values, but // we use "true" and "false" whereas the samples use "1" and "0". if (obj.equals("true")) { return new AbstractMap.SimpleEntry<String, Object>(null, "1"); } if (obj.equals("false")) { return new AbstractMap.SimpleEntry<String, Object>(null, "0"); } String string = obj.toString(); // We use a slightly different datetime format (both legal) than the samples, so normalize // both into Datetime objects. try { return new AbstractMap.SimpleEntry<String, Object>(null, ISODateTimeFormat.dateTime().parseDateTime(string).toDateTime(UTC)); } catch (IllegalArgumentException e) { // It wasn't a DateTime. } try { return new AbstractMap.SimpleEntry<String, Object>(null, ISODateTimeFormat.dateTimeNoMillis().parseDateTime(string).toDateTime(UTC)); } catch (IllegalArgumentException e) { // It wasn't a DateTime. } try { if (!InternetDomainName.isValid(string)) { // It's not a domain name, but it is an InetAddress. Ergo, it's an ip address. return new AbstractMap.SimpleEntry<String, Object>(null, InetAddresses.forString(string)); } } catch (IllegalArgumentException e) { // Not an ip address. } return new AbstractMap.SimpleEntry<String, Object>(null, string); } return new AbstractMap.SimpleEntry<>(null, checkNotNull(obj)); }
From source file:org.archive.modules.fetcher.FetchWhois.java
/** * Adds outlinks to whois:{domain} and whois:{ipAddress} *///from w w w . ja v a 2s . co m protected void addWhoisLinks(CrawlURI curi) throws InterruptedException { CrawlHost ch = serverCache.getHostFor(curi.getUURI()); if (ch == null) { return; } if (ch.getIP() != null) { // do a whois lookup on the ip address addWhoisLink(curi, ch.getIP().getHostAddress()); } if (InternetDomainName.isValid(ch.getHostName())) { // do a whois lookup on the domain try { String topmostAssigned = InternetDomainName.from(ch.getHostName()).topPrivateDomain().toString(); addWhoisLink(curi, topmostAssigned); } catch (IllegalStateException e) { // java.lang.IllegalStateException: Not under a public suffix: mod.uk logger.warning( "problem resolving topmost assigned domain, will try whois lookup on the plain hostname " + ch.getHostName() + " - " + e); addWhoisLink(curi, ch.getHostName()); } } }
From source file:com.spend.spendService.WorkerSearchQueue.java
private void insertSeedLink(String url, String searchEngineName, String text, int resultOrder, int pageContentId) { try {// w w w .j a v a 2s . co m URL u = new URL(url); String host = u.getHost(); if (InternetDomainName.isValid(host) || com.google.common.net.HostSpecifier.isValid(host)) { PreparedStatement pstmt = con.prepareStatement( "INSERT INTO seedurlraw (url, searchEngine,resultOrder,pageContentId) VALUES (?,?,?,?);"); pstmt.setString(1, url); pstmt.setString(2, searchEngineName); pstmt.setString(3, String.valueOf(resultOrder)); pstmt.setInt(4, pageContentId); pstmt.executeUpdate(); pstmt.close(); DateFormat dateFormat = new SimpleDateFormat("dd/MM/yyyy HH:mm:ss"); Object[] row = { searchEngineName, text, url, dateFormat.format(new Date()) }; } } catch (Exception ex) { String a = ""; String b = ""; } }