List of usage examples for org.apache.commons.httpclient URIException URIException
public URIException(String reason)
From source file:com.limegroup.gnutella.licenses.LicenseFactory.java
/** Gets a CC license URI from the given license string. */ private static URI getCCLicenseURI(String license) { // find where the URL should begin. int verifyAt = license.indexOf(CCConstants.URL_INDICATOR); if (verifyAt == -1) return null; int urlStart = verifyAt + CCConstants.URL_INDICATOR.length(); if (urlStart >= license.length()) return null; String url = license.substring(urlStart).trim(); URI uri = null;//from w w w . j a v a 2 s . com try { uri = new URI(url.toCharArray()); // Make sure the scheme is HTTP. String scheme = uri.getScheme(); if (scheme == null || !scheme.equalsIgnoreCase("http")) throw new URIException("Invalid scheme: " + scheme); // Make sure the scheme has some authority. String authority = uri.getAuthority(); if (authority == null || authority.equals("") || authority.indexOf(' ') != -1) throw new URIException("Invalid authority: " + authority); } catch (URIException e) { uri = null; LOG.error("Unable to create URI", e); } return uri; }
From source file:de.kapsi.net.daap.DaapRequest.java
/** * Sets and parses the URI. Note: if URIException is * thrown then is this Request in an inconsistent state! * * @param uri/*from www.j ava 2s . co m*/ * @throws URIException */ private void setURI(URI uri) throws URIException { this.uri = uri; if (uri != null) { String path = uri.getPath(); this.queryMap = DaapUtil.parseQuery(uri.getQuery()); if (path.equals("/server-info")) { requestType = SERVER_INFO; } else if (path.equals("/content-codes")) { requestType = CONTENT_CODES; } else if (path.equals("/login")) { requestType = LOGIN; } else if (path.equals("/logout")) { requestType = LOGOUT; } else if (path.equals("/update")) { requestType = UPDATE; } else if (path.equals("/resolve")) { requestType = RESOLVE; } if (queryMap.containsKey("session-id")) { sessionId = Integer.parseInt((String) queryMap.get("session-id")); } if (sessionId != DaapUtil.NULL) { if (queryMap.containsKey("revision-number")) { revisionNumber = Integer.parseInt((String) queryMap.get("revision-number")); } if (queryMap.containsKey("delta")) { delta = Integer.parseInt((String) queryMap.get("delta")); } if (queryMap.containsKey("meta")) { metaString = (String) queryMap.get("meta"); } isUpdateType = (delta != DaapUtil.NULL) && (delta < revisionNumber); // "/databases/id/items" 3 tokens // "/databases/id/containers" 3 tokens // "/databases/id/items/id.format" 4 tokens // "/databases/id/containers/id/items" 5 tokens if (path.equals("/databases")) { requestType = DATABASES; } else if (path.startsWith("/databases")) { StringTokenizer tok = new StringTokenizer(path, "/"); int count = tok.countTokens(); if (count >= 3) { String token = tok.nextToken(); if (token.equals("databases") == false) { throw new URIException("Unknown token in path: " + path + " [" + token + "]@1"); } databaseId = Integer.parseInt((String) tok.nextToken()); token = tok.nextToken(); if (token.equals("items")) { requestType = DATABASE_SONGS; } else if (token.equals("containers")) { requestType = DATABASE_PLAYLISTS; } else { throw new URIException("Unknown token in path: " + path + " [" + token + "]@2"); } if (count == 3) { // do nothing... } else if (count == 4) { token = (String) tok.nextToken(); StringTokenizer fileTokenizer = new StringTokenizer(token, "."); if (fileTokenizer.countTokens() == 2) { itemId = Integer.parseInt(fileTokenizer.nextToken()); requestType = SONG; } else { throw new URIException("Unknown token in path: " + path + " [" + token + "]@3"); } } else if (count == 5) { containerId = Integer.parseInt((String) tok.nextToken()); token = (String) tok.nextToken(); if (token.equals("items")) { requestType = PLAYLIST_SONGS; } else { throw new URIException("Unknown token in path: " + path + " [" + token + "@4"); } } else { throw new URIException("Unknown token in path: " + path + " [" + token + "]@5"); } } else { throw new URIException("Unknown token in path: " + path); } } } } else { queryMap = null; metaString = null; isUpdateType = false; requestType = DaapUtil.NULL; databaseId = DaapUtil.NULL; containerId = DaapUtil.NULL; itemId = DaapUtil.NULL; sessionId = DaapUtil.NULL; revisionNumber = DaapUtil.NULL; delta = DaapUtil.NULL; } }
From source file:com.cyberway.issue.net.LaxURI.java
/** * IA OVERRIDDEN IN LaxURI TO INCLUDE FIX FOR * http://issues.apache.org/jira/browse/HTTPCLIENT-588 * AND// w w w.j a v a 2 s . co m * http://webteam.archive.org/jira/browse/HER-1268 * * In order to avoid any possilbity of conflict with non-ASCII characters, * Parse a URI reference as a <code>String</code> with the character * encoding of the local system or the document. * <p> * The following line is the regular expression for breaking-down a URI * reference into its components. * <p><blockquote><pre> * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * 12 3 4 5 6 7 8 9 * </pre></blockquote><p> * For example, matching the above expression to * http://jakarta.apache.org/ietf/uri/#Related * results in the following subexpression matches: * <p><blockquote><pre> * $1 = http: * scheme = $2 = http * $3 = //jakarta.apache.org * authority = $4 = jakarta.apache.org * path = $5 = /ietf/uri/ * $6 = <undefined> * query = $7 = <undefined> * $8 = #Related * fragment = $9 = Related * </pre></blockquote><p> * * @param original the original character sequence * @param escaped <code>true</code> if <code>original</code> is escaped * @throws URIException If an error occurs. */ protected void parseUriReference(String original, boolean escaped) throws URIException { // validate and contruct the URI character sequence if (original == null) { throw new URIException("URI-Reference required"); } /* @ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? */ String tmp = original.trim(); /* * The length of the string sequence of characters. * It may not be equal to the length of the byte array. */ int length = tmp.length(); /* * Remove the delimiters like angle brackets around an URI. */ if (length > 0) { char[] firstDelimiter = { tmp.charAt(0) }; if (validate(firstDelimiter, delims)) { if (length >= 2) { char[] lastDelimiter = { tmp.charAt(length - 1) }; if (validate(lastDelimiter, delims)) { tmp = tmp.substring(1, length - 1); length = length - 2; } } } } /* * The starting index */ int from = 0; /* * The test flag whether the URI is started from the path component. */ boolean isStartedFromPath = false; int atColon = tmp.indexOf(':'); int atSlash = tmp.indexOf('/'); if ((atColon <= 0 && !tmp.startsWith("//")) || (atSlash >= 0 && atSlash < atColon)) { isStartedFromPath = true; } /* * <p><blockquote><pre> * @@@@@@@@ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * </pre></blockquote><p> */ int at = indexFirstOf(tmp, isStartedFromPath ? "/?#" : ":/?#", from); if (at == -1) { at = 0; } /* * Parse the scheme. * <p><blockquote><pre> * scheme = $2 = http * @ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * </pre></blockquote><p> */ if (at > 0 && at < length && tmp.charAt(at) == ':') { char[] target = tmp.substring(0, at).toLowerCase().toCharArray(); if (validate(target, scheme)) { _scheme = target; from = ++at; } else { // IA CHANGE: // do nothing; allow interpretation as URI with // later colon in other syntactical component } } /* * Parse the authority component. * <p><blockquote><pre> * authority = $4 = jakarta.apache.org * @@ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * </pre></blockquote><p> */ // Reset flags _is_net_path = _is_abs_path = _is_rel_path = _is_hier_part = false; if (0 <= at && at < length && tmp.charAt(at) == '/') { // Set flag _is_hier_part = true; if (at + 2 < length && tmp.charAt(at + 1) == '/' && !isStartedFromPath) { // the temporary index to start the search from int next = indexFirstOf(tmp, "/?#", at + 2); if (next == -1) { next = (tmp.substring(at + 2).length() == 0) ? at + 2 : tmp.length(); } parseAuthority(tmp.substring(at + 2, next), escaped); from = at = next; // Set flag _is_net_path = true; } if (from == at) { // Set flag _is_abs_path = true; } } /* * Parse the path component. * <p><blockquote><pre> * path = $5 = /ietf/uri/ * @@@@@@ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * </pre></blockquote><p> */ if (from < length) { // rel_path = rel_segment [ abs_path ] int next = indexFirstOf(tmp, "?#", from); if (next == -1) { next = tmp.length(); } if (!_is_abs_path) { if (!escaped && prevalidate(tmp.substring(from, next), disallowed_rel_path) || escaped && validate(tmp.substring(from, next).toCharArray(), rel_path)) { // Set flag _is_rel_path = true; } else if (!escaped && prevalidate(tmp.substring(from, next), disallowed_opaque_part) || escaped && validate(tmp.substring(from, next).toCharArray(), opaque_part)) { // Set flag _is_opaque_part = true; } else { // the path component may be empty _path = null; } } String s = tmp.substring(from, next); if (escaped) { setRawPath(s.toCharArray()); } else { setPath(s); } at = next; } // set the charset to do escape encoding String charset = getProtocolCharset(); /* * Parse the query component. * <p><blockquote><pre> * query = $7 = <undefined> * @@@@@@@@@ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * </pre></blockquote><p> */ if (0 <= at && at + 1 < length && tmp.charAt(at) == '?') { int next = tmp.indexOf('#', at + 1); if (next == -1) { next = tmp.length(); } if (escaped) { _query = tmp.substring(at + 1, next).toCharArray(); if (!validate(_query, query)) { throw new URIException("Invalid query"); } } else { _query = encode(tmp.substring(at + 1, next), allowed_query, charset); } at = next; } /* * Parse the fragment component. * <p><blockquote><pre> * fragment = $9 = Related * @@@@@@@@ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * </pre></blockquote><p> */ if (0 <= at && at + 1 <= length && tmp.charAt(at) == '#') { if (at + 1 == length) { // empty fragment _fragment = "".toCharArray(); } else { _fragment = (escaped) ? tmp.substring(at + 1).toCharArray() : encode(tmp.substring(at + 1), allowed_fragment, charset); } } // set this URI. setURI(); }
From source file:com.hipu.bdb.util.LaxURI.java
/** * IA OVERRIDDEN IN LaxURI TO INCLUDE FIX FOR * http://issues.apache.org/jira/browse/HTTPCLIENT-588 * AND/*from w w w. ja v a 2 s . c o m*/ * http://webteam.archive.org/jira/browse/HER-1268 * * In order to avoid any possilbity of conflict with non-ASCII characters, * Parse a URI reference as a <code>String</code> with the character * encoding of the local system or the document. * <p> * The following line is the regular expression for breaking-down a URI * reference into its components. * <p><blockquote><pre> * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * 12 3 4 5 6 7 8 9 * </pre></blockquote><p> * For example, matching the above expression to * http://jakarta.apache.org/ietf/uri/#Related * results in the following subexpression matches: * <p><blockquote><pre> * $1 = http: * scheme = $2 = http * $3 = //jakarta.apache.org * authority = $4 = jakarta.apache.org * path = $5 = /ietf/uri/ * $6 = <undefined> * query = $7 = <undefined> * $8 = #Related * fragment = $9 = Related * </pre></blockquote><p> * * @param original the original character sequence * @param escaped <code>true</code> if <code>original</code> is escaped * @throws URIException If an error occurs. */ protected void parseUriReference(String original, boolean escaped) throws URIException { // validate and contruct the URI character sequence if (original == null) { throw new URIException("URI-Reference required"); } /* @ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? */ String tmp = original.trim(); /* * The length of the string sequence of characters. * It may not be equal to the length of the byte array. */ int length = tmp.length(); /* * Remove the delimiters like angle brackets around an URI. */ if (length > 0) { char[] firstDelimiter = { tmp.charAt(0) }; if (validate(firstDelimiter, delims)) { if (length >= 2) { char[] lastDelimiter = { tmp.charAt(length - 1) }; if (validate(lastDelimiter, delims)) { tmp = tmp.substring(1, length - 1); length = length - 2; } } } } /* * The starting index */ int from = 0; /* * The test flag whether the URI is started from the path component. */ boolean isStartedFromPath = false; int atColon = tmp.indexOf(':'); int atSlash = tmp.indexOf('/'); if ((atColon <= 0 && !tmp.startsWith("//")) || (atSlash >= 0 && atSlash < atColon)) { isStartedFromPath = true; } /* * <p><blockquote><pre> * @@@@@@@@ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * </pre></blockquote><p> */ int at = indexFirstOf(tmp, isStartedFromPath ? "/?#" : ":/?#", from); if (at == -1) { at = 0; } /* * Parse the scheme. * <p><blockquote><pre> * scheme = $2 = http * @ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * </pre></blockquote><p> */ if (at > 0 && at < length && tmp.charAt(at) == ':') { char[] target = tmp.substring(0, at).toLowerCase().toCharArray(); if (validate(target, scheme)) { _scheme = target; from = ++at; } else { // IA CHANGE: // do nothing; allow interpretation as URI with // later colon in other syntactical component } } /* * Parse the authority component. * <p><blockquote><pre> * authority = $4 = jakarta.apache.org * @@ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * </pre></blockquote><p> */ // Reset flags _is_net_path = _is_abs_path = _is_rel_path = _is_hier_part = false; if (0 <= at && at < length && tmp.charAt(at) == '/') { // Set flag _is_hier_part = true; if (at + 2 < length && tmp.charAt(at + 1) == '/' && !isStartedFromPath) { // the temporary index to start the search from int next = indexFirstOf(tmp, "/?#", at + 2); if (next == -1) { next = (tmp.substring(at + 2).length() == 0) ? at + 2 : tmp.length(); } parseAuthority(tmp.substring(at + 2, next), escaped); from = at = next; // Set flag _is_net_path = true; } if (from == at) { // Set flag _is_abs_path = true; } } /* * Parse the path component. * <p><blockquote><pre> * path = $5 = /ietf/uri/ * @@@@@@ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * </pre></blockquote><p> */ if (from < length) { // rel_path = rel_segment [ abs_path ] int next = indexFirstOf(tmp, "?#", from); if (next == -1) { next = tmp.length(); } if (!_is_abs_path) { if (!escaped && prevalidate(tmp.substring(from, next), disallowed_rel_path) || escaped && validate(tmp.substring(from, next).toCharArray(), rel_path)) { // Set flag _is_rel_path = true; } else if (!escaped && prevalidate(tmp.substring(from, next), disallowed_opaque_part) || escaped && validate(tmp.substring(from, next).toCharArray(), opaque_part)) { // Set flag _is_opaque_part = true; } else { // the path component may be empty _path = null; } } String s = tmp.substring(from, next); if (escaped) { setRawPath(s.toCharArray()); } else { setPath(s); } at = next; } // set the charset to do escape encoding String charset = getProtocolCharset(); /* * Parse the query component. * <p><blockquote><pre> * query = $7 = <undefined> * @@@@@@@@@ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * </pre></blockquote><p> */ if (0 <= at && at + 1 < length && tmp.charAt(at) == '?') { int next = tmp.indexOf('#', at + 1); if (next == -1) { next = tmp.length(); } if (escaped) { _query = tmp.substring(at + 1, next).toCharArray(); if (!validate(_query, query)) { throw new URIException("Invalid query"); } } else { _query = encode(tmp.substring(at + 1, next), allowed_query, charset); } at = next; } /* * Parse the fragment component. * <p><blockquote><pre> * fragment = $9 = Related * @@@@@@@@ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * </pre></blockquote><p> */ if (0 <= at && at + 1 <= length && tmp.charAt(at) == '#') { if (at + 1 == length) { // empty fragment _fragment = "".toCharArray(); } else { _fragment = (escaped) ? tmp.substring(at + 1).toCharArray() : encode(tmp.substring(at + 1), allowed_fragment, charset); } } // set this URI. setURI(); }
From source file:com.cyberway.issue.net.UURIFactory.java
/** * Check the generated UURI.// w w w . j a v a 2 s. c o m * * At the least look at length of uuri string. We were seeing case * where before escaping, string was < MAX_URL_LENGTH but after was * >. Letting out a too-big message was causing us troubles later * down the processing chain. * @param uuri Created uuri to check. * @return The passed <code>uuri</code> so can easily inline this check. * @throws URIException */ protected UURI validityCheck(UURI uuri) throws URIException { if (uuri.getRawURI().length > UURI.MAX_URL_LENGTH) { throw new URIException("Created (escaped) uuri > " + UURI.MAX_URL_LENGTH + ": " + uuri.toString()); } return uuri; }
From source file:com.cyberway.issue.net.UURIFactory.java
/** * Do heritrix fix-up on passed uri string. * * Does heritrix escaping; usually escaping done to make our behavior align * with IEs. This method codifies our experience pulling URIs from the * wilds. Its does all the escaping we want; its output can always be * assumed to be 'escaped' (though perhaps to a laxer standard than the * vanilla HttpClient URI class or official specs might suggest). * * @param uri URI as string.//from w ww . j a va2s .c o m * @param base May be null. * @param e True if the uri is already escaped. * @return A fixed up URI string. * @throws URIException */ private String fixup(String uri, final URI base, final String charset) throws URIException { if (uri == null) { throw new NullPointerException(); } else if (uri.length() == 0 && base == null) { throw new URIException("URI length is zero (and not relative)."); } if (uri.length() > UURI.MAX_URL_LENGTH) { // We check length here and again later after all convertions. throw new URIException("URI length > " + UURI.MAX_URL_LENGTH + ": " + uri); } // Replace nbsp with normal spaces (so that they get stripped if at // ends, or encoded if in middle) if (uri.indexOf(NBSP) >= 0) { uri = TextUtils.replaceAll(NBSP, uri, SPACE); } // Get rid of any trailing spaces or new-lines. uri = uri.trim(); // IE actually converts backslashes to slashes rather than to %5C. // Since URIs that have backslashes usually work only with IE, we will // convert backslashes to slashes as well. // TODO: Maybe we can first convert backslashes by specs and than by IE // so that we fetch both versions. if (uri.indexOf(BACKSLASH) >= 0) { uri = TextUtils.replaceAll(BACKSLASH_PATTERN, uri, SLASH); } // Remove stray TAB/CR/LF uri = TextUtils.replaceAll(STRAY_SPACING, uri, EMPTY_STRING); // Test for the case of more than two slashes after the http(s) scheme. // Replace with two slashes as mozilla does if found. // See [ 788219 ] URI Syntax Errors stop page parsing. Matcher matcher = HTTP_SCHEME_SLASHES.matcher(uri); if (matcher.matches()) { uri = matcher.group(1) + matcher.group(2); } // now, minimally escape any whitespace uri = escapeWhitespace(uri); // For further processing, get uri elements. See the RFC2396REGEX // comment above for explaination of group indices used in the below. matcher = RFC2396REGEX.matcher(uri); if (!matcher.matches()) { throw new URIException("Failed parse of " + uri); } String uriScheme = checkUriElementAndLowerCase(matcher.group(2)); String uriSchemeSpecificPart = checkUriElement(matcher.group(3)); String uriAuthority = checkUriElement(matcher.group(5)); String uriPath = checkUriElement(matcher.group(6)); String uriQuery = checkUriElement(matcher.group(8)); // UNUSED String uriFragment = checkUriElement(matcher.group(10)); // If a scheme, is it a supported scheme? if (uriScheme != null && uriScheme.length() > 0 && this.schemes != null) { if (!(Arrays.binarySearch(schemes, uriScheme) >= 0)) { // unsupported; see if silently ignored if ((Arrays.binarySearch(ignoredSchemes, uriScheme) >= 0)) { throw new URIException(IGNORED_SCHEME, "Ignored scheme: " + uriScheme); } else { throw new URIException("Unsupported scheme: " + uriScheme); } } } // Test if relative URI. If so, need a base to resolve against. if (uriScheme == null || uriScheme.length() <= 0) { if (base == null) { throw new URIException("Relative URI but no base: " + uri); } } else { checkHttpSchemeSpecificPartSlashPrefix(base, uriScheme, uriSchemeSpecificPart); } // fixup authority portion: lowercase/IDN-punycode any domain; // remove stray trailing spaces uriAuthority = fixupAuthority(uriAuthority); // Do some checks if absolute path. if (uriSchemeSpecificPart != null && uriSchemeSpecificPart.startsWith(SLASH)) { if (uriPath != null) { // Eliminate '..' if its first thing in the path. IE does this. uriPath = TextUtils.replaceFirst(SLASHDOTDOTSLASH, uriPath, SLASH); } // Ensure root URLs end with '/': browsers always send "/" // on the request-line, so we should consider "http://host" // to be "http://host/". if (uriPath == null || EMPTY_STRING.equals(uriPath)) { uriPath = SLASH; } } if (uriAuthority != null) { if (uriScheme != null && uriScheme.length() > 0 && uriScheme.equals(HTTP)) { uriAuthority = checkPort(uriAuthority); uriAuthority = stripTail(uriAuthority, HTTP_PORT); } else if (uriScheme != null && uriScheme.length() > 0 && uriScheme.equals(HTTPS)) { uriAuthority = checkPort(uriAuthority); uriAuthority = stripTail(uriAuthority, HTTPS_PORT); } // Strip any prefix dot or tail dots from the authority. uriAuthority = stripTail(uriAuthority, DOT); uriAuthority = stripPrefix(uriAuthority, DOT); } else { // no authority; may be relative. consider stripping scheme // to work-around org.apache.commons.httpclient.URI bug // ( http://issues.apache.org/jira/browse/HTTPCLIENT-587 ) if (uriScheme != null && base != null && uriScheme.equals(base.getScheme())) { // uriScheme redundant and will only confound httpclient.URI uriScheme = null; } } // Ensure minimal escaping. Use of 'lax' URI and URLCodec // means minimal escaping isn't necessarily complete/consistent. // There is a chance such lax encoding will throw exceptions // later at inconvenient times. // // One reason for these bad escapings -- though not the only -- // is that the page is using an encoding other than the ASCII or the // UTF-8 that is our default URI encoding. In this case the parent // class is burping on the passed URL encoding. If the page encoding // was passed into this factory, the encoding seems to be parsed // correctly (See the testEscapedEncoding unit test). // // This fixup may cause us to miss content. There is the charset case // noted above. TODO: Look out for cases where we fail other than for // the above given reason which will be fixed when we address // '[ 913687 ] Make extractors interrogate for charset'. uriPath = ensureMinimalEscaping(uriPath, charset); uriQuery = ensureMinimalEscaping(uriQuery, charset, LaxURLCodec.QUERY_SAFE); // Preallocate. The '1's and '2's in below are space for ':', // '//', etc. URI characters. MutableString s = new MutableString(((uriScheme != null) ? uriScheme.length() : 0) + 1 // ';' + ((uriAuthority != null) ? uriAuthority.length() : 0) + 2 // '//' + ((uriPath != null) ? uriPath.length() : 0) + 1 // '?' + ((uriQuery != null) ? uriQuery.length() : 0)); appendNonNull(s, uriScheme, ":", true); appendNonNull(s, uriAuthority, "//", false); appendNonNull(s, uriPath, "", false); appendNonNull(s, uriQuery, "?", false); return s.toString(); }
From source file:dk.netarkivet.wayback.batch.copycode.NetarchiveSuiteUURIFactory.java
/** * Do heritrix fix-up on passed uri string. * * Does heritrix escaping; usually escaping done to make our behavior align * with IEs. This method codifies our experience pulling URIs from the * wilds. Its does all the escaping we want; its output can always be * assumed to be 'escaped' (though perhaps to a laxer standard than the * vanilla HttpClient URI class or official specs might suggest). * * @param uri URI as string./*from w w w.j av a 2 s . c om*/ * @param base May be null. * @param e True if the uri is already escaped. * @return A fixed up URI string. * @throws URIException */ private String fixup(String uri, final URI base, final String charset) throws URIException { if (uri == null) { throw new NullPointerException(); } else if (uri.length() == 0 && base == null) { throw new URIException("URI length is zero (and not relative)."); } if (uri.length() > UURI.MAX_URL_LENGTH) { // We check length here and again later after all convertions. throw new URIException("URI length > " + UURI.MAX_URL_LENGTH + ": " + uri); } // Replace nbsp with normal spaces (so that they get stripped if at // ends, or encoded if in middle) if (uri.indexOf(NBSP) >= 0) { uri = TextUtils.replaceAll(NBSP, uri, SPACE); } // Get rid of any trailing spaces or new-lines. uri = uri.trim(); // IE actually converts backslashes to slashes rather than to %5C. // Since URIs that have backslashes usually work only with IE, we will // convert backslashes to slashes as well. // TODO Maybe we can first convert backslashes by specs and than by IE // so that we fetch both versions. if (uri.indexOf(BACKSLASH) >= 0) { uri = TextUtils.replaceAll(BACKSLASH_PATTERN, uri, SLASH); } // Remove stray TAB/CR/LF uri = TextUtils.replaceAll(STRAY_SPACING, uri, EMPTY_STRING); // Test for the case of more than two slashes after the http(s) scheme. // Replace with two slashes as mozilla does if found. // See [ 788219 ] URI Syntax Errors stop page parsing. Matcher matcher = HTTP_SCHEME_SLASHES.matcher(uri); if (matcher.matches()) { uri = matcher.group(1) + matcher.group(2); } // now, minimally escape any whitespace uri = escapeWhitespace(uri); // For further processing, get uri elements. See the RFC2396REGEX // comment above for explaination of group indices used in the below. matcher = RFC2396REGEX.matcher(uri); if (!matcher.matches()) { throw new URIException("Failed parse of " + uri); } String uriScheme = checkUriElementAndLowerCase(matcher.group(2)); String uriSchemeSpecificPart = checkUriElement(matcher.group(3)); String uriAuthority = checkUriElement(matcher.group(5)); String uriPath = checkUriElement(matcher.group(6)); String uriQuery = checkUriElement(matcher.group(8)); // UNUSED String uriFragment = checkUriElement(matcher.group(10)); // If a scheme, is it a supported scheme? if (uriScheme != null && uriScheme.length() > 0 && this.schemes != null) { if (!(Arrays.binarySearch(schemes, uriScheme) >= 0)) { // unsupported; see if silently ignored if ((Arrays.binarySearch(ignoredSchemes, uriScheme) >= 0)) { throw new URIException(IGNORED_SCHEME, "Ignored scheme: " + uriScheme); } else { throw new URIException("Unsupported scheme: " + uriScheme); } } } // Test if relative URI. If so, need a base to resolve against. if (uriScheme == null || uriScheme.length() <= 0) { if (base == null) { throw new URIException("Relative URI but no base: " + uri); } } else { checkHttpSchemeSpecificPartSlashPrefix(base, uriScheme, uriSchemeSpecificPart); } // fixup authority portion: lowercase/IDN-punycode any domain; // remove stray trailing spaces uriAuthority = fixupAuthority(uriAuthority); // Do some checks if absolute path. if (uriSchemeSpecificPart != null && uriSchemeSpecificPart.startsWith(SLASH)) { if (uriPath != null) { // Eliminate '..' if its first thing in the path. IE does this. uriPath = TextUtils.replaceFirst(SLASHDOTDOTSLASH, uriPath, SLASH); } // Ensure root URLs end with '/': browsers always send "/" // on the request-line, so we should consider "http://host" // to be "http://host/". if (uriPath == null || EMPTY_STRING.equals(uriPath)) { uriPath = SLASH; } } if (uriAuthority != null) { if (uriScheme != null && uriScheme.length() > 0 && uriScheme.equals(HTTP)) { uriAuthority = checkPort(uriAuthority); uriAuthority = stripTail(uriAuthority, HTTP_PORT); } else if (uriScheme != null && uriScheme.length() > 0 && uriScheme.equals(HTTPS)) { uriAuthority = checkPort(uriAuthority); uriAuthority = stripTail(uriAuthority, HTTPS_PORT); } // Strip any prefix dot or tail dots from the authority. uriAuthority = stripTail(uriAuthority, DOT); uriAuthority = stripPrefix(uriAuthority, DOT); } else { // no authority; may be relative. consider stripping scheme // to work-around org.apache.commons.httpclient.URI bug // ( http://issues.apache.org/jira/browse/HTTPCLIENT-587 ) if (uriScheme != null && base != null && uriScheme.equals(base.getScheme())) { // uriScheme redundant and will only confound httpclient.URI uriScheme = null; } } // Ensure minimal escaping. Use of 'lax' URI and URLCodec // means minimal escaping isn't necessarily complete/consistent. // There is a chance such lax encoding will throw exceptions // later at inconvenient times. // // One reason for these bad escapings -- though not the only -- // is that the page is using an encoding other than the ASCII or the // UTF-8 that is our default URI encoding. In this case the parent // class is burping on the passed URL encoding. If the page encoding // was passed into this factory, the encoding seems to be parsed // correctly (See the testEscapedEncoding unit test). // // This fixup may cause us to miss content. There is the charset case // noted above. TODO Look out for cases where we fail other than for // the above given reason which will be fixed when we address // '[ 913687 ] Make extractors interrogate for charset'. uriPath = ensureMinimalEscaping(uriPath, charset); uriQuery = ensureMinimalEscaping(uriQuery, charset, LaxURLCodec.QUERY_SAFE); // Preallocate. The '1's and '2's in below are space for ':', // '//', etc. URI characters. MutableString s = new MutableString(((uriScheme != null) ? uriScheme.length() : 0) + 1 // ';' + ((uriAuthority != null) ? uriAuthority.length() : 0) + 2 // '//' + ((uriPath != null) ? uriPath.length() : 0) + 1 // '?' + ((uriQuery != null) ? uriQuery.length() : 0)); appendNonNull(s, uriScheme, ":", true); appendNonNull(s, uriAuthority, "//", false); appendNonNull(s, uriPath, "", false); appendNonNull(s, uriQuery, "?", false); return s.toString(); }
From source file:davmail.exchange.ExchangeSession.java
protected String getAbsoluteUri(HttpMethod method, String path) throws URIException { URI uri = method.getURI();/*w ww . j a va 2s .co m*/ if (path != null) { // reset query string uri.setQuery(null); if (path.startsWith("/")) { // path is absolute, replace method path uri.setPath(path); } else if (path.startsWith("http://") || path.startsWith("https://")) { return path; } else { // relative path, build new path String currentPath = method.getPath(); int end = currentPath.lastIndexOf('/'); if (end >= 0) { uri.setPath(currentPath.substring(0, end + 1) + path); } else { throw new URIException(uri.getURI()); } } } return uri.getURI(); }
From source file:com.cyberway.issue.net.UURIFactory.java
/** * If http(s) scheme, check scheme specific part begins '//'. * @throws URIException /*from w ww .ja v a 2s .co m*/ * @see http://www.faqs.org/rfcs/rfc1738.html Section 3.1. Common Internet * Scheme Syntax */ protected void checkHttpSchemeSpecificPartSlashPrefix(final URI base, final String scheme, final String schemeSpecificPart) throws URIException { if (scheme == null || scheme.length() <= 0) { return; } if (!scheme.equals("http") && !scheme.equals("https")) { return; } if (schemeSpecificPart == null || !schemeSpecificPart.startsWith("//")) { // only acceptable if schemes match if (base == null || !scheme.equals(base.getScheme())) { throw new URIException("relative URI with scheme only allowed for " + "scheme matching base"); } return; } if (schemeSpecificPart.length() <= 2) { throw new URIException("http scheme specific part is " + "too short: " + schemeSpecificPart); } }
From source file:com.cyberway.issue.net.UURIFactory.java
/** * Fixup the domain label part of the authority. * /* www . j av a 2s. c o m*/ * We're more lax than the spec. in that we allow underscores. * * @param label Domain label to fix. * @return Return fixed domain label. * @throws URIException */ private String fixupDomainlabel(String label) throws URIException { // apply IDN-punycoding, as necessary try { // TODO: optimize: only apply when necessary, or // keep cache of recent encodings label = IDNA.toASCII(label); } catch (IDNAException e) { if (TextUtils.matches(ACCEPTABLE_ASCII_DOMAIN, label)) { // domain name has ACE prefix, leading/trailing dash, or // underscore -- but is still a name we wish to tolerate; // simply continue } else { // problematic domain: neither ASCII acceptable characters // nor IDN-punycodable, so throw exception // TODO: change to HeritrixURIException so distinguishable // from URIExceptions in library code URIException ue = new URIException(e + " " + label); ue.initCause(e); throw ue; } } label = label.toLowerCase(); return label; }