Http Connection
import java.io.BufferedInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.net.URLEncoder; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collection; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; /** * Implementation of {@link Connection}. * * @see org.jsoup.Jsoup#connect(String) */ public class HttpConnection implements Connection { public static Connection connect(String url) { Connection con = new HttpConnection(); con.url(url); return con; } public static Connection connect(URL url) { Connection con = new HttpConnection(); con.url(url); return con; } private Connection.Request req; private Connection.Response res; private HttpConnection() { req = new Request(); res = new Response(); } public Connection url(URL url) { req.url(url); return this; } public Connection url(String url) { Validate.notEmpty(url, "Must supply a valid URL"); try { req.url(new URL(url)); } catch (MalformedURLException e) { throw new IllegalArgumentException("Malformed URL: " + url, e); } return this; } public Connection userAgent(String userAgent) { Validate.notNull(userAgent, "User agent must not be null"); req.header("User-Agent", userAgent); return this; } public Connection timeout(int millis) { req.timeout(millis); return this; } public Connection followRedirects(boolean followRedirects) { req.followRedirects(followRedirects); return this; } public Connection referrer(String referrer) { Validate.notNull(referrer, "Referrer must not be null"); req.header("Referer", referrer); return this; } public Connection method(Method method) { req.method(method); return this; } public Connection data(String key, String value) { req.data(KeyVal.create(key, value)); return this; } public Connection data(Map<String, String> data) { Validate.notNull(data, "Data map must not be null"); for (Map.Entry<String, String> entry : data.entrySet()) { req.data(KeyVal.create(entry.getKey(), entry.getValue())); } return this; } public Connection data(String... keyvals) { Validate.notNull(keyvals, "Data key value pairs must not be null"); Validate.isTrue(keyvals.length % 2 == 0, "Must supply an even number of key value pairs"); for (int i = 0; i < keyvals.length; i += 2) { String key = keyvals[i]; String value = keyvals[i + 1]; Validate.notEmpty(key, "Data key must not be empty"); Validate.notNull(value, "Data value must not be null"); req.data(KeyVal.create(key, value)); } return this; } public Connection header(String name, String value) { req.header(name, value); return this; } public Connection cookie(String name, String value) { req.cookie(name, value); return this; } public Connection.Response execute() throws IOException { res = Response.execute(req); return res; } public Connection.Request request() { return req; } public Connection request(Connection.Request request) { req = request; return this; } public Connection.Response response() { return res; } public Connection response(Connection.Response response) { res = response; return this; } @SuppressWarnings({ "unchecked" }) private static abstract class Base<T extends Connection.Base> implements Connection.Base<T> { URL url; Method method; Map<String, String> headers; Map<String, String> cookies; private Base() { headers = new LinkedHashMap<String, String>(); cookies = new LinkedHashMap<String, String>(); } public URL url() { return url; } public T url(URL url) { Validate.notNull(url, "URL must not be null"); this.url = url; return (T) this; } public Method method() { return method; } public T method(Method method) { Validate.notNull(method, "Method must not be null"); this.method = method; return (T) this; } public String header(String name) { Validate.notNull(name, "Header name must not be null"); return getHeaderCaseInsensitive(name); } public T header(String name, String value) { Validate.notEmpty(name, "Header name must not be empty"); Validate.notNull(value, "Header value must not be null"); removeHeader(name); // ensures we don't get an "accept-encoding" and // a "Accept-Encoding" headers.put(name, value); return (T) this; } public boolean hasHeader(String name) { Validate.notEmpty(name, "Header name must not be empty"); return getHeaderCaseInsensitive(name) != null; } public T removeHeader(String name) { Validate.notEmpty(name, "Header name must not be empty"); Map.Entry<String, String> entry = scanHeaders(name); // remove is // case // insensitive // too if (entry != null) headers.remove(entry.getKey()); // ensures correct case return (T) this; } public Map<String, String> headers() { return headers; } private String getHeaderCaseInsensitive(String name) { Validate.notNull(name, "Header name must not be null"); // quick evals for common case of title case, lower case, then scan // for mixed String value = headers.get(name); if (value == null) value = headers.get(name.toLowerCase()); if (value == null) { Map.Entry<String, String> entry = scanHeaders(name); if (entry != null) value = entry.getValue(); } return value; } private Map.Entry<String, String> scanHeaders(String name) { String lc = name.toLowerCase(); for (Map.Entry<String, String> entry : headers.entrySet()) { if (entry.getKey().toLowerCase().equals(lc)) return entry; } return null; } public String cookie(String name) { Validate.notNull(name, "Cookie name must not be null"); return cookies.get(name); } public T cookie(String name, String value) { Validate.notEmpty(name, "Cookie name must not be empty"); Validate.notNull(value, "Cookie value must not be null"); cookies.put(name, value); return (T) this; } public boolean hasCookie(String name) { Validate.notEmpty("Cookie name must not be empty"); return cookies.containsKey(name); } public T removeCookie(String name) { Validate.notEmpty("Cookie name must not be empty"); cookies.remove(name); return (T) this; } public Map<String, String> cookies() { return cookies; } } public static class Request extends Base<Connection.Request> implements Connection.Request { private int timeoutMilliseconds; private boolean followRedirects; private Collection<Connection.KeyVal> data; private Request() { timeoutMilliseconds = 3000; followRedirects = true; data = new ArrayList<Connection.KeyVal>(); method = Connection.Method.GET; headers.put("Accept-Encoding", "gzip"); } public int timeout() { return timeoutMilliseconds; } public Request timeout(int millis) { Validate.isTrue(millis >= 0, "Timeout milliseconds must be 0 (infinite) or greater"); timeoutMilliseconds = millis; return this; } public boolean followRedirects() { return followRedirects; } public Connection.Request followRedirects(boolean followRedirects) { this.followRedirects = followRedirects; return this; } public Request data(Connection.KeyVal keyval) { Validate.notNull(keyval, "Key val must not be null"); data.add(keyval); return this; } public Collection<Connection.KeyVal> data() { return data; } } public static class Response extends Base<Connection.Response> implements Connection.Response { private static final int MAX_REDIRECTS = 20; private int statusCode; private String statusMessage; private ByteBuffer byteData; private String charset; private String contentType; private boolean executed = false; private int numRedirects = 0; Response() { super(); } private Response(Response previousResponse) throws IOException { super(); if (previousResponse != null) { numRedirects = previousResponse.numRedirects + 1; if (numRedirects >= MAX_REDIRECTS) throw new IOException( String.format( "Too many redirects occurred trying to load URL %s", previousResponse.url())); } } static Response execute(Connection.Request req) throws IOException { return execute(req, null); } static Response execute(Connection.Request req, Response previousResponse) throws IOException { Validate.notNull(req, "Request must not be null"); String protocol = req.url().getProtocol(); Validate.isTrue( protocol.equals("http") || protocol.equals("https"), "Only http & https protocols supported"); // set up the request for execution if (req.method() == Connection.Method.GET && req.data().size() > 0) serialiseRequestUrl(req); // appends query string HttpURLConnection conn = createConnection(req); conn.connect(); if (req.method() == Connection.Method.POST) writePost(req.data(), conn.getOutputStream()); int status = conn.getResponseCode(); boolean needsRedirect = false; if (status != HttpURLConnection.HTTP_OK) { if (status == HttpURLConnection.HTTP_MOVED_TEMP || status == HttpURLConnection.HTTP_MOVED_PERM || status == HttpURLConnection.HTTP_SEE_OTHER) needsRedirect = true; else throw new IOException(status + " error loading URL " + req.url().toString()); } Response res = new Response(previousResponse); res.setupFromConnection(conn, previousResponse); if (needsRedirect && req.followRedirects()) { req.url(new URL(req.url(), res.header("Location"))); for (Map.Entry<String, String> cookie : res.cookies.entrySet()) { // add // response // cookies // to // request // (for // e.g. // login // posts) req.cookie(cookie.getKey(), cookie.getValue()); } return execute(req, res); } InputStream inStream = null; try { inStream = res.hasHeader("Content-Encoding") && res.header("Content-Encoding").equalsIgnoreCase( "gzip") ? new BufferedInputStream( new GZIPInputStream(conn.getInputStream())) : new BufferedInputStream(conn.getInputStream()); res.byteData = DataUtil.readToByteBuffer(inStream); res.charset = DataUtil .getCharsetFromContentType(res.contentType); // may be // null, // readInputStream // deals // with // it } finally { if (inStream != null) inStream.close(); } res.executed = true; return res; } public int statusCode() { return statusCode; } public String statusMessage() { return statusMessage; } public String charset() { return charset; } public String contentType() { return contentType; } public String body() { Validate.isTrue( executed, "Request must be executed (with .execute(), .get(), or .post() before getting response body"); // charset gets set from header on execute, and from meta-equiv on // parse. parse may not have happened yet String body; if (charset == null) body = Charset.forName(DataUtil.defaultCharset) .decode(byteData).toString(); else body = Charset.forName(charset).decode(byteData).toString(); byteData.rewind(); return body; } public byte[] bodyAsBytes() { Validate.isTrue( executed, "Request must be executed (with .execute(), .get(), or .post() before getting response body"); return byteData.array(); } // set up connection defaults, and details from request private static HttpURLConnection createConnection(Connection.Request req) throws IOException { HttpURLConnection conn = (HttpURLConnection) req.url() .openConnection(); conn.setRequestMethod(req.method().name()); conn.setInstanceFollowRedirects(false); // don't rely on native // redirection support conn.setConnectTimeout(req.timeout()); conn.setReadTimeout(req.timeout()); if (req.method() == Method.POST) conn.setDoOutput(true); if (req.cookies().size() > 0) conn.addRequestProperty("Cookie", getRequestCookieString(req)); for (Map.Entry<String, String> header : req.headers().entrySet()) { conn.addRequestProperty(header.getKey(), header.getValue()); } return conn; } // set up url, method, header, cookies private void setupFromConnection(HttpURLConnection conn, Connection.Response previousResponse) throws IOException { method = Connection.Method.valueOf(conn.getRequestMethod()); url = conn.getURL(); statusCode = conn.getResponseCode(); statusMessage = conn.getResponseMessage(); contentType = conn.getContentType(); // headers into map Map<String, List<String>> resHeaders = conn.getHeaderFields(); for (Map.Entry<String, List<String>> entry : resHeaders.entrySet()) { String name = entry.getKey(); if (name == null) continue; // http/1.1 line List<String> values = entry.getValue(); if (name.equalsIgnoreCase("Set-Cookie")) { for (String value : values) { TokenQueue cd = new TokenQueue(value); String cookieName = cd.chompTo("=").trim(); String cookieVal = cd.consumeTo(";").trim(); // ignores path, date, domain, secure et al. req'd? cookie(cookieName, cookieVal); } } else { // only take the first instance of each header if (!values.isEmpty()) header(name, values.get(0)); } } // if from a redirect, map previous response cookies into this // response if (previousResponse != null) { for (Map.Entry<String, String> prevCookie : previousResponse .cookies().entrySet()) { if (!hasCookie(prevCookie.getKey())) cookie(prevCookie.getKey(), prevCookie.getValue()); } } } private static void writePost(Collection<Connection.KeyVal> data, OutputStream outputStream) throws IOException { OutputStreamWriter w = new OutputStreamWriter(outputStream, DataUtil.defaultCharset); boolean first = true; for (Connection.KeyVal keyVal : data) { if (!first) w.append('&'); else first = false; w.write(URLEncoder.encode(keyVal.key(), DataUtil.defaultCharset)); w.write('='); w.write(URLEncoder.encode(keyVal.value(), DataUtil.defaultCharset)); } w.close(); } private static String getRequestCookieString(Connection.Request req) { StringBuilder sb = new StringBuilder(); boolean first = true; for (Map.Entry<String, String> cookie : req.cookies().entrySet()) { if (!first) sb.append("; "); else first = false; sb.append(cookie.getKey()).append('=') .append(cookie.getValue()); // todo: spec says only ascii, no escaping / encoding defined. // validate on set? or escape somehow here? } return sb.toString(); } // for get url reqs, serialise the data map into the url private static void serialiseRequestUrl(Connection.Request req) throws IOException { URL in = req.url(); StringBuilder url = new StringBuilder(); boolean first = true; // reconstitute the query, ready for appends url.append(in.getProtocol()).append("://") .append(in.getAuthority()) // includes host, port .append(in.getPath()).append("?"); if (in.getQuery() != null) { url.append(in.getQuery()); first = false; } for (Connection.KeyVal keyVal : req.data()) { if (!first) url.append('&'); else first = false; url.append( URLEncoder.encode(keyVal.key(), DataUtil.defaultCharset)) .append('=') .append(URLEncoder.encode(keyVal.value(), DataUtil.defaultCharset)); } req.url(new URL(url.toString())); req.data().clear(); // moved into url as get params } } public static class KeyVal implements Connection.KeyVal { private String key; private String value; public static KeyVal create(String key, String value) { Validate.notEmpty(key, "Data key must not be empty"); Validate.notNull(value, "Data value must not be null"); return new KeyVal(key, value); } private KeyVal(String key, String value) { this.key = key; this.value = value; } public KeyVal key(String key) { Validate.notEmpty(key, "Data key must not be empty"); this.key = key; return this; } public String key() { return key; } public KeyVal value(String value) { Validate.notNull(value, "Data value must not be null"); this.value = value; return this; } public String value() { return value; } @Override public String toString() { return key + "=" + value; } } } /** * A Connection provides a convenient interface to fetch content from the web, * and parse them into Documents. * <p> * To get a new Connection, use {@link org.jsoup.Jsoup#connect(String)}. * Connections contain {@link Connection.Request} and * {@link Connection.Response} objects. The request objects are reusable as * prototype requests. * <p> * Request configuration can be made using either the shortcut methods in * Connection (e.g. {@link #userAgent(String)}), or by methods in the * Connection.Request object directly. All request configuration must be made * before the request is executed. * <p> * The Connection interface is <b>currently in beta</b> and subject to change. * Comments, suggestions, and bug reports are welcome. */ interface Connection { /** * GET and POST http methods. */ public enum Method { GET, POST } /** * Set the request URL to fetch. The protocol must be HTTP or HTTPS. * * @param url * URL to connect to * @return this Connection, for chaining */ public Connection url(URL url); /** * Set the request URL to fetch. The protocol must be HTTP or HTTPS. * * @param url * URL to connect to * @return this Connection, for chaining */ public Connection url(String url); /** * Set the request user-agent header. * * @param userAgent * user-agent to use * @return this Connection, for chaining */ public Connection userAgent(String userAgent); /** * Set the request timeouts (connect and read). If a timeout occurs, an * IOException will be thrown. The default timeout is 3 seconds (3000 * millis). A timeout of zero is treated as an infinite timeout. * * @param millis * number of milliseconds (thousandths of a second) before timing * out connects or reads. * @return this Connection, for chaining */ public Connection timeout(int millis); /** * Set the request referrer (aka "referer") header. * * @param referrer * referrer to use * @return this Connection, for chaining */ public Connection referrer(String referrer); /** * Configures the connection to (not) follow server redirects. By default * this is <b>true</b>. * * @param followRedirects * true if server redirects should be followed. * @return this Connection, for chaining */ public Connection followRedirects(boolean followRedirects); /** * Set the request method to use, GET or POST. Default is GET. * * @param method * HTTP request method * @return this Connection, for chaining */ public Connection method(Method method); /** * Add a request data parameter. Request parameters are sent in the request * query string for GETs, and in the request body for POSTs. A request may * have multiple values of the same name. * * @param key * data key * @param value * data value * @return this Connection, for chaining */ public Connection data(String key, String value); /** * Adds all of the supplied data to the request data parameters * * @param data * map of data parameters * @return this Connection, for chaining */ public Connection data(Map<String, String> data); /** * Add a number of request data parameters. Multiple parameters may be set * at once, e.g.: * <code>.data("name", "jsoup", "language", "Java", "language", "English");</code> * creates a query string like: * <code>?name=jsoup&language=Java&language=English</code> * * @param keyvals * a set of key value pairs. * @return this Connection, for chaining */ public Connection data(String... keyvals); /** * Set a request header. * * @param name * header name * @param value * header value * @return this Connection, for chaining * @see org.jsoup.Connection.Request#headers() */ public Connection header(String name, String value); /** * Set a cookie to be sent in the request * * @param name * name of cookie * @param value * value of cookie * @return this Connection, for chaining */ public Connection cookie(String name, String value); /** * Execute the request. * * @return a response object * @throws IOException * on error */ public Response execute() throws IOException; /** * Get the request object associatated with this connection * * @return request */ public Request request(); /** * Set the connection's request * * @param request * new request object * @return this Connection, for chaining */ public Connection request(Request request); /** * Get the response, once the request has been executed * * @return response */ public Response response(); /** * Set the conenction's response * * @param response * new response * @return this Connection, for chaining */ public Connection response(Response response); /** * Common methods for Requests and Responses * * @param <T> * Type of Base, either Request or Response */ interface Base<T extends Base> { /** * Get the URL * * @return URL */ public URL url(); /** * Set the URL * * @param url * new URL * @return this, for chaining */ public T url(URL url); /** * Get the request method * * @return method */ public Method method(); /** * Set the request method * * @param method * new method * @return this, for chaining */ public T method(Method method); /** * Get the value of a header. This is a simplified header model, where a * header may only have one value. * <p> * Header names are case insensitive. * * @param name * name of header (case insensitive) * @return value of header, or null if not set. * @see #hasHeader(String) * @see #cookie(String) */ public String header(String name); /** * Set a header. This method will overwrite any existing header with the * same case insensitive name. * * @param name * Name of header * @param value * Value of header * @return this, for chaining */ public T header(String name, String value); /** * Check if a header is present * * @param name * name of header (case insensitive) * @return if the header is present in this request/response */ public boolean hasHeader(String name); /** * Remove a header by name * * @param name * name of header to remove (case insensitive) * @return this, for chianing */ public T removeHeader(String name); /** * Retrieve all of the request/response headers as a map * * @return headers */ public Map<String, String> headers(); /** * Get a cookie value by name from this request/response. * <p> * Response objects have a simplified cookie model. Each cookie set in * the response is added to the response object's cookie key=value map. * The cookie's path, domain, and expiry date are ignored. * * @param name * name of cookie to retrieve. * @return value of cookie, or null if not set */ public String cookie(String name); /** * Set a cookie in this request/response. * * @param name * name of cookie * @param value * value of cookie * @return this, for chianing */ public T cookie(String name, String value); /** * Check if a cookie is present * * @param name * name of cookie * @return if the cookie is present in this request/response */ public boolean hasCookie(String name); /** * Remove a cookie by name * * @param name * name of cookie to remove * @return this, for chianing */ public T removeCookie(String name); /** * Retrieve all of the request/response cookies as a map * * @return cookies */ public Map<String, String> cookies(); } /** * Represents a HTTP request. */ public interface Request extends Base<Request> { /** * Get the request timeout, in milliseconds. * * @return the timeout in milliseconds. */ public int timeout(); /** * Update the request timeout. * * @param millis * timeout, in milliseconds * @return this Request, for chaining */ public Request timeout(int millis); /** * Get the current followRedirects configuration. * * @return true if followRedirects is enabled. */ public boolean followRedirects(); /** * Configures the request to (not) follow server redirects. By default * this is <b>true</b>. * * @param followRedirects * true if server redirects should be followed. * @return this Connection, for chaining */ public Request followRedirects(boolean followRedirects); /** * Add a data parameter to the request * * @param keyval * data to add. * @return this Request, for chaining */ public Request data(KeyVal keyval); /** * Get all of the request's data parameters * * @return collection of keyvals */ public Collection<KeyVal> data(); } /** * Represents a HTTP response. */ public interface Response extends Base<Response> { /** * Get the status code of the response. * * @return status code */ public int statusCode(); /** * Get the status message of the response. * * @return status message */ public String statusMessage(); /** * Get the character set name of the response. * * @return character set name */ public String charset(); /** * Get the response content type (e.g. "text/html"); * * @return the response content type */ public String contentType(); /** * Get the body of the response as a plain string. * * @return body */ public String body(); /** * Get the body of the response as an array of bytes. * * @return body bytes */ public byte[] bodyAsBytes(); } /** * A Key Value tuple. */ public interface KeyVal { /** * Update the key of a keyval * * @param key * new key * @return this KeyVal, for chaining */ public KeyVal key(String key); /** * Get the key of a keyval * * @return the key */ public String key(); /** * Update the value of a keyval * * @param value * the new value * @return this KeyVal, for chaining */ public KeyVal value(String value); /** * Get the value of a keyval * * @return the value */ public String value(); } } final class Validate { private Validate() { } /** * Validates that the obect is not null * * @param obj * object to test */ public static void notNull(Object obj) { if (obj == null) throw new IllegalArgumentException("Object must not be null"); } /** * Validates that the object is not null * * @param obj * object to test * @param msg * message to output if validation fails */ public static void notNull(Object obj, String msg) { if (obj == null) throw new IllegalArgumentException(msg); } /** * Validates that the value is true * * @param val * object to test */ public static void isTrue(boolean val) { if (!val) throw new IllegalArgumentException("Must be true"); } /** * Validates that the value is true * * @param val * object to test * @param msg * message to output if validation fails */ public static void isTrue(boolean val, String msg) { if (!val) throw new IllegalArgumentException(msg); } /** * Validates that the array contains no null elements * * @param objects * the array to test */ public static void noNullElements(Object[] objects) { noNullElements(objects, "Array must not contain any null objects"); } /** * Validates that the array contains no null elements * * @param objects * the array to test * @param msg * message to output if validation fails */ public static void noNullElements(Object[] objects, String msg) { for (Object obj : objects) if (obj == null) throw new IllegalArgumentException(msg); } /** * Validates that the string is not empty * * @param string * the string to test */ public static void notEmpty(String string) { if (string == null || string.length() == 0) throw new IllegalArgumentException("String must not be empty"); } /** * Validates that the string is not empty * * @param string * the string to test * @param msg * message to output if validation fails */ public static void notEmpty(String string, String msg) { if (string == null || string.length() == 0) throw new IllegalArgumentException(msg); } } /** * Internal static utilities for handling data. * */ class DataUtil { private static final Pattern charsetPattern = Pattern .compile("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)"); static final String defaultCharset = "UTF-8"; // used if not found in header // or meta charset private static final int bufferSize = 0x20000; // ~130K. private DataUtil() { } /** * Loads a file to a Document. * * @param in * file to load * @param charsetName * character set of input * @param baseUri * base URI of document, to resolve relative links against * @return Document * @throws IOException * on IO error */ static ByteBuffer readToByteBuffer(InputStream inStream) throws IOException { byte[] buffer = new byte[bufferSize]; ByteArrayOutputStream outStream = new ByteArrayOutputStream(bufferSize); int read; while (true) { read = inStream.read(buffer); if (read == -1) break; outStream.write(buffer, 0, read); } ByteBuffer byteData = ByteBuffer.wrap(outStream.toByteArray()); return byteData; } /** * Parse out a charset from a content type header. * * @param contentType * e.g. "text/html; charset=EUC-JP" * @return "EUC-JP", or null if not found. Charset is trimmed and * uppercased. */ static String getCharsetFromContentType(String contentType) { if (contentType == null) return null; Matcher m = charsetPattern.matcher(contentType); if (m.find()) { return m.group(1).trim().toUpperCase(); } return null; } } /** * A character queue with parsing helpers. * * @author Jonathan Hedley */ class TokenQueue { private String queue; private int pos = 0; private static final char ESC = '\\'; // escape char for chomp balanced. /** Create a new TokenQueue. @param data string of data to back queue. */ public TokenQueue(String data) { Validate.notNull(data); queue = data; } /** * Is the queue empty? * @return true if no data left in queue. */ public boolean isEmpty() { return remainingLength() == 0; } private int remainingLength() { return queue.length() - pos; } /** * Retrieves but does not remove the first character from the queue. * @return First character, or 0 if empty. */ public char peek() { return isEmpty() ? 0 : queue.charAt(pos); } /** Add a character to the start of the queue (will be the next character retrieved). @param c character to add */ public void addFirst(Character c) { addFirst(c.toString()); } /** Add a string to the start of the queue. @param seq string to add. */ public void addFirst(String seq) { // not very performant, but an edge case queue = seq + queue.substring(pos); pos = 0; } /** * Tests if the next characters on the queue match the sequence. Case insensitive. * @param seq String to check queue for. * @return true if the next characters match. */ public boolean matches(String seq) { return queue.regionMatches(true, pos, seq, 0, seq.length()); } /** * Case sensitive match test. * @param seq * @return */ public boolean matchesCS(String seq) { return queue.startsWith(seq, pos); } /** Tests if the next characters match any of the sequences. Case insensitive. @param seq @return */ public boolean matchesAny(String... seq) { for (String s : seq) { if (matches(s)) return true; } return false; } public boolean matchesAny(char... seq) { if (isEmpty()) return false; for (char c: seq) { if (queue.charAt(pos) == c) return true; } return false; } public boolean matchesStartTag() { // micro opt for matching "<x" return (remainingLength() >= 2 && queue.charAt(pos) == '<' && Character.isLetter(queue.charAt(pos+1))); } /** * Tests if the queue matches the sequence (as with match), and if they do, removes the matched string from the * queue. * @param seq String to search for, and if found, remove from queue. * @return true if found and removed, false if not found. */ public boolean matchChomp(String seq) { if (matches(seq)) { pos += seq.length(); return true; } else { return false; } } /** Tests if queue starts with a whitespace character. @return if starts with whitespace */ public boolean matchesWhitespace() { return !isEmpty() && Character.isWhitespace(queue.charAt(pos)); } /** Test if the queue matches a word character (letter or digit). @return if matches a word character */ public boolean matchesWord() { return !isEmpty() && Character.isLetterOrDigit(queue.charAt(pos)); } /** * Drops the next character off the queue. */ public void advance() { if (!isEmpty()) pos++; } /** * Consume one character off queue. * @return first character on queue. */ public char consume() { return queue.charAt(pos++); } /** * Consumes the supplied sequence of the queue. If the queue does not start with the supplied sequence, will * throw an illegal state exception -- but you should be running match() against that condition. <p> Case insensitive. * @param seq sequence to remove from head of queue. */ public void consume(String seq) { if (!matches(seq)) throw new IllegalStateException("Queue did not match expected sequence"); int len = seq.length(); if (len > remainingLength()) throw new IllegalStateException("Queue not long enough to consume sequence"); pos += len; } /** * Pulls a string off the queue, up to but exclusive of the match sequence, or to the queue running out. * @param seq String to end on (and not include in return, but leave on queue). <b>Case sensitive.</b> * @return The matched data consumed from queue. */ public String consumeTo(String seq) { int offset = queue.indexOf(seq, pos); if (offset != -1) { String consumed = queue.substring(pos, offset); pos += consumed.length(); return consumed; } else { return remainder(); } } public String consumeToIgnoreCase(String seq) { int start = pos; String first = seq.substring(0, 1); boolean canScan = first.toLowerCase().equals(first.toUpperCase()); // if first is not cased, use index of while (!isEmpty()) { if (matches(seq)) break; if (canScan) { int skip = queue.indexOf(first, pos) - pos; if (skip == 0) // this char is the skip char, but not match, so force advance of pos pos++; else if (skip < 0) // no chance of finding, grab to end pos = queue.length(); else pos += skip; } else pos++; } String data = queue.substring(start, pos); return data; } /** Consumes to the first sequence provided, or to the end of the queue. Leaves the terminator on the queue. @param seq any number of terminators to consume to. <b>Case insensitive.</b> @return consumed string */ // todo: method name. not good that consumeTo cares for case, and consume to any doesn't. And the only use for this // is is a case sensitive time... public String consumeToAny(String... seq) { int start = pos; while (!isEmpty() && !matchesAny(seq)) { pos++; } String data = queue.substring(start, pos); return data; } /** * Pulls a string off the queue (like consumeTo), and then pulls off the matched string (but does not return it). * <p> * If the queue runs out of characters before finding the seq, will return as much as it can (and queue will go * isEmpty() == true). * @param seq String to match up to, and not include in return, and to pull off queue. <b>Case sensitive.</b> * @return Data matched from queue. */ public String chompTo(String seq) { String data = consumeTo(seq); matchChomp(seq); return data; } public String chompToIgnoreCase(String seq) { String data = consumeToIgnoreCase(seq); // case insensitive scan matchChomp(seq); return data; } /** * Pulls a balanced string off the queue. E.g. if queue is "(one (two) three) four", (,) will return "one (two) three", * and leave " four" on the queue. Unbalanced openers and closers can be escaped (with \). Those escapes will be left * in the returned string, which is suitable for regexes (where we need to preserve the escape), but unsuitable for * contains text strings; use unescape for that. * @param open opener * @param close closer * @return data matched from the queue */ public String chompBalanced(char open, char close) { StringBuilder accum = new StringBuilder(); int depth = 0; char last = 0; do { if (isEmpty()) break; Character c = consume(); if (last == 0 || last != ESC) { if (c.equals(open)) depth++; else if (c.equals(close)) depth--; } if (depth > 0 && last != 0) accum.append(c); // don't include the outer match pair in the return last = c; } while (depth > 0); return accum.toString(); } /** * Unescaped a \ escaped string. * @param in backslash escaped string * @return unescaped string */ public static String unescape(String in) { StringBuilder out = new StringBuilder(); char last = 0; for (char c : in.toCharArray()) { if (c == ESC) { if (last != 0 && last == ESC) out.append(c); } else out.append(c); last = c; } return out.toString(); } /** * Pulls the next run of whitespace characters of the queue. */ public boolean consumeWhitespace() { boolean seen = false; while (matchesWhitespace()) { pos++; seen = true; } return seen; } /** * Retrieves the next run of word type (letter or digit) off the queue. * @return String of word characters from queue, or empty string if none. */ public String consumeWord() { int start = pos; while (matchesWord()) pos++; return queue.substring(start, pos); } /** * Consume an tag name off the queue (word or :, _, -) * * @return tag name */ public String consumeTagName() { int start = pos; while (!isEmpty() && (matchesWord() || matchesAny(':', '_', '-'))) pos++; return queue.substring(start, pos); } /** * Consume a CSS element selector (tag name, but | instead of : for namespaces, to not conflict with :pseudo selects). * * @return tag name */ public String consumeElementSelector() { int start = pos; while (!isEmpty() && (matchesWord() || matchesAny('|', '_', '-'))) pos++; return queue.substring(start, pos); } /** Consume a CSS identifier (ID or class) off the queue (letter, digit, -, _) http://www.w3.org/TR/CSS2/syndata.html#value-def-identifier @return identifier */ public String consumeCssIdentifier() { int start = pos; while (!isEmpty() && (matchesWord() || matchesAny('-', '_'))) pos++; return queue.substring(start, pos); } /** Consume an attribute key off the queue (letter, digit, -, _, :") @return attribute key */ public String consumeAttributeKey() { int start = pos; while (!isEmpty() && (matchesWord() || matchesAny('-', '_', ':'))) pos++; return queue.substring(start, pos); } /** Consume and return whatever is left on the queue. @return remained of queue. */ public String remainder() { StringBuilder accum = new StringBuilder(); while (!isEmpty()) { accum.append(consume()); } return accum.toString(); } public String toString() { return queue.substring(pos); } }