List of usage examples for org.apache.commons.httpclient.util EncodingUtil getString
public static String getString(final byte[] data, int offset, int length, String charset)
From source file:is.landsbokasafn.deduplicator.indexer.WarcFileIterator.java
protected static CrawlDataItem processResponse(WARCRecord record, ArchiveRecordHeader header) throws IOException { CrawlDataItem cdi = new CrawlDataItem(); cdi.setURL(header.getUrl());//from w w w . j a v a 2s . c o m cdi.setContentDigest((String) header.getHeaderValue(WARCConstants.HEADER_KEY_PAYLOAD_DIGEST)); cdi.setRevisit(false); cdi.setTimestamp(header.getDate()); cdi.setWarcRecordId((String) header.getHeaderValue(WARCConstants.HEADER_KEY_ID)); // Process the HTTP header, if any byte[] statusBytes = HttpParser.readRawLine(record); int eolCharCount = getEolCharsCount(statusBytes); if (eolCharCount > 0) { String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount, WARCConstants.DEFAULT_ENCODING); if ((statusLine != null) && StatusLine.startsWithHTTP(statusLine)) { StatusLine status = new StatusLine(statusLine); cdi.setStatusCode(status.getStatusCode()); Header[] headers = HttpParser.parseHeaders(record, WARCConstants.DEFAULT_ENCODING); for (Header h : headers) { if (h.getName().equalsIgnoreCase("Content-Type")) { cdi.setMimeType(h.getValue()); } else if (h.getName().equalsIgnoreCase("ETag")) { cdi.setEtag(h.getValue()); } } } } return cdi; }
From source file:com.cyberway.issue.io.arc.ARCRecord.java
/** * Read http header if present. Technique borrowed from HttpClient HttpParse * class./*from w ww .j av a2 s . c o m*/ * * @return ByteArrayInputStream with the http header in it or null if no * http header. * @throws IOException */ private InputStream readHttpHeader() throws IOException { // If judged a record that doesn't have an http header, return // immediately. if (!getHeader().getUrl().startsWith("http") || getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) { return null; } byte[] statusBytes = HttpParser.readRawLine(getIn()); int eolCharCount = getEolCharsCount(statusBytes); if (eolCharCount <= 0) { throw new IOException("Failed to read http status where one was expected: " + ((statusBytes == null) ? "" : new String(statusBytes))); } String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) { if (statusLine.startsWith("DELETED")) { // Some old ARCs have deleted records like following: // http://vireo.gatech.edu:80/ebt-bin/nph-dweb/dynaweb/SGI_Developer/SGITCL_PG/@Generic__BookTocView/11108%3Btd%3D2 130.207.168.42 19991010131803 text/html 29202 // DELETED_TIME=20000425001133_DELETER=Kurt_REASON=alexalist // (follows ~29K spaces) // For now, throw a RecoverableIOException so if iterating over // records, we keep going. TODO: Later make a legitimate // ARCRecord from the deleted record rather than throw // exception. throw new DeletedARCRecordIOException(statusLine); } else { throw new IOException("Failed parse of http status line."); } } this.httpStatus = new StatusLine(statusLine); // Save off all bytes read. Keep them as bytes rather than // convert to strings so we don't have to worry about encodings // though this should never be a problem doing http headers since // its all supposed to be ascii. ByteArrayOutputStream baos = new ByteArrayOutputStream(statusBytes.length + 4 * 1024); baos.write(statusBytes); // Now read rest of the header lines looking for the separation // between header and body. for (byte[] lineBytes = null; true;) { lineBytes = HttpParser.readRawLine(getIn()); eolCharCount = getEolCharsCount(lineBytes); if (eolCharCount <= 0) { throw new IOException( "Failed reading http headers: " + ((lineBytes != null) ? new String(lineBytes) : null)); } // Save the bytes read. baos.write(lineBytes); if ((lineBytes.length - eolCharCount) <= 0) { // We've finished reading the http header. break; } } byte[] headerBytes = baos.toByteArray(); // Save off where body starts. this.getMetaData().setContentBegin(headerBytes.length); ByteArrayInputStream bais = new ByteArrayInputStream(headerBytes); if (!bais.markSupported()) { throw new IOException("ByteArrayInputStream does not support mark"); } bais.mark(headerBytes.length); // Read the status line. Don't let it into the parseHeaders function. // It doesn't know what to do with it. bais.read(statusBytes, 0, statusBytes.length); this.httpHeaders = HttpParser.parseHeaders(bais, ARCConstants.DEFAULT_ENCODING); this.getMetaData().setStatusCode(Integer.toString(getStatusCode())); bais.reset(); return bais; }
From source file:dk.netarkivet.wayback.batch.copycode.NetarchiveSuiteWARCRecordToSearchResultAdapter.java
private CaptureSearchResult adaptWARCHTTPResponse(CaptureSearchResult result, WARCRecord rec) throws IOException { ArchiveRecordHeader header = rec.getHeader(); // need to parse the documents HTTP message and headers here: WARCReader // does not implement this... yet.. byte[] statusBytes = HttpParser.readRawLine(rec); int eolCharCount = getEolCharsCount(statusBytes); if (eolCharCount <= 0) { throw new RecoverableIOException("Failed to read http status where one " + " was expected: " + ((statusBytes == null) ? "(null)" : new String(statusBytes))); }/*w w w .j a v a 2s . c o m*/ String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) { throw new RecoverableIOException("Failed parse of http status line."); } StatusLine status = new StatusLine(statusLine); result.setHttpCode(String.valueOf(status.getStatusCode())); Header[] headers = HttpParser.parseHeaders(rec, ARCConstants.DEFAULT_ENCODING); annotater.annotateHTTPContent(result, rec, headers, header.getMimetype()); return result; }
From source file:org.archive.io.arc.ARCRecord.java
/** * Read http header if present. Technique borrowed from HttpClient HttpParse * class. set errors when found./*from w w w . j av a2 s .com*/ * * @return ByteArrayInputStream with the http header in it or null if no * http header. * @throws IOException */ private InputStream readHttpHeader() throws IOException { // this can be helpful when simply iterating over records, // looking for problems. Logger logger = Logger.getLogger(this.getClass().getName()); ArchiveRecordHeader h = this.getHeader(); // If judged a record that doesn't have an http header, return // immediately. String url = getHeader().getUrl(); if (!url.startsWith("http") || getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) { return null; } String statusLine; byte[] statusBytes; int eolCharCount = 0; int errOffset = 0; // Read status line, skipping any errant http headers found before it // This allows a larger number of 'corrupt' arcs -- where headers were accidentally // inserted before the status line to be readable while (true) { statusBytes = LaxHttpParser.readRawLine(getIn()); eolCharCount = getEolCharsCount(statusBytes); if (eolCharCount <= 0) { throw new RecoverableIOException("Failed to read http status where one was expected: " + ((statusBytes == null) ? "" : new String(statusBytes))); } statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); // If a null or DELETED break immediately if ((statusLine == null) || statusLine.startsWith("DELETED")) { break; } // If it's actually the status line, break, otherwise continue skipping any // previous header values if (!statusLine.contains(":") && StatusLine.startsWithHTTP(statusLine)) { break; } // Add bytes read to error "offset" to add to position errOffset += statusBytes.length; } if (errOffset > 0) { this.incrementPosition(errOffset); } if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) { if (statusLine.startsWith("DELETED")) { // Some old ARCs have deleted records like following: // http://vireo.gatech.edu:80/ebt-bin/nph-dweb/dynaweb/SGI_Developer/SGITCL_PG/@Generic__BookTocView/11108%3Btd%3D2 130.207.168.42 19991010131803 text/html 29202 // DELETED_TIME=20000425001133_DELETER=Kurt_REASON=alexalist // (follows ~29K spaces) // For now, throw a RecoverableIOException so if iterating over // records, we keep going. TODO: Later make a legitimate // ARCRecord from the deleted record rather than throw // exception. throw new DeletedARCRecordIOException(statusLine); } else { this.errors.add(ArcRecordErrors.HTTP_STATUS_LINE_INVALID); } } try { this.httpStatus = new StatusLine(statusLine); } catch (IOException e) { logger.warning(e.getMessage() + " at offset: " + h.getOffset()); this.errors.add(ArcRecordErrors.HTTP_STATUS_LINE_EXCEPTION); } // Save off all bytes read. Keep them as bytes rather than // convert to strings so we don't have to worry about encodings // though this should never be a problem doing http headers since // its all supposed to be ascii. ByteArrayOutputStream baos = new ByteArrayOutputStream(statusBytes.length + 4 * 1024); baos.write(statusBytes); // Now read rest of the header lines looking for the separation // between header and body. for (byte[] lineBytes = null; true;) { lineBytes = LaxHttpParser.readRawLine(getIn()); eolCharCount = getEolCharsCount(lineBytes); if (eolCharCount <= 0) { if (getIn().available() == 0) { httpHeaderBytesRead += statusBytes.length; logger.warning("HTTP header truncated at offset: " + h.getOffset()); this.errors.add(ArcRecordErrors.HTTP_HEADER_TRUNCATED); this.setEor(true); break; } else { throw new IOException( "Failed reading http headers: " + ((lineBytes != null) ? new String(lineBytes) : null)); } } else { httpHeaderBytesRead += lineBytes.length; } // Save the bytes read. baos.write(lineBytes); if ((lineBytes.length - eolCharCount) <= 0) { // We've finished reading the http header. break; } } byte[] headerBytes = baos.toByteArray(); // Save off where body starts. this.getMetaData().setContentBegin(headerBytes.length); ByteArrayInputStream bais = new ByteArrayInputStream(headerBytes); if (!bais.markSupported()) { throw new IOException("ByteArrayInputStream does not support mark"); } bais.mark(headerBytes.length); // Read the status line. Don't let it into the parseHeaders function. // It doesn't know what to do with it. bais.read(statusBytes, 0, statusBytes.length); this.httpHeaders = LaxHttpParser.parseHeaders(bais, ARCConstants.DEFAULT_ENCODING); this.getMetaData().setStatusCode(Integer.toString(getStatusCode())); bais.reset(); return bais; }
From source file:org.archive.io.HeaderedArchiveRecord.java
/** * Read header if present. Technique borrowed from HttpClient HttpParse * class. Using http parser code for now. Later move to more generic header * parsing code if there proves a need.// ww w. java 2 s . c om * * @return ByteArrayInputStream with the http header in it or null if no * http header. * @throws IOException */ private InputStream readContentHeaders() throws IOException { // If judged a record that doesn't have an http header, return // immediately. if (!hasContentHeaders()) { return null; } byte[] statusBytes = LaxHttpParser.readRawLine(getIn()); int eolCharCount = getEolCharsCount(statusBytes); if (eolCharCount <= 0) { throw new IOException( "Failed to read raw lie where one " + " was expected: " + new String(statusBytes)); } String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); if (statusLine == null) { throw new NullPointerException("Expected status line is null"); } // TODO: Tighten up this test. boolean isHttpResponse = StatusLine.startsWithHTTP(statusLine); boolean isHttpRequest = false; if (!isHttpResponse) { isHttpRequest = statusLine.toUpperCase().startsWith("GET") || !statusLine.toUpperCase().startsWith("POST"); } if (!isHttpResponse && !isHttpRequest) { throw new UnexpectedStartLineIOException("Failed parse of " + "status line: " + statusLine); } this.statusCode = isHttpResponse ? (new StatusLine(statusLine)).getStatusCode() : -1; // Save off all bytes read. Keep them as bytes rather than // convert to strings so we don't have to worry about encodings // though this should never be a problem doing http headers since // its all supposed to be ascii. ByteArrayOutputStream baos = new ByteArrayOutputStream(statusBytes.length + 4 * 1024); baos.write(statusBytes); // Now read rest of the header lines looking for the separation // between header and body. for (byte[] lineBytes = null; true;) { lineBytes = LaxHttpParser.readRawLine(getIn()); eolCharCount = getEolCharsCount(lineBytes); if (eolCharCount <= 0) { throw new IOException( "Failed reading headers: " + ((lineBytes != null) ? new String(lineBytes) : null)); } // Save the bytes read. baos.write(lineBytes); if ((lineBytes.length - eolCharCount) <= 0) { // We've finished reading the http header. break; } } byte[] headerBytes = baos.toByteArray(); // Save off where content body, post content headers, starts. this.contentHeadersLength = headerBytes.length; ByteArrayInputStream bais = new ByteArrayInputStream(headerBytes); if (!bais.markSupported()) { throw new IOException("ByteArrayInputStream does not support mark"); } bais.mark(headerBytes.length); // Read the status line. Don't let it into the parseHeaders function. // It doesn't know what to do with it. bais.read(statusBytes, 0, statusBytes.length); this.contentHeaders = LaxHttpParser.parseHeaders(bais, ARCConstants.DEFAULT_ENCODING); bais.reset(); return bais; }
From source file:org.archive.util.LaxHttpParser.java
/** * Read up to <tt>"\n"</tt> from an (unchunked) input stream. * If the stream ends before the line terminator is found, * the last part of the string will still be returned. * If no input data available, <code>null</code> is returned. * * @param inputStream the stream to read from * @param charset charset of HTTP protocol elements * * @throws IOException if an I/O problem occurs * @return a line from the stream/*w w w . j a va 2s . c om*/ * * @since 3.0 */ public static String readLine(InputStream inputStream, String charset) throws IOException { LOG.trace("enter LaxHttpParser.readLine(InputStream, String)"); byte[] rawdata = readRawLine(inputStream); if (rawdata == null) { return null; } // strip CR and LF from the end int len = rawdata.length; int offset = 0; if (len > 0) { if (rawdata[len - 1] == '\n') { offset++; if (len > 1) { if (rawdata[len - 2] == '\r') { offset++; } } } } return EncodingUtil.getString(rawdata, 0, len - offset, charset); }
From source file:org.archive.wayback.resourcestore.indexer.WARCRecordToSearchResultAdapter.java
private CaptureSearchResult adaptWARCHTTPResponse(CaptureSearchResult result, WARCRecord rec) throws IOException { ArchiveRecordHeader header = rec.getHeader(); // need to parse the documents HTTP message and headers here: WARCReader // does not implement this... yet.. byte[] statusBytes = LaxHttpParser.readRawLine(rec); int eolCharCount = getEolCharsCount(statusBytes); if (eolCharCount <= 0) { throw new RecoverableIOException("Failed to read http status where one " + " was expected: " + ((statusBytes == null) ? "(null)" : new String(statusBytes))); }/* w ww . j a va 2 s . c om*/ String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) { throw new RecoverableIOException("Failed parse of http status line."); } StatusLine status = new StatusLine(statusLine); result.setHttpCode(String.valueOf(status.getStatusCode())); Header[] headers = LaxHttpParser.parseHeaders(rec, ARCConstants.DEFAULT_ENCODING); annotater.annotateHTTPContent(result, rec, headers, header.getMimetype()); return result; }
From source file:org.archive.wayback.resourcestore.resourcefile.WarcResource.java
public void parseHeaders() throws IOException { if (parsedHeaders) { return;// w w w . jav a 2s . c om } // If warc or arc record is 0 length, don't do any more parsing! // Hopefully caller code will check this before proceeding as well if (getRecordLength() <= 0) { parsedHeaders = true; return; } // WARCRecord should have getRecordType() method returning WARCRecordType. String rectypeStr = (String) rec.getHeader().getHeaderValue("WARC-Type"); WARCRecordType rectype; try { rectype = WARCRecordType.valueOf(rectypeStr); } catch (IllegalArgumentException ex) { throw new RecoverableIOException("unrecognized WARC-Type \"" + rectypeStr + "\""); } if (rectype == WARCRecordType.response || rectype == WARCRecordType.revisit) { byte[] statusBytes = LaxHttpParser.readRawLine(rec); int eolCharCount = getEolCharsCount(statusBytes); if (eolCharCount <= 0) { throw new RecoverableIOException( "Failed to read http status where one " + " was expected: " + new String(statusBytes)); } String statusLineStr = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); if ((statusLineStr == null) || !StatusLine.startsWithHTTP(statusLineStr)) { throw new RecoverableIOException("Failed parse of http status line."); } StatusLine statusLine = new StatusLine(statusLineStr); this.status = statusLine.getStatusCode(); Header[] tmpHeaders = LaxHttpParser.parseHeaders(rec, ARCConstants.DEFAULT_ENCODING); headers = new Hashtable<String, String>(); this.setInputStream(rec); for (Header header : tmpHeaders) { headers.put(header.getName(), header.getValue()); if (header.getName().toUpperCase().contains(HttpHeaderOperation.HTTP_TRANSFER_ENC_HEADER)) { if (header.getValue().toUpperCase() .contains(HttpHeaderOperation.HTTP_CHUNKED_ENCODING_HEADER)) { setChunkedEncoding(); } } } } else if (rectype == WARCRecordType.metadata || rectype == WARCRecordType.resource) { status = 200; headers = new HashMap<String, String>(); String ct = (String) rec.getHeader().getHeaderValue("Content-Type"); if (ct != null) { headers.put("Content-Type", ct); } // necessary? String date = rec.getHeader().getDate(); if (date != null) { try { Date d = org.apache.commons.lang.time.DateUtils.parseDate(date, new String[] { "yyyy-MM-dd'T'HH:mm:ss'Z'" }); String httpDate = DateUtils.getRFC1123Date(d); headers.put("Date", httpDate); } catch (ParseException ex) { // } } setInputStream(rec); } parsedHeaders = true; }
From source file:org.folg.werelatedata.editor.PageEditor.java
private String getResponse(HttpMethodBase m) throws IOException { InputStream s = m.getResponseBodyAsStream(); int bytesRead = -1; int totalBytes = 0; int bytesToRead = BUF_SIZE; byte[] buf = new byte[BUF_SIZE]; while (true) { bytesRead = s.read(buf, totalBytes, bytesToRead); if (bytesRead < 0) { break; }//from w ww. ja v a 2s . com totalBytes += bytesRead; bytesToRead -= bytesRead; if (bytesToRead == 0) { // buffer full, so allocate more if (buf.length * 2 > MAX_BUF_SIZE) { throw new IOException("Response too long: " + m.getURI().toString()); } byte[] temp = buf; buf = new byte[temp.length * 2]; System.arraycopy(temp, 0, buf, 0, temp.length); bytesToRead = temp.length; } } if (totalBytes > 0) { return EncodingUtil.getString(buf, 0, totalBytes, m.getResponseCharSet()); } else { return null; } }
From source file:org.webcurator.domain.model.core.ArcHarvestFileDTO.java
private void indexWARCResponse(ArchiveRecord rec, Map<String, HarvestResourceDTO> results) throws IOException { WARCRecord record = (WARCRecord) rec; ArchiveRecordHeader header = record.getHeader(); // If the URL length is too long for the database, skip adding the URL // to the index. This ensures that the harvest completes successfully. if (header.getUrl().length() > MAX_URL_LENGTH) { return;/*from w w w . j a va2s .com*/ } try { ArcHarvestResourceDTO res = new ArcHarvestResourceDTO(); res.setArcFileName(this.getName()); res.setName(header.getUrl()); res.setResourceOffset(header.getOffset()); res.setCompressed(this.isCompressed()); // need to parse the documents HTTP message and headers here: WARCReader // does not implement this... byte[] statusBytes = HttpParser.readRawLine(record); int eolCharCount = getEolCharsCount(statusBytes); if (eolCharCount <= 0) { throw new RecoverableIOException( "Failed to read http status where one " + " was expected: " + new String(statusBytes)); } String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount, WARCConstants.DEFAULT_ENCODING); if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) { throw new RecoverableIOException("Failed parse of http status line."); } StatusLine status = new StatusLine(statusLine); res.setStatusCode(status.getStatusCode()); // Calculate the length. long length = header.getLength() - header.getContentBegin(); res.setLength(length); results.put(res.getName(), res); } finally { rec.close(); } }