List of usage examples for org.apache.commons.httpclient HttpParser readRawLine
public static byte[] readRawLine(InputStream paramInputStream) throws IOException
From source file:is.landsbokasafn.deduplicator.indexer.WarcFileIterator.java
protected static CrawlDataItem processResponse(WARCRecord record, ArchiveRecordHeader header) throws IOException { CrawlDataItem cdi = new CrawlDataItem(); cdi.setURL(header.getUrl());//from w w w. ja va 2 s. com cdi.setContentDigest((String) header.getHeaderValue(WARCConstants.HEADER_KEY_PAYLOAD_DIGEST)); cdi.setRevisit(false); cdi.setTimestamp(header.getDate()); cdi.setWarcRecordId((String) header.getHeaderValue(WARCConstants.HEADER_KEY_ID)); // Process the HTTP header, if any byte[] statusBytes = HttpParser.readRawLine(record); int eolCharCount = getEolCharsCount(statusBytes); if (eolCharCount > 0) { String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount, WARCConstants.DEFAULT_ENCODING); if ((statusLine != null) && StatusLine.startsWithHTTP(statusLine)) { StatusLine status = new StatusLine(statusLine); cdi.setStatusCode(status.getStatusCode()); Header[] headers = HttpParser.parseHeaders(record, WARCConstants.DEFAULT_ENCODING); for (Header h : headers) { if (h.getName().equalsIgnoreCase("Content-Type")) { cdi.setMimeType(h.getValue()); } else if (h.getName().equalsIgnoreCase("ETag")) { cdi.setEtag(h.getValue()); } } } } return cdi; }
From source file:com.cyberway.issue.io.arc.ARCRecord.java
/** * Read http header if present. Technique borrowed from HttpClient HttpParse * class./*from w w w .j a v a 2 s. co m*/ * * @return ByteArrayInputStream with the http header in it or null if no * http header. * @throws IOException */ private InputStream readHttpHeader() throws IOException { // If judged a record that doesn't have an http header, return // immediately. if (!getHeader().getUrl().startsWith("http") || getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) { return null; } byte[] statusBytes = HttpParser.readRawLine(getIn()); int eolCharCount = getEolCharsCount(statusBytes); if (eolCharCount <= 0) { throw new IOException("Failed to read http status where one was expected: " + ((statusBytes == null) ? "" : new String(statusBytes))); } String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) { if (statusLine.startsWith("DELETED")) { // Some old ARCs have deleted records like following: // http://vireo.gatech.edu:80/ebt-bin/nph-dweb/dynaweb/SGI_Developer/SGITCL_PG/@Generic__BookTocView/11108%3Btd%3D2 130.207.168.42 19991010131803 text/html 29202 // DELETED_TIME=20000425001133_DELETER=Kurt_REASON=alexalist // (follows ~29K spaces) // For now, throw a RecoverableIOException so if iterating over // records, we keep going. TODO: Later make a legitimate // ARCRecord from the deleted record rather than throw // exception. throw new DeletedARCRecordIOException(statusLine); } else { throw new IOException("Failed parse of http status line."); } } this.httpStatus = new StatusLine(statusLine); // Save off all bytes read. Keep them as bytes rather than // convert to strings so we don't have to worry about encodings // though this should never be a problem doing http headers since // its all supposed to be ascii. ByteArrayOutputStream baos = new ByteArrayOutputStream(statusBytes.length + 4 * 1024); baos.write(statusBytes); // Now read rest of the header lines looking for the separation // between header and body. for (byte[] lineBytes = null; true;) { lineBytes = HttpParser.readRawLine(getIn()); eolCharCount = getEolCharsCount(lineBytes); if (eolCharCount <= 0) { throw new IOException( "Failed reading http headers: " + ((lineBytes != null) ? new String(lineBytes) : null)); } // Save the bytes read. baos.write(lineBytes); if ((lineBytes.length - eolCharCount) <= 0) { // We've finished reading the http header. break; } } byte[] headerBytes = baos.toByteArray(); // Save off where body starts. this.getMetaData().setContentBegin(headerBytes.length); ByteArrayInputStream bais = new ByteArrayInputStream(headerBytes); if (!bais.markSupported()) { throw new IOException("ByteArrayInputStream does not support mark"); } bais.mark(headerBytes.length); // Read the status line. Don't let it into the parseHeaders function. // It doesn't know what to do with it. bais.read(statusBytes, 0, statusBytes.length); this.httpHeaders = HttpParser.parseHeaders(bais, ARCConstants.DEFAULT_ENCODING); this.getMetaData().setStatusCode(Integer.toString(getStatusCode())); bais.reset(); return bais; }
From source file:dk.netarkivet.wayback.batch.copycode.NetarchiveSuiteWARCRecordToSearchResultAdapter.java
private CaptureSearchResult adaptWARCHTTPResponse(CaptureSearchResult result, WARCRecord rec) throws IOException { ArchiveRecordHeader header = rec.getHeader(); // need to parse the documents HTTP message and headers here: WARCReader // does not implement this... yet.. byte[] statusBytes = HttpParser.readRawLine(rec); int eolCharCount = getEolCharsCount(statusBytes); if (eolCharCount <= 0) { throw new RecoverableIOException("Failed to read http status where one " + " was expected: " + ((statusBytes == null) ? "(null)" : new String(statusBytes))); }/* w w w. j a va 2s .com*/ String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) { throw new RecoverableIOException("Failed parse of http status line."); } StatusLine status = new StatusLine(statusLine); result.setHttpCode(String.valueOf(status.getStatusCode())); Header[] headers = HttpParser.parseHeaders(rec, ARCConstants.DEFAULT_ENCODING); annotater.annotateHTTPContent(result, rec, headers, header.getMimetype()); return result; }
From source file:org.webcurator.domain.model.core.ArcHarvestFileDTO.java
private void indexWARCResponse(ArchiveRecord rec, Map<String, HarvestResourceDTO> results) throws IOException { WARCRecord record = (WARCRecord) rec; ArchiveRecordHeader header = record.getHeader(); // If the URL length is too long for the database, skip adding the URL // to the index. This ensures that the harvest completes successfully. if (header.getUrl().length() > MAX_URL_LENGTH) { return;/* w w w . j a v a2 s . c o m*/ } try { ArcHarvestResourceDTO res = new ArcHarvestResourceDTO(); res.setArcFileName(this.getName()); res.setName(header.getUrl()); res.setResourceOffset(header.getOffset()); res.setCompressed(this.isCompressed()); // need to parse the documents HTTP message and headers here: WARCReader // does not implement this... byte[] statusBytes = HttpParser.readRawLine(record); int eolCharCount = getEolCharsCount(statusBytes); if (eolCharCount <= 0) { throw new RecoverableIOException( "Failed to read http status where one " + " was expected: " + new String(statusBytes)); } String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount, WARCConstants.DEFAULT_ENCODING); if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) { throw new RecoverableIOException("Failed parse of http status line."); } StatusLine status = new StatusLine(statusLine); res.setStatusCode(status.getStatusCode()); // Calculate the length. long length = header.getLength() - header.getContentBegin(); res.setLength(length); results.put(res.getName(), res); } finally { rec.close(); } }