Example usage for org.apache.commons.httpclient HttpParser readRawLine

List of usage examples for org.apache.commons.httpclient HttpParser readRawLine

Introduction

In this page you can find the example usage for org.apache.commons.httpclient HttpParser readRawLine.

Prototype

public static byte[] readRawLine(InputStream paramInputStream) throws IOException 

Source Link

Usage

From source file:is.landsbokasafn.deduplicator.indexer.WarcFileIterator.java

protected static CrawlDataItem processResponse(WARCRecord record, ArchiveRecordHeader header)
        throws IOException {
    CrawlDataItem cdi = new CrawlDataItem();
    cdi.setURL(header.getUrl());//from w  w w. ja va  2  s. com
    cdi.setContentDigest((String) header.getHeaderValue(WARCConstants.HEADER_KEY_PAYLOAD_DIGEST));
    cdi.setRevisit(false);
    cdi.setTimestamp(header.getDate());
    cdi.setWarcRecordId((String) header.getHeaderValue(WARCConstants.HEADER_KEY_ID));

    // Process the HTTP header, if any
    byte[] statusBytes = HttpParser.readRawLine(record);
    int eolCharCount = getEolCharsCount(statusBytes);
    if (eolCharCount > 0) {
        String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount,
                WARCConstants.DEFAULT_ENCODING);
        if ((statusLine != null) && StatusLine.startsWithHTTP(statusLine)) {
            StatusLine status = new StatusLine(statusLine);
            cdi.setStatusCode(status.getStatusCode());
            Header[] headers = HttpParser.parseHeaders(record, WARCConstants.DEFAULT_ENCODING);
            for (Header h : headers) {
                if (h.getName().equalsIgnoreCase("Content-Type")) {
                    cdi.setMimeType(h.getValue());
                } else if (h.getName().equalsIgnoreCase("ETag")) {
                    cdi.setEtag(h.getValue());
                }
            }
        }
    }

    return cdi;
}

From source file:com.cyberway.issue.io.arc.ARCRecord.java

/**
* Read http header if present. Technique borrowed from HttpClient HttpParse
* class./*from   w w  w .j a  v  a 2  s. co m*/
* 
* @return ByteArrayInputStream with the http header in it or null if no
*         http header.
* @throws IOException
*/
private InputStream readHttpHeader() throws IOException {
    // If judged a record that doesn't have an http header, return
    // immediately.
    if (!getHeader().getUrl().startsWith("http") || getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
        return null;
    }
    byte[] statusBytes = HttpParser.readRawLine(getIn());
    int eolCharCount = getEolCharsCount(statusBytes);
    if (eolCharCount <= 0) {
        throw new IOException("Failed to read http status where one was expected: "
                + ((statusBytes == null) ? "" : new String(statusBytes)));
    }
    String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount,
            ARCConstants.DEFAULT_ENCODING);
    if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) {
        if (statusLine.startsWith("DELETED")) {
            // Some old ARCs have deleted records like following:
            // http://vireo.gatech.edu:80/ebt-bin/nph-dweb/dynaweb/SGI_Developer/SGITCL_PG/@Generic__BookTocView/11108%3Btd%3D2 130.207.168.42 19991010131803 text/html 29202
            // DELETED_TIME=20000425001133_DELETER=Kurt_REASON=alexalist
            // (follows ~29K spaces)
            // For now, throw a RecoverableIOException so if iterating over
            // records, we keep going.  TODO: Later make a legitimate
            // ARCRecord from the deleted record rather than throw
            // exception.
            throw new DeletedARCRecordIOException(statusLine);
        } else {
            throw new IOException("Failed parse of http status line.");
        }
    }
    this.httpStatus = new StatusLine(statusLine);

    // Save off all bytes read.  Keep them as bytes rather than
    // convert to strings so we don't have to worry about encodings
    // though this should never be a problem doing http headers since
    // its all supposed to be ascii.
    ByteArrayOutputStream baos = new ByteArrayOutputStream(statusBytes.length + 4 * 1024);
    baos.write(statusBytes);

    // Now read rest of the header lines looking for the separation
    // between header and body.
    for (byte[] lineBytes = null; true;) {
        lineBytes = HttpParser.readRawLine(getIn());
        eolCharCount = getEolCharsCount(lineBytes);
        if (eolCharCount <= 0) {
            throw new IOException(
                    "Failed reading http headers: " + ((lineBytes != null) ? new String(lineBytes) : null));
        }
        // Save the bytes read.
        baos.write(lineBytes);
        if ((lineBytes.length - eolCharCount) <= 0) {
            // We've finished reading the http header.
            break;
        }
    }

    byte[] headerBytes = baos.toByteArray();
    // Save off where body starts.
    this.getMetaData().setContentBegin(headerBytes.length);
    ByteArrayInputStream bais = new ByteArrayInputStream(headerBytes);
    if (!bais.markSupported()) {
        throw new IOException("ByteArrayInputStream does not support mark");
    }
    bais.mark(headerBytes.length);
    // Read the status line.  Don't let it into the parseHeaders function.
    // It doesn't know what to do with it.
    bais.read(statusBytes, 0, statusBytes.length);
    this.httpHeaders = HttpParser.parseHeaders(bais, ARCConstants.DEFAULT_ENCODING);
    this.getMetaData().setStatusCode(Integer.toString(getStatusCode()));
    bais.reset();
    return bais;
}

From source file:dk.netarkivet.wayback.batch.copycode.NetarchiveSuiteWARCRecordToSearchResultAdapter.java

private CaptureSearchResult adaptWARCHTTPResponse(CaptureSearchResult result, WARCRecord rec)
        throws IOException {

    ArchiveRecordHeader header = rec.getHeader();
    // need to parse the documents HTTP message and headers here: WARCReader
    // does not implement this... yet..

    byte[] statusBytes = HttpParser.readRawLine(rec);
    int eolCharCount = getEolCharsCount(statusBytes);
    if (eolCharCount <= 0) {
        throw new RecoverableIOException("Failed to read http status where one " + " was expected: "
                + ((statusBytes == null) ? "(null)" : new String(statusBytes)));
    }/* w  w  w. j a  va 2s .com*/
    String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount,
            ARCConstants.DEFAULT_ENCODING);
    if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) {
        throw new RecoverableIOException("Failed parse of http status line.");
    }
    StatusLine status = new StatusLine(statusLine);
    result.setHttpCode(String.valueOf(status.getStatusCode()));

    Header[] headers = HttpParser.parseHeaders(rec, ARCConstants.DEFAULT_ENCODING);

    annotater.annotateHTTPContent(result, rec, headers, header.getMimetype());

    return result;
}

From source file:org.webcurator.domain.model.core.ArcHarvestFileDTO.java

private void indexWARCResponse(ArchiveRecord rec, Map<String, HarvestResourceDTO> results) throws IOException {

    WARCRecord record = (WARCRecord) rec;
    ArchiveRecordHeader header = record.getHeader();

    // If the URL length is too long for the database, skip adding the URL
    // to the index. This ensures that the harvest completes successfully. 
    if (header.getUrl().length() > MAX_URL_LENGTH) {
        return;/*  w  w  w .  j a  v a2 s . c o  m*/
    }

    try {
        ArcHarvestResourceDTO res = new ArcHarvestResourceDTO();
        res.setArcFileName(this.getName());
        res.setName(header.getUrl());
        res.setResourceOffset(header.getOffset());
        res.setCompressed(this.isCompressed());

        // need to parse the documents HTTP message and headers here: WARCReader
        // does not implement this...

        byte[] statusBytes = HttpParser.readRawLine(record);
        int eolCharCount = getEolCharsCount(statusBytes);
        if (eolCharCount <= 0) {
            throw new RecoverableIOException(
                    "Failed to read http status where one " + " was expected: " + new String(statusBytes));
        }
        String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount,
                WARCConstants.DEFAULT_ENCODING);
        if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) {
            throw new RecoverableIOException("Failed parse of http status line.");
        }
        StatusLine status = new StatusLine(statusLine);

        res.setStatusCode(status.getStatusCode());

        // Calculate the length.
        long length = header.getLength() - header.getContentBegin();
        res.setLength(length);

        results.put(res.getName(), res);
    } finally {
        rec.close();
    }
}