Example usage for org.apache.commons.httpclient StatusLine startsWithHTTP

List of usage examples for org.apache.commons.httpclient StatusLine startsWithHTTP

Introduction

In this page you can find the example usage for org.apache.commons.httpclient StatusLine startsWithHTTP.

Prototype

public static boolean startsWithHTTP(String paramString) 

Source Link

Usage

From source file:is.landsbokasafn.deduplicator.indexer.WarcFileIterator.java

protected static CrawlDataItem processResponse(WARCRecord record, ArchiveRecordHeader header)
        throws IOException {
    CrawlDataItem cdi = new CrawlDataItem();
    cdi.setURL(header.getUrl());//from  w  w w.  ja v a2  s . c  o m
    cdi.setContentDigest((String) header.getHeaderValue(WARCConstants.HEADER_KEY_PAYLOAD_DIGEST));
    cdi.setRevisit(false);
    cdi.setTimestamp(header.getDate());
    cdi.setWarcRecordId((String) header.getHeaderValue(WARCConstants.HEADER_KEY_ID));

    // Process the HTTP header, if any
    byte[] statusBytes = HttpParser.readRawLine(record);
    int eolCharCount = getEolCharsCount(statusBytes);
    if (eolCharCount > 0) {
        String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount,
                WARCConstants.DEFAULT_ENCODING);
        if ((statusLine != null) && StatusLine.startsWithHTTP(statusLine)) {
            StatusLine status = new StatusLine(statusLine);
            cdi.setStatusCode(status.getStatusCode());
            Header[] headers = HttpParser.parseHeaders(record, WARCConstants.DEFAULT_ENCODING);
            for (Header h : headers) {
                if (h.getName().equalsIgnoreCase("Content-Type")) {
                    cdi.setMimeType(h.getValue());
                } else if (h.getName().equalsIgnoreCase("ETag")) {
                    cdi.setEtag(h.getValue());
                }
            }
        }
    }

    return cdi;
}

From source file:com.cyberway.issue.io.arc.ARCRecord.java

/**
* Read http header if present. Technique borrowed from HttpClient HttpParse
* class.//w  w w .  j av  a  2 s.  co  m
* 
* @return ByteArrayInputStream with the http header in it or null if no
*         http header.
* @throws IOException
*/
private InputStream readHttpHeader() throws IOException {
    // If judged a record that doesn't have an http header, return
    // immediately.
    if (!getHeader().getUrl().startsWith("http") || getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
        return null;
    }
    byte[] statusBytes = HttpParser.readRawLine(getIn());
    int eolCharCount = getEolCharsCount(statusBytes);
    if (eolCharCount <= 0) {
        throw new IOException("Failed to read http status where one was expected: "
                + ((statusBytes == null) ? "" : new String(statusBytes)));
    }
    String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount,
            ARCConstants.DEFAULT_ENCODING);
    if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) {
        if (statusLine.startsWith("DELETED")) {
            // Some old ARCs have deleted records like following:
            // http://vireo.gatech.edu:80/ebt-bin/nph-dweb/dynaweb/SGI_Developer/SGITCL_PG/@Generic__BookTocView/11108%3Btd%3D2 130.207.168.42 19991010131803 text/html 29202
            // DELETED_TIME=20000425001133_DELETER=Kurt_REASON=alexalist
            // (follows ~29K spaces)
            // For now, throw a RecoverableIOException so if iterating over
            // records, we keep going.  TODO: Later make a legitimate
            // ARCRecord from the deleted record rather than throw
            // exception.
            throw new DeletedARCRecordIOException(statusLine);
        } else {
            throw new IOException("Failed parse of http status line.");
        }
    }
    this.httpStatus = new StatusLine(statusLine);

    // Save off all bytes read.  Keep them as bytes rather than
    // convert to strings so we don't have to worry about encodings
    // though this should never be a problem doing http headers since
    // its all supposed to be ascii.
    ByteArrayOutputStream baos = new ByteArrayOutputStream(statusBytes.length + 4 * 1024);
    baos.write(statusBytes);

    // Now read rest of the header lines looking for the separation
    // between header and body.
    for (byte[] lineBytes = null; true;) {
        lineBytes = HttpParser.readRawLine(getIn());
        eolCharCount = getEolCharsCount(lineBytes);
        if (eolCharCount <= 0) {
            throw new IOException(
                    "Failed reading http headers: " + ((lineBytes != null) ? new String(lineBytes) : null));
        }
        // Save the bytes read.
        baos.write(lineBytes);
        if ((lineBytes.length - eolCharCount) <= 0) {
            // We've finished reading the http header.
            break;
        }
    }

    byte[] headerBytes = baos.toByteArray();
    // Save off where body starts.
    this.getMetaData().setContentBegin(headerBytes.length);
    ByteArrayInputStream bais = new ByteArrayInputStream(headerBytes);
    if (!bais.markSupported()) {
        throw new IOException("ByteArrayInputStream does not support mark");
    }
    bais.mark(headerBytes.length);
    // Read the status line.  Don't let it into the parseHeaders function.
    // It doesn't know what to do with it.
    bais.read(statusBytes, 0, statusBytes.length);
    this.httpHeaders = HttpParser.parseHeaders(bais, ARCConstants.DEFAULT_ENCODING);
    this.getMetaData().setStatusCode(Integer.toString(getStatusCode()));
    bais.reset();
    return bais;
}

From source file:dk.netarkivet.wayback.batch.copycode.NetarchiveSuiteWARCRecordToSearchResultAdapter.java

private CaptureSearchResult adaptWARCHTTPResponse(CaptureSearchResult result, WARCRecord rec)
        throws IOException {

    ArchiveRecordHeader header = rec.getHeader();
    // need to parse the documents HTTP message and headers here: WARCReader
    // does not implement this... yet..

    byte[] statusBytes = HttpParser.readRawLine(rec);
    int eolCharCount = getEolCharsCount(statusBytes);
    if (eolCharCount <= 0) {
        throw new RecoverableIOException("Failed to read http status where one " + " was expected: "
                + ((statusBytes == null) ? "(null)" : new String(statusBytes)));
    }//from  w  w w.  ja  v  a2s . c  o  m
    String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount,
            ARCConstants.DEFAULT_ENCODING);
    if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) {
        throw new RecoverableIOException("Failed parse of http status line.");
    }
    StatusLine status = new StatusLine(statusLine);
    result.setHttpCode(String.valueOf(status.getStatusCode()));

    Header[] headers = HttpParser.parseHeaders(rec, ARCConstants.DEFAULT_ENCODING);

    annotater.annotateHTTPContent(result, rec, headers, header.getMimetype());

    return result;
}

From source file:org.archive.io.arc.ARCRecord.java

/**
 * Read http header if present. Technique borrowed from HttpClient HttpParse
 * class. set errors when found./*w ww.j  av  a  2 s  .c  o  m*/
 * 
 * @return ByteArrayInputStream with the http header in it or null if no
 *         http header.
 * @throws IOException
 */
private InputStream readHttpHeader() throws IOException {

    // this can be helpful when simply iterating over records, 
    // looking for problems.
    Logger logger = Logger.getLogger(this.getClass().getName());
    ArchiveRecordHeader h = this.getHeader();

    // If judged a record that doesn't have an http header, return
    // immediately.
    String url = getHeader().getUrl();
    if (!url.startsWith("http") || getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
        return null;
    }

    String statusLine;
    byte[] statusBytes;
    int eolCharCount = 0;
    int errOffset = 0;

    // Read status line, skipping any errant http headers found before it
    // This allows a larger number of 'corrupt' arcs -- where headers were accidentally
    // inserted before the status line to be readable
    while (true) {
        statusBytes = LaxHttpParser.readRawLine(getIn());
        eolCharCount = getEolCharsCount(statusBytes);
        if (eolCharCount <= 0) {
            throw new RecoverableIOException("Failed to read http status where one was expected: "
                    + ((statusBytes == null) ? "" : new String(statusBytes)));
        }

        statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount,
                ARCConstants.DEFAULT_ENCODING);

        // If a null or DELETED break immediately
        if ((statusLine == null) || statusLine.startsWith("DELETED")) {
            break;
        }

        // If it's actually the status line, break, otherwise continue skipping any
        // previous header values
        if (!statusLine.contains(":") && StatusLine.startsWithHTTP(statusLine)) {
            break;
        }

        // Add bytes read to error "offset" to add to position
        errOffset += statusBytes.length;
    }

    if (errOffset > 0) {
        this.incrementPosition(errOffset);
    }

    if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) {
        if (statusLine.startsWith("DELETED")) {
            // Some old ARCs have deleted records like following:
            // http://vireo.gatech.edu:80/ebt-bin/nph-dweb/dynaweb/SGI_Developer/SGITCL_PG/@Generic__BookTocView/11108%3Btd%3D2 130.207.168.42 19991010131803 text/html 29202
            // DELETED_TIME=20000425001133_DELETER=Kurt_REASON=alexalist
            // (follows ~29K spaces)
            // For now, throw a RecoverableIOException so if iterating over
            // records, we keep going.  TODO: Later make a legitimate
            // ARCRecord from the deleted record rather than throw
            // exception.
            throw new DeletedARCRecordIOException(statusLine);
        } else {
            this.errors.add(ArcRecordErrors.HTTP_STATUS_LINE_INVALID);
        }
    }

    try {
        this.httpStatus = new StatusLine(statusLine);
    } catch (IOException e) {
        logger.warning(e.getMessage() + " at offset: " + h.getOffset());
        this.errors.add(ArcRecordErrors.HTTP_STATUS_LINE_EXCEPTION);
    }

    // Save off all bytes read.  Keep them as bytes rather than
    // convert to strings so we don't have to worry about encodings
    // though this should never be a problem doing http headers since
    // its all supposed to be ascii.
    ByteArrayOutputStream baos = new ByteArrayOutputStream(statusBytes.length + 4 * 1024);
    baos.write(statusBytes);

    // Now read rest of the header lines looking for the separation
    // between header and body.
    for (byte[] lineBytes = null; true;) {
        lineBytes = LaxHttpParser.readRawLine(getIn());
        eolCharCount = getEolCharsCount(lineBytes);
        if (eolCharCount <= 0) {
            if (getIn().available() == 0) {
                httpHeaderBytesRead += statusBytes.length;
                logger.warning("HTTP header truncated at offset: " + h.getOffset());
                this.errors.add(ArcRecordErrors.HTTP_HEADER_TRUNCATED);
                this.setEor(true);
                break;
            } else {
                throw new IOException(
                        "Failed reading http headers: " + ((lineBytes != null) ? new String(lineBytes) : null));
            }
        } else {
            httpHeaderBytesRead += lineBytes.length;
        }
        // Save the bytes read.
        baos.write(lineBytes);
        if ((lineBytes.length - eolCharCount) <= 0) {
            // We've finished reading the http header.
            break;
        }
    }

    byte[] headerBytes = baos.toByteArray();
    // Save off where body starts.
    this.getMetaData().setContentBegin(headerBytes.length);
    ByteArrayInputStream bais = new ByteArrayInputStream(headerBytes);
    if (!bais.markSupported()) {
        throw new IOException("ByteArrayInputStream does not support mark");
    }
    bais.mark(headerBytes.length);
    // Read the status line.  Don't let it into the parseHeaders function.
    // It doesn't know what to do with it.
    bais.read(statusBytes, 0, statusBytes.length);
    this.httpHeaders = LaxHttpParser.parseHeaders(bais, ARCConstants.DEFAULT_ENCODING);
    this.getMetaData().setStatusCode(Integer.toString(getStatusCode()));
    bais.reset();
    return bais;
}

From source file:org.archive.io.HeaderedArchiveRecord.java

/**
 * Read header if present. Technique borrowed from HttpClient HttpParse
 * class. Using http parser code for now. Later move to more generic header
 * parsing code if there proves a need./*from w  ww.j  a  v a 2 s  .c o  m*/
 * 
 * @return ByteArrayInputStream with the http header in it or null if no
 *         http header.
 * @throws IOException
 */
private InputStream readContentHeaders() throws IOException {
    // If judged a record that doesn't have an http header, return
    // immediately.
    if (!hasContentHeaders()) {
        return null;
    }
    byte[] statusBytes = LaxHttpParser.readRawLine(getIn());
    int eolCharCount = getEolCharsCount(statusBytes);
    if (eolCharCount <= 0) {
        throw new IOException(
                "Failed to read raw lie where one " + " was expected: " + new String(statusBytes));
    }
    String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount,
            ARCConstants.DEFAULT_ENCODING);
    if (statusLine == null) {
        throw new NullPointerException("Expected status line is null");
    }
    // TODO: Tighten up this test.
    boolean isHttpResponse = StatusLine.startsWithHTTP(statusLine);
    boolean isHttpRequest = false;
    if (!isHttpResponse) {
        isHttpRequest = statusLine.toUpperCase().startsWith("GET")
                || !statusLine.toUpperCase().startsWith("POST");
    }
    if (!isHttpResponse && !isHttpRequest) {
        throw new UnexpectedStartLineIOException("Failed parse of " + "status line: " + statusLine);
    }
    this.statusCode = isHttpResponse ? (new StatusLine(statusLine)).getStatusCode() : -1;

    // Save off all bytes read.  Keep them as bytes rather than
    // convert to strings so we don't have to worry about encodings
    // though this should never be a problem doing http headers since
    // its all supposed to be ascii.
    ByteArrayOutputStream baos = new ByteArrayOutputStream(statusBytes.length + 4 * 1024);
    baos.write(statusBytes);

    // Now read rest of the header lines looking for the separation
    // between header and body.
    for (byte[] lineBytes = null; true;) {
        lineBytes = LaxHttpParser.readRawLine(getIn());
        eolCharCount = getEolCharsCount(lineBytes);
        if (eolCharCount <= 0) {
            throw new IOException(
                    "Failed reading headers: " + ((lineBytes != null) ? new String(lineBytes) : null));
        }
        // Save the bytes read.
        baos.write(lineBytes);
        if ((lineBytes.length - eolCharCount) <= 0) {
            // We've finished reading the http header.
            break;
        }
    }

    byte[] headerBytes = baos.toByteArray();
    // Save off where content body, post content headers, starts.
    this.contentHeadersLength = headerBytes.length;
    ByteArrayInputStream bais = new ByteArrayInputStream(headerBytes);
    if (!bais.markSupported()) {
        throw new IOException("ByteArrayInputStream does not support mark");
    }
    bais.mark(headerBytes.length);
    // Read the status line.  Don't let it into the parseHeaders function.
    // It doesn't know what to do with it.
    bais.read(statusBytes, 0, statusBytes.length);
    this.contentHeaders = LaxHttpParser.parseHeaders(bais, ARCConstants.DEFAULT_ENCODING);
    bais.reset();
    return bais;
}

From source file:org.archive.wayback.resourcestore.indexer.WARCRecordToSearchResultAdapter.java

private CaptureSearchResult adaptWARCHTTPResponse(CaptureSearchResult result, WARCRecord rec)
        throws IOException {

    ArchiveRecordHeader header = rec.getHeader();
    // need to parse the documents HTTP message and headers here: WARCReader
    // does not implement this... yet..

    byte[] statusBytes = LaxHttpParser.readRawLine(rec);
    int eolCharCount = getEolCharsCount(statusBytes);
    if (eolCharCount <= 0) {
        throw new RecoverableIOException("Failed to read http status where one " + " was expected: "
                + ((statusBytes == null) ? "(null)" : new String(statusBytes)));
    }//from   ww w .  j  a  v  a 2  s .  c o  m
    String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount,
            ARCConstants.DEFAULT_ENCODING);
    if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) {
        throw new RecoverableIOException("Failed parse of http status line.");
    }
    StatusLine status = new StatusLine(statusLine);
    result.setHttpCode(String.valueOf(status.getStatusCode()));

    Header[] headers = LaxHttpParser.parseHeaders(rec, ARCConstants.DEFAULT_ENCODING);

    annotater.annotateHTTPContent(result, rec, headers, header.getMimetype());

    return result;
}

From source file:org.archive.wayback.resourcestore.resourcefile.WarcResource.java

public void parseHeaders() throws IOException {
    if (parsedHeaders) {
        return;/*w ww . ja va 2 s .  com*/
    }

    // If warc or arc record is 0 length, don't do any more parsing!
    // Hopefully caller code will check this before proceeding as well
    if (getRecordLength() <= 0) {
        parsedHeaders = true;
        return;
    }

    // WARCRecord should have getRecordType() method returning WARCRecordType.
    String rectypeStr = (String) rec.getHeader().getHeaderValue("WARC-Type");
    WARCRecordType rectype;
    try {
        rectype = WARCRecordType.valueOf(rectypeStr);
    } catch (IllegalArgumentException ex) {
        throw new RecoverableIOException("unrecognized WARC-Type \"" + rectypeStr + "\"");
    }

    if (rectype == WARCRecordType.response || rectype == WARCRecordType.revisit) {
        byte[] statusBytes = LaxHttpParser.readRawLine(rec);
        int eolCharCount = getEolCharsCount(statusBytes);
        if (eolCharCount <= 0) {
            throw new RecoverableIOException(
                    "Failed to read http status where one " + " was expected: " + new String(statusBytes));
        }
        String statusLineStr = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount,
                ARCConstants.DEFAULT_ENCODING);
        if ((statusLineStr == null) || !StatusLine.startsWithHTTP(statusLineStr)) {
            throw new RecoverableIOException("Failed parse of http status line.");
        }
        StatusLine statusLine = new StatusLine(statusLineStr);

        this.status = statusLine.getStatusCode();

        Header[] tmpHeaders = LaxHttpParser.parseHeaders(rec, ARCConstants.DEFAULT_ENCODING);
        headers = new Hashtable<String, String>();
        this.setInputStream(rec);
        for (Header header : tmpHeaders) {
            headers.put(header.getName(), header.getValue());
            if (header.getName().toUpperCase().contains(HttpHeaderOperation.HTTP_TRANSFER_ENC_HEADER)) {
                if (header.getValue().toUpperCase()
                        .contains(HttpHeaderOperation.HTTP_CHUNKED_ENCODING_HEADER)) {
                    setChunkedEncoding();
                }
            }
        }
    } else if (rectype == WARCRecordType.metadata || rectype == WARCRecordType.resource) {
        status = 200;
        headers = new HashMap<String, String>();
        String ct = (String) rec.getHeader().getHeaderValue("Content-Type");
        if (ct != null) {
            headers.put("Content-Type", ct);
        }
        // necessary?
        String date = rec.getHeader().getDate();
        if (date != null) {
            try {
                Date d = org.apache.commons.lang.time.DateUtils.parseDate(date,
                        new String[] { "yyyy-MM-dd'T'HH:mm:ss'Z'" });
                String httpDate = DateUtils.getRFC1123Date(d);
                headers.put("Date", httpDate);
            } catch (ParseException ex) {
                //
            }
        }
        setInputStream(rec);
    }
    parsedHeaders = true;
}

From source file:org.webcurator.domain.model.core.ArcHarvestFileDTO.java

private void indexWARCResponse(ArchiveRecord rec, Map<String, HarvestResourceDTO> results) throws IOException {

    WARCRecord record = (WARCRecord) rec;
    ArchiveRecordHeader header = record.getHeader();

    // If the URL length is too long for the database, skip adding the URL
    // to the index. This ensures that the harvest completes successfully. 
    if (header.getUrl().length() > MAX_URL_LENGTH) {
        return;//from   w w  w . java  2  s.  co  m
    }

    try {
        ArcHarvestResourceDTO res = new ArcHarvestResourceDTO();
        res.setArcFileName(this.getName());
        res.setName(header.getUrl());
        res.setResourceOffset(header.getOffset());
        res.setCompressed(this.isCompressed());

        // need to parse the documents HTTP message and headers here: WARCReader
        // does not implement this...

        byte[] statusBytes = HttpParser.readRawLine(record);
        int eolCharCount = getEolCharsCount(statusBytes);
        if (eolCharCount <= 0) {
            throw new RecoverableIOException(
                    "Failed to read http status where one " + " was expected: " + new String(statusBytes));
        }
        String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount,
                WARCConstants.DEFAULT_ENCODING);
        if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) {
            throw new RecoverableIOException("Failed parse of http status line.");
        }
        StatusLine status = new StatusLine(statusLine);

        res.setStatusCode(status.getStatusCode());

        // Calculate the length.
        long length = header.getLength() - header.getContentBegin();
        res.setLength(length);

        results.put(res.getName(), res);
    } finally {
        rec.close();
    }
}