Example usage for org.apache.commons.httpclient.util EncodingUtil getString

Introduction

In this page you can find the example usage for org.apache.commons.httpclient.util EncodingUtil getString.

Prototype

public static String getString(final byte[] data, int offset, int length, String charset)

Source Link

Document

Converts the byte array of HTTP content characters to a string.

Usage

From source file:is.landsbokasafn.deduplicator.indexer.WarcFileIterator.java

protected static CrawlDataItem processResponse(WARCRecord record, ArchiveRecordHeader header)
        throws IOException {
    CrawlDataItem cdi = new CrawlDataItem();
    cdi.setURL(header.getUrl());//from  w  w w  .  j a  v  a  2s  .  c  o m
    cdi.setContentDigest((String) header.getHeaderValue(WARCConstants.HEADER_KEY_PAYLOAD_DIGEST));
    cdi.setRevisit(false);
    cdi.setTimestamp(header.getDate());
    cdi.setWarcRecordId((String) header.getHeaderValue(WARCConstants.HEADER_KEY_ID));

    // Process the HTTP header, if any
    byte[] statusBytes = HttpParser.readRawLine(record);
    int eolCharCount = getEolCharsCount(statusBytes);
    if (eolCharCount > 0) {
        String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount,
                WARCConstants.DEFAULT_ENCODING);
        if ((statusLine != null) && StatusLine.startsWithHTTP(statusLine)) {
            StatusLine status = new StatusLine(statusLine);
            cdi.setStatusCode(status.getStatusCode());
            Header[] headers = HttpParser.parseHeaders(record, WARCConstants.DEFAULT_ENCODING);
            for (Header h : headers) {
                if (h.getName().equalsIgnoreCase("Content-Type")) {
                    cdi.setMimeType(h.getValue());
                } else if (h.getName().equalsIgnoreCase("ETag")) {
                    cdi.setEtag(h.getValue());
                }
            }
        }
    }

    return cdi;
}

From source file:com.cyberway.issue.io.arc.ARCRecord.java

/**
* Read http header if present. Technique borrowed from HttpClient HttpParse
* class./*from w ww  .j  av a2 s  .  c o m*/
* 
* @return ByteArrayInputStream with the http header in it or null if no
*         http header.
* @throws IOException
*/
private InputStream readHttpHeader() throws IOException {
    // If judged a record that doesn't have an http header, return
    // immediately.
    if (!getHeader().getUrl().startsWith("http") || getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
        return null;
    }
    byte[] statusBytes = HttpParser.readRawLine(getIn());
    int eolCharCount = getEolCharsCount(statusBytes);
    if (eolCharCount <= 0) {
        throw new IOException("Failed to read http status where one was expected: "
                + ((statusBytes == null) ? "" : new String(statusBytes)));
    }
    String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount,
            ARCConstants.DEFAULT_ENCODING);
    if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) {
        if (statusLine.startsWith("DELETED")) {
            // Some old ARCs have deleted records like following:
            // http://vireo.gatech.edu:80/ebt-bin/nph-dweb/dynaweb/SGI_Developer/SGITCL_PG/@Generic__BookTocView/11108%3Btd%3D2 130.207.168.42 19991010131803 text/html 29202
            // DELETED_TIME=20000425001133_DELETER=Kurt_REASON=alexalist
            // (follows ~29K spaces)
            // For now, throw a RecoverableIOException so if iterating over
            // records, we keep going.  TODO: Later make a legitimate
            // ARCRecord from the deleted record rather than throw
            // exception.
            throw new DeletedARCRecordIOException(statusLine);
        } else {
            throw new IOException("Failed parse of http status line.");
        }
    }
    this.httpStatus = new StatusLine(statusLine);

    // Save off all bytes read.  Keep them as bytes rather than
    // convert to strings so we don't have to worry about encodings
    // though this should never be a problem doing http headers since
    // its all supposed to be ascii.
    ByteArrayOutputStream baos = new ByteArrayOutputStream(statusBytes.length + 4 * 1024);
    baos.write(statusBytes);

    // Now read rest of the header lines looking for the separation
    // between header and body.
    for (byte[] lineBytes = null; true;) {
        lineBytes = HttpParser.readRawLine(getIn());
        eolCharCount = getEolCharsCount(lineBytes);
        if (eolCharCount <= 0) {
            throw new IOException(
                    "Failed reading http headers: " + ((lineBytes != null) ? new String(lineBytes) : null));
        }
        // Save the bytes read.
        baos.write(lineBytes);
        if ((lineBytes.length - eolCharCount) <= 0) {
            // We've finished reading the http header.
            break;
        }
    }

    byte[] headerBytes = baos.toByteArray();
    // Save off where body starts.
    this.getMetaData().setContentBegin(headerBytes.length);
    ByteArrayInputStream bais = new ByteArrayInputStream(headerBytes);
    if (!bais.markSupported()) {
        throw new IOException("ByteArrayInputStream does not support mark");
    }
    bais.mark(headerBytes.length);
    // Read the status line.  Don't let it into the parseHeaders function.
    // It doesn't know what to do with it.
    bais.read(statusBytes, 0, statusBytes.length);
    this.httpHeaders = HttpParser.parseHeaders(bais, ARCConstants.DEFAULT_ENCODING);
    this.getMetaData().setStatusCode(Integer.toString(getStatusCode()));
    bais.reset();
    return bais;
}

From source file:dk.netarkivet.wayback.batch.copycode.NetarchiveSuiteWARCRecordToSearchResultAdapter.java

private CaptureSearchResult adaptWARCHTTPResponse(CaptureSearchResult result, WARCRecord rec)
        throws IOException {

    ArchiveRecordHeader header = rec.getHeader();
    // need to parse the documents HTTP message and headers here: WARCReader
    // does not implement this... yet..

    byte[] statusBytes = HttpParser.readRawLine(rec);
    int eolCharCount = getEolCharsCount(statusBytes);
    if (eolCharCount <= 0) {
        throw new RecoverableIOException("Failed to read http status where one " + " was expected: "
                + ((statusBytes == null) ? "(null)" : new String(statusBytes)));
    }/*w w w  .j a v a 2s .  c o m*/
    String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount,
            ARCConstants.DEFAULT_ENCODING);
    if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) {
        throw new RecoverableIOException("Failed parse of http status line.");
    }
    StatusLine status = new StatusLine(statusLine);
    result.setHttpCode(String.valueOf(status.getStatusCode()));

    Header[] headers = HttpParser.parseHeaders(rec, ARCConstants.DEFAULT_ENCODING);

    annotater.annotateHTTPContent(result, rec, headers, header.getMimetype());

    return result;
}

From source file:org.archive.io.arc.ARCRecord.java

/**
 * Read http header if present. Technique borrowed from HttpClient HttpParse
 * class. set errors when found./*from  w  w w  . j av a2 s .com*/
 * 
 * @return ByteArrayInputStream with the http header in it or null if no
 *         http header.
 * @throws IOException
 */
private InputStream readHttpHeader() throws IOException {

    // this can be helpful when simply iterating over records, 
    // looking for problems.
    Logger logger = Logger.getLogger(this.getClass().getName());
    ArchiveRecordHeader h = this.getHeader();

    // If judged a record that doesn't have an http header, return
    // immediately.
    String url = getHeader().getUrl();
    if (!url.startsWith("http") || getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
        return null;
    }

    String statusLine;
    byte[] statusBytes;
    int eolCharCount = 0;
    int errOffset = 0;

    // Read status line, skipping any errant http headers found before it
    // This allows a larger number of 'corrupt' arcs -- where headers were accidentally
    // inserted before the status line to be readable
    while (true) {
        statusBytes = LaxHttpParser.readRawLine(getIn());
        eolCharCount = getEolCharsCount(statusBytes);
        if (eolCharCount <= 0) {
            throw new RecoverableIOException("Failed to read http status where one was expected: "
                    + ((statusBytes == null) ? "" : new String(statusBytes)));
        }

        statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount,
                ARCConstants.DEFAULT_ENCODING);

        // If a null or DELETED break immediately
        if ((statusLine == null) || statusLine.startsWith("DELETED")) {
            break;
        }

        // If it's actually the status line, break, otherwise continue skipping any
        // previous header values
        if (!statusLine.contains(":") && StatusLine.startsWithHTTP(statusLine)) {
            break;
        }

        // Add bytes read to error "offset" to add to position
        errOffset += statusBytes.length;
    }

    if (errOffset > 0) {
        this.incrementPosition(errOffset);
    }

    if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) {
        if (statusLine.startsWith("DELETED")) {
            // Some old ARCs have deleted records like following:
            // http://vireo.gatech.edu:80/ebt-bin/nph-dweb/dynaweb/SGI_Developer/SGITCL_PG/@Generic__BookTocView/11108%3Btd%3D2 130.207.168.42 19991010131803 text/html 29202
            // DELETED_TIME=20000425001133_DELETER=Kurt_REASON=alexalist
            // (follows ~29K spaces)
            // For now, throw a RecoverableIOException so if iterating over
            // records, we keep going.  TODO: Later make a legitimate
            // ARCRecord from the deleted record rather than throw
            // exception.
            throw new DeletedARCRecordIOException(statusLine);
        } else {
            this.errors.add(ArcRecordErrors.HTTP_STATUS_LINE_INVALID);
        }
    }

    try {
        this.httpStatus = new StatusLine(statusLine);
    } catch (IOException e) {
        logger.warning(e.getMessage() + " at offset: " + h.getOffset());
        this.errors.add(ArcRecordErrors.HTTP_STATUS_LINE_EXCEPTION);
    }

    // Save off all bytes read.  Keep them as bytes rather than
    // convert to strings so we don't have to worry about encodings
    // though this should never be a problem doing http headers since
    // its all supposed to be ascii.
    ByteArrayOutputStream baos = new ByteArrayOutputStream(statusBytes.length + 4 * 1024);
    baos.write(statusBytes);

    // Now read rest of the header lines looking for the separation
    // between header and body.
    for (byte[] lineBytes = null; true;) {
        lineBytes = LaxHttpParser.readRawLine(getIn());
        eolCharCount = getEolCharsCount(lineBytes);
        if (eolCharCount <= 0) {
            if (getIn().available() == 0) {
                httpHeaderBytesRead += statusBytes.length;
                logger.warning("HTTP header truncated at offset: " + h.getOffset());
                this.errors.add(ArcRecordErrors.HTTP_HEADER_TRUNCATED);
                this.setEor(true);
                break;
            } else {
                throw new IOException(
                        "Failed reading http headers: " + ((lineBytes != null) ? new String(lineBytes) : null));
            }
        } else {
            httpHeaderBytesRead += lineBytes.length;
        }
        // Save the bytes read.
        baos.write(lineBytes);
        if ((lineBytes.length - eolCharCount) <= 0) {
            // We've finished reading the http header.
            break;
        }
    }

    byte[] headerBytes = baos.toByteArray();
    // Save off where body starts.
    this.getMetaData().setContentBegin(headerBytes.length);
    ByteArrayInputStream bais = new ByteArrayInputStream(headerBytes);
    if (!bais.markSupported()) {
        throw new IOException("ByteArrayInputStream does not support mark");
    }
    bais.mark(headerBytes.length);
    // Read the status line.  Don't let it into the parseHeaders function.
    // It doesn't know what to do with it.
    bais.read(statusBytes, 0, statusBytes.length);
    this.httpHeaders = LaxHttpParser.parseHeaders(bais, ARCConstants.DEFAULT_ENCODING);
    this.getMetaData().setStatusCode(Integer.toString(getStatusCode()));
    bais.reset();
    return bais;
}

From source file:org.archive.io.HeaderedArchiveRecord.java

/**
 * Read header if present. Technique borrowed from HttpClient HttpParse
 * class. Using http parser code for now. Later move to more generic header
 * parsing code if there proves a need.// ww w.  java 2  s . c om
 * 
 * @return ByteArrayInputStream with the http header in it or null if no
 *         http header.
 * @throws IOException
 */
private InputStream readContentHeaders() throws IOException {
    // If judged a record that doesn't have an http header, return
    // immediately.
    if (!hasContentHeaders()) {
        return null;
    }
    byte[] statusBytes = LaxHttpParser.readRawLine(getIn());
    int eolCharCount = getEolCharsCount(statusBytes);
    if (eolCharCount <= 0) {
        throw new IOException(
                "Failed to read raw lie where one " + " was expected: " + new String(statusBytes));
    }
    String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount,
            ARCConstants.DEFAULT_ENCODING);
    if (statusLine == null) {
        throw new NullPointerException("Expected status line is null");
    }
    // TODO: Tighten up this test.
    boolean isHttpResponse = StatusLine.startsWithHTTP(statusLine);
    boolean isHttpRequest = false;
    if (!isHttpResponse) {
        isHttpRequest = statusLine.toUpperCase().startsWith("GET")
                || !statusLine.toUpperCase().startsWith("POST");
    }
    if (!isHttpResponse && !isHttpRequest) {
        throw new UnexpectedStartLineIOException("Failed parse of " + "status line: " + statusLine);
    }
    this.statusCode = isHttpResponse ? (new StatusLine(statusLine)).getStatusCode() : -1;

    // Save off all bytes read.  Keep them as bytes rather than
    // convert to strings so we don't have to worry about encodings
    // though this should never be a problem doing http headers since
    // its all supposed to be ascii.
    ByteArrayOutputStream baos = new ByteArrayOutputStream(statusBytes.length + 4 * 1024);
    baos.write(statusBytes);

    // Now read rest of the header lines looking for the separation
    // between header and body.
    for (byte[] lineBytes = null; true;) {
        lineBytes = LaxHttpParser.readRawLine(getIn());
        eolCharCount = getEolCharsCount(lineBytes);
        if (eolCharCount <= 0) {
            throw new IOException(
                    "Failed reading headers: " + ((lineBytes != null) ? new String(lineBytes) : null));
        }
        // Save the bytes read.
        baos.write(lineBytes);
        if ((lineBytes.length - eolCharCount) <= 0) {
            // We've finished reading the http header.
            break;
        }
    }

    byte[] headerBytes = baos.toByteArray();
    // Save off where content body, post content headers, starts.
    this.contentHeadersLength = headerBytes.length;
    ByteArrayInputStream bais = new ByteArrayInputStream(headerBytes);
    if (!bais.markSupported()) {
        throw new IOException("ByteArrayInputStream does not support mark");
    }
    bais.mark(headerBytes.length);
    // Read the status line.  Don't let it into the parseHeaders function.
    // It doesn't know what to do with it.
    bais.read(statusBytes, 0, statusBytes.length);
    this.contentHeaders = LaxHttpParser.parseHeaders(bais, ARCConstants.DEFAULT_ENCODING);
    bais.reset();
    return bais;
}

From source file:org.archive.util.LaxHttpParser.java

/**
 * Read up to <tt>"\n"</tt> from an (unchunked) input stream.
 * If the stream ends before the line terminator is found,
 * the last part of the string will still be returned.
 * If no input data available, <code>null</code> is returned.
 *
 * @param inputStream the stream to read from
 * @param charset charset of HTTP protocol elements
 *
 * @throws IOException if an I/O problem occurs
 * @return a line from the stream/*w w  w  . j  a va  2s  . c om*/
 * 
 * @since 3.0
 */
public static String readLine(InputStream inputStream, String charset) throws IOException {
    LOG.trace("enter LaxHttpParser.readLine(InputStream, String)");
    byte[] rawdata = readRawLine(inputStream);
    if (rawdata == null) {
        return null;
    }
    // strip CR and LF from the end
    int len = rawdata.length;
    int offset = 0;
    if (len > 0) {
        if (rawdata[len - 1] == '\n') {
            offset++;
            if (len > 1) {
                if (rawdata[len - 2] == '\r') {
                    offset++;
                }
            }
        }
    }
    return EncodingUtil.getString(rawdata, 0, len - offset, charset);
}

From source file:org.archive.wayback.resourcestore.indexer.WARCRecordToSearchResultAdapter.java

private CaptureSearchResult adaptWARCHTTPResponse(CaptureSearchResult result, WARCRecord rec)
        throws IOException {

    ArchiveRecordHeader header = rec.getHeader();
    // need to parse the documents HTTP message and headers here: WARCReader
    // does not implement this... yet..

    byte[] statusBytes = LaxHttpParser.readRawLine(rec);
    int eolCharCount = getEolCharsCount(statusBytes);
    if (eolCharCount <= 0) {
        throw new RecoverableIOException("Failed to read http status where one " + " was expected: "
                + ((statusBytes == null) ? "(null)" : new String(statusBytes)));
    }/* w  ww  . j a va 2  s  . c  om*/
    String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount,
            ARCConstants.DEFAULT_ENCODING);
    if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) {
        throw new RecoverableIOException("Failed parse of http status line.");
    }
    StatusLine status = new StatusLine(statusLine);
    result.setHttpCode(String.valueOf(status.getStatusCode()));

    Header[] headers = LaxHttpParser.parseHeaders(rec, ARCConstants.DEFAULT_ENCODING);

    annotater.annotateHTTPContent(result, rec, headers, header.getMimetype());

    return result;
}

From source file:org.archive.wayback.resourcestore.resourcefile.WarcResource.java

public void parseHeaders() throws IOException {
    if (parsedHeaders) {
        return;//  w  w w . jav  a 2s . c om
    }

    // If warc or arc record is 0 length, don't do any more parsing!
    // Hopefully caller code will check this before proceeding as well
    if (getRecordLength() <= 0) {
        parsedHeaders = true;
        return;
    }

    // WARCRecord should have getRecordType() method returning WARCRecordType.
    String rectypeStr = (String) rec.getHeader().getHeaderValue("WARC-Type");
    WARCRecordType rectype;
    try {
        rectype = WARCRecordType.valueOf(rectypeStr);
    } catch (IllegalArgumentException ex) {
        throw new RecoverableIOException("unrecognized WARC-Type \"" + rectypeStr + "\"");
    }

    if (rectype == WARCRecordType.response || rectype == WARCRecordType.revisit) {
        byte[] statusBytes = LaxHttpParser.readRawLine(rec);
        int eolCharCount = getEolCharsCount(statusBytes);
        if (eolCharCount <= 0) {
            throw new RecoverableIOException(
                    "Failed to read http status where one " + " was expected: " + new String(statusBytes));
        }
        String statusLineStr = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount,
                ARCConstants.DEFAULT_ENCODING);
        if ((statusLineStr == null) || !StatusLine.startsWithHTTP(statusLineStr)) {
            throw new RecoverableIOException("Failed parse of http status line.");
        }
        StatusLine statusLine = new StatusLine(statusLineStr);

        this.status = statusLine.getStatusCode();

        Header[] tmpHeaders = LaxHttpParser.parseHeaders(rec, ARCConstants.DEFAULT_ENCODING);
        headers = new Hashtable<String, String>();
        this.setInputStream(rec);
        for (Header header : tmpHeaders) {
            headers.put(header.getName(), header.getValue());
            if (header.getName().toUpperCase().contains(HttpHeaderOperation.HTTP_TRANSFER_ENC_HEADER)) {
                if (header.getValue().toUpperCase()
                        .contains(HttpHeaderOperation.HTTP_CHUNKED_ENCODING_HEADER)) {
                    setChunkedEncoding();
                }
            }
        }
    } else if (rectype == WARCRecordType.metadata || rectype == WARCRecordType.resource) {
        status = 200;
        headers = new HashMap<String, String>();
        String ct = (String) rec.getHeader().getHeaderValue("Content-Type");
        if (ct != null) {
            headers.put("Content-Type", ct);
        }
        // necessary?
        String date = rec.getHeader().getDate();
        if (date != null) {
            try {
                Date d = org.apache.commons.lang.time.DateUtils.parseDate(date,
                        new String[] { "yyyy-MM-dd'T'HH:mm:ss'Z'" });
                String httpDate = DateUtils.getRFC1123Date(d);
                headers.put("Date", httpDate);
            } catch (ParseException ex) {
                //
            }
        }
        setInputStream(rec);
    }
    parsedHeaders = true;
}

From source file:org.folg.werelatedata.editor.PageEditor.java

private String getResponse(HttpMethodBase m) throws IOException {
    InputStream s = m.getResponseBodyAsStream();
    int bytesRead = -1;
    int totalBytes = 0;
    int bytesToRead = BUF_SIZE;
    byte[] buf = new byte[BUF_SIZE];
    while (true) {
        bytesRead = s.read(buf, totalBytes, bytesToRead);
        if (bytesRead < 0) {
            break;
        }//from w ww.  ja  v  a  2s . com
        totalBytes += bytesRead;
        bytesToRead -= bytesRead;
        if (bytesToRead == 0) { // buffer full, so allocate more
            if (buf.length * 2 > MAX_BUF_SIZE) {
                throw new IOException("Response too long: " + m.getURI().toString());
            }
            byte[] temp = buf;
            buf = new byte[temp.length * 2];
            System.arraycopy(temp, 0, buf, 0, temp.length);
            bytesToRead = temp.length;
        }
    }
    if (totalBytes > 0) {
        return EncodingUtil.getString(buf, 0, totalBytes, m.getResponseCharSet());
    } else {
        return null;
    }
}

From source file:org.webcurator.domain.model.core.ArcHarvestFileDTO.java

private void indexWARCResponse(ArchiveRecord rec, Map<String, HarvestResourceDTO> results) throws IOException {

    WARCRecord record = (WARCRecord) rec;
    ArchiveRecordHeader header = record.getHeader();

    // If the URL length is too long for the database, skip adding the URL
    // to the index. This ensures that the harvest completes successfully. 
    if (header.getUrl().length() > MAX_URL_LENGTH) {
        return;/*from   w w w . j  a  va2s .com*/
    }

    try {
        ArcHarvestResourceDTO res = new ArcHarvestResourceDTO();
        res.setArcFileName(this.getName());
        res.setName(header.getUrl());
        res.setResourceOffset(header.getOffset());
        res.setCompressed(this.isCompressed());

        // need to parse the documents HTTP message and headers here: WARCReader
        // does not implement this...

        byte[] statusBytes = HttpParser.readRawLine(record);
        int eolCharCount = getEolCharsCount(statusBytes);
        if (eolCharCount <= 0) {
            throw new RecoverableIOException(
                    "Failed to read http status where one " + " was expected: " + new String(statusBytes));
        }
        String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount,
                WARCConstants.DEFAULT_ENCODING);
        if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) {
            throw new RecoverableIOException("Failed parse of http status line.");
        }
        StatusLine status = new StatusLine(statusLine);

        res.setStatusCode(status.getStatusCode());

        // Calculate the length.
        long length = header.getLength() - header.getContentBegin();
        res.setLength(length);

        results.put(res.getName(), res);
    } finally {
        rec.close();
    }
}