Example usage for org.apache.commons.httpclient HttpParser readLine

Introduction

In this page you can find the example usage for org.apache.commons.httpclient HttpParser readLine.

Prototype

public static String readLine(InputStream paramInputStream, String paramString) throws IOException

Source Link

Usage

From source file:uk.bl.wa.indexer.WARCIndexer.java

private String processWARCHeaders(ArchiveRecord record, ArchiveRecordHeader header, String targetUrl,
        SolrRecord solr) throws IOException {
    String statusCode = null;/*from  ww  w.  ja v a 2 s  . c  om*/
    // There are not always headers! The code should check first.
    String statusLine = HttpParser.readLine(record, "UTF-8");
    if (statusLine != null && statusLine.startsWith("HTTP")) {
        String firstLine[] = statusLine.split(" ");
        if (firstLine.length > 1) {
            statusCode = firstLine[1].trim();
            try {
                this.processHeaders(solr, statusCode, HttpParser.parseHeaders(record, "UTF-8"), targetUrl);
            } catch (ProtocolException p) {
                log.error("ProtocolException [" + statusCode + "]: "
                        + header.getHeaderValue(WARCConstants.HEADER_KEY_FILENAME) + "@"
                        + header.getHeaderValue(WARCConstants.ABSOLUTE_OFFSET_KEY), p);
            }
        } else {
            log.warn("Could not parse status line: " + statusLine);
        }
    } else {
        log.warn("Invalid status line: " + header.getHeaderValue(WARCConstants.HEADER_KEY_FILENAME) + "@"
                + header.getHeaderValue(WARCConstants.ABSOLUTE_OFFSET_KEY));
    }
    // No need for this, as the headers have already been read from the
    // InputStream (above):
    // WARCRecordUtils.getPayload(record); ]
    return statusCode;
}

From source file:uk.bl.wa.tika.parser.warc.WebARCExtractor.java

public void parse(InputStream stream) throws IOException, SAXException, TikaException {
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();//from ww  w  . j ava 2 s.  c  o m

    System.out.println("GO: " + metadata.get(Metadata.RESOURCE_NAME_KEY));
    // Open the ARCReader:
    // This did not work as assumes compressed:
    // ArchiveReaderFactory.get("name.arc", stream, true);
    ArchiveReader ar = null;
    if (isWARC) {
        ar = WARCReaderFactory.get("dummy-name.warc", stream, true);
    } else {
        ar = ARCReaderFactory.get("dummy-name.arc", stream, true);
    }

    // Go through the records:
    if (ar != null) {

        // Also get out the archive format version:
        metadata.set("version", ar.getVersion());

        Iterator<ArchiveRecord> it = ar.iterator();

        while (it.hasNext()) {
            ArchiveRecord entry = it.next();
            InputStream is = (WARCRecord) entry;
            if (this.isWARC) {
                String firstLine[] = HttpParser.readLine(is, "UTF-8").split(" ");
                String statusCode = firstLine[1].trim();
                Header[] headers = HttpParser.parseHeaders(is, "UTF-8");
            }
            // As this is ARC (as opposed to WARC), the URL should be directly usable
            String name = entry.getHeader().getUrl();
            name = entry.getHeader().getHeaderValue(WARCRecord.HEADER_KEY_TYPE) + ":" + name;
            // Now parse it...
            // Setup
            Metadata entrydata = new Metadata();
            entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
            // Use the delegate parser to parse the compressed document
            if (extractor.shouldParseEmbedded(entrydata)) {
                extractor.parseEmbedded(is, xhtml, entrydata, true);
            }
        }

    }
    xhtml.endDocument();
}