List of usage examples for org.apache.commons.httpclient HttpParser readLine
public static String readLine(InputStream paramInputStream, String paramString) throws IOException
From source file:uk.bl.wa.indexer.WARCIndexer.java
private String processWARCHeaders(ArchiveRecord record, ArchiveRecordHeader header, String targetUrl, SolrRecord solr) throws IOException { String statusCode = null;/*from ww w. ja v a 2 s . c om*/ // There are not always headers! The code should check first. String statusLine = HttpParser.readLine(record, "UTF-8"); if (statusLine != null && statusLine.startsWith("HTTP")) { String firstLine[] = statusLine.split(" "); if (firstLine.length > 1) { statusCode = firstLine[1].trim(); try { this.processHeaders(solr, statusCode, HttpParser.parseHeaders(record, "UTF-8"), targetUrl); } catch (ProtocolException p) { log.error("ProtocolException [" + statusCode + "]: " + header.getHeaderValue(WARCConstants.HEADER_KEY_FILENAME) + "@" + header.getHeaderValue(WARCConstants.ABSOLUTE_OFFSET_KEY), p); } } else { log.warn("Could not parse status line: " + statusLine); } } else { log.warn("Invalid status line: " + header.getHeaderValue(WARCConstants.HEADER_KEY_FILENAME) + "@" + header.getHeaderValue(WARCConstants.ABSOLUTE_OFFSET_KEY)); } // No need for this, as the headers have already been read from the // InputStream (above): // WARCRecordUtils.getPayload(record); ] return statusCode; }
From source file:uk.bl.wa.tika.parser.warc.WebARCExtractor.java
public void parse(InputStream stream) throws IOException, SAXException, TikaException { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument();//from ww w . j ava 2 s. c o m System.out.println("GO: " + metadata.get(Metadata.RESOURCE_NAME_KEY)); // Open the ARCReader: // This did not work as assumes compressed: // ArchiveReaderFactory.get("name.arc", stream, true); ArchiveReader ar = null; if (isWARC) { ar = WARCReaderFactory.get("dummy-name.warc", stream, true); } else { ar = ARCReaderFactory.get("dummy-name.arc", stream, true); } // Go through the records: if (ar != null) { // Also get out the archive format version: metadata.set("version", ar.getVersion()); Iterator<ArchiveRecord> it = ar.iterator(); while (it.hasNext()) { ArchiveRecord entry = it.next(); InputStream is = (WARCRecord) entry; if (this.isWARC) { String firstLine[] = HttpParser.readLine(is, "UTF-8").split(" "); String statusCode = firstLine[1].trim(); Header[] headers = HttpParser.parseHeaders(is, "UTF-8"); } // As this is ARC (as opposed to WARC), the URL should be directly usable String name = entry.getHeader().getUrl(); name = entry.getHeader().getHeaderValue(WARCRecord.HEADER_KEY_TYPE) + ":" + name; // Now parse it... // Setup Metadata entrydata = new Metadata(); entrydata.set(Metadata.RESOURCE_NAME_KEY, name); // Use the delegate parser to parse the compressed document if (extractor.shouldParseEmbedded(entrydata)) { extractor.parseEmbedded(is, xhtml, entrydata, true); } } } xhtml.endDocument(); }