List of usage examples for org.apache.http.impl EnglishReasonPhraseCatalog INSTANCE
EnglishReasonPhraseCatalog INSTANCE
To view the source code for org.apache.http.impl EnglishReasonPhraseCatalog INSTANCE.
Click Source Link
From source file:org.apache.nifi.processors.aws.wag.InvokeAWSGatewayApi.java
@Override public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException { ComponentLog logger = getLogger();/*from ww w .j a v a2 s .c o m*/ FlowFile requestFlowFile = session.get(); // Checking to see if the property to put the body of the response in an attribute was set boolean putToAttribute = context.getProperty(PROP_PUT_OUTPUT_IN_ATTRIBUTE).isSet(); if (requestFlowFile == null) { String request = context.getProperty(PROP_METHOD).evaluateAttributeExpressions().getValue() .toUpperCase(); if ("POST".equals(request) || "PUT".equals(request) || "PATCH".equals(request)) { return; } else if (putToAttribute) { requestFlowFile = session.create(); } } // Every request/response cycle has a unique transaction id which will be stored as a flowfile attribute. final UUID txId = UUID.randomUUID(); FlowFile responseFlowFile = null; try { final int maxAttributeSize = context.getProperty(PROP_PUT_ATTRIBUTE_MAX_LENGTH).asInteger(); final String resourceName = context.getProperty(PROP_RESOURCE_NAME).getValue(); final GenericApiGatewayClient client = getClient(); final GenericApiGatewayRequest request = configureRequest(context, session, resourceName, requestFlowFile); logRequest(logger, client.getEndpoint(), request); final long startNanos = System.nanoTime(); GenericApiGatewayResponse response = null; GenericApiGatewayException exception = null; try { response = client.execute(request); logResponse(logger, response); } catch (GenericApiGatewayException gag) { // ERROR response codes may come back as exceptions, 404 for example exception = gag; } final int statusCode; if (exception != null) { statusCode = exception.getStatusCode(); } else { statusCode = response.getHttpResponse().getStatusCode(); } if (statusCode == 0) { throw new IllegalStateException("Status code unknown, connection hasn't been attempted."); } final String endpoint = context.getProperty(PROP_AWS_GATEWAY_API_ENDPOINT).getValue(); boolean outputRegardless = context.getProperty(PROP_OUTPUT_RESPONSE_REGARDLESS).asBoolean(); boolean outputBodyToResponseContent = (isSuccess(statusCode) && !putToAttribute || outputRegardless); boolean outputBodyToRequestAttribute = (!isSuccess(statusCode) || putToAttribute) && requestFlowFile != null; boolean bodyExists = response != null && response.getBody() != null; final String statusExplanation; if (exception != null) { statusExplanation = EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, null); } else { statusExplanation = response.getHttpResponse().getStatusText(); } // Create a map of the status attributes that are always written to the request and response FlowFiles final Map<String, String> statusAttributes = new HashMap<>(); statusAttributes.put(STATUS_CODE, String.valueOf(statusCode)); statusAttributes.put(STATUS_MESSAGE, statusExplanation); statusAttributes.put(ENDPOINT_ATTR, client.getEndpointPrefix()); statusAttributes.put(RESOURCE_NAME_ATTR, resourceName); statusAttributes.put(TRANSACTION_ID, txId.toString()); if (outputBodyToResponseContent) { /* * If successful and putting to response flowfile, store the response body as the flowfile payload * we include additional flowfile attributes including the response headers and the status codes. */ // clone the flowfile to capture the response if (requestFlowFile != null) { responseFlowFile = session.create(requestFlowFile); // write attributes to request flowfile requestFlowFile = session.putAllAttributes(requestFlowFile, statusAttributes); // If the property to add the response headers to the request flowfile is true then add them if (context.getProperty(PROP_ADD_HEADERS_TO_REQUEST).asBoolean()) { // write the response headers as attributes // this will overwrite any existing flowfile attributes requestFlowFile = session.putAllAttributes(requestFlowFile, convertAttributesFromHeaders(response)); } } else { responseFlowFile = session.create(); } // write attributes to response flowfile responseFlowFile = session.putAllAttributes(responseFlowFile, statusAttributes); // write the response headers as attributes // this will overwrite any existing flowfile attributes if (response != null) { responseFlowFile = session.putAllAttributes(responseFlowFile, convertAttributesFromHeaders(response)); } else { responseFlowFile = session.putAllAttributes(responseFlowFile, exception.getHttpHeaders()); } // transfer the message body to the payload // can potentially be null in edge cases if (bodyExists) { final String contentType = response.getHttpResponse().getHeaders().get("Content-Type"); if (!(contentType == null) && !contentType.trim().isEmpty()) { responseFlowFile = session.putAttribute(responseFlowFile, CoreAttributes.MIME_TYPE.key(), contentType.trim()); } responseFlowFile = session.importFrom(new ByteArrayInputStream(response.getBody().getBytes()), responseFlowFile); // emit provenance event final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNanos); if (requestFlowFile != null) { session.getProvenanceReporter().fetch(responseFlowFile, endpoint, millis); } else { session.getProvenanceReporter().receive(responseFlowFile, endpoint, millis); } } else if (exception != null) { final String contentType = "application/json"; responseFlowFile = session.putAttribute(responseFlowFile, CoreAttributes.MIME_TYPE.key(), contentType.trim()); responseFlowFile = session.importFrom(new ByteArrayInputStream(exception.getRawResponse()), responseFlowFile); // emit provenance event final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNanos); if (requestFlowFile != null) { session.getProvenanceReporter().fetch(responseFlowFile, endpoint, millis); } else { session.getProvenanceReporter().receive(responseFlowFile, endpoint, millis); } } } // if not successful and request flowfile is not null, store the response body into a flowfile attribute if (outputBodyToRequestAttribute) { String attributeKey = context.getProperty(PROP_PUT_OUTPUT_IN_ATTRIBUTE) .evaluateAttributeExpressions(requestFlowFile).getValue(); if (attributeKey == null) { attributeKey = RESPONSE_BODY; } byte[] outputBuffer; int size = 0; outputBuffer = new byte[maxAttributeSize]; if (bodyExists) { size = StreamUtils.fillBuffer(new ByteArrayInputStream(response.getBody().getBytes()), outputBuffer, false); } else if (exception != null && exception.getRawResponse() != null && exception.getRawResponse().length > 0) { size = StreamUtils.fillBuffer(new ByteArrayInputStream(exception.getRawResponse()), outputBuffer, false); } if (size > 0) { String bodyString = new String(outputBuffer, 0, size, "UTF-8"); requestFlowFile = session.putAttribute(requestFlowFile, attributeKey, bodyString); } requestFlowFile = session.putAllAttributes(requestFlowFile, statusAttributes); final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNanos); session.getProvenanceReporter().modifyAttributes(requestFlowFile, "The " + attributeKey + " has been added. The value of which is the body of a http call to " + endpoint + resourceName + ". It took " + millis + "millis,"); } route(requestFlowFile, responseFlowFile, session, context, statusCode, getRelationships()); } catch (Exception e) { // penalize or yield if (requestFlowFile != null) { logger.error("Routing to {} due to exception: {}", new Object[] { REL_FAILURE.getName(), e }, e); requestFlowFile = session.penalize(requestFlowFile); requestFlowFile = session.putAttribute(requestFlowFile, EXCEPTION_CLASS, e.getClass().getName()); requestFlowFile = session.putAttribute(requestFlowFile, EXCEPTION_MESSAGE, e.getMessage()); // transfer original to failure session.transfer(requestFlowFile, getRelationshipForName(REL_FAILURE_NAME, getRelationships())); } else { logger.error("Yielding processor due to exception encountered as a source processor: {}", e); context.yield(); } // cleanup response flowfile, if applicable try { if (responseFlowFile != null) { session.remove(responseFlowFile); } } catch (final Exception e1) { logger.error("Could not cleanup response flowfile due to exception: {}", new Object[] { e1 }, e1); } } }
From source file:org.deviceconnect.message.event.AbstractEventManager.java
/** * ?????.//from w ww . j a v a 2s. co m * * @param response ? * @return ??? * @throws IOException ????? */ protected HttpResponse copyResponse(final HttpResponse response) throws IOException { int code = response.getStatusLine().getStatusCode(); HttpResponse retRes = new BasicHttpResponse(response.getProtocolVersion(), code, EnglishReasonPhraseCatalog.INSTANCE.getReason(code, null)); retRes.setHeaders(response.getAllHeaders()); retRes.setEntity(new StringEntity(EntityUtils.toString(response.getEntity(), "UTF-8"))); return retRes; }
From source file:com.crawler.app.crawler.WebCrawler.java
private void processPage(WebURL curURL) { PageFetchResult fetchResult = null;/*from ww w . j a va2 s.co m*/ try { if (curURL == null) { throw new Exception("Failed processing a NULL url !?"); } fetchResult = pageFetcher.fetchPage(curURL); int statusCode = fetchResult.getStatusCode(); handlePageStatusCode(curURL, statusCode, EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, Locale.ENGLISH)); // Finds the status reason for all known statuses Page page = new Page(curURL); page.setFetchResponseHeaders(fetchResult.getResponseHeaders()); page.setStatusCode(statusCode); if (statusCode != HttpStatus.SC_OK) { // Not 200 if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY || statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) { // is 3xx todo follow https://issues.apache.org/jira/browse/HTTPCORE-389 page.setRedirect(true); if (myController.getConfig().isFollowRedirects()) { String movedToUrl = fetchResult.getMovedToUrl(); if (movedToUrl == null) { throw new RedirectException(Level.WARN, "Unexpected error, URL: " + curURL + " is redirected to NOTHING"); } page.setRedirectedToUrl(movedToUrl); int newDocId = docIdServer.getDocId(movedToUrl); if (newDocId > 0) { throw new RedirectException(Level.DEBUG, "Redirect page: " + curURL + " is already seen"); } WebURL webURL = new WebURL(); webURL.setURL(movedToUrl); webURL.setParentDocid(curURL.getParentDocid()); webURL.setParentUrl(curURL.getParentUrl()); webURL.setDepth(curURL.getDepth()); webURL.setDocid(-1); webURL.setAnchor(curURL.getAnchor()); if (shouldVisit(page, webURL)) { if (robotstxtServer.allows(webURL)) { webURL.setDocid(docIdServer.getNewDocID(movedToUrl)); frontier.schedule(webURL); } else { logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy", webURL.getURL()); } } else { logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL()); } } } else { // All other http codes other than 3xx & 200 String description = EnglishReasonPhraseCatalog.INSTANCE.getReason(fetchResult.getStatusCode(), Locale.ENGLISH); // Finds the status reason for all known statuses String contentType = fetchResult.getEntity() == null ? "" : fetchResult.getEntity().getContentType().getValue(); onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(), contentType, description); } } else { // if status code is 200 if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) { if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) { throw new RedirectException(Level.DEBUG, "Redirect page: " + curURL + " has already been seen"); } curURL.setURL(fetchResult.getFetchedUrl()); curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl())); } if (!fetchResult.fetchContent(page)) { throw new ContentFetchException(); } parser.parse(page, curURL.getURL()); ParseData parseData = page.getParseData(); List<WebURL> toSchedule = new ArrayList<>(); int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling(); for (WebURL webURL : parseData.getOutgoingUrls()) { webURL.setParentDocid(curURL.getDocid()); webURL.setParentUrl(curURL.getURL()); int newdocid = docIdServer.getDocId(webURL.getURL()); if (newdocid > 0) { // This is not the first time that this Url is visited. So, we set the depth to a negative number. webURL.setDepth((short) -1); webURL.setDocid(newdocid); } else { webURL.setDocid(-1); webURL.setDepth((short) (curURL.getDepth() + 1)); if (maxCrawlDepth == -1 || curURL.getDepth() < maxCrawlDepth) { if (shouldVisit(page, webURL)) { if (robotstxtServer.allows(webURL)) { webURL.setDocid(docIdServer.getNewDocID(webURL.getURL())); toSchedule.add(webURL); } else { logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy", webURL.getURL()); } } else { logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL()); } } } } frontier.scheduleAll(toSchedule); visit(page); } } catch (PageBiggerThanMaxSizeException e) { onPageBiggerThanMaxSize(curURL.getURL(), e.getPageSize()); } catch (ParseException pe) { onParseError(curURL); } catch (ContentFetchException cfe) { onContentFetchError(curURL); } catch (RedirectException re) { logger.log(re.level, re.getMessage()); } catch (NotAllowedContentException nace) { logger.debug("Skipping: {} as it contains binary content which you configured not to crawl", curURL.getURL()); } catch (Exception e) { String urlStr = (curURL == null ? "NULL" : curURL.getURL()); logger.error("{}, while processing: {}", e.getMessage(), urlStr); logger.debug("Stacktrace", e); } finally { if (fetchResult != null) { fetchResult.discardContentIfNotConsumed(); } } }
From source file:com.vnlab.badlink.crawler.WebCrawler.java
private void processPage(WebURL curURL) { PageFetchResult fetchResult = null;//ww w .jav a 2s . co m try { if (curURL == null) { throw new Exception("Failed processing a NULL url !?"); } fetchResult = pageFetcher.fetchPage(curURL); int statusCode = fetchResult.getStatusCode(); handlePageStatusCode(curURL, statusCode, EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, Locale.ENGLISH)); // Finds the status reason for all known statuses Page page = new Page(curURL); page.setFetchResponseHeaders(fetchResult.getResponseHeaders()); page.setStatusCode(statusCode); if (statusCode != HttpStatus.SC_OK) { // Not 200 if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY || statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) { // is 3xx todo follow https://issues.apache.org/jira/browse/HTTPCORE-389 page.setRedirect(true); if (myController.getConfig().isFollowRedirects()) { String movedToUrl = fetchResult.getMovedToUrl(); if (movedToUrl == null) { throw new RedirectException(Level.WARN, "Unexpected error, URL: " + curURL + " is redirected to NOTHING"); } page.setRedirectedToUrl(movedToUrl); int newDocId = docIdServer.getDocId(movedToUrl); if (newDocId > 0) { throw new RedirectException(Level.DEBUG, "Redirect page: " + curURL + " is already seen"); } WebURL webURL = new WebURL(); webURL.setURL(movedToUrl); webURL.setParentDocid(curURL.getParentDocid()); webURL.setParentUrl(curURL.getParentUrl()); webURL.setDepth(curURL.getDepth()); webURL.setDocid(-1); webURL.setAnchor(curURL.getAnchor()); if (shouldVisit(page, webURL)) { if (robotstxtServer.allows(webURL)) { webURL.setDocid(docIdServer.getNewDocID(movedToUrl)); frontier.schedule(webURL); } else { logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy", webURL.getURL()); } } else { logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL()); } } } else { // All other http codes other than 3xx & 200 String description = EnglishReasonPhraseCatalog.INSTANCE.getReason(fetchResult.getStatusCode(), Locale.ENGLISH); // Finds the status reason for all known statuses String contentType = fetchResult.getEntity() == null ? "" : fetchResult.getEntity().getContentType().getValue(); onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(), contentType, description); } } else { // if status code is 200 if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) { if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) { throw new RedirectException(Level.DEBUG, "Redirect page: " + curURL + " has already been seen"); } curURL.setURL(fetchResult.getFetchedUrl()); curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl())); } if (!fetchResult.fetchContent(page)) { throw new ContentFetchException(); } parser.parse(page, curURL.getURL()); ParseData parseData = page.getParseData(); List<WebURL> toSchedule = new ArrayList<>(); int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling(); for (WebURL webURL : parseData.getOutgoingUrls()) { webURL.setParentDocid(curURL.getDocid()); webURL.setParentUrl(curURL.getURL()); int newdocid = docIdServer.getDocId(webURL.getURL()); if (newdocid > 0) { // This is not the first time that this Url is visited. So, we set the depth to a negative number. webURL.setDepth((short) -1); webURL.setDocid(newdocid); } else { webURL.setDocid(-1); webURL.setDepth((short) (curURL.getDepth() + 1)); if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) { if (shouldVisit(page, webURL)) { if (robotstxtServer.allows(webURL)) { webURL.setDocid(docIdServer.getNewDocID(webURL.getURL())); toSchedule.add(webURL); } else { logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy", webURL.getURL()); } } else { logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL()); } } } } frontier.scheduleAll(toSchedule); visit(page); } } catch (PageBiggerThanMaxSizeException e) { onPageBiggerThanMaxSize(curURL.getURL(), e.getPageSize()); } catch (ParseException pe) { onParseError(curURL); } catch (ContentFetchException cfe) { onContentFetchError(curURL); } catch (RedirectException re) { logger.log(re.level, re.getMessage()); } catch (NotAllowedContentException nace) { logger.debug("Skipping: {} as it contains binary content which you configured not to crawl", curURL.getURL()); } catch (Exception e) { String urlStr = (curURL == null ? "NULL" : curURL.getURL()); logger.error("{}, while processing: {}", e.getMessage(), urlStr); logger.debug("Stacktrace", e); } finally { if (fetchResult != null) { fetchResult.discardContentIfNotConsumed(); } } }
From source file:crawler.java.edu.uci.ics.crawler4j.crawler.WebCrawler.java
private void processPage(WebURL curURL) { PageFetchResult fetchResult = null;//from w w w .j a va2s .co m try { if (curURL == null) { throw new Exception("Failed processing a NULL url !?"); } fetchResult = pageFetcher.fetchPage(curURL); int statusCode = fetchResult.getStatusCode(); handlePageStatusCode(curURL, statusCode, EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, Locale.ENGLISH)); // Finds the status reason for all known statuses Page page = new Page(curURL); page.setFetchResponseHeaders(fetchResult.getResponseHeaders()); page.setStatusCode(statusCode); if (statusCode < 200 || statusCode > 299) { // Not 2XX: 2XX status codes indicate success if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY || statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) { // is 3xx todo follow https://issues.apache.org/jira/browse/HTTPCORE-389 page.setRedirect(true); if (myController.getConfig().isFollowRedirects()) { String movedToUrl = fetchResult.getMovedToUrl(); if (movedToUrl == null) { throw new RedirectException( "Unexpected error, URL: " + curURL + " is redirected to NOTHING"); } page.setRedirectedToUrl(movedToUrl); int newDocId = docIdServer.getDocId(movedToUrl); if (newDocId > 0) { throw new RedirectException("Redirect page: " + curURL + " is already seen"); } WebURL webURL = new WebURL(); webURL.setURL(movedToUrl); webURL.setParentDocid(curURL.getParentDocid()); webURL.setParentUrl(curURL.getParentUrl()); webURL.setDepth(curURL.getDepth()); webURL.setDocid(-1); webURL.setAnchor(curURL.getAnchor()); if (shouldVisit(page, webURL)) { if (robotstxtServer.allows(webURL)) { webURL.setDocid(docIdServer.getNewDocID(movedToUrl)); frontier.schedule(webURL); } else { // logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy", webURL.getURL()); } } else { // logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL()); } } } else { // All other http codes other than 3xx & 200 String description = EnglishReasonPhraseCatalog.INSTANCE.getReason(fetchResult.getStatusCode(), Locale.ENGLISH); // Finds the status reason for all known statuses String contentType = fetchResult.getEntity() == null ? "" : fetchResult.getEntity().getContentType().getValue(); onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(), contentType, description); } } else { // if status code is 200 if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) { if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) { throw new RedirectException("Redirect page: " + curURL + " has already been seen"); } curURL.setURL(fetchResult.getFetchedUrl()); curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl())); } if (!fetchResult.fetchContent(page)) { throw new ContentFetchException(); } parser.parse(page, curURL.getURL()); ParseData parseData = page.getParseData(); List<WebURL> toSchedule = new ArrayList<>(); int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling(); for (WebURL webURL : parseData.getOutgoingUrls()) { webURL.setParentDocid(curURL.getDocid()); webURL.setParentUrl(curURL.getURL()); int newdocid = docIdServer.getDocId(webURL.getURL()); if (newdocid > 0) { // This is not the first time that this Url is visited. So, we set the depth to a negative number. webURL.setDepth((short) -1); webURL.setDocid(newdocid); } else { webURL.setDocid(-1); webURL.setDepth((short) (curURL.getDepth() + 1)); if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) { if (shouldVisit(page, webURL)) { if (robotstxtServer.allows(webURL)) { webURL.setDocid(docIdServer.getNewDocID(webURL.getURL())); toSchedule.add(webURL); } else { // logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy", webURL.getURL()); } } else { // logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL()); } } } } frontier.scheduleAll(toSchedule); visit(page); } } catch (PageBiggerThanMaxSizeException e) { onPageBiggerThanMaxSize(curURL.getURL(), e.getPageSize()); } catch (ParseException pe) { onParseError(curURL); } catch (ContentFetchException cfe) { onContentFetchError(curURL); } catch (RedirectException re) { // logger.log(re.level, re.getMessage()); } catch (NotAllowedContentException nace) { // logger.debug("Skipping: {} as it contains binary content which you configured not to crawl", curURL.getURL()); } catch (Exception e) { String urlStr = (curURL == null ? "NULL" : curURL.getURL()); // logger.error("{}, while processing: {}", e.getMessage(), urlStr); // logger.debug("Stacktrace", e); } finally { if (fetchResult != null) { fetchResult.discardContentIfNotConsumed(); } } }
From source file:io.github.cidisk.indexcrawler.WebCrawler.java
private void processPage(WebURL curURL) { PageFetchResult fetchResult = null;// w w w . j a v a 2s .c o m try { if (curURL == null) { throw new Exception("Failed processing a NULL url !?"); } // add for checking depth for optimized fish search algorithm if (curURL.getDepth() == 0) { throw new DeadFishException(curURL.getDepth()); } fetchResult = pageFetcher.fetchPage(curURL); int statusCode = fetchResult.getStatusCode(); handlePageStatusCode(curURL, statusCode, EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, Locale.ENGLISH)); // Finds the status reason for all // known statuses Page page = new Page(curURL); page.setFetchResponseHeaders(fetchResult.getResponseHeaders()); page.setStatusCode(statusCode); if (statusCode != HttpStatus.SC_OK) { // Not 200 if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY || statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) { // is 3xx todo follow // https://issues.apache.org/jira/browse/HTTPCORE-389 page.setRedirect(true); if (myController.getConfig().isFollowRedirects()) { String movedToUrl = fetchResult.getMovedToUrl(); if (movedToUrl == null) { throw new RedirectException(Level.WARN, "Unexpected error, URL: " + curURL + " is redirected to NOTHING"); } page.setRedirectedToUrl(movedToUrl); int newDocId = docIdServer.getDocId(movedToUrl); if (newDocId > 0) { throw new RedirectException(Level.DEBUG, "Redirect page: " + curURL + " is already seen"); } WebURL webURL = new WebURL(); webURL.setURL(movedToUrl); webURL.setParentDocid(curURL.getParentDocid()); webURL.setParentUrl(curURL.getParentUrl()); webURL.setDepth(curURL.getDepth()); webURL.setDocid(-1); webURL.setAnchor(curURL.getAnchor()); // ////////// webURL.setPriority(Util.int2ByteArray(255)[3]); if (shouldVisit(page, webURL)) { // if (robotstxtServer.allows(webURL)) { logger.debug("Visiting: {}", webURL.getURL()); webURL.setDocid(docIdServer.getNewDocID(movedToUrl)); frontier.schedule(webURL); // } else { // logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy", // webURL.getURL()); // } } else { logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL()); } } } else { // All other http codes other than 3xx & 200 String description = EnglishReasonPhraseCatalog.INSTANCE.getReason(fetchResult.getStatusCode(), Locale.ENGLISH); // Finds the status reason // for all known // statuses String contentType = fetchResult.getEntity() == null ? "" : fetchResult.getEntity().getContentType().getValue(); onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(), contentType, description); } } else { // if status code is 200 if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) { if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) { throw new RedirectException(Level.DEBUG, "Redirect page: " + curURL + " has already been seen"); } curURL.setURL(fetchResult.getFetchedUrl()); curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl())); } if (!fetchResult.fetchContent(page)) { throw new ContentFetchException(); } if (!parser.parse(page, curURL.getURL(), curURL.getDepth())) { throw new ParseException(); } ParseData parseData = page.getParseData(); List<WebURL> toSchedule = new ArrayList<>(); // int crawlDepth = // myController.getConfig().getDepthOfCrawling(); Set<WebURL> infourls = parseData.getInfoUrls(); // calculate the rate of new info urls double newDocRate = 0.0; if (null != infourls) { int cnt = 0; for (WebURL webURL : infourls) { int newinfodocid = infoDocIdServer.getDocId(webURL.getURL()); // This is not the first time that this Url is visited. // So, we set the depth to a negative number. webURL.setDepth(curURL.getDepth()); if (!infoDocIdServer.isSeenBefore(webURL.getURL())) { cnt++; newinfodocid = infoDocIdServer.getNewDocID(webURL.getURL()); } webURL.setDocid(newinfodocid); } newDocRate = ((double) cnt) / infourls.size(); } for (WebURL webURL : parseData.getOutgoingIndexUrls()) { webURL.setParentDocid(curURL.getDocid()); webURL.setParentUrl(curURL.getURL()); webURL.setPriority(Util.int2ByteArray((int) (255 * newDocRate))[3]); int newdocid = docIdServer.getDocId(webURL.getURL()); // if (newdocid > 0) { if (newdocid > 0) { // This is not the first time that this Url is visited. // So, we set the depth to a negative number. webURL.setDepth((short) -1); // webURL.setDepth( (short) (curURL.getDepth()-1)); webURL.setDocid(newdocid); } else { webURL.setDocid(-1); // webURL.setDepth((short) (curURL.getDepth() + 1)); if (newDocRate == 0) { webURL.setDepth((short) (curURL.getDepth() - 1)); } else { webURL.setDepth((short) curURL.getDepth()); } // if (maxCrawlDepth == -1 || curURL.getDepth() < // maxCrawlDepth) { if (shouldVisit(page, webURL)) { // if (robotstxtServer.allows(webURL)) { logger.debug("Visiting: {}", webURL.getURL()); webURL.setDocid(docIdServer.getNewDocID(webURL.getURL())); toSchedule.add(webURL); // } else { // logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy", // webURL.getURL()); // } } else { logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL()); } // } } } frontier.scheduleAll(toSchedule); visit(page); } } catch (PageBiggerThanMaxSizeException e) { onPageBiggerThanMaxSize(curURL.getURL(), e.getPageSize()); } catch (ParseException pe) { onParseError(curURL); } catch (ContentFetchException cfe) { onContentFetchError(curURL); } catch (RedirectException re) { logger.log(re.level, re.getMessage()); } catch (NotAllowedContentException nace) { logger.debug("Skipping: {} as it contains binary content which you configured not to crawl", curURL.getURL()); } catch (Exception e) { String urlStr = (curURL == null ? "NULL" : curURL.getURL()); logger.error("{}, while processing: {}", e.getMessage(), urlStr); logger.debug("Stacktrace", e); } finally { if (fetchResult != null) { fetchResult.discardContentIfNotConsumed(); } } }
From source file:org.eclipse.mylyn.commons.repositories.http.core.HttpUtil.java
public static String getStatusText(int statusCode) { return EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, Locale.getDefault()); }
From source file:xyz.invik.ccrawler4j.crawler.WebCrawler.java
private void processPage(WebURL curURL) { PageFetchResult fetchResult = null;/*from ww w. jav a2 s. c om*/ try { if (curURL == null) { return; } fetchResult = pageFetcher.fetchPage(curURL); int statusCode = fetchResult.getStatusCode(); handlePageStatusCode(curURL, statusCode, EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, Locale.ENGLISH)); // Finds the status reason for all known statuses Page page = new Page(curURL); page.setFetchResponseHeaders(fetchResult.getResponseHeaders()); page.setStatusCode(statusCode); if (statusCode < 200 || statusCode > 299) { // Not 2XX: 2XX status codes indicate success if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY || statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) { // is 3xx todo // follow https://issues.apache.org/jira/browse/HTTPCORE-389 page.setRedirect(true); String movedToUrl = fetchResult.getMovedToUrl(); if (movedToUrl == null) { logger.warn("Unexpected error, URL: {} is redirected to NOTHING", curURL); return; } page.setRedirectedToUrl(movedToUrl); onRedirectedStatusCode(page); if (myController.getConfig().isFollowRedirects()) { int newDocId = docIdServer.getDocId(movedToUrl); if (newDocId > 0) { logger.debug("Redirect page: {} is already seen", curURL); return; } WebURL webURL = new WebURL(); webURL.setURL(movedToUrl); webURL.setParentDocid(curURL.getParentDocid()); webURL.setParentUrl(curURL.getParentUrl()); webURL.setDepth(curURL.getDepth()); webURL.setDocid(-1); webURL.setAnchor(curURL.getAnchor()); webURL.setLabel(curURL.getLabel()); // Update the number of consecutive redirections followed to reach webURL. webURL.setRedirectionDepth(curURL.getRedirectionDepth() + 1); if (shouldVisit(page, webURL)) { if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) { webURL.setDocid(docIdServer.getNewDocID(movedToUrl)); frontier.schedule(webURL); } else { logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy", webURL.getURL()); } } else { logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL()); } } } else { // All other http codes other than 3xx & 200 String description = EnglishReasonPhraseCatalog.INSTANCE.getReason(fetchResult.getStatusCode(), Locale.ENGLISH); // Finds // the status reason for all known statuses String contentType = fetchResult.getEntity() == null ? "" : fetchResult.getEntity().getContentType() == null ? "" : fetchResult.getEntity().getContentType().getValue(); onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(), contentType, description); } } else { // if status code is 200 if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) { if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) { logger.debug("Redirect page: {} has already been seen", curURL); return; } curURL.setURL(fetchResult.getFetchedUrl()); curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl())); } if (!fetchResult.fetchContent(page, myController.getConfig().getMaxDownloadSize())) { throw new ContentFetchException(); } if (page.isTruncated()) { logger.warn( "Warning: unknown page size exceeded max-download-size, truncated to: " + "({}), at URL: {}", myController.getConfig().getMaxDownloadSize(), curURL.getURL()); } parser.parse(page, curURL.getURL()); if (shouldFollowLinksIn(page.getWebURL())) { ParseData parseData = page.getParseData(); List<WebURL> toSchedule = new ArrayList<>(); int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling(); for (WebURL webURL : parseData.getOutgoingUrls()) { webURL.setParentDocid(curURL.getDocid()); webURL.setParentUrl(curURL.getURL()); int newdocid = docIdServer.getDocId(webURL.getURL()); if (newdocid > 0) { // This is not the first time that this Url is visited. So, we set the // depth to a negative number. webURL.setDepth((short) -1); webURL.setDocid(newdocid); } else { webURL.setDocid(-1); webURL.setDepth((short) (curURL.getDepth() + 1)); if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) { if (shouldVisit(page, webURL)) { if (robotstxtServer.allows(webURL)) { webURL.setDocid(docIdServer.getNewDocID(webURL.getURL())); toSchedule.add(webURL); } else { logger.debug( "Not visiting: {} as per the server's \"robots.txt\" " + "policy", webURL.getURL()); } } else { logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL()); } } } } frontier.scheduleAll(toSchedule); } else { logger.debug( "Not looking for links in page {}, " + "as per your \"shouldFollowLinksInPage\" policy", page.getWebURL().getURL()); } boolean noIndex = myController.getConfig().isRespectNoIndex() && page.getContentType() != null && page.getContentType().contains("html") && ((HtmlParseData) page.getParseData()).getMetaTagValue("robots").contains("noindex"); if (!noIndex) { visit(page); } } } catch (PageBiggerThanMaxSizeException e) { onPageBiggerThanMaxSize(curURL.getURL(), e.getPageSize()); } catch (ParseException pe) { onParseError(curURL); } catch (ContentFetchException cfe) { onContentFetchError(curURL); } catch (NotAllowedContentException nace) { logger.debug("Skipping: {} as it contains binary content which you configured not to crawl", curURL.getURL()); } catch (Exception e) { onUnhandledException(curURL, e); } finally { if (fetchResult != null) { fetchResult.discardContentIfNotConsumed(); } } }
From source file:org.eclipse.californium.proxy.HttpTranslator.java
/** * Sets the parameters of the incoming http response from a CoAP response. * The status code is mapped through the properties file and is set through * the StatusLine. The options are translated to the corresponding headers * and the max-age (in the header cache-control) is set to the default value * (60 seconds) if not already present. If the request method was not HEAD * and the coap response has a payload, the entity and the content-type are * set in the http response.//w ww . j ava 2 s.c o m * * @param coapResponse * the coap response * @param httpResponse * * * * @param httpRequest * HttpRequest * @throws TranslationException * the translation exception */ public static void getHttpResponse(HttpRequest httpRequest, Response coapResponse, HttpResponse httpResponse) throws TranslationException { if (httpRequest == null) { throw new IllegalArgumentException("httpRequest == null"); } if (coapResponse == null) { throw new IllegalArgumentException("coapResponse == null"); } if (httpResponse == null) { throw new IllegalArgumentException("httpResponse == null"); } // get/set the response code ResponseCode coapCode = coapResponse.getCode(); String httpCodeString = HTTP_TRANSLATION_PROPERTIES.getProperty(KEY_COAP_CODE + coapCode.value); if (httpCodeString == null || httpCodeString.isEmpty()) { LOGGER.warning("httpCodeString == null"); throw new TranslationException("httpCodeString == null"); } int httpCode = 0; try { httpCode = Integer.parseInt(httpCodeString.trim()); } catch (NumberFormatException e) { LOGGER.warning("Cannot convert the coap code in http status code" + e); throw new TranslationException("Cannot convert the coap code in http status code", e); } // create the http response and set the status line String reason = EnglishReasonPhraseCatalog.INSTANCE.getReason(httpCode, Locale.ENGLISH); StatusLine statusLine = new BasicStatusLine(HttpVersion.HTTP_1_1, httpCode, reason); httpResponse.setStatusLine(statusLine); String uriString = httpRequest.getRequestLine().getUri(); int index_query = uriString.indexOf("//"); String query = uriString.substring(index_query + 2); int index_host = query.indexOf("/"); String host = query.substring(0, index_host); // set the headers Header[] headers = getHttpHeaders(coapResponse.getOptions().asSortedList(), host); httpResponse.setHeaders(headers); // set max-age if not already set if (!httpResponse.containsHeader("cache-control")) { httpResponse.setHeader("cache-control", "max-age=" + Long.toString(OptionNumberRegistry.Defaults.MAX_AGE)); } // get the http entity if the request was not HEAD if (!httpRequest.getRequestLine().getMethod().equalsIgnoreCase("head")) { if ((httpRequest.getRequestLine().getMethod().equalsIgnoreCase("put")) && (coapCode.value == 131)) { String linkPut = getLinkPut(coapResponse); Header link = new BasicHeader("Link", linkPut); httpResponse.addHeader(link); } // if the content-type is not set in the coap response and if the // response contains an error, then the content-type should set to // text-plain if (coapResponse.getOptions().getContentFormat() == MediaTypeRegistry.UNDEFINED && (ResponseCode.isClientError(coapCode) || ResponseCode.isServerError(coapCode))) { LOGGER.info("Set contenttype to TEXT_PLAIN"); coapResponse.getOptions().setContentFormat(MediaTypeRegistry.TEXT_PLAIN); } HttpEntity httpEntity = getHttpEntity(coapResponse); if (httpEntity != null) { httpResponse.setEntity(httpEntity); // get the content-type from the entity and set the header ContentType contentType = ContentType.get(httpEntity); httpResponse.setHeader("content-type", contentType.toString()); } } }