Example usage for org.apache.http.impl EnglishReasonPhraseCatalog INSTANCE

List of usage examples for org.apache.http.impl EnglishReasonPhraseCatalog INSTANCE

Introduction

In this page you can find the example usage for org.apache.http.impl EnglishReasonPhraseCatalog INSTANCE.

Prototype

EnglishReasonPhraseCatalog INSTANCE

To view the source code for org.apache.http.impl EnglishReasonPhraseCatalog INSTANCE.

Click Source Link

Usage

From source file:org.apache.nifi.processors.aws.wag.InvokeAWSGatewayApi.java

@Override
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
    ComponentLog logger = getLogger();/*from   ww  w .j  a v  a2 s .c o  m*/
    FlowFile requestFlowFile = session.get();

    // Checking to see if the property to put the body of the response in an attribute was set
    boolean putToAttribute = context.getProperty(PROP_PUT_OUTPUT_IN_ATTRIBUTE).isSet();
    if (requestFlowFile == null) {
        String request = context.getProperty(PROP_METHOD).evaluateAttributeExpressions().getValue()
                .toUpperCase();
        if ("POST".equals(request) || "PUT".equals(request) || "PATCH".equals(request)) {
            return;
        } else if (putToAttribute) {
            requestFlowFile = session.create();
        }
    }

    // Every request/response cycle has a unique transaction id which will be stored as a flowfile attribute.
    final UUID txId = UUID.randomUUID();
    FlowFile responseFlowFile = null;

    try {
        final int maxAttributeSize = context.getProperty(PROP_PUT_ATTRIBUTE_MAX_LENGTH).asInteger();

        final String resourceName = context.getProperty(PROP_RESOURCE_NAME).getValue();

        final GenericApiGatewayClient client = getClient();

        final GenericApiGatewayRequest request = configureRequest(context, session, resourceName,
                requestFlowFile);

        logRequest(logger, client.getEndpoint(), request);
        final long startNanos = System.nanoTime();
        GenericApiGatewayResponse response = null;
        GenericApiGatewayException exception = null;
        try {
            response = client.execute(request);
            logResponse(logger, response);
        } catch (GenericApiGatewayException gag) {
            // ERROR response codes may come back as exceptions, 404 for example
            exception = gag;
        }

        final int statusCode;
        if (exception != null) {
            statusCode = exception.getStatusCode();
        } else {
            statusCode = response.getHttpResponse().getStatusCode();
        }

        if (statusCode == 0) {
            throw new IllegalStateException("Status code unknown, connection hasn't been attempted.");
        }
        final String endpoint = context.getProperty(PROP_AWS_GATEWAY_API_ENDPOINT).getValue();
        boolean outputRegardless = context.getProperty(PROP_OUTPUT_RESPONSE_REGARDLESS).asBoolean();

        boolean outputBodyToResponseContent = (isSuccess(statusCode) && !putToAttribute || outputRegardless);
        boolean outputBodyToRequestAttribute = (!isSuccess(statusCode) || putToAttribute)
                && requestFlowFile != null;
        boolean bodyExists = response != null && response.getBody() != null;

        final String statusExplanation;
        if (exception != null) {
            statusExplanation = EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, null);
        } else {
            statusExplanation = response.getHttpResponse().getStatusText();
        }

        // Create a map of the status attributes that are always written to the request and response FlowFiles
        final Map<String, String> statusAttributes = new HashMap<>();
        statusAttributes.put(STATUS_CODE, String.valueOf(statusCode));
        statusAttributes.put(STATUS_MESSAGE, statusExplanation);
        statusAttributes.put(ENDPOINT_ATTR, client.getEndpointPrefix());
        statusAttributes.put(RESOURCE_NAME_ATTR, resourceName);
        statusAttributes.put(TRANSACTION_ID, txId.toString());

        if (outputBodyToResponseContent) {
            /*
             * If successful and putting to response flowfile, store the response body as the flowfile payload
             * we include additional flowfile attributes including the response headers and the status codes.
             */

            // clone the flowfile to capture the response
            if (requestFlowFile != null) {
                responseFlowFile = session.create(requestFlowFile);
                // write attributes to request flowfile
                requestFlowFile = session.putAllAttributes(requestFlowFile, statusAttributes);
                // If the property to add the response headers to the request flowfile is true then add them
                if (context.getProperty(PROP_ADD_HEADERS_TO_REQUEST).asBoolean()) {
                    // write the response headers as attributes
                    // this will overwrite any existing flowfile attributes
                    requestFlowFile = session.putAllAttributes(requestFlowFile,
                            convertAttributesFromHeaders(response));
                }
            } else {
                responseFlowFile = session.create();
            }

            // write attributes to response flowfile
            responseFlowFile = session.putAllAttributes(responseFlowFile, statusAttributes);

            // write the response headers as attributes
            // this will overwrite any existing flowfile attributes
            if (response != null) {
                responseFlowFile = session.putAllAttributes(responseFlowFile,
                        convertAttributesFromHeaders(response));
            } else {
                responseFlowFile = session.putAllAttributes(responseFlowFile, exception.getHttpHeaders());
            }
            // transfer the message body to the payload
            // can potentially be null in edge cases
            if (bodyExists) {
                final String contentType = response.getHttpResponse().getHeaders().get("Content-Type");
                if (!(contentType == null) && !contentType.trim().isEmpty()) {
                    responseFlowFile = session.putAttribute(responseFlowFile, CoreAttributes.MIME_TYPE.key(),
                            contentType.trim());
                }

                responseFlowFile = session.importFrom(new ByteArrayInputStream(response.getBody().getBytes()),
                        responseFlowFile);

                // emit provenance event
                final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNanos);
                if (requestFlowFile != null) {
                    session.getProvenanceReporter().fetch(responseFlowFile, endpoint, millis);
                } else {
                    session.getProvenanceReporter().receive(responseFlowFile, endpoint, millis);
                }
            } else if (exception != null) {
                final String contentType = "application/json";
                responseFlowFile = session.putAttribute(responseFlowFile, CoreAttributes.MIME_TYPE.key(),
                        contentType.trim());

                responseFlowFile = session.importFrom(new ByteArrayInputStream(exception.getRawResponse()),
                        responseFlowFile);

                // emit provenance event
                final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNanos);
                if (requestFlowFile != null) {
                    session.getProvenanceReporter().fetch(responseFlowFile, endpoint, millis);
                } else {
                    session.getProvenanceReporter().receive(responseFlowFile, endpoint, millis);
                }
            }
        }
        // if not successful and request flowfile is not null, store the response body into a flowfile attribute
        if (outputBodyToRequestAttribute) {
            String attributeKey = context.getProperty(PROP_PUT_OUTPUT_IN_ATTRIBUTE)
                    .evaluateAttributeExpressions(requestFlowFile).getValue();
            if (attributeKey == null) {
                attributeKey = RESPONSE_BODY;
            }
            byte[] outputBuffer;
            int size = 0;
            outputBuffer = new byte[maxAttributeSize];
            if (bodyExists) {
                size = StreamUtils.fillBuffer(new ByteArrayInputStream(response.getBody().getBytes()),
                        outputBuffer, false);
            } else if (exception != null && exception.getRawResponse() != null
                    && exception.getRawResponse().length > 0) {
                size = StreamUtils.fillBuffer(new ByteArrayInputStream(exception.getRawResponse()),
                        outputBuffer, false);
            }

            if (size > 0) {
                String bodyString = new String(outputBuffer, 0, size, "UTF-8");
                requestFlowFile = session.putAttribute(requestFlowFile, attributeKey, bodyString);
            }

            requestFlowFile = session.putAllAttributes(requestFlowFile, statusAttributes);

            final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNanos);
            session.getProvenanceReporter().modifyAttributes(requestFlowFile,
                    "The " + attributeKey + " has been added. The value of which is the body of a http call to "
                            + endpoint + resourceName + ". It took " + millis + "millis,");
        }

        route(requestFlowFile, responseFlowFile, session, context, statusCode, getRelationships());
    } catch (Exception e) {
        // penalize or yield
        if (requestFlowFile != null) {
            logger.error("Routing to {} due to exception: {}", new Object[] { REL_FAILURE.getName(), e }, e);
            requestFlowFile = session.penalize(requestFlowFile);
            requestFlowFile = session.putAttribute(requestFlowFile, EXCEPTION_CLASS, e.getClass().getName());
            requestFlowFile = session.putAttribute(requestFlowFile, EXCEPTION_MESSAGE, e.getMessage());
            // transfer original to failure
            session.transfer(requestFlowFile, getRelationshipForName(REL_FAILURE_NAME, getRelationships()));
        } else {
            logger.error("Yielding processor due to exception encountered as a source processor: {}", e);
            context.yield();
        }

        // cleanup response flowfile, if applicable
        try {
            if (responseFlowFile != null) {
                session.remove(responseFlowFile);
            }
        } catch (final Exception e1) {
            logger.error("Could not cleanup response flowfile due to exception: {}", new Object[] { e1 }, e1);
        }
    }
}

From source file:org.deviceconnect.message.event.AbstractEventManager.java

/**
 * ?????.//from  w  ww  . j a v  a 2s. co  m
 * 
 * @param response ?
 * @return ???
 * @throws IOException ?????
 */
protected HttpResponse copyResponse(final HttpResponse response) throws IOException {
    int code = response.getStatusLine().getStatusCode();
    HttpResponse retRes = new BasicHttpResponse(response.getProtocolVersion(), code,
            EnglishReasonPhraseCatalog.INSTANCE.getReason(code, null));
    retRes.setHeaders(response.getAllHeaders());
    retRes.setEntity(new StringEntity(EntityUtils.toString(response.getEntity(), "UTF-8")));
    return retRes;
}

From source file:com.crawler.app.crawler.WebCrawler.java

private void processPage(WebURL curURL) {
    PageFetchResult fetchResult = null;/*from   ww w  .  j  a  va2 s.co m*/
    try {
        if (curURL == null) {
            throw new Exception("Failed processing a NULL url !?");
        }

        fetchResult = pageFetcher.fetchPage(curURL);
        int statusCode = fetchResult.getStatusCode();
        handlePageStatusCode(curURL, statusCode,
                EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, Locale.ENGLISH)); // Finds the status reason for all known statuses

        Page page = new Page(curURL);
        page.setFetchResponseHeaders(fetchResult.getResponseHeaders());
        page.setStatusCode(statusCode);
        if (statusCode != HttpStatus.SC_OK) { // Not 200
            if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY
                    || statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER
                    || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) { // is 3xx  todo follow https://issues.apache.org/jira/browse/HTTPCORE-389

                page.setRedirect(true);
                if (myController.getConfig().isFollowRedirects()) {
                    String movedToUrl = fetchResult.getMovedToUrl();
                    if (movedToUrl == null) {
                        throw new RedirectException(Level.WARN,
                                "Unexpected error, URL: " + curURL + " is redirected to NOTHING");
                    }
                    page.setRedirectedToUrl(movedToUrl);

                    int newDocId = docIdServer.getDocId(movedToUrl);
                    if (newDocId > 0) {
                        throw new RedirectException(Level.DEBUG,
                                "Redirect page: " + curURL + " is already seen");
                    }

                    WebURL webURL = new WebURL();
                    webURL.setURL(movedToUrl);
                    webURL.setParentDocid(curURL.getParentDocid());
                    webURL.setParentUrl(curURL.getParentUrl());
                    webURL.setDepth(curURL.getDepth());
                    webURL.setDocid(-1);
                    webURL.setAnchor(curURL.getAnchor());
                    if (shouldVisit(page, webURL)) {
                        if (robotstxtServer.allows(webURL)) {
                            webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
                            frontier.schedule(webURL);
                        } else {
                            logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy",
                                    webURL.getURL());
                        }
                    } else {
                        logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL());
                    }
                }
            } else { // All other http codes other than 3xx & 200
                String description = EnglishReasonPhraseCatalog.INSTANCE.getReason(fetchResult.getStatusCode(),
                        Locale.ENGLISH); // Finds the status reason for all known statuses
                String contentType = fetchResult.getEntity() == null ? ""
                        : fetchResult.getEntity().getContentType().getValue();
                onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(), contentType, description);
            }

        } else { // if status code is 200
            if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
                if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
                    throw new RedirectException(Level.DEBUG,
                            "Redirect page: " + curURL + " has already been seen");
                }
                curURL.setURL(fetchResult.getFetchedUrl());
                curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
            }

            if (!fetchResult.fetchContent(page)) {
                throw new ContentFetchException();
            }

            parser.parse(page, curURL.getURL());

            ParseData parseData = page.getParseData();
            List<WebURL> toSchedule = new ArrayList<>();
            int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
            for (WebURL webURL : parseData.getOutgoingUrls()) {
                webURL.setParentDocid(curURL.getDocid());
                webURL.setParentUrl(curURL.getURL());
                int newdocid = docIdServer.getDocId(webURL.getURL());
                if (newdocid > 0) {
                    // This is not the first time that this Url is visited. So, we set the depth to a negative number.
                    webURL.setDepth((short) -1);
                    webURL.setDocid(newdocid);
                } else {
                    webURL.setDocid(-1);
                    webURL.setDepth((short) (curURL.getDepth() + 1));
                    if (maxCrawlDepth == -1 || curURL.getDepth() < maxCrawlDepth) {
                        if (shouldVisit(page, webURL)) {
                            if (robotstxtServer.allows(webURL)) {
                                webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
                                toSchedule.add(webURL);
                            } else {
                                logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy",
                                        webURL.getURL());
                            }
                        } else {
                            logger.debug("Not visiting: {} as per your \"shouldVisit\" policy",
                                    webURL.getURL());
                        }
                    }
                }
            }
            frontier.scheduleAll(toSchedule);

            visit(page);
        }
    } catch (PageBiggerThanMaxSizeException e) {
        onPageBiggerThanMaxSize(curURL.getURL(), e.getPageSize());
    } catch (ParseException pe) {
        onParseError(curURL);
    } catch (ContentFetchException cfe) {
        onContentFetchError(curURL);
    } catch (RedirectException re) {
        logger.log(re.level, re.getMessage());
    } catch (NotAllowedContentException nace) {
        logger.debug("Skipping: {} as it contains binary content which you configured not to crawl",
                curURL.getURL());
    } catch (Exception e) {
        String urlStr = (curURL == null ? "NULL" : curURL.getURL());
        logger.error("{}, while processing: {}", e.getMessage(), urlStr);
        logger.debug("Stacktrace", e);
    } finally {
        if (fetchResult != null) {
            fetchResult.discardContentIfNotConsumed();
        }
    }
}

From source file:com.vnlab.badlink.crawler.WebCrawler.java

private void processPage(WebURL curURL) {
    PageFetchResult fetchResult = null;//ww  w  .jav a  2s  .  co m
    try {
        if (curURL == null) {
            throw new Exception("Failed processing a NULL url !?");
        }

        fetchResult = pageFetcher.fetchPage(curURL);
        int statusCode = fetchResult.getStatusCode();
        handlePageStatusCode(curURL, statusCode,
                EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, Locale.ENGLISH)); // Finds the status reason for all known statuses

        Page page = new Page(curURL);
        page.setFetchResponseHeaders(fetchResult.getResponseHeaders());
        page.setStatusCode(statusCode);
        if (statusCode != HttpStatus.SC_OK) { // Not 200
            if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY
                    || statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER
                    || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) { // is 3xx  todo follow https://issues.apache.org/jira/browse/HTTPCORE-389

                page.setRedirect(true);
                if (myController.getConfig().isFollowRedirects()) {
                    String movedToUrl = fetchResult.getMovedToUrl();
                    if (movedToUrl == null) {
                        throw new RedirectException(Level.WARN,
                                "Unexpected error, URL: " + curURL + " is redirected to NOTHING");
                    }
                    page.setRedirectedToUrl(movedToUrl);

                    int newDocId = docIdServer.getDocId(movedToUrl);
                    if (newDocId > 0) {
                        throw new RedirectException(Level.DEBUG,
                                "Redirect page: " + curURL + " is already seen");
                    }

                    WebURL webURL = new WebURL();
                    webURL.setURL(movedToUrl);
                    webURL.setParentDocid(curURL.getParentDocid());
                    webURL.setParentUrl(curURL.getParentUrl());
                    webURL.setDepth(curURL.getDepth());
                    webURL.setDocid(-1);
                    webURL.setAnchor(curURL.getAnchor());
                    if (shouldVisit(page, webURL)) {
                        if (robotstxtServer.allows(webURL)) {
                            webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
                            frontier.schedule(webURL);
                        } else {
                            logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy",
                                    webURL.getURL());
                        }
                    } else {
                        logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL());
                    }
                }
            } else { // All other http codes other than 3xx & 200
                String description = EnglishReasonPhraseCatalog.INSTANCE.getReason(fetchResult.getStatusCode(),
                        Locale.ENGLISH); // Finds the status reason for all known statuses
                String contentType = fetchResult.getEntity() == null ? ""
                        : fetchResult.getEntity().getContentType().getValue();
                onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(), contentType, description);
            }

        } else { // if status code is 200
            if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
                if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
                    throw new RedirectException(Level.DEBUG,
                            "Redirect page: " + curURL + " has already been seen");
                }
                curURL.setURL(fetchResult.getFetchedUrl());
                curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
            }

            if (!fetchResult.fetchContent(page)) {
                throw new ContentFetchException();
            }

            parser.parse(page, curURL.getURL());

            ParseData parseData = page.getParseData();
            List<WebURL> toSchedule = new ArrayList<>();
            int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
            for (WebURL webURL : parseData.getOutgoingUrls()) {
                webURL.setParentDocid(curURL.getDocid());
                webURL.setParentUrl(curURL.getURL());
                int newdocid = docIdServer.getDocId(webURL.getURL());
                if (newdocid > 0) {
                    // This is not the first time that this Url is visited. So, we set the depth to a negative number.
                    webURL.setDepth((short) -1);
                    webURL.setDocid(newdocid);
                } else {
                    webURL.setDocid(-1);
                    webURL.setDepth((short) (curURL.getDepth() + 1));
                    if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
                        if (shouldVisit(page, webURL)) {
                            if (robotstxtServer.allows(webURL)) {
                                webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
                                toSchedule.add(webURL);
                            } else {
                                logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy",
                                        webURL.getURL());
                            }
                        } else {
                            logger.debug("Not visiting: {} as per your \"shouldVisit\" policy",
                                    webURL.getURL());
                        }
                    }
                }
            }
            frontier.scheduleAll(toSchedule);

            visit(page);
        }
    } catch (PageBiggerThanMaxSizeException e) {
        onPageBiggerThanMaxSize(curURL.getURL(), e.getPageSize());
    } catch (ParseException pe) {
        onParseError(curURL);
    } catch (ContentFetchException cfe) {
        onContentFetchError(curURL);
    } catch (RedirectException re) {
        logger.log(re.level, re.getMessage());
    } catch (NotAllowedContentException nace) {
        logger.debug("Skipping: {} as it contains binary content which you configured not to crawl",
                curURL.getURL());
    } catch (Exception e) {
        String urlStr = (curURL == null ? "NULL" : curURL.getURL());
        logger.error("{}, while processing: {}", e.getMessage(), urlStr);
        logger.debug("Stacktrace", e);
    } finally {
        if (fetchResult != null) {
            fetchResult.discardContentIfNotConsumed();
        }
    }
}

From source file:crawler.java.edu.uci.ics.crawler4j.crawler.WebCrawler.java

private void processPage(WebURL curURL) {
    PageFetchResult fetchResult = null;//from w  w  w  .j a va2s  .co m
    try {
        if (curURL == null) {
            throw new Exception("Failed processing a NULL url !?");
        }

        fetchResult = pageFetcher.fetchPage(curURL);
        int statusCode = fetchResult.getStatusCode();
        handlePageStatusCode(curURL, statusCode,
                EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, Locale.ENGLISH)); // Finds the status reason for all known statuses

        Page page = new Page(curURL);
        page.setFetchResponseHeaders(fetchResult.getResponseHeaders());
        page.setStatusCode(statusCode);
        if (statusCode < 200 || statusCode > 299) { // Not 2XX: 2XX status codes indicate success
            if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY
                    || statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER
                    || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) { // is 3xx  todo follow https://issues.apache.org/jira/browse/HTTPCORE-389

                page.setRedirect(true);
                if (myController.getConfig().isFollowRedirects()) {
                    String movedToUrl = fetchResult.getMovedToUrl();
                    if (movedToUrl == null) {
                        throw new RedirectException(
                                "Unexpected error, URL: " + curURL + " is redirected to NOTHING");
                    }
                    page.setRedirectedToUrl(movedToUrl);

                    int newDocId = docIdServer.getDocId(movedToUrl);
                    if (newDocId > 0) {
                        throw new RedirectException("Redirect page: " + curURL + " is already seen");
                    }

                    WebURL webURL = new WebURL();
                    webURL.setURL(movedToUrl);
                    webURL.setParentDocid(curURL.getParentDocid());
                    webURL.setParentUrl(curURL.getParentUrl());
                    webURL.setDepth(curURL.getDepth());
                    webURL.setDocid(-1);
                    webURL.setAnchor(curURL.getAnchor());
                    if (shouldVisit(page, webURL)) {
                        if (robotstxtServer.allows(webURL)) {
                            webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
                            frontier.schedule(webURL);
                        } else {
                            //                logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy", webURL.getURL());
                        }
                    } else {
                        //              logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL());
                    }
                }
            } else { // All other http codes other than 3xx & 200
                String description = EnglishReasonPhraseCatalog.INSTANCE.getReason(fetchResult.getStatusCode(),
                        Locale.ENGLISH); // Finds the status reason for all known statuses
                String contentType = fetchResult.getEntity() == null ? ""
                        : fetchResult.getEntity().getContentType().getValue();
                onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(), contentType, description);
            }

        } else { // if status code is 200
            if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
                if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
                    throw new RedirectException("Redirect page: " + curURL + " has already been seen");
                }
                curURL.setURL(fetchResult.getFetchedUrl());
                curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
            }

            if (!fetchResult.fetchContent(page)) {
                throw new ContentFetchException();
            }

            parser.parse(page, curURL.getURL());

            ParseData parseData = page.getParseData();
            List<WebURL> toSchedule = new ArrayList<>();
            int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
            for (WebURL webURL : parseData.getOutgoingUrls()) {
                webURL.setParentDocid(curURL.getDocid());
                webURL.setParentUrl(curURL.getURL());
                int newdocid = docIdServer.getDocId(webURL.getURL());
                if (newdocid > 0) {
                    // This is not the first time that this Url is visited. So, we set the depth to a negative number.
                    webURL.setDepth((short) -1);
                    webURL.setDocid(newdocid);
                } else {
                    webURL.setDocid(-1);
                    webURL.setDepth((short) (curURL.getDepth() + 1));
                    if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
                        if (shouldVisit(page, webURL)) {
                            if (robotstxtServer.allows(webURL)) {
                                webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
                                toSchedule.add(webURL);
                            } else {
                                //                  logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy", webURL.getURL());
                            }
                        } else {
                            //                logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL());
                        }
                    }
                }
            }
            frontier.scheduleAll(toSchedule);

            visit(page);
        }
    } catch (PageBiggerThanMaxSizeException e) {
        onPageBiggerThanMaxSize(curURL.getURL(), e.getPageSize());
    } catch (ParseException pe) {
        onParseError(curURL);
    } catch (ContentFetchException cfe) {
        onContentFetchError(curURL);
    } catch (RedirectException re) {
        //      logger.log(re.level, re.getMessage());
    } catch (NotAllowedContentException nace) {
        //      logger.debug("Skipping: {} as it contains binary content which you configured not to crawl", curURL.getURL());
    } catch (Exception e) {
        String urlStr = (curURL == null ? "NULL" : curURL.getURL());
        //      logger.error("{}, while processing: {}", e.getMessage(), urlStr);
        //      logger.debug("Stacktrace", e);
    } finally {
        if (fetchResult != null) {
            fetchResult.discardContentIfNotConsumed();
        }
    }
}

From source file:io.github.cidisk.indexcrawler.WebCrawler.java

private void processPage(WebURL curURL) {
    PageFetchResult fetchResult = null;//  w  w w  .  j a v a  2s  .c o m
    try {
        if (curURL == null) {
            throw new Exception("Failed processing a NULL url !?");
        }
        // add for checking depth for optimized fish search algorithm
        if (curURL.getDepth() == 0) {
            throw new DeadFishException(curURL.getDepth());
        }

        fetchResult = pageFetcher.fetchPage(curURL);
        int statusCode = fetchResult.getStatusCode();
        handlePageStatusCode(curURL, statusCode,
                EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, Locale.ENGLISH)); // Finds the status reason for all
                                                                                                                                     // known statuses

        Page page = new Page(curURL);
        page.setFetchResponseHeaders(fetchResult.getResponseHeaders());
        page.setStatusCode(statusCode);
        if (statusCode != HttpStatus.SC_OK) { // Not 200
            if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY
                    || statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER
                    || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) { // is 3xx todo follow
                // https://issues.apache.org/jira/browse/HTTPCORE-389

                page.setRedirect(true);
                if (myController.getConfig().isFollowRedirects()) {
                    String movedToUrl = fetchResult.getMovedToUrl();
                    if (movedToUrl == null) {
                        throw new RedirectException(Level.WARN,
                                "Unexpected error, URL: " + curURL + " is redirected to NOTHING");
                    }
                    page.setRedirectedToUrl(movedToUrl);

                    int newDocId = docIdServer.getDocId(movedToUrl);
                    if (newDocId > 0) {
                        throw new RedirectException(Level.DEBUG,
                                "Redirect page: " + curURL + " is already seen");
                    }

                    WebURL webURL = new WebURL();
                    webURL.setURL(movedToUrl);
                    webURL.setParentDocid(curURL.getParentDocid());
                    webURL.setParentUrl(curURL.getParentUrl());
                    webURL.setDepth(curURL.getDepth());
                    webURL.setDocid(-1);
                    webURL.setAnchor(curURL.getAnchor());
                    // //////////
                    webURL.setPriority(Util.int2ByteArray(255)[3]);
                    if (shouldVisit(page, webURL)) {
                        // if (robotstxtServer.allows(webURL)) {
                        logger.debug("Visiting: {}", webURL.getURL());
                        webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
                        frontier.schedule(webURL);
                        // } else {
                        // logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy",
                        // webURL.getURL());
                        // }
                    } else {
                        logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL());
                    }
                }
            } else { // All other http codes other than 3xx & 200
                String description = EnglishReasonPhraseCatalog.INSTANCE.getReason(fetchResult.getStatusCode(),
                        Locale.ENGLISH); // Finds the status reason
                                                                                                                                         // for all known
                                                                                                                                         // statuses
                String contentType = fetchResult.getEntity() == null ? ""
                        : fetchResult.getEntity().getContentType().getValue();
                onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(), contentType, description);
            }

        } else { // if status code is 200
            if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
                if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
                    throw new RedirectException(Level.DEBUG,
                            "Redirect page: " + curURL + " has already been seen");
                }
                curURL.setURL(fetchResult.getFetchedUrl());
                curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
            }

            if (!fetchResult.fetchContent(page)) {
                throw new ContentFetchException();
            }

            if (!parser.parse(page, curURL.getURL(), curURL.getDepth())) {
                throw new ParseException();
            }

            ParseData parseData = page.getParseData();
            List<WebURL> toSchedule = new ArrayList<>();
            // int crawlDepth =
            // myController.getConfig().getDepthOfCrawling();
            Set<WebURL> infourls = parseData.getInfoUrls();
            // calculate the rate of new info urls
            double newDocRate = 0.0;
            if (null != infourls) {
                int cnt = 0;
                for (WebURL webURL : infourls) {
                    int newinfodocid = infoDocIdServer.getDocId(webURL.getURL());

                    // This is not the first time that this Url is visited.
                    // So, we set the depth to a negative number.
                    webURL.setDepth(curURL.getDepth());
                    if (!infoDocIdServer.isSeenBefore(webURL.getURL())) {
                        cnt++;
                        newinfodocid = infoDocIdServer.getNewDocID(webURL.getURL());
                    }
                    webURL.setDocid(newinfodocid);
                }
                newDocRate = ((double) cnt) / infourls.size();
            }

            for (WebURL webURL : parseData.getOutgoingIndexUrls()) {
                webURL.setParentDocid(curURL.getDocid());
                webURL.setParentUrl(curURL.getURL());
                webURL.setPriority(Util.int2ByteArray((int) (255 * newDocRate))[3]);
                int newdocid = docIdServer.getDocId(webURL.getURL());
                // if (newdocid > 0) {
                if (newdocid > 0) {
                    // This is not the first time that this Url is visited.
                    // So, we set the depth to a negative number.
                    webURL.setDepth((short) -1);
                    // webURL.setDepth( (short) (curURL.getDepth()-1));
                    webURL.setDocid(newdocid);
                } else {
                    webURL.setDocid(-1);
                    // webURL.setDepth((short) (curURL.getDepth() + 1));
                    if (newDocRate == 0) {
                        webURL.setDepth((short) (curURL.getDepth() - 1));
                    } else {
                        webURL.setDepth((short) curURL.getDepth());
                    }
                    // if (maxCrawlDepth == -1 || curURL.getDepth() <
                    // maxCrawlDepth) {
                    if (shouldVisit(page, webURL)) {
                        // if (robotstxtServer.allows(webURL)) {
                        logger.debug("Visiting: {}", webURL.getURL());
                        webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
                        toSchedule.add(webURL);
                        // } else {
                        // logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy",
                        // webURL.getURL());
                        // }
                    } else {
                        logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL());
                    }
                    // }
                }
            }
            frontier.scheduleAll(toSchedule);

            visit(page);
        }
    } catch (PageBiggerThanMaxSizeException e) {
        onPageBiggerThanMaxSize(curURL.getURL(), e.getPageSize());
    } catch (ParseException pe) {
        onParseError(curURL);
    } catch (ContentFetchException cfe) {
        onContentFetchError(curURL);
    } catch (RedirectException re) {
        logger.log(re.level, re.getMessage());
    } catch (NotAllowedContentException nace) {
        logger.debug("Skipping: {} as it contains binary content which you configured not to crawl",
                curURL.getURL());
    } catch (Exception e) {
        String urlStr = (curURL == null ? "NULL" : curURL.getURL());
        logger.error("{}, while processing: {}", e.getMessage(), urlStr);
        logger.debug("Stacktrace", e);
    } finally {
        if (fetchResult != null) {
            fetchResult.discardContentIfNotConsumed();
        }
    }
}

From source file:org.eclipse.mylyn.commons.repositories.http.core.HttpUtil.java

public static String getStatusText(int statusCode) {
    return EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, Locale.getDefault());
}

From source file:xyz.invik.ccrawler4j.crawler.WebCrawler.java

private void processPage(WebURL curURL) {
    PageFetchResult fetchResult = null;/*from  ww w.  jav  a2 s. c om*/
    try {
        if (curURL == null) {
            return;
        }

        fetchResult = pageFetcher.fetchPage(curURL);
        int statusCode = fetchResult.getStatusCode();
        handlePageStatusCode(curURL, statusCode,
                EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, Locale.ENGLISH));
        // Finds the status reason for all known statuses

        Page page = new Page(curURL);
        page.setFetchResponseHeaders(fetchResult.getResponseHeaders());
        page.setStatusCode(statusCode);
        if (statusCode < 200 || statusCode > 299) { // Not 2XX: 2XX status codes indicate success
            if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY
                    || statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER
                    || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) { // is 3xx  todo
                // follow https://issues.apache.org/jira/browse/HTTPCORE-389

                page.setRedirect(true);

                String movedToUrl = fetchResult.getMovedToUrl();
                if (movedToUrl == null) {
                    logger.warn("Unexpected error, URL: {} is redirected to NOTHING", curURL);
                    return;
                }
                page.setRedirectedToUrl(movedToUrl);
                onRedirectedStatusCode(page);

                if (myController.getConfig().isFollowRedirects()) {
                    int newDocId = docIdServer.getDocId(movedToUrl);
                    if (newDocId > 0) {
                        logger.debug("Redirect page: {} is already seen", curURL);
                        return;
                    }

                    WebURL webURL = new WebURL();
                    webURL.setURL(movedToUrl);
                    webURL.setParentDocid(curURL.getParentDocid());
                    webURL.setParentUrl(curURL.getParentUrl());
                    webURL.setDepth(curURL.getDepth());
                    webURL.setDocid(-1);
                    webURL.setAnchor(curURL.getAnchor());
                    webURL.setLabel(curURL.getLabel());

                    // Update the number of consecutive redirections followed to reach webURL.
                    webURL.setRedirectionDepth(curURL.getRedirectionDepth() + 1);

                    if (shouldVisit(page, webURL)) {
                        if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) {
                            webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
                            frontier.schedule(webURL);
                        } else {
                            logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy",
                                    webURL.getURL());
                        }
                    } else {
                        logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL());
                    }
                }
            } else { // All other http codes other than 3xx & 200
                String description = EnglishReasonPhraseCatalog.INSTANCE.getReason(fetchResult.getStatusCode(),
                        Locale.ENGLISH); // Finds
                // the status reason for all known statuses
                String contentType = fetchResult.getEntity() == null ? ""
                        : fetchResult.getEntity().getContentType() == null ? ""
                                : fetchResult.getEntity().getContentType().getValue();
                onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(), contentType, description);
            }

        } else { // if status code is 200
            if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
                if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
                    logger.debug("Redirect page: {} has already been seen", curURL);
                    return;
                }
                curURL.setURL(fetchResult.getFetchedUrl());
                curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
            }

            if (!fetchResult.fetchContent(page, myController.getConfig().getMaxDownloadSize())) {
                throw new ContentFetchException();
            }

            if (page.isTruncated()) {
                logger.warn(
                        "Warning: unknown page size exceeded max-download-size, truncated to: "
                                + "({}), at URL: {}",
                        myController.getConfig().getMaxDownloadSize(), curURL.getURL());
            }

            parser.parse(page, curURL.getURL());

            if (shouldFollowLinksIn(page.getWebURL())) {
                ParseData parseData = page.getParseData();
                List<WebURL> toSchedule = new ArrayList<>();
                int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
                for (WebURL webURL : parseData.getOutgoingUrls()) {
                    webURL.setParentDocid(curURL.getDocid());
                    webURL.setParentUrl(curURL.getURL());
                    int newdocid = docIdServer.getDocId(webURL.getURL());
                    if (newdocid > 0) {
                        // This is not the first time that this Url is visited. So, we set the
                        // depth to a negative number.
                        webURL.setDepth((short) -1);
                        webURL.setDocid(newdocid);
                    } else {
                        webURL.setDocid(-1);
                        webURL.setDepth((short) (curURL.getDepth() + 1));
                        if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
                            if (shouldVisit(page, webURL)) {
                                if (robotstxtServer.allows(webURL)) {
                                    webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
                                    toSchedule.add(webURL);
                                } else {
                                    logger.debug(
                                            "Not visiting: {} as per the server's \"robots.txt\" " + "policy",
                                            webURL.getURL());
                                }
                            } else {
                                logger.debug("Not visiting: {} as per your \"shouldVisit\" policy",
                                        webURL.getURL());
                            }
                        }
                    }
                }
                frontier.scheduleAll(toSchedule);
            } else {
                logger.debug(
                        "Not looking for links in page {}, " + "as per your \"shouldFollowLinksInPage\" policy",
                        page.getWebURL().getURL());
            }

            boolean noIndex = myController.getConfig().isRespectNoIndex() && page.getContentType() != null
                    && page.getContentType().contains("html")
                    && ((HtmlParseData) page.getParseData()).getMetaTagValue("robots").contains("noindex");

            if (!noIndex) {
                visit(page);
            }
        }
    } catch (PageBiggerThanMaxSizeException e) {
        onPageBiggerThanMaxSize(curURL.getURL(), e.getPageSize());
    } catch (ParseException pe) {
        onParseError(curURL);
    } catch (ContentFetchException cfe) {
        onContentFetchError(curURL);
    } catch (NotAllowedContentException nace) {
        logger.debug("Skipping: {} as it contains binary content which you configured not to crawl",
                curURL.getURL());
    } catch (Exception e) {
        onUnhandledException(curURL, e);
    } finally {
        if (fetchResult != null) {
            fetchResult.discardContentIfNotConsumed();
        }
    }
}

From source file:org.eclipse.californium.proxy.HttpTranslator.java

/**
 * Sets the parameters of the incoming http response from a CoAP response.
 * The status code is mapped through the properties file and is set through
 * the StatusLine. The options are translated to the corresponding headers
 * and the max-age (in the header cache-control) is set to the default value
 * (60 seconds) if not already present. If the request method was not HEAD
 * and the coap response has a payload, the entity and the content-type are
 * set in the http response.//w ww .  j ava 2 s.c  o m
 * 
 * @param coapResponse
 *            the coap response
 * @param httpResponse
 * 
 * 
 * 
 * @param httpRequest
 *            HttpRequest
 * @throws TranslationException
 *             the translation exception
 */
public static void getHttpResponse(HttpRequest httpRequest, Response coapResponse, HttpResponse httpResponse)
        throws TranslationException {
    if (httpRequest == null) {
        throw new IllegalArgumentException("httpRequest == null");
    }
    if (coapResponse == null) {
        throw new IllegalArgumentException("coapResponse == null");
    }
    if (httpResponse == null) {
        throw new IllegalArgumentException("httpResponse == null");
    }

    // get/set the response code
    ResponseCode coapCode = coapResponse.getCode();
    String httpCodeString = HTTP_TRANSLATION_PROPERTIES.getProperty(KEY_COAP_CODE + coapCode.value);

    if (httpCodeString == null || httpCodeString.isEmpty()) {
        LOGGER.warning("httpCodeString == null");
        throw new TranslationException("httpCodeString == null");
    }

    int httpCode = 0;
    try {
        httpCode = Integer.parseInt(httpCodeString.trim());
    } catch (NumberFormatException e) {
        LOGGER.warning("Cannot convert the coap code in http status code" + e);
        throw new TranslationException("Cannot convert the coap code in http status code", e);
    }

    // create the http response and set the status line
    String reason = EnglishReasonPhraseCatalog.INSTANCE.getReason(httpCode, Locale.ENGLISH);
    StatusLine statusLine = new BasicStatusLine(HttpVersion.HTTP_1_1, httpCode, reason);
    httpResponse.setStatusLine(statusLine);

    String uriString = httpRequest.getRequestLine().getUri();
    int index_query = uriString.indexOf("//");
    String query = uriString.substring(index_query + 2);
    int index_host = query.indexOf("/");
    String host = query.substring(0, index_host);

    // set the headers
    Header[] headers = getHttpHeaders(coapResponse.getOptions().asSortedList(), host);
    httpResponse.setHeaders(headers);

    // set max-age if not already set
    if (!httpResponse.containsHeader("cache-control")) {
        httpResponse.setHeader("cache-control",
                "max-age=" + Long.toString(OptionNumberRegistry.Defaults.MAX_AGE));
    }

    // get the http entity if the request was not HEAD
    if (!httpRequest.getRequestLine().getMethod().equalsIgnoreCase("head")) {
        if ((httpRequest.getRequestLine().getMethod().equalsIgnoreCase("put")) && (coapCode.value == 131)) {
            String linkPut = getLinkPut(coapResponse);
            Header link = new BasicHeader("Link", linkPut);
            httpResponse.addHeader(link);
        }
        // if the content-type is not set in the coap response and if the
        // response contains an error, then the content-type should set to
        // text-plain
        if (coapResponse.getOptions().getContentFormat() == MediaTypeRegistry.UNDEFINED
                && (ResponseCode.isClientError(coapCode) || ResponseCode.isServerError(coapCode))) {
            LOGGER.info("Set contenttype to TEXT_PLAIN");
            coapResponse.getOptions().setContentFormat(MediaTypeRegistry.TEXT_PLAIN);
        }

        HttpEntity httpEntity = getHttpEntity(coapResponse);
        if (httpEntity != null) {
            httpResponse.setEntity(httpEntity);

            // get the content-type from the entity and set the header
            ContentType contentType = ContentType.get(httpEntity);
            httpResponse.setHeader("content-type", contentType.toString());
        }
    }
}