edu.uci.ics.crawler4j.parser.Parser.java Source code

Introduction

Here is the source code for edu.uci.ics.crawler4j.parser.Parser.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package edu.uci.ics.crawler4j.parser;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;

import org.apache.log4j.Logger;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.HtmlParser;

import edu.uci.ics.crawler4j.crawler.Configurable;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import edu.uci.ics.crawler4j.url.WebURL;
import edu.uci.ics.crawler4j.util.Util;

/**
 * @author Yasser Ganjisaffar <lastname at gmail dot com>
 */
public class Parser extends Configurable {

    protected static final Logger logger = Logger.getLogger(Parser.class.getName());

    private HtmlParser htmlParser;
    private ParseContext parseContext;

    private PDFTextStripper pdfTextStripper;

    public Parser(CrawlConfig config) {
        super(config);
        htmlParser = new HtmlParser();
        parseContext = new ParseContext();
        try {
            pdfTextStripper = new PDFTextStripper();
        } catch (IOException e) {
            logger.error("Error creating PDF Component. This should not happen in production.");
            throw new RuntimeException(e);
        }
    }

    public boolean parse(Page page, String contextURL) {

        /*
         * Checking for PDF content before checking for Binary content because
         * the latter looks for the "application" substring and would match
         * "application/pdf".
         */
        if (Util.hasPDFContent(page.getContentType()) && config.isIncludePDF()) {
            try {
                treatPDFContentType(page);
                return true;
            } catch (IOException e) {
                logger.error(e.getMessage() + ", while parsing: " + page.getWebURL().getURL());
                return false;
            }
        } else if (Util.hasBinaryContent(page.getContentType())) {
            if (!config.isIncludeBinaryContentInCrawling()) {
                return false;
            }

            page.setParseData(BinaryParseData.getInstance());
            return true;

        } else if (Util.hasPlainTextContent(page.getContentType())) {
            try {
                TextParseData parseData = new TextParseData();
                if (page.getContentCharset() == null) {
                    parseData.setTextContent(new String(page.getContentData()));
                } else {
                    parseData.setTextContent(new String(page.getContentData(), page.getContentCharset()));
                }
                page.setParseData(parseData);
                return true;
            } catch (Exception e) {
                logger.error(e.getMessage() + ", while parsing: " + page.getWebURL().getURL());
            }
            return false;
        }

        Metadata metadata = new Metadata();
        HtmlContentHandler contentHandler = new HtmlContentHandler();
        InputStream inputStream = null;
        try {
            inputStream = new ByteArrayInputStream(page.getContentData());
            htmlParser.parse(inputStream, contentHandler, metadata, parseContext);
        } catch (Exception e) {
            logger.error(e.getMessage() + ", while parsing: " + page.getWebURL().getURL());
        } finally {
            try {
                if (inputStream != null) {
                    inputStream.close();
                }
            } catch (IOException e) {
                logger.error(e.getMessage() + ", while parsing: " + page.getWebURL().getURL());
            }
        }

        if (page.getContentCharset() == null) {
            page.setContentCharset(metadata.get("Content-Encoding"));
        }

        HtmlParseData parseData = new HtmlParseData();
        parseData.setText(contentHandler.getBodyText().trim());
        parseData.setTitle(metadata.get(DublinCore.TITLE));

        List<WebURL> outgoingUrls = new ArrayList<>();

        String baseURL = contentHandler.getBaseUrl();
        if (baseURL != null) {
            contextURL = baseURL;
        }

        int urlCount = 0;
        for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) {
            String href = urlAnchorPair.getHref();
            href = href.trim();
            if (href.length() == 0) {
                continue;
            }
            String hrefWithoutProtocol = href.toLowerCase();
            if (href.startsWith("http://")) {
                hrefWithoutProtocol = href.substring(7);
            }
            if (!hrefWithoutProtocol.contains("javascript:") && !hrefWithoutProtocol.contains("mailto:")
                    && !hrefWithoutProtocol.contains("@")) {
                String url = URLCanonicalizer.getCanonicalURL(href, contextURL);
                if (url != null) {
                    WebURL webURL = new WebURL();
                    webURL.setURL(url);
                    webURL.setAnchor(urlAnchorPair.getAnchor());
                    outgoingUrls.add(webURL);
                    urlCount++;
                    if (urlCount > config.getMaxOutgoingLinksToFollow()) {
                        break;
                    }
                }
            }
        }

        parseData.setOutgoingUrls(outgoingUrls);

        try {
            if (page.getContentCharset() == null) {
                parseData.setHtml(new String(page.getContentData()));
            } else {
                parseData.setHtml(new String(page.getContentData(), page.getContentCharset()));
            }
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
            return false;
        }

        page.setParseData(parseData);
        return true;

    }

    private void treatPDFContentType(Page page) throws IOException {
        PDDocument doc = PDDocument.load(new ByteArrayInputStream(page.getContentData()));
        page.setParseData(new PDFParseData(pdfTextStripper.getText(doc)));
        doc.close();
    }

}