org.nuxeo.webpage.archiver.WebpageToBlob.java Source code

Java tutorial

Introduction

Here is the source code for org.nuxeo.webpage.archiver.WebpageToBlob.java

Source

/*
 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * Contributors:
 *     Thibaud Arguillere
 */
package org.nuxeo.webpage.archiver;

import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;

import org.apache.commons.exec.CommandLine;
import org.apache.commons.exec.DefaultExecutor;
import org.apache.commons.exec.ExecuteWatchdog;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.nuxeo.ecm.core.api.Blob;
import org.nuxeo.ecm.core.api.Blobs;
import org.nuxeo.ecm.core.api.NuxeoException;
import org.nuxeo.ecm.platform.commandline.executor.api.CmdParameters;
import org.nuxeo.ecm.platform.commandline.executor.api.CmdParameters.ParameterValue;
import org.nuxeo.ecm.platform.commandline.executor.api.CommandAvailability;
import org.nuxeo.ecm.platform.commandline.executor.api.CommandLineExecutorService;
import org.nuxeo.ecm.platform.commandline.executor.api.CommandNotAvailable;
import org.nuxeo.ecm.platform.commandline.executor.service.CommandLineDescriptor;
import org.nuxeo.ecm.platform.commandline.executor.service.CommandLineExecutorComponent;
import org.nuxeo.runtime.api.Framework;

/**
 * Use the <code>wkhtmltopdf</code>
 * (http://wkhtmltopdf.org) command line (must be installed) to convert a distant webpage to pdf.
 * <p>
 * <b>Important</b>: Some webpages can be complicated, can contain errors, etc. To avoid the commandline to block and
 * freeze, it is used with options forcing it to ignore errors (see OSGI-INF/commandLines.xml), such as
 * <code>--load-media-error-handling ignore</code> and <code>--load-error-handling ignore</code>. Also, launching the
 * command in quiet mode is a requirement (<code>-q</code> option)
 * <p>
 * Also, the commandline can create a valid and complete pdf but still return an error code. This class does not rely on
 * the exitValue returned by wkhtmltopdf. Instead, it checks the resulting pdf.
 * <p>
 * <b>Using Apache Common Exec<b>
 * <p>
 * As of "today" (Nuxeo 8.1); the Nuxeo CommandLineService does not allow setting timeout if wkhtmltopdf fails and
 * freezes (which happened during some private tests with big, big HTML pages containing a lot of CSS and some errors).
 * So we use Apache Common Exec instead.
 * <p>
 * Notice #1: We still use the CommandLineExecutorService to declare and handle XML contributions, because it is is very
 * convenient and contains everything we need. No need to re-create a service doing basically the same just to add a
 * timeout.
 * <p>
 * Notice #2: And it is the reason why we created the {@link CommandLineParameters} class: So we handle security by
 * forcing some parameters to be quoted.
 * <p>
 * <b>Pages Requiring Authentication<b>
 * <p>
 * <khtmltopdf allows handling such pages by using 2 steps:
 * <ul>
 * <li>First call with login informations. This creates a "cookie jar" file</li>
 * <li>Use this cookie jar in further authenticated calls</li>
 * </ul>
 * To handle this, which requires user and password to be exchanged, you must declare as many commandLine XML as you
 * need, or user/pwd are saved server-side, and pass the command line name to the {@link login} method. This way no
 * exchange with a browser for example. See {@link login} for more information
 * <p>
 * <b>NOTICE</b>
 * <ul>
 * <li>In all cases, the command line <i>must</i> use "#{url}" and "#{targetFilePath}" (the later is handled by the
 * plug-in)</li>
 * <li>We strongly recommend to use the <code>-q</code> option as a minimum</li>
 * 
 * @since 7.10HF05
 */
public class WebpageToBlob {

    @SuppressWarnings("unused")
    private static final Log log = LogFactory.getLog(WebpageToBlob.class);

    public static final String COMMANDLINE_DEFAULT_wkhtmltopdf = "wkhtmlToPdf";

    public static final String COMMANDLINE_DEFAULT_wkhtmltopdf_AUTHENTICATED = "wkhtmlToPdf-authenticated";

    // 30s timeout by default
    public static final int TIMEOUT_DEFAULT = 30000;

    protected int timeout = TIMEOUT_DEFAULT;

    public WebpageToBlob() {
        this(0);
    }

    public WebpageToBlob(int inTimeout) {

        setTimeout(inTimeout);
    }

    /**
     * Checks the availability of the default command line
     * 
     * @return
     * @since 7.10
     */
    public static boolean isAvailable() {

        CommandLineExecutorService cles = Framework.getService(CommandLineExecutorService.class);
        CommandAvailability ca = cles.getCommandAvailability(WebpageToBlob.COMMANDLINE_DEFAULT_wkhtmltopdf);
        return ca.isAvailable();
    }

    /**
     * Converts the distant URL to PDF, returns the blob of the PDF.
     * <p>
     * If <code>inCommandLine</code> is empty, the default command is called.
     * 
     * @param inUrl, the url to convert
     * @param inFileName, the fileName of the final PDF
     * @return a Blob holding the pdf
     * @throws IOException
     * @throws CommandNotAvailable
     * @throws NuxeoException
     * @since 7.10
     */
    public Blob toPdf(String inCommandLine, String inUrl, String inFileName)
            throws IOException, CommandNotAvailable, NuxeoException {

        return run(inCommandLine, inUrl, inFileName, null);

    }

    /**
     * Converts the distant URL to PDF, returns the blob of the PDF.
     * <p>
     * To be used for pages requiring authentication (and {@link login} must have been previously called and have
     * returned the cookieJar blob).
     * <p>
     * If <code>inCommandLine</code> is empty, the default command is called.
     * 
     * @param inCommandLine
     * @param inUrl
     * @param inFileName
     * @param inCookieJar
     * @return
     * @throws IOException
     * @throws CommandNotAvailable
     * @throws NuxeoException
     * @since 7.10
     */
    public Blob toPdf(String inCommandLine, String inUrl, String inFileName, Blob inCookieJar)
            throws IOException, CommandNotAvailable, NuxeoException {

        return run(inCommandLine, inUrl, inFileName, inCookieJar);

    }

    /**
     * Logins to a distant website and return a Blob, a file (named "cookie jar" by wkhtmltopdf), to be used when
     * accessing an authenticated page. You will have previously contribute the <code>inCommandLine</code> with all the
     * credentials and info.
     * <p>
     * <b>NOTE</b>: It is not easy to access authenticated pages, since wkhtmltopdf basically runs a faceless browser.
     * You must read wkhtmltopdf documentation, and also (mainly, actually) this blog:
     * http://test-mate.blogspot.com/2014/07/wkhtmltopdf-generate-pdf-of-password.html
     * <p>
     * Example: To access a page at http://my.url.com, you first have a login page. You must then:
     * <ul>
     * <li>Get the info to be sent in --post values by wkhtmltopdf. Which means, the <i>variables</i> sent in the form
     * in the POST request.</li>
     * <li>Add each of them to your commandline XML</li>
     * <li>In the command line, you must:
     * <ul>
     * <li>Hard code the URL and all the <code>--post</code> information</li>
     * <li>Use the <code>#{cookieJar}</code> parameter (<i>must be set</i>,filled by the plug-in, do not change this
     * name)</li>
     * <li>Use the <code>#{targetFilePath}</code> parameter (<i>must be set</i>,filled by the plug-in, do not change
     * this name)</li>
     * </ul>
     * So, say the form variables to send are "user_name", "user_pwd" and the submit button is "Submit", with a value of
     * "doLogin". You must have the following command:
     * <p>
     * 
     * <pre>
     * <command name="wkhtmlToPdf-login-EXAMPLE" enabled="true">
     *   <commandLine>wkhtmltopdf</commandLine>
     *   <parameterString>-q --cookie-jar #{cookieJar} --post user_name johndoe --post user-pwd 123456 --post Submit doLogin "http://my.site.com/login" "#{targetFilePath}"</parameterString>
     * </command>
     * </pre>
     * 
     * @param inCommandLine
     * @return the cookie jar (as a Blob) to be used with the next authenticated calls.
     * @throws IOException
     * @throws IOException, NuxeoException, CommandNotAvailable
     * @since 7.10
     */
    public Blob login(String inCommandLine) throws IOException, NuxeoException, CommandNotAvailable {

        if (StringUtils.isBlank(inCommandLine)) {
            throw new NuxeoException(
                    "When calling login(), a valid commandline must be passed, the default one does not handle authentification");
        }

        Blob cookieJar = Blobs.createBlobWithExtension(".jar");

        @SuppressWarnings("unused")
        Blob ignorePdf = run(inCommandLine, cookieJar);

        return cookieJar;
    }

    /*
     * This one is used only in tests, because as of today (2016-03), you can't dynamically setup login credentials from
     * unit test, when using the CommandLineService and its XML extensions. Well. You can, but it is a lot, a lot of
     * work. We secure this by checking we are running a test.
     */
    public Blob login(String inCommandLine, Properties inTestProps)
            throws IOException, NuxeoException, CommandNotAvailable {

        if (!Framework.isTestModeSet()) {
            throw new NuxeoException(
                    "A call to login(String inCommandLine, Properties inTestProps) can me made only in test mode.");
        }

        if (StringUtils.isBlank(inCommandLine)) {
            throw new NuxeoException(
                    "When calling login(), a valid commandline must be passed, the default one does not handle authentification");
        }

        CmdParameters params = new CmdParameters();

        String loginUrl = inTestProps.getProperty("loginUrl");
        params.addNamedParameter(CommandLineParameters.URL, loginUrl);

        Blob cookieJar = Blobs.createBlobWithExtension(".jar");
        params.addNamedParameter(CommandLineParameters.COOKIE_JAR, cookieJar.getFile().getAbsolutePath());

        params.addNamedParameter("loginVar", inTestProps.getProperty("loginVar"));
        params.addNamedParameter("loginValue", inTestProps.getProperty("loginValue"));
        params.addNamedParameter("pwdVar", inTestProps.getProperty("pwdVar"));
        params.addNamedParameter("pwdValue", inTestProps.getProperty("pwdValue"));
        params.addNamedParameter("submitVar", inTestProps.getProperty("submitVar"));
        params.addNamedParameter("submitValue", inTestProps.getProperty("submitValue"));

        @SuppressWarnings("unused")
        Blob ignorePdf = buildCommandLineAndRun(inCommandLine, params, null, true);

        return cookieJar;
    }

    /*
     * (wrapper for Blob run(String inCommandLine, String inUrl, String inFileName, Blob inCookieJar))
     */
    protected Blob run(String inCommandLine, Blob inCookieJar)
            throws NuxeoException, IOException, CommandNotAvailable {

        return run(inCommandLine, null, null, inCookieJar);

    }

    protected Blob run(String inCommandLine, String inUrl, String inFileName, Blob inCookieJar)
            throws IOException, CommandNotAvailable, NuxeoException {

        if (StringUtils.isBlank(inCommandLine)) {
            inCommandLine = COMMANDLINE_DEFAULT_wkhtmltopdf;
        }

        CmdParameters params = new CmdParameters();
        if (inCookieJar != null) {
            params.addNamedParameter(CommandLineParameters.COOKIE_JAR, inCookieJar.getFile().getAbsolutePath());
        }
        if (StringUtils.isNotBlank(inUrl)) {
            params.addNamedParameter(CommandLineParameters.URL, inUrl);
        }

        return buildCommandLineAndRun(inCommandLine, params, inFileName, false);

    }

    protected Blob buildCommandLineAndRun(String inCommandLine, CmdParameters inParams, String inFileName,
            boolean inUseAllParams) throws IOException, CommandNotAvailable, NuxeoException {

        // Create a temp. File handled by Nuxeo
        Blob resultPdf = Blobs.createBlobWithExtension(".pdf");

        // Build the full, resolved command line
        String resolvedParameterString = CommandLineParameters.buildParameterString(inCommandLine,
                inParams.getParameter(CommandLineParameters.COOKIE_JAR),
                inParams.getParameter(CommandLineParameters.URL), resultPdf.getFile().getAbsolutePath());

        // Mainly during test, we may have uncjecked parameters (safe because everything is hard-coded server side)
        if (inUseAllParams) {
            if (!Framework.isTestModeSet()) {
                throw new NuxeoException("A call to buildCommandLineAndRun(..., true) is for test only.");
            }
            Map<String, ParameterValue> allParams = inParams.getParameters();
            String key, value;
            for (Entry<String, ParameterValue> entry : allParams.entrySet()) {
                key = entry.getKey();
                value = entry.getValue().getValue();
                if (!CommandLineParameters.isHandledParameter(key)) {
                    resolvedParameterString = StringUtils.replace(resolvedParameterString, "#{" + key + "}", value);
                }
            }
        }

        // Get the exact command line and build the line
        CommandLineDescriptor desc = CommandLineExecutorComponent.getCommandDescriptor(inCommandLine);
        String line = desc.getCommand() + " " + resolvedParameterString;

        // Run the thing
        Exception exception = null;

        CommandLine cmdLine = CommandLine.parse(line);
        DefaultExecutor executor = new DefaultExecutor();
        ExecuteWatchdog watchdog = new ExecuteWatchdog(timeout);
        // We don't want a check on exit values, because a PDF can still be created with errors
        // (can't get a font, ...)
        executor.setExitValues(null);
        executor.setWatchdog(watchdog);
        int exitValue = 0;
        try {
            exitValue = executor.execute(cmdLine);
        } catch (IOException e) {
            exception = e;
        }

        // Even if we had no error catched, we must check if the pdf is valid.
        // Exit value may be 1, or non zero while the pdf was created. But maybe
        // a font could not be correctly rendered, etc. Let's check if we have
        // something in the pdf and it looks valid
        if (!pdfLooksValid(resultPdf.getFile())) {
            resultPdf = null;
            String msg = "Failed to execute the command line [" + cmdLine.toString()
                    + " ]. No valid PDF generated. exitValue: " + exitValue;

            if (exitValue == 143) { // On Linux: Timeout, wkhtmltopdf was SIGTERM
                msg += " (time out reached. The timeout was " + timeout + "ms)";
            }
            if (exception == null) {
                throw new NuxeoException(msg);
            } else {
                throw new NuxeoException(msg, exception);
            }
        }

        resultPdf.setMimeType("application/pdf");
        String url = inParams.getParameter(CommandLineParameters.URL);
        // Url parameter can be blank (hard coded url in the command line XML for example)
        if (StringUtils.isBlank(inFileName) && StringUtils.isNotBlank(url)) {
            try {
                URL urlObj = new URL(url);
                inFileName = StringUtils.replace(urlObj.getHost(), ".", "-") + ".pdf";
            } finally {
                // Nothing. Default name has been set by nuxeo
            }
        }
        if (StringUtils.isNotBlank(inFileName)) {
            resultPdf.setFilename(inFileName);
        }

        return resultPdf;
    }

    /*
     * This call is a bit expensive (in term of CPU). But as we can't rely on the exitReturn value from wkhtmltopdf, not
     * on just the size of the file, we must check the PDF looks ok. Using PDFBox here.
     */
    protected boolean pdfLooksValid(File inPdf) {

        boolean valid = false;

        if (inPdf.exists() && inPdf.length() > 0) {
            PDDocument pdfDoc = null;
            try {
                pdfDoc = PDDocument.load(inPdf);
                if (pdfDoc.getNumberOfPages() > 0) {
                    valid = true;
                }
            } catch (IOException e) {
                // Nothing
            } finally {
                if (pdfDoc != null) {
                    try {
                        pdfDoc.close();
                    } catch (IOException e) {
                        // Ignore
                    }
                }
            }
        }

        return valid;
    }

    public void setTimeout(int newValue) {
        timeout = newValue < 1 ? TIMEOUT_DEFAULT : newValue;
    }

}