it.vige.albopretorio.ocr.OCRExtractAction.java Source code

Introduction

Here is the source code for it.vige.albopretorio.ocr.OCRExtractAction.java
Source

/******************************************************************************
 * Vige, Home of Professional Open Source Copyright 2010, Vige, and           *
 * individual contributors by the @authors tag. See the copyright.txt in the  *
 * distribution for a full listing of individual contributors.                *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may    *
 * not use this file except in compliance with the License. You may obtain    *
 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0        *
 * Unless required by applicable law or agreed to in writing, software        *
 * distributed under the License is distributed on an "AS IS" BASIS,          *
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.   *
 * See the License for the specific language governing permissions and        *
 * limitations under the License.                                             *
 ******************************************************************************/
package it.vige.albopretorio.ocr;

import static it.vige.albopretorio.ocr.model.OCRdModel.*;
import static it.vige.albopretorio.ocr.model.OCRdModel.PROP_PROCESSED_DATE;
import static org.alfresco.model.ContentModel.PROP_CONTENT;
import static org.alfresco.repo.content.MimetypeMap.MIMETYPE_PDF;
import static org.alfresco.service.cmr.dictionary.DataTypeDefinition.BOOLEAN;
import static org.alfresco.service.cmr.version.VersionType.MINOR;
import static org.apache.commons.logging.LogFactory.getLog;
import static org.apache.pdfbox.pdmodel.PDDocument.loadNonSeq;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.alfresco.repo.action.ParameterDefinitionImpl;
import org.alfresco.repo.action.executer.ActionExecuterAbstractBase;
import org.alfresco.repo.version.VersionModel;
import org.alfresco.service.cmr.action.Action;
import org.alfresco.service.cmr.action.ParameterDefinition;
import org.alfresco.service.cmr.repository.ContentData;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentService;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.FileContentReader;
import org.alfresco.service.cmr.repository.NodeRef;
import org.alfresco.service.cmr.repository.NodeService;
import org.alfresco.service.cmr.version.Version;
import org.alfresco.service.cmr.version.VersionService;
import org.alfresco.service.namespace.QName;
import org.apache.commons.logging.Log;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

public class OCRExtractAction extends ActionExecuterAbstractBase {

    private static final Log logger = getLog(OCRExtractAction.class);

    private NodeService nodeService;
    private ContentService contentService;
    private VersionService versionService;

    private OCRTransformWorker ocrTransformWorker;

    // Continue current operation in case of OCR error
    private static final String PARAM_CONTINUE_ON_ERROR = "continue-on-error";

    public void init() {
        super.init();
    }

    @Override
    protected void addParameterDefinitions(List<ParameterDefinition> paramList) {

        paramList.add(new ParameterDefinitionImpl(PARAM_CONTINUE_ON_ERROR, BOOLEAN, false,
                getParamDisplayLabel(PARAM_CONTINUE_ON_ERROR)));
    }

    @Override
    protected void executeImpl(Action action, NodeRef actionedUponNodeRef) {

        if (!nodeService.hasAspect(actionedUponNodeRef, ASPECT_OCRD)) {

            ContentData contentData = (ContentData) nodeService.getProperty(actionedUponNodeRef, PROP_CONTENT);

            // Exclude folders and other nodes without content
            if (contentData != null) {

                Boolean continueOnError = (Boolean) action.getParameterValue(PARAM_CONTINUE_ON_ERROR);
                if (continueOnError == null)
                    continueOnError = true;

                try {
                    // Current transaction
                    executeImplInternal(actionedUponNodeRef, contentData);
                } catch (Throwable throwableCurrentTransaction) {
                    if (continueOnError) {
                        logger.warn(actionedUponNodeRef + ": " + throwableCurrentTransaction.getMessage());
                    } else {
                        throw throwableCurrentTransaction;
                    }
                }

            }

        }

    }

    private void executeImplInternal(NodeRef actionedUponNodeRef, ContentData contentData) {

        String originalMimeType = contentData.getMimetype();
        if (originalMimeType.equals(MIMETYPE_PDF)) {

            ContentReader reader = contentService.getReader(actionedUponNodeRef, PROP_CONTENT);
            if (isBinary(reader)) {
                ContentWriter writer = contentService.getTempWriter();
                writer.setMimetype(MIMETYPE_PDF);

                try {
                    ocrTransformWorker.transform(reader, writer, null);
                } catch (Exception e) {
                    throw new RuntimeException(e);
                }

                // Set initial version if it's a new one
                versionService.ensureVersioningEnabled(actionedUponNodeRef, null);
                if (!versionService.isVersioned(actionedUponNodeRef)) {
                    Map<String, Serializable> versionProperties = new HashMap<String, Serializable>();
                    versionProperties.put(Version.PROP_DESCRIPTION, "OCRd");
                    versionProperties.put(VersionModel.PROP_VERSION_TYPE, MINOR);
                    versionService.createVersion(actionedUponNodeRef, versionProperties);
                }

                ContentWriter writeOriginalContent = null;
                // Update original PDF file
                writeOriginalContent = contentService.getWriter(actionedUponNodeRef, PROP_CONTENT, true);
                writeOriginalContent.putContent(writer.getReader());

                versionService.ensureVersioningEnabled(actionedUponNodeRef, null);
                Map<String, Serializable> versionProperties = new HashMap<String, Serializable>();
                versionProperties.put(Version.PROP_DESCRIPTION, "OCRd");
                versionProperties.put(VersionModel.PROP_VERSION_TYPE, MINOR);
                versionService.createVersion(actionedUponNodeRef, versionProperties);

            }
            nodeService.setType(actionedUponNodeRef, MODEL_OCRD);
            // Set OCRd aspect to avoid future re-OCR process
            Map<QName, Serializable> aspectProperties = new HashMap<QName, Serializable>();
            aspectProperties.put(PROP_PROCESSED_DATE, new Date());
            nodeService.addAspect(actionedUponNodeRef, ASPECT_OCRD, aspectProperties);
        }
    }

    public void setNodeService(NodeService nodeService) {
        this.nodeService = nodeService;
    }

    public void setContentService(ContentService contentService) {
        this.contentService = contentService;
    }

    public void setVersionService(VersionService versionService) {
        this.versionService = versionService;
    }

    public void setOcrTransformWorker(OCRTransformWorker ocrTransformWorker) {
        this.ocrTransformWorker = ocrTransformWorker;
    }

    private boolean isBinary(ContentReader reader) {
        String filePath = ((FileContentReader) reader).getFile().getAbsolutePath();
        String content = "";
        try (FileInputStream in = new FileInputStream(filePath)) {
            content = getText(in);
        } catch (IOException e) {
            logger.error(e);
        }

        return content.isEmpty();
    }

    public static String getText(InputStream in) throws IOException {
        PDDocument document = loadNonSeq(in, null);
        String content = new PDFTextStripper().getText(document).trim();
        document.close();
        return content;
    }

}