org.pennyledger.docstore.parser.impl.PDFImageExtractor.java Source code

Introduction

Here is the source code for org.pennyledger.docstore.parser.impl.PDFImageExtractor.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.pennyledger.docstore.parser.impl;

import java.awt.geom.Point2D;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashSet;
import java.util.Set;

import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
import org.apache.pdfbox.util.Matrix;
import org.pennyledger.docstore.IDocumentContents;
import org.pennyledger.docstore.parser.IImageParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Extracts the images from a PDF file.
 *
 * @author Ben Litchfield
 */
final class PDFImageExtractor {

    private static final Logger logger = LoggerFactory.getLogger(PDFImageExtractor.class);

    private final IImageParser imageParser;
    private final int dpi;
    private final Set<COSStream> seen = new HashSet<COSStream>();

    PDFImageExtractor(IImageParser imageParser, int dpi) {
        this.imageParser = imageParser;
        this.dpi = dpi;
    }

    /**
     * Entry point for the application.
     *
     * @param args
     *          The command-line arguments.
     * @throws IOException
     *           if there is an error reading the file or extracting the images.
     */
    //  public static void main(String[] args) throws IOException {
    //    // suppress the Dock icon on OS X
    //    System.setProperty("apple.awt.UIElement", "true");
    //
    //    IImageParser imageParser = new TesseractImageOCR();
    //    PDFImageExtractor extractor = new PDFImageExtractor(imageParser, 150);
    //    extractor.run(args);
    //  }

    //  private void run(String[] args) throws IOException {
    //    //String pdfFile = args[0];
    //    String pdfFile = "c:/PennyLedger/1d51-9fe1b211e8039458b2ac4dbbfbf1.pdf";
    //    if (pdfFile.length() <= 4) {
    //      throw new IllegalArgumentException("Invalid file name: not PDF");
    //    }
    //    String password = "";
    //    Path pdfPath = Paths.get(pdfFile);
    //    PDDocument document = PDDocument.load(pdfPath.toFile(), password);
    //    IDocumentContents docContents = new DocumentContents();
    //    extract(document, id, pdfPath, docContents);
    //  }

    IDocumentContents extract(PDDocument document, String id, IDocumentContents docContents) throws IOException {
        AccessPermission ap = document.getCurrentAccessPermission();
        if (!ap.canExtractContent()) {
            throw new IOException("You do not have permission to extract images");
        }

        for (int i = 0; i < document.getNumberOfPages(); i++) {
            PDPage page = document.getPage(i);
            ImageGraphicsEngine extractor = new ImageGraphicsEngine(page, i, id);
            extractor.run();
            IDocumentContents pageContents = extractor.getPageContents();
            docContents = docContents.merge(pageContents);
        }
        return docContents;
    }

    private class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
        private final int pageIndex;
        private final String id;
        private IDocumentContents pageContents;
        private int imageIndex = 0;

        protected ImageGraphicsEngine(PDPage page, int pageIndex, String id) throws IOException {
            super(page);
            this.pageContents = new DocumentContents();
            this.pageIndex = pageIndex;
            this.id = id;
        }

        public void run() throws IOException {
            processPage(getPage());
        }

        @Override
        public void drawImage(PDImage pdImage) throws IOException {
            if (pdImage instanceof PDImageXObject) {
                PDImageXObject xobject = (PDImageXObject) pdImage;
                if (seen.contains(xobject.getCOSStream())) {
                    // skip duplicate image
                    return;
                }
                seen.add(xobject.getCOSStream());
            }

            logger.info("Extracting image {} from page {} of: {}", imageIndex, pageIndex, id);
            int imageWidth = pdImage.getWidth();
            int imageHeight = pdImage.getHeight();
            if (imageWidth > 1 && imageHeight > 1) {
                // Only OCR significant images
                PDGraphicsState gs = getGraphicsState();
                Matrix gm = gs.getCurrentTransformationMatrix();

                BufferedImage image = pdImage.getImage();
                Path ocrImagePath = OCRPaths.getOCRImagePath(id, pageIndex, imageIndex);
                ImageIO.writeImage(image, ocrImagePath);
                IDocumentContents imageContents = imageParser.parse(id, ocrImagePath);
                float pageWidth = gm.getScaleX() + gm.getTranslateX();

                // Scale the image down to page width (at 72dpi), and then scale to the dpi we want.
                double scale = (pageWidth / imageWidth) * (dpi / 72.0);
                imageContents.scaleSegments(scale);
                pageContents = pageContents.merge(imageContents);
                //////////Files.delete(ocrImagePath);
            }
            imageIndex++;
        }

        public IDocumentContents getPageContents() {
            return pageContents;
        }

        @Override
        public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) throws IOException {
        }

        @Override
        public void clip(int windingRule) throws IOException {
        }

        @Override
        public void moveTo(float x, float y) throws IOException {
        }

        @Override
        public void lineTo(float x, float y) throws IOException {
        }

        @Override
        public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) throws IOException {
        }

        @Override
        public Point2D getCurrentPoint() throws IOException {
            return new Point2D.Float(0, 0);
        }

        @Override
        public void closePath() throws IOException {
        }

        @Override
        public void endPath() throws IOException {
        }

        @Override
        public void strokePath() throws IOException {
        }

        @Override
        public void fillPath(int windingRule) throws IOException {
        }

        @Override
        public void fillAndStrokePath(int windingRule) throws IOException {
        }

        @Override
        public void shadingFill(COSName shadingName) throws IOException {
        }
    }

}