Java tutorial
package edu.ist.psu.sagnik.research.pdfbox2playground.javatest; /** * Created by schoudhury on 6/24/16. */ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.awt.geom.Point2D; import java.awt.image.BufferedImage; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.io.IOUtils; import org.apache.pdfbox.cos.COSStream; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.encryption.AccessPermission; import org.apache.pdfbox.pdmodel.graphics.image.PDImage; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray; import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB; import org.apache.pdfbox.tools.imageio.ImageIOUtil; import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine; /** * Extracts the images from a PDF file. * * <p>usage: java org.apache.pdfbox.tools.ExtractImages <pdffile> <password> [imageprefix] * * @author Ben Litchfield */ public class ExtractImages { private static final String PASSWORD = "-password"; private static final String PREFIX = "-prefix"; private static final String DIRECTJPEG = "-directJPEG"; private static final List<String> JPEG = Arrays.asList(COSName.DCT_DECODE.getName(), COSName.DCT_DECODE_ABBREVIATION.getName()); private boolean directJPEG; private String prefix; private final Set<COSStream> seen = new HashSet<COSStream>(); private int imageCounter = 1; private ExtractImages() { } /** * Entry point for the application. * * @param args The command-line arguments. * @throws Exception If there is an error decrypting the document. */ public static void main(String[] args) throws Exception { // suppress the Dock icon on OS X System.setProperty("apple.awt.UIElement", "true"); ExtractImages extractor = new ExtractImages(); extractor.run(args); } private void run(String[] args) throws Exception { if (args.length < 1 || args.length > 4) { usage(); } else { String pdfFile = null; String password = ""; for (int i = 0; i < args.length; i++) { if (args[i].equals(PASSWORD)) { i++; if (i >= args.length) { usage(); } password = args[i]; } else if (args[i].equals(PREFIX)) { i++; if (i >= args.length) { usage(); } prefix = args[i]; } else if (args[i].equals(DIRECTJPEG)) { directJPEG = true; } else { if (pdfFile == null) { pdfFile = args[i]; } } } if (pdfFile == null) { usage(); } else { if (prefix == null && pdfFile.length() > 4) { prefix = pdfFile.substring(0, pdfFile.length() - 4); } extract(pdfFile, password); } } } /** * Print the usage requirements and exit. */ private static void usage() { System.err.println("Usage: java org.apache.pdfbox.tools.ExtractImages [OPTIONS] <PDF file>\n" + " -password <password>Password to decrypt document\n" + " -prefix <image-prefix> Image prefix(default to pdf name)\n" + " -directJPEG Forces the direct extraction of JPEG images " + "regardless of colorspace\n" + " <PDF file> The PDF document to use\n"); System.exit(1); } private void extract(String pdfFile, String password) throws IOException { PDDocument document = null; try { document = PDDocument.load(new File(pdfFile), password); AccessPermission ap = document.getCurrentAccessPermission(); if (!ap.canExtractContent()) { throw new IOException("You do not have permission to extract images"); } for (int i = 0; i < document.getNumberOfPages(); i++) // todo: ITERATOR would be much better { PDPage page = document.getPage(i); ImageGraphicsEngine extractor = new ImageGraphicsEngine(page); extractor.run(); } } finally { if (document != null) { document.close(); } } } private class ImageGraphicsEngine extends PDFGraphicsStreamEngine { protected ImageGraphicsEngine(PDPage page) throws IOException { super(page); } public void run() throws IOException { processPage(getPage()); } @Override public void drawImage(PDImage pdImage) throws IOException { if (pdImage instanceof PDImageXObject) { PDImageXObject xobject = (PDImageXObject) pdImage; if (seen.contains(xobject.getCOSStream())) { // skip duplicate image return; } seen.add(xobject.getCOSStream()); } // save image String name = prefix + "-" + imageCounter; imageCounter++; System.out.println("Writing image: " + name); write2file(pdImage, name, directJPEG); } @Override public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) throws IOException { } @Override public void clip(int windingRule) throws IOException { } @Override public void moveTo(float x, float y) throws IOException { } @Override public void lineTo(float x, float y) throws IOException { } @Override public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) throws IOException { } @Override public Point2D getCurrentPoint() throws IOException { return new Point2D.Float(0, 0); } @Override public void closePath() throws IOException { } @Override public void endPath() throws IOException { } @Override public void strokePath() throws IOException { } @Override public void fillPath(int windingRule) throws IOException { } @Override public void fillAndStrokePath(int windingRule) throws IOException { } @Override public void shadingFill(COSName shadingName) throws IOException { } } /** * Writes the image to a file with the filename + an appropriate suffix, like "Image.jpg". * The suffix is automatically set by the * @param filename the filename * @throws IOException When somethings wrong with the corresponding file. */ private void write2file(PDImage pdImage, String filename, boolean directJPEG) throws IOException { String suffix = pdImage.getSuffix(); if (suffix == null) { suffix = "png"; } FileOutputStream out = null; try { out = new FileOutputStream(filename + "." + suffix); BufferedImage image = pdImage.getImage(); if (image != null) { if ("jpg".equals(suffix)) { String colorSpaceName = pdImage.getColorSpace().getName(); if (directJPEG || PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) || PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName)) { // RGB or Gray colorspace: get and write the unmodifiedJPEG stream //InputStream data = pdImage.getColor.getPartiallyFilteredStream(JPEG); //IOUtils.copy(data, out); //IOUtils.closeQuietly(data); BufferedImage b = pdImage.getImage(); ImageIOUtil.writeImage(b, "jpg", out); } else { // for CMYK and other "unusual" colorspaces, the JPEG will be converted ImageIOUtil.writeImage(image, suffix, out); } } else { ImageIOUtil.writeImage(image, suffix, out); } } out.flush(); } finally { if (out != null) { out.close(); } } } }