Example usage for org.apache.pdfbox.pdmodel PDDocument close

List of usage examples for org.apache.pdfbox.pdmodel PDDocument close

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument close.

Prototype

@Override
public void close() throws IOException 

Source Link

Document

This will close the underlying COSDocument object.

Usage

From source file:org.apache.tika.parser.pdf.PDFParser.java

License:Apache License

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    PDDocument pdfDocument = null;
    TemporaryResources tmp = new TemporaryResources();
    //config from context, or default if not set via context
    PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
    String password = "";
    try {/*w w  w  .  j  av  a 2 s  .  com*/
        // PDFBox can process entirely in memory, or can use a temp file
        //  for unpacked / processed resources
        // Decide which to do based on if we're reading from a file or not already
        TikaInputStream tstream = TikaInputStream.cast(stream);
        password = getPassword(metadata, context);
        if (tstream != null && tstream.hasFile()) {
            // File based, take that as a cue to use a temporary file
            RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw");
            if (localConfig.getUseNonSequentialParser() == true) {
                pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), scratchFile, password);
            } else {
                pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true);
            }
        } else {
            // Go for the normal, stream based in-memory parsing
            if (localConfig.getUseNonSequentialParser() == true) {
                pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream),
                        new RandomAccessBuffer(), password);
            } else {
                pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true);
            }
        }
        metadata.set("pdf:encrypted", Boolean.toString(pdfDocument.isEncrypted()));

        //if using the classic parser and the doc is encrypted, we must manually decrypt
        if (!localConfig.getUseNonSequentialParser() && pdfDocument.isEncrypted()) {
            pdfDocument.decrypt(password);
        }

        metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
        extractMetadata(pdfDocument, metadata);

        AccessChecker checker = localConfig.getAccessChecker();
        checker.check(metadata);
        if (handler != null) {
            if (shouldHandleXFAOnly(pdfDocument, localConfig)) {
                handleXFAOnly(pdfDocument, handler, metadata);
            } else {
                PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
            }
        }

    } catch (CryptographyException e) {
        //seq parser throws CryptographyException for bad password
        throw new EncryptedDocumentException(e);
    } catch (IOException e) {
        //nonseq parser throws IOException for bad password
        //At the Tika level, we want the same exception to be thrown
        if (e.getMessage() != null && e.getMessage().contains("Error (CryptographyException)")) {
            metadata.set("pdf:encrypted", Boolean.toString(true));
            throw new EncryptedDocumentException(e);
        }
        //rethrow any other IOExceptions
        throw e;
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
        tmp.dispose();
        //TODO: once we migrate to PDFBox 2.0, remove this (PDFBOX-2200)
        PDFont.clearResources();
    }
}

From source file:org.apache.tika.parser.pdf.PDFPureJavaParser.java

License:Apache License

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    PDFPureJavaParserConfig localConfig = context.get(PDFPureJavaParserConfig.class, defaultConfig);

    PDDocument pdfDocument = null;

    String password = "";
    try {//from  w  ww  .  j a v  a  2 s .c o m
        // PDFBox can process entirely in memory, or can use a temp file
        //  for unpacked / processed resources
        // Decide which to do based on if we're reading from a file or not already
        //TODO: make this configurable via MemoryUsageSetting
        TikaInputStream tstream = TikaInputStream.cast(stream);
        password = getPassword(metadata, context);
        if (tstream != null && tstream.hasFile()) {
            // File based -- send file directly to PDFBox
            pdfDocument = PDDocument.load(tstream.getPath().toFile(), password);
        } else {
            pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password);
        }
        metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(pdfDocument.isEncrypted()));

        metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
        extractMetadata(pdfDocument, metadata, context);
        AccessChecker checker = localConfig.getAccessChecker();
        checker.check(metadata);
        if (handler != null) {
            if (shouldHandleXFAOnly(pdfDocument, localConfig)) {
                handleXFAOnly(pdfDocument, handler, metadata, context);
            } else if (localConfig.getOcrStrategy().equals(PDFPureJavaParserConfig.OCR_STRATEGY.OCR_ONLY)) {
                metadata.add("X-Parsed-By", "org.apache.tika.parser.ocr.TesseractOCRParser");
                // No-ops. Do not support OCR parser.
            } else {
                if (localConfig.getOcrStrategy()
                        .equals(PDFPureJavaParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
                    metadata.add("X-Parsed-By", "org.apache.tika.parser.ocr.TesseractOCRParser");
                }
                PDF2XHTMLPureJava.process(pdfDocument, handler, context, metadata, localConfig);
            }
        }
    } catch (InvalidPasswordException e) {
        metadata.set(PDF.IS_ENCRYPTED, "true");
        throw new EncryptedDocumentException(e);
    } catch (final PdfTimeoutException e) {
        throw new TikaPdfTimeoutException("PdfTimeoutException", e);
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}

From source file:org.codelibs.robot.extractor.impl.PdfExtractor.java

License:Apache License

@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
    if (in == null) {
        throw new RobotSystemException("The inputstream is null.");
    }/*  w w w .  java2 s .c o  m*/
    synchronized (pdfBoxLockObj) {
        PDDocument document = null;
        try {
            document = PDDocument.load(in, null, force);
            if (document.isEncrypted() && params != null) {
                String password = params.get(ExtractData.PDF_PASSWORD);
                if (password == null) {
                    password = getPassword(params.get(ExtractData.URL),
                            params.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
                }
                if (password != null) {
                    final StandardDecryptionMaterial sdm = new StandardDecryptionMaterial(password);
                    document.openProtection(sdm);
                    final AccessPermission ap = document.getCurrentAccessPermission();

                    if (!ap.canExtractContent()) {
                        throw new IOException("You do not have permission to extract text.");
                    }
                }
            }

            final ByteArrayOutputStream baos = new ByteArrayOutputStream();
            final Writer output = new OutputStreamWriter(baos, encoding);
            final PDFTextStripper stripper = new PDFTextStripper(encoding);
            stripper.setForceParsing(force);
            final AtomicBoolean done = new AtomicBoolean(false);
            final PDDocument doc = document;
            final Set<Exception> exceptionSet = new HashSet<>();
            final Thread task = new Thread(() -> {
                try {
                    stripper.writeText(doc, output);
                } catch (final Exception e) {
                    exceptionSet.add(e);
                } finally {
                    done.set(true);
                }
            });
            task.setDaemon(true);
            task.start();
            task.join(timeout);
            if (!done.get()) {
                for (int i = 0; i < 100 && !done.get(); i++) {
                    task.interrupt();
                    Thread.sleep(50);
                }
                throw new ExtractException("PDFBox process cannot finish in " + timeout + " sec.");
            } else if (!exceptionSet.isEmpty()) {
                throw exceptionSet.iterator().next();
            }
            output.flush();
            final ExtractData extractData = new ExtractData(baos.toString(encoding));
            extractMetadata(document, extractData);
            return extractData;
        } catch (final Exception e) {
            throw new ExtractException(e);
        } finally {
            if (document != null) {
                try {
                    document.close();
                } catch (final IOException e) {
                    // NOP
                }
            }
        }
    }
}

From source file:org.crossref.pdfmark.Main.java

License:Open Source License

public static void writeInfoDictionary(FileInputStream in, String outputFile, byte[] xmp)
        throws IOException, COSVisitorException {

    PDFParser parser = new PDFParser(in);
    parser.parse();/*from   ww w .  java 2 s.  co  m*/

    PDDocument document = parser.getPDDocument();
    PDDocumentInformation info = document.getDocumentInformation();

    for (Entry<String, String> entry : XmpUtils.toInfo(xmp).entrySet()) {
        info.setCustomMetadataValue(entry.getKey(), entry.getValue());
    }

    document.setDocumentInformation(info);
    document.save(outputFile);
    document.close();
}

From source file:org.data2semantics.annotate.D2S_SampleAnnotation.java

License:Apache License

/**
 * This will create a doucument showing various annotations.
 * //from   w  w w .  ja v a2s. c om
 * @param args
 *            The command line arguments.
 * 
 * @throws Exception
 *             If there is an error parsing the document.
 */
public static void main(String[] args) throws Exception {

    PDDocument document = new PDDocument();

    try {
        PDPage page = new PDPage();
        document.addPage(page);
        List annotations = page.getAnnotations();

        // Setup some basic reusable objects/constants
        // Annotations themselves can only be used once!

        float inch = 72;
        PDGamma colourRed = new PDGamma();
        colourRed.setR(1);
        PDGamma colourBlue = new PDGamma();
        colourBlue.setB(1);
        PDGamma colourBlack = new PDGamma();

        PDBorderStyleDictionary borderThick = new PDBorderStyleDictionary();
        borderThick.setWidth(inch / 12); // 12th inch
        PDBorderStyleDictionary borderThin = new PDBorderStyleDictionary();
        borderThin.setWidth(inch / 72); // 1 point
        PDBorderStyleDictionary borderULine = new PDBorderStyleDictionary();
        borderULine.setStyle(PDBorderStyleDictionary.STYLE_UNDERLINE);
        borderULine.setWidth(inch / 72); // 1 point

        float pw = page.getMediaBox().getUpperRightX();
        float ph = page.getMediaBox().getUpperRightY();

        // First add some text, two lines we'll add some annotations to this
        // later

        PDFont font = PDType1Font.HELVETICA_BOLD;

        PDPageContentStream contentStream = new PDPageContentStream(document, page);
        contentStream.beginText();
        contentStream.setFont(font, 18);
        contentStream.moveTextPositionByAmount(inch, ph - inch - 18);
        contentStream.drawString("PDFBox");
        contentStream.moveTextPositionByAmount(0, -(inch / 2));
        contentStream.drawString("Click Here");
        contentStream.endText();

        contentStream.close();

        // Now add the markup annotation, a highlight to PDFBox text
        PDAnnotationTextMarkup txtMark = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
        txtMark.setColour(colourBlue);
        txtMark.setConstantOpacity((float) 0.2); // Make the highlight 20%
        // transparent

        // Set the rectangle containing the markup

        float textWidth = (font.getStringWidth("PDFBox") / 1000) * 18;
        PDRectangle position = new PDRectangle();
        position.setLowerLeftX(inch);
        position.setLowerLeftY(ph - inch - 18);
        position.setUpperRightX(72 + textWidth);
        position.setUpperRightY(ph - inch);
        txtMark.setRectangle(position);

        // work out the points forming the four corners of the annotations
        // set out in anti clockwise form (Completely wraps the text)
        // OK, the below doesn't match that description.
        // It's what acrobat 7 does and displays properly!
        float[] quads = new float[8];

        quads[0] = position.getLowerLeftX(); // x1
        quads[1] = position.getUpperRightY() - 2; // y1
        quads[2] = position.getUpperRightX(); // x2
        quads[3] = quads[1]; // y2
        quads[4] = quads[0]; // x3
        quads[5] = position.getLowerLeftY() - 2; // y3
        quads[6] = quads[2]; // x4
        quads[7] = quads[5]; // y5

        txtMark.setQuadPoints(quads);
        txtMark.setContents("Highlighted since it's important");

        annotations.add(txtMark);

        // Now add the link annotation, so the clickme works
        PDAnnotationLink txtLink = new PDAnnotationLink();
        txtLink.setBorderStyle(borderULine);

        // Set the rectangle containing the link

        textWidth = (font.getStringWidth("Click Here") / 1000) * 18;
        position = new PDRectangle();
        position.setLowerLeftX(inch);
        position.setLowerLeftY(ph - (float) (1.5 * inch) - 20); // down a
        // couple of
        // points
        position.setUpperRightX(72 + textWidth);
        position.setUpperRightY(ph - (float) (1.5 * inch));
        txtLink.setRectangle(position);

        // add an action
        PDActionURI action = new PDActionURI();
        action.setURI("http://www.pdfbox.org");
        txtLink.setAction(action);

        annotations.add(txtLink);

        // Now draw a few more annotations

        PDAnnotationSquareCircle aCircle = new PDAnnotationSquareCircle(
                PDAnnotationSquareCircle.SUB_TYPE_CIRCLE);
        aCircle.setContents("Circle Annotation");
        aCircle.setInteriorColour(colourRed); // Fill in circle in red
        aCircle.setColour(colourBlue); // The border itself will be blue
        aCircle.setBorderStyle(borderThin);

        // Place the annotation on the page, we'll make this 1" round
        // 3" down, 1" in on the page

        position = new PDRectangle();
        position.setLowerLeftX(inch);
        position.setLowerLeftY(ph - (3 * inch) - inch); // 1" height, 3"
        // down
        position.setUpperRightX(2 * inch); // 1" in, 1" width
        position.setUpperRightY(ph - (3 * inch)); // 3" down
        aCircle.setRectangle(position);

        // add to the annotations on the page
        annotations.add(aCircle);

        // Now a square annotation

        PDAnnotationSquareCircle aSquare = new PDAnnotationSquareCircle(
                PDAnnotationSquareCircle.SUB_TYPE_SQUARE);
        aSquare.setContents("Square Annotation");
        aSquare.setColour(colourRed); // Outline in red, not setting a fill
        aSquare.setBorderStyle(borderThick);

        // Place the annotation on the page, we'll make this 1" (72points)
        // square
        // 3.5" down, 1" in from the right on the page

        position = new PDRectangle(); // Reuse the variable, but note it's a
        // new object!
        position.setLowerLeftX(pw - (2 * inch)); // 1" in from right, 1"
        // wide
        position.setLowerLeftY(ph - (float) (3.5 * inch) - inch); // 1" height, 3.5"
        // down
        position.setUpperRightX(pw - inch); // 1" in from right
        position.setUpperRightY(ph - (float) (3.5 * inch)); // 3.5" down
        aSquare.setRectangle(position);

        // add to the annotations on the page
        annotations.add(aSquare);

        // Now we want to draw a line between the two, one end with an open
        // arrow

        PDAnnotationLine aLine = new PDAnnotationLine();

        aLine.setEndPointEndingStyle(PDAnnotationLine.LE_OPEN_ARROW);
        aLine.setContents("Circle->Square");
        aLine.setCaption(true); // Make the contents a caption on the line

        // Set the rectangle containing the line

        position = new PDRectangle(); // Reuse the variable, but note it's a
        // new object!
        position.setLowerLeftX(2 * inch); // 1" in + width of circle
        position.setLowerLeftY(ph - (float) (3.5 * inch) - inch); // 1" height, 3.5"
        // down
        position.setUpperRightX(pw - inch - inch); // 1" in from right, and
        // width of square
        position.setUpperRightY(ph - (3 * inch)); // 3" down (top of circle)
        aLine.setRectangle(position);

        // Now set the line position itself
        float[] linepos = new float[4];
        linepos[0] = 2 * inch; // x1 = rhs of circle
        linepos[1] = ph - (float) (3.5 * inch); // y1 halfway down circle
        linepos[2] = pw - (2 * inch); // x2 = lhs of square
        linepos[3] = ph - (4 * inch); // y2 halfway down square
        aLine.setLine(linepos);

        aLine.setBorderStyle(borderThick);
        aLine.setColour(colourBlack);

        // add to the annotations on the page
        annotations.add(aLine);

        // Finally all done

        document.save("testAnnotation.pdf");
    } finally {
        document.close();
    }
}

From source file:org.deidentifier.arx.certificate.ARXCertificate.java

License:Apache License

/**
 * Renders the document into the given output stream
 * /*from   w  w  w .  j  a va2 s.  c  o  m*/
 * @param stream
 * @throws IOException 
 */
public void save(OutputStream stream) throws IOException {

    // Render
    Document document = new Document(style.gethMargin(), style.gethMargin(), style.getvMargin(),
            style.getvMargin());
    for (Element element : this.elements) {
        element.render(document, 0, this.style);
    }

    // Save to temp file
    File tmp = File.createTempFile("arx", "certificate");
    document.save(tmp);

    // Load and watermark
    PDDocument pdDocument = PDDocument.load(tmp);
    Watermark watermark = new Watermark(pdDocument);
    watermark.mark(pdDocument);

    // Save
    pdDocument.save(stream);
    pdDocument.close();
    tmp.delete();
}

From source file:org.dspace.app.mediafilter.PDFBoxThumbnail.java

License:BSD License

/**
 * @param source//from   w  ww  . j a  v  a 2 s . c  o m
 *            source input stream
 * 
 * @return InputStream the resulting input stream
 */
@Override
public InputStream getDestinationStream(Item currentItem, InputStream source, boolean verbose)
        throws Exception {
    PDDocument doc = PDDocument.load(source);
    PDFRenderer renderer = new PDFRenderer(doc);
    BufferedImage buf = renderer.renderImage(0);
    //        ImageIO.write(buf, "PNG", new File("custom-render.png"));
    doc.close();

    JPEGFilter jpegFilter = new JPEGFilter();
    return jpegFilter.getThumb(currentItem, buf, verbose);
}

From source file:org.dspace.app.mediafilter.PDFFilter.java

License:BSD License

/**
 * @param source/*from   w  w w  .ja v a 2s .c o m*/
 *            source input stream
 *
 * @return InputStream the resulting input stream
 */
public InputStream getDestinationStream(InputStream source) throws Exception {
    try {
        boolean useTemporaryFile = ConfigurationManager.getBooleanProperty("pdffilter.largepdfs", false);

        // get input stream from bitstream
        // pass to filter, get string back
        PDFTextStripper pts = new PDFTextStripper();
        PDDocument pdfDoc = null;
        Writer writer = null;
        File tempTextFile = null;
        ByteArrayOutputStream byteStream = null;

        if (useTemporaryFile) {
            tempTextFile = File.createTempFile("dspacepdfextract" + source.hashCode(), ".txt");
            tempTextFile.deleteOnExit();
            writer = new OutputStreamWriter(new FileOutputStream(tempTextFile));
        } else {
            byteStream = new ByteArrayOutputStream();
            writer = new OutputStreamWriter(byteStream);
        }

        try {
            pdfDoc = PDDocument.load(source);
            pts.writeText(pdfDoc, writer);
        } finally {
            try {
                if (pdfDoc != null) {
                    pdfDoc.close();
                }
            } catch (Exception e) {
                log.error("Error closing PDF file: " + e.getMessage(), e);
            }

            try {
                writer.close();
            } catch (Exception e) {
                log.error("Error closing temporary extract file: " + e.getMessage(), e);
            }
        }

        if (useTemporaryFile) {
            return new FileInputStream(tempTextFile);
        } else {
            byte[] bytes = byteStream.toByteArray();
            return new ByteArrayInputStream(bytes);
        }
    } catch (OutOfMemoryError oome) {
        log.error("Error parsing PDF document " + oome.getMessage(), oome);
        if (!ConfigurationManager.getBooleanProperty("pdffilter.skiponmemoryexception", false)) {
            throw oome;
        }
    }

    return null;
}

From source file:org.dspace.disseminate.CitationDocument.java

License:BSD License

/**
 * Creates a// w w  w .  jav  a2s .  c o m
 * cited document from the given bitstream of the given item. This
 * requires that bitstream is contained in item.
 * <p>
 * The Process for adding a cover page is as follows:
 * <ol>
 *  <li> Load source file into PdfReader and create a
 *     Document to put our cover page into.</li>
 *  <li> Create cover page and add content to it.</li>
 *  <li> Concatenate the coverpage and the source
 *     document.</li>
 * </p>
 *
 * @param bitstream The source bitstream being cited. This must be a PDF.
 * @return The temporary File that is the finished, cited document.
 * @throws java.io.FileNotFoundException
 * @throws SQLException
 * @throws org.dspace.authorize.AuthorizeException
 */
public File makeCitedDocument(Bitstream bitstream)
        throws IOException, SQLException, AuthorizeException, COSVisitorException {
    PDDocument document = new PDDocument();
    PDDocument sourceDocument = new PDDocument();
    try {
        Item item = (Item) bitstream.getParentObject();
        sourceDocument = sourceDocument.load(bitstream.retrieve());
        PDPage coverPage = new PDPage(PDPage.PAGE_SIZE_LETTER);
        generateCoverPage(document, coverPage, item);
        addCoverPageToDocument(document, sourceDocument, coverPage);

        document.save(tempDir.getAbsolutePath() + "/bitstream.cover.pdf");
        return new File(tempDir.getAbsolutePath() + "/bitstream.cover.pdf");
    } finally {
        sourceDocument.close();
        document.close();
    }
}

From source file:org.dspace.disseminate.CitationDocumentServiceImpl.java

License:BSD License

@Override
public File makeCitedDocument(Context context, Bitstream bitstream)
        throws IOException, SQLException, AuthorizeException {
    PDDocument document = new PDDocument();
    PDDocument sourceDocument = new PDDocument();
    try {/*www  .java2  s.com*/
        Item item = (Item) bitstreamService.getParentObject(context, bitstream);
        sourceDocument = sourceDocument.load(bitstreamService.retrieve(context, bitstream));
        PDPage coverPage = new PDPage(PDRectangle.LETTER); // TODO: needs to be configurable
        generateCoverPage(context, document, coverPage, item);
        addCoverPageToDocument(document, sourceDocument, coverPage);

        document.save(tempDir.getAbsolutePath() + "/bitstream.cover.pdf");
        return new File(tempDir.getAbsolutePath() + "/bitstream.cover.pdf");
    } finally {
        sourceDocument.close();
        document.close();
    }
}