List of usage examples for org.apache.pdfbox.pdmodel PDDocument close
@Override public void close() throws IOException
From source file:org.apache.tika.parser.pdf.PDFParser.java
License:Apache License
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { PDDocument pdfDocument = null; TemporaryResources tmp = new TemporaryResources(); //config from context, or default if not set via context PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig); String password = ""; try {/*w w w . j av a 2 s . com*/ // PDFBox can process entirely in memory, or can use a temp file // for unpacked / processed resources // Decide which to do based on if we're reading from a file or not already TikaInputStream tstream = TikaInputStream.cast(stream); password = getPassword(metadata, context); if (tstream != null && tstream.hasFile()) { // File based, take that as a cue to use a temporary file RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw"); if (localConfig.getUseNonSequentialParser() == true) { pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), scratchFile, password); } else { pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true); } } else { // Go for the normal, stream based in-memory parsing if (localConfig.getUseNonSequentialParser() == true) { pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), new RandomAccessBuffer(), password); } else { pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true); } } metadata.set("pdf:encrypted", Boolean.toString(pdfDocument.isEncrypted())); //if using the classic parser and the doc is encrypted, we must manually decrypt if (!localConfig.getUseNonSequentialParser() && pdfDocument.isEncrypted()) { pdfDocument.decrypt(password); } metadata.set(Metadata.CONTENT_TYPE, "application/pdf"); extractMetadata(pdfDocument, metadata); AccessChecker checker = localConfig.getAccessChecker(); checker.check(metadata); if (handler != null) { if (shouldHandleXFAOnly(pdfDocument, localConfig)) { handleXFAOnly(pdfDocument, handler, metadata); } else { PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig); } } } catch (CryptographyException e) { //seq parser throws CryptographyException for bad password throw new EncryptedDocumentException(e); } catch (IOException e) { //nonseq parser throws IOException for bad password //At the Tika level, we want the same exception to be thrown if (e.getMessage() != null && e.getMessage().contains("Error (CryptographyException)")) { metadata.set("pdf:encrypted", Boolean.toString(true)); throw new EncryptedDocumentException(e); } //rethrow any other IOExceptions throw e; } finally { if (pdfDocument != null) { pdfDocument.close(); } tmp.dispose(); //TODO: once we migrate to PDFBox 2.0, remove this (PDFBOX-2200) PDFont.clearResources(); } }
From source file:org.apache.tika.parser.pdf.PDFPureJavaParser.java
License:Apache License
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { PDFPureJavaParserConfig localConfig = context.get(PDFPureJavaParserConfig.class, defaultConfig); PDDocument pdfDocument = null; String password = ""; try {//from w ww . j a v a 2 s .c o m // PDFBox can process entirely in memory, or can use a temp file // for unpacked / processed resources // Decide which to do based on if we're reading from a file or not already //TODO: make this configurable via MemoryUsageSetting TikaInputStream tstream = TikaInputStream.cast(stream); password = getPassword(metadata, context); if (tstream != null && tstream.hasFile()) { // File based -- send file directly to PDFBox pdfDocument = PDDocument.load(tstream.getPath().toFile(), password); } else { pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password); } metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(pdfDocument.isEncrypted())); metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString()); extractMetadata(pdfDocument, metadata, context); AccessChecker checker = localConfig.getAccessChecker(); checker.check(metadata); if (handler != null) { if (shouldHandleXFAOnly(pdfDocument, localConfig)) { handleXFAOnly(pdfDocument, handler, metadata, context); } else if (localConfig.getOcrStrategy().equals(PDFPureJavaParserConfig.OCR_STRATEGY.OCR_ONLY)) { metadata.add("X-Parsed-By", "org.apache.tika.parser.ocr.TesseractOCRParser"); // No-ops. Do not support OCR parser. } else { if (localConfig.getOcrStrategy() .equals(PDFPureJavaParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { metadata.add("X-Parsed-By", "org.apache.tika.parser.ocr.TesseractOCRParser"); } PDF2XHTMLPureJava.process(pdfDocument, handler, context, metadata, localConfig); } } } catch (InvalidPasswordException e) { metadata.set(PDF.IS_ENCRYPTED, "true"); throw new EncryptedDocumentException(e); } catch (final PdfTimeoutException e) { throw new TikaPdfTimeoutException("PdfTimeoutException", e); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:org.codelibs.robot.extractor.impl.PdfExtractor.java
License:Apache License
@Override public ExtractData getText(final InputStream in, final Map<String, String> params) { if (in == null) { throw new RobotSystemException("The inputstream is null."); }/* w w w . java2 s .c o m*/ synchronized (pdfBoxLockObj) { PDDocument document = null; try { document = PDDocument.load(in, null, force); if (document.isEncrypted() && params != null) { String password = params.get(ExtractData.PDF_PASSWORD); if (password == null) { password = getPassword(params.get(ExtractData.URL), params.get(TikaMetadataKeys.RESOURCE_NAME_KEY)); } if (password != null) { final StandardDecryptionMaterial sdm = new StandardDecryptionMaterial(password); document.openProtection(sdm); final AccessPermission ap = document.getCurrentAccessPermission(); if (!ap.canExtractContent()) { throw new IOException("You do not have permission to extract text."); } } } final ByteArrayOutputStream baos = new ByteArrayOutputStream(); final Writer output = new OutputStreamWriter(baos, encoding); final PDFTextStripper stripper = new PDFTextStripper(encoding); stripper.setForceParsing(force); final AtomicBoolean done = new AtomicBoolean(false); final PDDocument doc = document; final Set<Exception> exceptionSet = new HashSet<>(); final Thread task = new Thread(() -> { try { stripper.writeText(doc, output); } catch (final Exception e) { exceptionSet.add(e); } finally { done.set(true); } }); task.setDaemon(true); task.start(); task.join(timeout); if (!done.get()) { for (int i = 0; i < 100 && !done.get(); i++) { task.interrupt(); Thread.sleep(50); } throw new ExtractException("PDFBox process cannot finish in " + timeout + " sec."); } else if (!exceptionSet.isEmpty()) { throw exceptionSet.iterator().next(); } output.flush(); final ExtractData extractData = new ExtractData(baos.toString(encoding)); extractMetadata(document, extractData); return extractData; } catch (final Exception e) { throw new ExtractException(e); } finally { if (document != null) { try { document.close(); } catch (final IOException e) { // NOP } } } } }
From source file:org.crossref.pdfmark.Main.java
License:Open Source License
public static void writeInfoDictionary(FileInputStream in, String outputFile, byte[] xmp) throws IOException, COSVisitorException { PDFParser parser = new PDFParser(in); parser.parse();/*from ww w . java 2 s. co m*/ PDDocument document = parser.getPDDocument(); PDDocumentInformation info = document.getDocumentInformation(); for (Entry<String, String> entry : XmpUtils.toInfo(xmp).entrySet()) { info.setCustomMetadataValue(entry.getKey(), entry.getValue()); } document.setDocumentInformation(info); document.save(outputFile); document.close(); }
From source file:org.data2semantics.annotate.D2S_SampleAnnotation.java
License:Apache License
/** * This will create a doucument showing various annotations. * //from w w w . ja v a2s. c om * @param args * The command line arguments. * * @throws Exception * If there is an error parsing the document. */ public static void main(String[] args) throws Exception { PDDocument document = new PDDocument(); try { PDPage page = new PDPage(); document.addPage(page); List annotations = page.getAnnotations(); // Setup some basic reusable objects/constants // Annotations themselves can only be used once! float inch = 72; PDGamma colourRed = new PDGamma(); colourRed.setR(1); PDGamma colourBlue = new PDGamma(); colourBlue.setB(1); PDGamma colourBlack = new PDGamma(); PDBorderStyleDictionary borderThick = new PDBorderStyleDictionary(); borderThick.setWidth(inch / 12); // 12th inch PDBorderStyleDictionary borderThin = new PDBorderStyleDictionary(); borderThin.setWidth(inch / 72); // 1 point PDBorderStyleDictionary borderULine = new PDBorderStyleDictionary(); borderULine.setStyle(PDBorderStyleDictionary.STYLE_UNDERLINE); borderULine.setWidth(inch / 72); // 1 point float pw = page.getMediaBox().getUpperRightX(); float ph = page.getMediaBox().getUpperRightY(); // First add some text, two lines we'll add some annotations to this // later PDFont font = PDType1Font.HELVETICA_BOLD; PDPageContentStream contentStream = new PDPageContentStream(document, page); contentStream.beginText(); contentStream.setFont(font, 18); contentStream.moveTextPositionByAmount(inch, ph - inch - 18); contentStream.drawString("PDFBox"); contentStream.moveTextPositionByAmount(0, -(inch / 2)); contentStream.drawString("Click Here"); contentStream.endText(); contentStream.close(); // Now add the markup annotation, a highlight to PDFBox text PDAnnotationTextMarkup txtMark = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT); txtMark.setColour(colourBlue); txtMark.setConstantOpacity((float) 0.2); // Make the highlight 20% // transparent // Set the rectangle containing the markup float textWidth = (font.getStringWidth("PDFBox") / 1000) * 18; PDRectangle position = new PDRectangle(); position.setLowerLeftX(inch); position.setLowerLeftY(ph - inch - 18); position.setUpperRightX(72 + textWidth); position.setUpperRightY(ph - inch); txtMark.setRectangle(position); // work out the points forming the four corners of the annotations // set out in anti clockwise form (Completely wraps the text) // OK, the below doesn't match that description. // It's what acrobat 7 does and displays properly! float[] quads = new float[8]; quads[0] = position.getLowerLeftX(); // x1 quads[1] = position.getUpperRightY() - 2; // y1 quads[2] = position.getUpperRightX(); // x2 quads[3] = quads[1]; // y2 quads[4] = quads[0]; // x3 quads[5] = position.getLowerLeftY() - 2; // y3 quads[6] = quads[2]; // x4 quads[7] = quads[5]; // y5 txtMark.setQuadPoints(quads); txtMark.setContents("Highlighted since it's important"); annotations.add(txtMark); // Now add the link annotation, so the clickme works PDAnnotationLink txtLink = new PDAnnotationLink(); txtLink.setBorderStyle(borderULine); // Set the rectangle containing the link textWidth = (font.getStringWidth("Click Here") / 1000) * 18; position = new PDRectangle(); position.setLowerLeftX(inch); position.setLowerLeftY(ph - (float) (1.5 * inch) - 20); // down a // couple of // points position.setUpperRightX(72 + textWidth); position.setUpperRightY(ph - (float) (1.5 * inch)); txtLink.setRectangle(position); // add an action PDActionURI action = new PDActionURI(); action.setURI("http://www.pdfbox.org"); txtLink.setAction(action); annotations.add(txtLink); // Now draw a few more annotations PDAnnotationSquareCircle aCircle = new PDAnnotationSquareCircle( PDAnnotationSquareCircle.SUB_TYPE_CIRCLE); aCircle.setContents("Circle Annotation"); aCircle.setInteriorColour(colourRed); // Fill in circle in red aCircle.setColour(colourBlue); // The border itself will be blue aCircle.setBorderStyle(borderThin); // Place the annotation on the page, we'll make this 1" round // 3" down, 1" in on the page position = new PDRectangle(); position.setLowerLeftX(inch); position.setLowerLeftY(ph - (3 * inch) - inch); // 1" height, 3" // down position.setUpperRightX(2 * inch); // 1" in, 1" width position.setUpperRightY(ph - (3 * inch)); // 3" down aCircle.setRectangle(position); // add to the annotations on the page annotations.add(aCircle); // Now a square annotation PDAnnotationSquareCircle aSquare = new PDAnnotationSquareCircle( PDAnnotationSquareCircle.SUB_TYPE_SQUARE); aSquare.setContents("Square Annotation"); aSquare.setColour(colourRed); // Outline in red, not setting a fill aSquare.setBorderStyle(borderThick); // Place the annotation on the page, we'll make this 1" (72points) // square // 3.5" down, 1" in from the right on the page position = new PDRectangle(); // Reuse the variable, but note it's a // new object! position.setLowerLeftX(pw - (2 * inch)); // 1" in from right, 1" // wide position.setLowerLeftY(ph - (float) (3.5 * inch) - inch); // 1" height, 3.5" // down position.setUpperRightX(pw - inch); // 1" in from right position.setUpperRightY(ph - (float) (3.5 * inch)); // 3.5" down aSquare.setRectangle(position); // add to the annotations on the page annotations.add(aSquare); // Now we want to draw a line between the two, one end with an open // arrow PDAnnotationLine aLine = new PDAnnotationLine(); aLine.setEndPointEndingStyle(PDAnnotationLine.LE_OPEN_ARROW); aLine.setContents("Circle->Square"); aLine.setCaption(true); // Make the contents a caption on the line // Set the rectangle containing the line position = new PDRectangle(); // Reuse the variable, but note it's a // new object! position.setLowerLeftX(2 * inch); // 1" in + width of circle position.setLowerLeftY(ph - (float) (3.5 * inch) - inch); // 1" height, 3.5" // down position.setUpperRightX(pw - inch - inch); // 1" in from right, and // width of square position.setUpperRightY(ph - (3 * inch)); // 3" down (top of circle) aLine.setRectangle(position); // Now set the line position itself float[] linepos = new float[4]; linepos[0] = 2 * inch; // x1 = rhs of circle linepos[1] = ph - (float) (3.5 * inch); // y1 halfway down circle linepos[2] = pw - (2 * inch); // x2 = lhs of square linepos[3] = ph - (4 * inch); // y2 halfway down square aLine.setLine(linepos); aLine.setBorderStyle(borderThick); aLine.setColour(colourBlack); // add to the annotations on the page annotations.add(aLine); // Finally all done document.save("testAnnotation.pdf"); } finally { document.close(); } }
From source file:org.deidentifier.arx.certificate.ARXCertificate.java
License:Apache License
/** * Renders the document into the given output stream * /*from w w w . j a va2 s. c o m*/ * @param stream * @throws IOException */ public void save(OutputStream stream) throws IOException { // Render Document document = new Document(style.gethMargin(), style.gethMargin(), style.getvMargin(), style.getvMargin()); for (Element element : this.elements) { element.render(document, 0, this.style); } // Save to temp file File tmp = File.createTempFile("arx", "certificate"); document.save(tmp); // Load and watermark PDDocument pdDocument = PDDocument.load(tmp); Watermark watermark = new Watermark(pdDocument); watermark.mark(pdDocument); // Save pdDocument.save(stream); pdDocument.close(); tmp.delete(); }
From source file:org.dspace.app.mediafilter.PDFBoxThumbnail.java
License:BSD License
/** * @param source//from w ww . j a v a 2 s . c o m * source input stream * * @return InputStream the resulting input stream */ @Override public InputStream getDestinationStream(Item currentItem, InputStream source, boolean verbose) throws Exception { PDDocument doc = PDDocument.load(source); PDFRenderer renderer = new PDFRenderer(doc); BufferedImage buf = renderer.renderImage(0); // ImageIO.write(buf, "PNG", new File("custom-render.png")); doc.close(); JPEGFilter jpegFilter = new JPEGFilter(); return jpegFilter.getThumb(currentItem, buf, verbose); }
From source file:org.dspace.app.mediafilter.PDFFilter.java
License:BSD License
/** * @param source/*from w w w .ja v a 2s .c o m*/ * source input stream * * @return InputStream the resulting input stream */ public InputStream getDestinationStream(InputStream source) throws Exception { try { boolean useTemporaryFile = ConfigurationManager.getBooleanProperty("pdffilter.largepdfs", false); // get input stream from bitstream // pass to filter, get string back PDFTextStripper pts = new PDFTextStripper(); PDDocument pdfDoc = null; Writer writer = null; File tempTextFile = null; ByteArrayOutputStream byteStream = null; if (useTemporaryFile) { tempTextFile = File.createTempFile("dspacepdfextract" + source.hashCode(), ".txt"); tempTextFile.deleteOnExit(); writer = new OutputStreamWriter(new FileOutputStream(tempTextFile)); } else { byteStream = new ByteArrayOutputStream(); writer = new OutputStreamWriter(byteStream); } try { pdfDoc = PDDocument.load(source); pts.writeText(pdfDoc, writer); } finally { try { if (pdfDoc != null) { pdfDoc.close(); } } catch (Exception e) { log.error("Error closing PDF file: " + e.getMessage(), e); } try { writer.close(); } catch (Exception e) { log.error("Error closing temporary extract file: " + e.getMessage(), e); } } if (useTemporaryFile) { return new FileInputStream(tempTextFile); } else { byte[] bytes = byteStream.toByteArray(); return new ByteArrayInputStream(bytes); } } catch (OutOfMemoryError oome) { log.error("Error parsing PDF document " + oome.getMessage(), oome); if (!ConfigurationManager.getBooleanProperty("pdffilter.skiponmemoryexception", false)) { throw oome; } } return null; }
From source file:org.dspace.disseminate.CitationDocument.java
License:BSD License
/** * Creates a// w w w . jav a2s . c o m * cited document from the given bitstream of the given item. This * requires that bitstream is contained in item. * <p> * The Process for adding a cover page is as follows: * <ol> * <li> Load source file into PdfReader and create a * Document to put our cover page into.</li> * <li> Create cover page and add content to it.</li> * <li> Concatenate the coverpage and the source * document.</li> * </p> * * @param bitstream The source bitstream being cited. This must be a PDF. * @return The temporary File that is the finished, cited document. * @throws java.io.FileNotFoundException * @throws SQLException * @throws org.dspace.authorize.AuthorizeException */ public File makeCitedDocument(Bitstream bitstream) throws IOException, SQLException, AuthorizeException, COSVisitorException { PDDocument document = new PDDocument(); PDDocument sourceDocument = new PDDocument(); try { Item item = (Item) bitstream.getParentObject(); sourceDocument = sourceDocument.load(bitstream.retrieve()); PDPage coverPage = new PDPage(PDPage.PAGE_SIZE_LETTER); generateCoverPage(document, coverPage, item); addCoverPageToDocument(document, sourceDocument, coverPage); document.save(tempDir.getAbsolutePath() + "/bitstream.cover.pdf"); return new File(tempDir.getAbsolutePath() + "/bitstream.cover.pdf"); } finally { sourceDocument.close(); document.close(); } }
From source file:org.dspace.disseminate.CitationDocumentServiceImpl.java
License:BSD License
@Override public File makeCitedDocument(Context context, Bitstream bitstream) throws IOException, SQLException, AuthorizeException { PDDocument document = new PDDocument(); PDDocument sourceDocument = new PDDocument(); try {/*www .java2 s.com*/ Item item = (Item) bitstreamService.getParentObject(context, bitstream); sourceDocument = sourceDocument.load(bitstreamService.retrieve(context, bitstream)); PDPage coverPage = new PDPage(PDRectangle.LETTER); // TODO: needs to be configurable generateCoverPage(context, document, coverPage, item); addCoverPageToDocument(document, sourceDocument, coverPage); document.save(tempDir.getAbsolutePath() + "/bitstream.cover.pdf"); return new File(tempDir.getAbsolutePath() + "/bitstream.cover.pdf"); } finally { sourceDocument.close(); document.close(); } }