List of usage examples for org.apache.pdfbox.io RandomAccessFile RandomAccessFile
public RandomAccessFile(File file, String mode) throws FileNotFoundException
From source file:org.apache.tika.parser.pdf.PDFParser.java
License:Apache License
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { PDDocument pdfDocument = null;/*from ww w. j a v a 2 s.co m*/ TemporaryResources tmp = new TemporaryResources(); //config from context, or default if not set via context PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig); String password = ""; try { // PDFBox can process entirely in memory, or can use a temp file // for unpacked / processed resources // Decide which to do based on if we're reading from a file or not already TikaInputStream tstream = TikaInputStream.cast(stream); password = getPassword(metadata, context); if (tstream != null && tstream.hasFile()) { // File based, take that as a cue to use a temporary file RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw"); if (localConfig.getUseNonSequentialParser() == true) { pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), scratchFile, password); } else { pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true); } } else { // Go for the normal, stream based in-memory parsing if (localConfig.getUseNonSequentialParser() == true) { pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), new RandomAccessBuffer(), password); } else { pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true); } } metadata.set("pdf:encrypted", Boolean.toString(pdfDocument.isEncrypted())); //if using the classic parser and the doc is encrypted, we must manually decrypt if (!localConfig.getUseNonSequentialParser() && pdfDocument.isEncrypted()) { pdfDocument.decrypt(password); } metadata.set(Metadata.CONTENT_TYPE, "application/pdf"); extractMetadata(pdfDocument, metadata); AccessChecker checker = localConfig.getAccessChecker(); checker.check(metadata); if (handler != null) { if (shouldHandleXFAOnly(pdfDocument, localConfig)) { handleXFAOnly(pdfDocument, handler, metadata); } else { PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig); } } } catch (CryptographyException e) { //seq parser throws CryptographyException for bad password throw new EncryptedDocumentException(e); } catch (IOException e) { //nonseq parser throws IOException for bad password //At the Tika level, we want the same exception to be thrown if (e.getMessage() != null && e.getMessage().contains("Error (CryptographyException)")) { metadata.set("pdf:encrypted", Boolean.toString(true)); throw new EncryptedDocumentException(e); } //rethrow any other IOExceptions throw e; } finally { if (pdfDocument != null) { pdfDocument.close(); } tmp.dispose(); //TODO: once we migrate to PDFBox 2.0, remove this (PDFBOX-2200) PDFont.clearResources(); } }
From source file:org.wangwei.pdf.AddImageToPDF.java
License:Apache License
/** * Add an image to an existing PDF document. * * @param inputFile The input PDF to add the image to. * @param image The filename of the image to put in the PDF. * @param outputFile The file to write to the pdf to. * @throws IOException If there is an error writing the data. * @throws COSVisitorException If there is an error writing the PDF. *//* w w w . j av a2s.co m*/ public void createPDFFromImage(String inputFile, String image, String outputFile) throws IOException, COSVisitorException { // the document PDDocument doc = null; try { doc = PDDocument.load(inputFile); // we will add the image to the first page. PDPage page = (PDPage) doc.getDocumentCatalog().getAllPages().get(0); PDXObjectImage ximage = null; if (image.toLowerCase().endsWith(".jpg")) { ximage = new PDJpeg(doc, new FileInputStream(image)); } else if (image.toLowerCase().endsWith(".tif") || image.toLowerCase().endsWith(".tiff")) { ximage = new PDCcitt(doc, new RandomAccessFile(new File(image), "r")); } else { BufferedImage awtImage = ImageIO.read(new File(image)); ximage = new PDPixelMap(doc, awtImage); } PDPageContentStream contentStream = new PDPageContentStream(doc, page, true, true); // contentStream.drawImage(ximage, 20, 20 ); // better method inspired by http://stackoverflow.com/a/22318681/535646 float scale = 1f; // reduce this value if the image is too large contentStream.drawXObject(ximage, 20, 20, ximage.getWidth() * scale, ximage.getHeight() * scale); contentStream.close(); doc.save(outputFile); } finally { if (doc != null) { doc.close(); } } }
From source file:pdf.to.info.PDF.java
/** * Creating a PDDocument object/*ww w . j ava 2s.com*/ * * @param filePath * @return * @throws java.io.IOException */ private PDDocument ReadPDDoc(String filePath) throws IOException { File file = new File(filePath); PDFParser parser = new PDFParser(new RandomAccessFile(file, "r")); // update for PDFBox V 2.0 parser.parse(); COSDocument cosDoc = parser.getDocument(); PDFTextStripper pdfStripper = new PDFTextStripper(); PDDocument pdDoc = new PDDocument(cosDoc); pdDoc.getNumberOfPages(); pdfStripper.setStartPage(1); pdfStripper.setEndPage(1); // for reading all pages of pdf file // pdfStripper.setEndPage(pdDoc.getNumberOfPages()); return pdDoc; }
From source file:uk.bl.dpt.qa.flint.wrappers.PDFBoxWrapper.java
License:Apache License
/** * Check if a PDF file has DRM or not/* ww w .j ava2s.c om*/ * @param pFile file to check * @return whether the file is had DRM or not */ public boolean hasDRM(File pFile) { boolean ret = false; File tmp = null; try { System.setProperty("org.apache.pdfbox.baseParser.pushBackSize", "1024768"); // NOTE: we use loadNonSeq here as it is the latest parser // load() and parser.parse() have hung on test files tmp = File.createTempFile("flint-", ".tmp"); tmp.deleteOnExit(); RandomAccess scratchFile = new RandomAccessFile(tmp, "rw"); PDDocument doc = PDDocument.loadNonSeq(new FileInputStream(pFile), scratchFile); ret = doc.isEncrypted(); doc.close(); } catch (IOException e) { // This may occur when a suitable security handler cannot be found if (e.getMessage().contains("BadSecurityHandlerException")) { // if this happens then there must be some sort of DRM here ret = true; } } catch (Exception e) { e.printStackTrace(); // See comments in https://issues.apache.org/jira/browse/PDFBOX-1757 // PDFBox state that these files have errors and their parser is correct // The only way to find out that the parser doesn't like it is to catch // a general Exception. // If we reach this point then we have no idea of whether the file contains // DRM or not. Return false and hope it is detected elsewhere. ret = false; } finally { if (tmp != null) tmp.delete(); } return ret; }
From source file:uk.bl.dpt.qa.flint.wrappers.PDFBoxWrapper.java
License:Apache License
/** * Check for encryption with Apache PDFBox * -> query the encryption dictionary (might allow more granular checks of protection) * @param pPDF pdf file to check//from w w w .j a v a 2s . com * @return whether or not the file has DRM */ public boolean hasDRMGranular(File pPDF) { boolean ret = false; File tmp = null; try { System.setProperty("org.apache.pdfbox.baseParser.pushBackSize", "1024768"); // NOTE: we use loadNonSeq here as it is the latest parser // load() and parser.parse() have hung on test files tmp = File.createTempFile("flint-", ".tmp"); tmp.deleteOnExit(); RandomAccess scratchFile = new RandomAccessFile(tmp, "rw"); PDDocument doc = PDDocument.loadNonSeq(new FileInputStream(pPDF), scratchFile); PDEncryptionDictionary dict = doc.getEncryptionDictionary(); if (dict != null) { //print encryption dictionary // for(COSName key:dict.keySet()) { // System.out.print(key.getName()); // String value = dict.getString(key); // if(value!=null){ // System.out.println(": "+value); // } else { // System.out.println(": "+dict.getLong(key)); // } // } //this feaure in pdfbox is currently broken, see: https://issues.apache.org/jira/browse/PDFBOX-1651 //AccessPermission perms = parser.getPDDocument().getCurrentAccessPermission(); //this is a work around; creating a new object from the data AccessPermission perms = new AccessPermission(dict.getPermissions());//.getInt("P")); boolean debug = true; if (debug) { System.out.println("canAssembleDocument() : " + perms.canAssembleDocument()); System.out.println("canExtractContent() : " + perms.canExtractContent()); System.out.println("canExtractForAccessibility() : " + perms.canExtractForAccessibility()); System.out.println("canFillInForm() : " + perms.canFillInForm()); System.out.println("canModify() : " + perms.canModify()); System.out.println("canModifyAnnotations() : " + perms.canModifyAnnotations()); System.out.println("canPrint() : " + perms.canPrint()); System.out.println("canPrintDegraded() : " + perms.canPrintDegraded()); System.out.println("isOwnerPermission() : " + perms.isOwnerPermission()); System.out.println("isReadOnly() : " + perms.isReadOnly()); } } doc.close(); } catch (Exception e) { LOGGER.warn("Exception while doing granular DRM checks leads to invalidity: {}", e); } finally { if (tmp != null) tmp.delete(); } return ret; }