List of usage examples for org.apache.pdfbox.cos COSObject getGenerationNumber
public int getGenerationNumber()
From source file:ReducePDFSize.java
License:Apache License
public static void main(String[] args) throws IOException { if (2 != args.length) { throw new RuntimeException("arg0 must be input file, org1 must be output file"); }// w ww.j a v a 2 s . c o m String in = args[0]; String out = args[1]; PDDocument doc = null; try { doc = PDDocument.load(new File(in)); doc.setAllSecurityToBeRemoved(true); for (COSObject cosObject : doc.getDocument().getObjects()) { COSBase base = cosObject.getObject(); // if it's a stream: decode it, then re-write it using FLATE_DECODE if (base instanceof COSStream) { COSStream stream = (COSStream) base; byte[] bytes; try { bytes = new PDStream(stream).toByteArray(); } catch (IOException ex) { // NOTE: original example code from PDFBox just logged & "continue;"d here, 'skipping' this stream. // If this type of failure ever happens, we can (perhaps) consider (re)ignoring this type of failure? // // IIUC then that will leave the original (non-decoded / non-flated) stream in place? throw new RuntimeException("can't serialize byte[] from: " + cosObject.getObjectNumber() + " " + cosObject.getGenerationNumber() + " obj: " + ex.getMessage(), ex); } stream.removeItem(COSName.FILTER); OutputStream streamOut = stream.createOutputStream(COSName.FLATE_DECODE); streamOut.write(bytes); streamOut.close(); } } doc.getDocumentCatalog(); doc.save(out); } finally { if (doc != null) { doc.close(); } } }
From source file:cz.muni.pdfjbim.PdfImageExtractor.java
License:Apache License
/** * This method extracts images by going through all COSObjects pointed from xref table * @param is input stream containing PDF file * @param prefix output basename for images * @param password password for access to PDF if needed * @param pagesToProcess list of pages which should be processed if null given => processed all pages * -- not working yet/*from ww w . j a va 2s.c o m*/ * @param binarize -- enables processing of nonbitonal images as well (LZW is still not * processed because of output with inverted colors) * @throws PdfRecompressionException if problem to extract images from PDF */ public void extractImagesUsingPdfParser(InputStream is, String prefix, String password, Set<Integer> pagesToProcess, Boolean binarize) throws PdfRecompressionException { // checking arguments and setting appropriate variables if (binarize == null) { binarize = false; } log.debug("Extracting images (binarize set to {})", binarize); InputStream inputStream = null; if (password != null) { try (ByteArrayOutputStream decryptedOutputStream = new ByteArrayOutputStream()) { PdfReader reader = new PdfReader(is, password.getBytes(StandardCharsets.UTF_8)); PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream); if (stamper != null) { stamper.close(); } inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray()); } catch (DocumentException ex) { throw new PdfRecompressionException(ex); } catch (IOException ex) { throw new PdfRecompressionException("Reading file caused exception", ex); } } else { inputStream = is; } PDFParser parser = null; COSDocument doc = null; try { parser = new PDFParser(inputStream); parser.parse(); doc = parser.getDocument(); List<COSObject> objs = doc.getObjectsByType(COSName.XOBJECT); if (objs != null) { for (COSObject obj : objs) { COSBase subtype = obj.getItem(COSName.SUBTYPE); if (subtype.toString().equalsIgnoreCase("COSName{Image}")) { COSBase imageObj = obj.getObject(); COSBase cosNameObj = obj.getItem(COSName.NAME); String key; if (cosNameObj != null) { String cosNameKey = cosNameObj.toString(); int startOfKey = cosNameKey.indexOf("{") + 1; key = cosNameKey.substring(startOfKey, cosNameKey.length() - 1); } else { key = "im0"; } int objectNum = obj.getObjectNumber().intValue(); int genNum = obj.getGenerationNumber().intValue(); PDXObjectImage image = (PDXObjectImage) PDXObjectImage.createXObject(imageObj); PDStream pdStr = new PDStream(image.getCOSStream()); List<COSName> filters = pdStr.getFilters(); log.debug("Detected image with color depth: {} bits", image.getBitsPerComponent()); if (filters == null) { continue; } log.debug("Detected filters: {}", filters.toString()); if ((image.getBitsPerComponent() > 1) && (!binarize)) { log.info("It is not a bitonal image => skipping"); continue; } // at this moment for preventing bad output (bad coloring) from LZWDecode filter if (filters.contains(COSName.LZW_DECODE)) { log.info("This is LZWDecoded => skipping"); continue; } if (filters.contains(COSName.FLATE_DECODE)) { log.debug("FlateDecoded image detected"); } if (filters.contains(COSName.JBIG2_DECODE)) { if (skipJBig2Images) { log.warn("Allready compressed according to JBIG2 standard => skipping"); continue; } else { log.debug("JBIG2 image detected"); } } // detection of unsupported filters by pdfBox library if (filters.contains(COSName.JPX_DECODE)) { log.warn("Unsupported filter JPXDecode => skipping"); continue; } String name = getUniqueFileName(prefix, image.getSuffix()); log.info("Writing image: {}", name); image.write2file(name); PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(), image.getHeight(), objectNum, genNum); originalImageInformations.add(pdfImageInfo); namesOfImages.add(name + "." + image.getSuffix()); } } } } catch (IOException ex) { Tools.deleteFilesFromList(namesOfImages); throw new PdfRecompressionException("Unable to parse PDF document", ex); } catch (Exception ex) { Tools.deleteFilesFromList(namesOfImages); } finally { if (doc != null) { try { doc.close(); } catch (IOException ex) { throw new PdfRecompressionException(ex); } } } }
From source file:cz.muni.pdfjbim.PdfImageExtractor.java
License:Apache License
/** * @deprecated -- do not use doesn't work properly yet * This method extracts images by going through PDF tree structure * @param pdfFile name of input PDF file * @param prefix /*from w ww.ja va 2 s. co m*/ * @param password password for access to PDF if needed * @param pagesToProcess list of pages which should be processed if null given => processed all pages * -- not working yet // * @param silent -- if true error messages are not written to output otherwise they are * @param binarize -- enables processing of nonbitonal images as well (LZW is still not * processed because of output with inverted colors) * @throws PdfRecompressionException if problem to extract images from PDF */ public void extractImagesUsingPdfObjectAccess(String pdfFile, String prefix, String password, Set<Integer> pagesToProcess, Boolean binarize) throws PdfRecompressionException { if (binarize == null) { binarize = false; } // checking arguments and setting appropriate variables if (pdfFile == null) { throw new IllegalArgumentException("pdfFile must be defined"); } InputStream inputStream = null; if (password != null) { try { log.debug("PDF probably encrypted, trying to decrypt using given password {}", password); ByteArrayOutputStream decryptedOutputStream = new ByteArrayOutputStream(); PdfReader reader = new PdfReader(pdfFile, password.getBytes(StandardCharsets.UTF_8)); PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream); stamper.close(); inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray()); } catch (DocumentException ex) { throw new PdfRecompressionException(ex); } catch (IOException ex) { throw new PdfRecompressionException("Reading file caused exception", ex); } } else { try { inputStream = new FileInputStream(pdfFile); } catch (FileNotFoundException ex) { throw new PdfRecompressionException("File wasn't found", ex); } } // if prefix is not set then prefix set to name of pdf without .pdf // if pdfFile has unconsistent name (without suffix .pdf) and name longer than 4 chars then last for chars are removed // and this string set as prefix if ((prefix == null) && (pdfFile.length() > 4)) { prefix = pdfFile.substring(0, pdfFile.length() - 4); } PDFParser parser = null; PDDocument doc = null; try { parser = new PDFParser(inputStream); parser.parse(); doc = parser.getPDDocument(); AccessPermission accessPermissions = doc.getCurrentAccessPermission(); if (!accessPermissions.canExtractContent()) { throw new PdfRecompressionException("Error: You do not have permission to extract images."); } // going page by page List pages = doc.getDocumentCatalog().getAllPages(); for (int pageNumber = 0; pageNumber < pages.size(); pageNumber++) { if ((pagesToProcess != null) && (!pagesToProcess.contains(pageNumber + 1))) { continue; } PDPage page = (PDPage) pages.get(pageNumber); PDResources resources = page.getResources(); Map xobjs = resources.getXObjects(); if (xobjs != null) { Iterator xobjIter = xobjs.entrySet().iterator(); while (xobjIter.hasNext()) { Map.Entry entry = (Map.Entry) xobjIter.next(); String key = (String) entry.getKey(); PDXObject xobj = (PDXObject) entry.getValue(); Map images; if (xobj instanceof PDXObjectForm) { PDXObjectForm xform = (PDXObjectForm) xobj; images = xform.getResources().getImages(); } else { images = resources.getImages(); } // reading images from each page and saving them to file if (images != null) { Iterator imageIter = images.entrySet().iterator(); while (imageIter.hasNext()) { Map.Entry imEntry = (Map.Entry) imageIter.next(); String imKey = (String) imEntry.getKey(); PDXObjectImage image = (PDXObjectImage) imEntry.getValue(); PDStream pdStr = new PDStream(image.getCOSStream()); List<COSName> filters = pdStr.getFilters(); if (image.getBitsPerComponent() > 1 && !binarize) { log.info("It is not a bitonal image => skipping"); continue; } // at this moment for preventing bad output (bad coloring) from LZWDecode filter if (filters.contains(COSName.LZW_DECODE)) { log.info("This is LZWDecoded => skipping"); continue; } if (filters.contains(COSName.JBIG2_DECODE)) { if (skipJBig2Images) { log.warn("Allready compressed according to JBIG2 standard => skipping"); continue; } else { log.debug("JBIG2 image detected"); } } // detection of unsupported filters by pdfBox library if (filters.contains(COSName.JPX_DECODE)) { log.info("Unsupported filter JPXDecode => skipping"); continue; } COSObject cosObj = new COSObject(image.getCOSObject()); int objectNum = cosObj.getObjectNumber().intValue(); int genNum = cosObj.getGenerationNumber().intValue(); log.debug(objectNum + " " + genNum + " obj"); String name = getUniqueFileName(prefix + imKey, image.getSuffix()); log.debug("Writing image:" + name); image.write2file(name); PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(), image.getHeight(), objectNum, genNum); originalImageInformations.add(pdfImageInfo); log.debug(pdfImageInfo.toString()); namesOfImages.add(name + "." + image.getSuffix()); } } } } } } catch (IOException ex) { Tools.deleteFilesFromList(namesOfImages); throw new PdfRecompressionException("Unable to parse PDF document", ex); } catch (RuntimeException ex) { Tools.deleteFilesFromList(namesOfImages); } finally { if (doc != null) { try { doc.close(); } catch (IOException ex) { throw new PdfRecompressionException(ex); } } } }
From source file:cz.muni.pdfjbim.PdfImageProcessor.java
License:Apache License
/** * This method extracts images by going through all COSObjects pointed from xref table * @param is input stream containing PDF file * @param password password for access to PDF if needed * @param pagesToProcess list of pages which should be processed if null given => processed all pages * -- not working yet/*from w w w . j av a2 s . com*/ * @param binarize -- enables processing of nonbitonal images as well (LZW is still not * processed because of output with inverted colors) * @throws PdfRecompressionException if problem to extract images from PDF */ public void extractImagesUsingPdfParser(InputStream is, String prefix, String password, Set<Integer> pagesToProcess, Boolean binarize) throws PdfRecompressionException { // checking arguments and setting appropriate variables if (binarize == null) { binarize = false; } InputStream inputStream = null; if (password != null) { try { ByteArrayOutputStream decryptedOutputStream = null; PdfReader reader = new PdfReader(is, password.getBytes()); PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream); stamper.close(); inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray()); } catch (DocumentException ex) { throw new PdfRecompressionException(ex); } catch (IOException ex) { throw new PdfRecompressionException("Reading file caused exception", ex); } } else { inputStream = is; } PDFParser parser = null; COSDocument doc = null; try { parser = new PDFParser(inputStream); parser.parse(); doc = parser.getDocument(); List<COSObject> objs = doc.getObjectsByType(COSName.XOBJECT); if (objs != null) { for (COSObject obj : objs) { COSBase subtype = obj.getItem(COSName.SUBTYPE); if (subtype.toString().equalsIgnoreCase("COSName{Image}")) { COSBase imageObj = obj.getObject(); COSBase cosNameObj = obj.getItem(COSName.NAME); String key; if (cosNameObj != null) { String cosNameKey = cosNameObj.toString(); int startOfKey = cosNameKey.indexOf("{") + 1; key = cosNameKey.substring(startOfKey, cosNameKey.length() - 1); } else { key = "im0"; } int objectNum = obj.getObjectNumber().intValue(); int genNum = obj.getGenerationNumber().intValue(); PDXObjectImage image = (PDXObjectImage) PDXObjectImage.createXObject(imageObj); PDStream pdStr = new PDStream(image.getCOSStream()); List filters = pdStr.getFilters(); if ((image.getBitsPerComponent() > 1) && (!binarize)) { log.info("It is not a bitonal image => skipping"); continue; } // at this moment for preventing bad output (bad coloring) from LZWDecode filter if (filters.contains(COSName.LZW_DECODE.getName())) { log.info("This is LZWDecoded => skipping"); continue; } // detection of unsupported filters by pdfBox library if (filters.contains("JBIG2Decode")) { log.warn("Allready compressed according to JBIG2 standard => skipping"); continue; } if (filters.contains("JPXDecode")) { log.warn("Unsupported filter JPXDecode => skipping"); continue; } String name = getUniqueFileName(prefix, image.getSuffix()); log.info("Writing image:" + name); image.write2file(name); PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(), image.getHeight(), objectNum, genNum); originalImageInformations.add(pdfImageInfo); namesOfImages.add(name + "." + image.getSuffix()); } // } } } } catch (IOException ex) { throw new PdfRecompressionException("Unable to parse PDF document", ex); } finally { if (doc != null) { try { doc.close(); } catch (IOException ex) { throw new PdfRecompressionException(ex); } } } }
From source file:cz.muni.pdfjbim.PdfImageProcessor.java
License:Apache License
/** * @deprecated -- do not use doesn't work properly yet * This method extracts images by going through PDF tree structure * @param pdfFile name of input PDF file * @param password password for access to PDF if needed * @param pagesToProcess list of pages which should be processed if null given => processed all pages * -- not working yet/* w ww . j a v a 2 s. c o m*/ * @param silent -- if true error messages are not written to output otherwise they are * @param binarize -- enables processing of nonbitonal images as well (LZW is still not * processed because of output with inverted colors) * @throws PdfRecompressionException if problem to extract images from PDF */ public void extractImagesUsingPdfObjectAccess(String pdfFile, String password, Set<Integer> pagesToProcess, Boolean silent, Boolean binarize) throws PdfRecompressionException { if (binarize == null) { binarize = false; } // checking arguments and setting appropriate variables if (pdfFile == null) { throw new IllegalArgumentException(pdfFile); } String prefix = null; InputStream inputStream = null; if (password != null) { try { ByteArrayOutputStream decryptedOutputStream = null; PdfReader reader = new PdfReader(pdfFile, password.getBytes()); PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream); stamper.close(); inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray()); } catch (DocumentException ex) { throw new PdfRecompressionException(ex); } catch (IOException ex) { throw new PdfRecompressionException("Reading file caused exception", ex); } } else { try { inputStream = new FileInputStream(pdfFile); } catch (FileNotFoundException ex) { throw new PdfRecompressionException("File wasn't found", ex); } } // if prefix is not set then prefix set to name of pdf without .pdf // if pdfFile has unconsistent name (without suffix .pdf) and name longer than 4 chars then last for chars are removed // and this string set as prefix if ((prefix == null) && (pdfFile.length() > 4)) { prefix = pdfFile.substring(0, pdfFile.length() - 4); } PDFParser parser = null; PDDocument doc = null; try { parser = new PDFParser(inputStream); parser.parse(); doc = parser.getPDDocument(); AccessPermission accessPermissions = doc.getCurrentAccessPermission(); if (!accessPermissions.canExtractContent()) { throw new PdfRecompressionException("Error: You do not have permission to extract images."); } // going page by page List pages = doc.getDocumentCatalog().getAllPages(); for (int pageNumber = 0; pageNumber < pages.size(); pageNumber++) { if ((pagesToProcess != null) && (!pagesToProcess.contains(pageNumber + 1))) { continue; } PDPage page = (PDPage) pages.get(pageNumber); PDResources resources = page.getResources(); Map xobjs = resources.getXObjects(); if (xobjs != null) { Iterator xobjIter = xobjs.keySet().iterator(); while (xobjIter.hasNext()) { String key = (String) xobjIter.next(); PDXObject xobj = (PDXObject) xobjs.get(key); Map images; if (xobj instanceof PDXObjectForm) { PDXObjectForm xform = (PDXObjectForm) xobj; images = xform.getResources().getImages(); } else { images = resources.getImages(); } // reading images from each page and saving them to file if (images != null) { Iterator imageIter = images.keySet().iterator(); while (imageIter.hasNext()) { String imKey = (String) imageIter.next(); PDXObjectImage image = (PDXObjectImage) images.get(imKey); PDStream pdStr = new PDStream(image.getCOSStream()); List filters = pdStr.getFilters(); if (image.getBitsPerComponent() > 1) { log.info("It is not a bitonal image => skipping"); continue; } // at this moment for preventing bad output (bad coloring) from LZWDecode filter if (filters.contains(COSName.LZW_DECODE.getName())) { log.info("This is LZWDecoded => skipping"); continue; } // detection of unsupported filters by pdfBox library if (filters.contains("JBIG2Decode")) { log.info("Allready compressed according to JBIG2 standard => skipping"); continue; } if (filters.contains("JPXDecode")) { log.info("Unsupported filter JPXDecode => skipping"); continue; } COSObject cosObj = new COSObject(image.getCOSObject()); int objectNum = cosObj.getObjectNumber().intValue(); int genNum = cosObj.getGenerationNumber().intValue(); log.debug(objectNum + " " + genNum + " obj"); String name = getUniqueFileName(prefix + imKey, image.getSuffix()); log.debug("Writing image:" + name); image.write2file(name); PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(), image.getHeight(), objectNum, genNum); originalImageInformations.add(pdfImageInfo); log.debug(pdfImageInfo.toString()); namesOfImages.add(name + "." + image.getSuffix()); } } } } } } catch (IOException ex) { throw new PdfRecompressionException("Unable to parse PDF document", ex); } finally { if (doc != null) { try { doc.close(); } catch (IOException ex) { throw new PdfRecompressionException(ex); } } } }
From source file:org.apache.fop.render.pdf.pdfbox.PDFBoxAdapter.java
License:Apache License
private Object readCOSObject(COSObject object, Collection exclude) throws IOException { if (log.isTraceEnabled()) { log.trace("Cloning indirect object: " + object.getObjectNumber() + " " + object.getGenerationNumber()); }/*from w w w . ja va 2 s. c o m*/ Object obj = cloneForNewDocument(object.getObject(), object, exclude); if (obj instanceof PDFObject) { PDFObject pdfobj = (PDFObject) obj; //pdfDoc.registerObject(pdfobj); if (!pdfobj.hasObjectNumber()) { throw new IllegalStateException("PDF object was not registered!"); } if (log.isTraceEnabled()) { log.trace("Object registered: " + pdfobj.getObjectNumber() + " " + pdfobj.getGeneration() + " for COSObject: " + object.getObjectNumber() + " " + object.getGenerationNumber()); } } return obj; }
From source file:org.apache.fop.render.pdf.pdfbox.PDFBoxAdapter.java
License:Apache License
private Object getBaseKey(Object base) { if (base instanceof COSObject) { COSObject obj = (COSObject) base; return obj.getObjectNumber() + " " + obj.getGenerationNumber(); } else if (base instanceof COSDictionary) { return base; } else {/* www.jav a2 s . co m*/ return null; } }
From source file:org.apache.padaf.preflight.RetrieveMissingStream.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length != 1) { System.err.println("usage : RetrieveMissingStream file"); System.exit(233);/*from w w w. java2 s . c o m*/ } HashSet<COSObjectKey> listOfKeys = new HashSet<COSObjectKey>(); PDDocument document = PDDocument.load(new FileInputStream(args[0])); List<COSObject> lCosObj = document.getDocument().getObjects(); for (COSObject cosObject : lCosObj) { if (cosObject.getObject() instanceof COSStream) { listOfKeys.add(new COSObjectKey(cosObject.getObjectNumber().intValue(), cosObject.getGenerationNumber().intValue())); } } PDDocumentCatalog catalog = document.getDocumentCatalog(); List<?> pages = catalog.getAllPages(); for (int i = 0; i < pages.size(); ++i) { PDPage pdp = (PDPage) pages.get(i); PDStream pdStream = pdp.getContents(); COSBase b = pdp.getCOSDictionary().getItem(COSName.getPDFName("Contents")); System.out.println(); } }