List of usage examples for org.apache.pdfbox.cos COSDocument getObjectsByType
public List<COSObject> getObjectsByType(COSName type)
From source file:cz.muni.pdfjbim.PdfImageExtractor.java
License:Apache License
/** * This method extracts images by going through all COSObjects pointed from xref table * @param is input stream containing PDF file * @param prefix output basename for images * @param password password for access to PDF if needed * @param pagesToProcess list of pages which should be processed if null given => processed all pages * -- not working yet/*from w w w . ja va2s. c o m*/ * @param binarize -- enables processing of nonbitonal images as well (LZW is still not * processed because of output with inverted colors) * @throws PdfRecompressionException if problem to extract images from PDF */ public void extractImagesUsingPdfParser(InputStream is, String prefix, String password, Set<Integer> pagesToProcess, Boolean binarize) throws PdfRecompressionException { // checking arguments and setting appropriate variables if (binarize == null) { binarize = false; } log.debug("Extracting images (binarize set to {})", binarize); InputStream inputStream = null; if (password != null) { try (ByteArrayOutputStream decryptedOutputStream = new ByteArrayOutputStream()) { PdfReader reader = new PdfReader(is, password.getBytes(StandardCharsets.UTF_8)); PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream); if (stamper != null) { stamper.close(); } inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray()); } catch (DocumentException ex) { throw new PdfRecompressionException(ex); } catch (IOException ex) { throw new PdfRecompressionException("Reading file caused exception", ex); } } else { inputStream = is; } PDFParser parser = null; COSDocument doc = null; try { parser = new PDFParser(inputStream); parser.parse(); doc = parser.getDocument(); List<COSObject> objs = doc.getObjectsByType(COSName.XOBJECT); if (objs != null) { for (COSObject obj : objs) { COSBase subtype = obj.getItem(COSName.SUBTYPE); if (subtype.toString().equalsIgnoreCase("COSName{Image}")) { COSBase imageObj = obj.getObject(); COSBase cosNameObj = obj.getItem(COSName.NAME); String key; if (cosNameObj != null) { String cosNameKey = cosNameObj.toString(); int startOfKey = cosNameKey.indexOf("{") + 1; key = cosNameKey.substring(startOfKey, cosNameKey.length() - 1); } else { key = "im0"; } int objectNum = obj.getObjectNumber().intValue(); int genNum = obj.getGenerationNumber().intValue(); PDXObjectImage image = (PDXObjectImage) PDXObjectImage.createXObject(imageObj); PDStream pdStr = new PDStream(image.getCOSStream()); List<COSName> filters = pdStr.getFilters(); log.debug("Detected image with color depth: {} bits", image.getBitsPerComponent()); if (filters == null) { continue; } log.debug("Detected filters: {}", filters.toString()); if ((image.getBitsPerComponent() > 1) && (!binarize)) { log.info("It is not a bitonal image => skipping"); continue; } // at this moment for preventing bad output (bad coloring) from LZWDecode filter if (filters.contains(COSName.LZW_DECODE)) { log.info("This is LZWDecoded => skipping"); continue; } if (filters.contains(COSName.FLATE_DECODE)) { log.debug("FlateDecoded image detected"); } if (filters.contains(COSName.JBIG2_DECODE)) { if (skipJBig2Images) { log.warn("Allready compressed according to JBIG2 standard => skipping"); continue; } else { log.debug("JBIG2 image detected"); } } // detection of unsupported filters by pdfBox library if (filters.contains(COSName.JPX_DECODE)) { log.warn("Unsupported filter JPXDecode => skipping"); continue; } String name = getUniqueFileName(prefix, image.getSuffix()); log.info("Writing image: {}", name); image.write2file(name); PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(), image.getHeight(), objectNum, genNum); originalImageInformations.add(pdfImageInfo); namesOfImages.add(name + "." + image.getSuffix()); } } } } catch (IOException ex) { Tools.deleteFilesFromList(namesOfImages); throw new PdfRecompressionException("Unable to parse PDF document", ex); } catch (Exception ex) { Tools.deleteFilesFromList(namesOfImages); } finally { if (doc != null) { try { doc.close(); } catch (IOException ex) { throw new PdfRecompressionException(ex); } } } }
From source file:cz.muni.pdfjbim.PdfImageProcessor.java
License:Apache License
/** * This method extracts images by going through all COSObjects pointed from xref table * @param is input stream containing PDF file * @param password password for access to PDF if needed * @param pagesToProcess list of pages which should be processed if null given => processed all pages * -- not working yet//www . jav a2s.co m * @param binarize -- enables processing of nonbitonal images as well (LZW is still not * processed because of output with inverted colors) * @throws PdfRecompressionException if problem to extract images from PDF */ public void extractImagesUsingPdfParser(InputStream is, String prefix, String password, Set<Integer> pagesToProcess, Boolean binarize) throws PdfRecompressionException { // checking arguments and setting appropriate variables if (binarize == null) { binarize = false; } InputStream inputStream = null; if (password != null) { try { ByteArrayOutputStream decryptedOutputStream = null; PdfReader reader = new PdfReader(is, password.getBytes()); PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream); stamper.close(); inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray()); } catch (DocumentException ex) { throw new PdfRecompressionException(ex); } catch (IOException ex) { throw new PdfRecompressionException("Reading file caused exception", ex); } } else { inputStream = is; } PDFParser parser = null; COSDocument doc = null; try { parser = new PDFParser(inputStream); parser.parse(); doc = parser.getDocument(); List<COSObject> objs = doc.getObjectsByType(COSName.XOBJECT); if (objs != null) { for (COSObject obj : objs) { COSBase subtype = obj.getItem(COSName.SUBTYPE); if (subtype.toString().equalsIgnoreCase("COSName{Image}")) { COSBase imageObj = obj.getObject(); COSBase cosNameObj = obj.getItem(COSName.NAME); String key; if (cosNameObj != null) { String cosNameKey = cosNameObj.toString(); int startOfKey = cosNameKey.indexOf("{") + 1; key = cosNameKey.substring(startOfKey, cosNameKey.length() - 1); } else { key = "im0"; } int objectNum = obj.getObjectNumber().intValue(); int genNum = obj.getGenerationNumber().intValue(); PDXObjectImage image = (PDXObjectImage) PDXObjectImage.createXObject(imageObj); PDStream pdStr = new PDStream(image.getCOSStream()); List filters = pdStr.getFilters(); if ((image.getBitsPerComponent() > 1) && (!binarize)) { log.info("It is not a bitonal image => skipping"); continue; } // at this moment for preventing bad output (bad coloring) from LZWDecode filter if (filters.contains(COSName.LZW_DECODE.getName())) { log.info("This is LZWDecoded => skipping"); continue; } // detection of unsupported filters by pdfBox library if (filters.contains("JBIG2Decode")) { log.warn("Allready compressed according to JBIG2 standard => skipping"); continue; } if (filters.contains("JPXDecode")) { log.warn("Unsupported filter JPXDecode => skipping"); continue; } String name = getUniqueFileName(prefix, image.getSuffix()); log.info("Writing image:" + name); image.write2file(name); PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(), image.getHeight(), objectNum, genNum); originalImageInformations.add(pdfImageInfo); namesOfImages.add(name + "." + image.getSuffix()); } // } } } } catch (IOException ex) { throw new PdfRecompressionException("Unable to parse PDF document", ex); } finally { if (doc != null) { try { doc.close(); } catch (IOException ex) { throw new PdfRecompressionException(ex); } } } }
From source file:org.apache.padaf.preflight.helpers.TrailerValidationHelper.java
License:Apache License
/** * Accesses and compares First and Last trailers for a PDF version higher than 1.4. * //from w w w .j av a 2 s. com * @param handler * @param result */ protected void checkTrailersForLinearizedPDF15(DocumentHandler handler, List<ValidationError> result) { PDDocument pdfDoc = handler.getDocument(); try { COSDocument cosDocument = pdfDoc.getDocument(); List<COSObject> xrefs = cosDocument.getObjectsByType(COSName.XREF); if (xrefs.isEmpty()) { // no XRef CosObject, may by this pdf file used the PDF 1.4 syntaxe checkTrailersForLinearizedPDF14(handler, result); } else { long min = Long.MAX_VALUE; long max = Long.MIN_VALUE; COSDictionary firstTrailer = null; COSDictionary lastTrailer = null; // Search First and Last trailers according to offset position. for (COSObject co : xrefs) { long offset = cosDocument.getXrefTable().get(new COSObjectKey(co)); if (offset < min) { min = offset; firstTrailer = (COSDictionary) co.getObject(); } if (offset > max) { max = offset; lastTrailer = (COSDictionary) co.getObject(); } } checkMainTrailer(pdfDoc.getDocument(), firstTrailer, result); if (!compareIds(firstTrailer, lastTrailer, pdfDoc.getDocument())) { result.add(new ValidationResult.ValidationError( ValidationConstants.ERROR_SYNTAX_TRAILER_ID_CONSISTENCY, "ID is different in the first and the last trailer")); } } } catch (IOException e) { result.add(new ValidationResult.ValidationError(ValidationConstants.ERROR_SYNTAX_TRAILER, "Unable to check PDF Trailers due to : " + e.getMessage())); } }
From source file:serock.pdfpagerestorer.PdfPageRestorer.java
License:Open Source License
private static void addPages(final PDDocument pdDoc, final COSDocument cosDoc) throws IOException { final List<COSObject> pageObjects = cosDoc.getObjectsByType(COSName.PAGE); for (COSObject pageObject : pageObjects) { final COSBase baseObject = pageObject.getObject(); final COSDictionary pageDictionary = (COSDictionary) baseObject; final PDPage page = new PDPage(pageDictionary); pdDoc.addPage(page);/*from www . j a v a2 s. com*/ } }