Example usage for org.apache.pdfbox.cos COSName JBIG2

Introduction

In this page you can find the example usage for org.apache.pdfbox.cos COSName JBIG2_DECODE.

Prototype

COSName JBIG2_DECODE

To view the source code for org.apache.pdfbox.cos COSName JBIG2_DECODE.

Click Source Link

Usage

From source file:cz.muni.pdfjbim.PdfImageExtractor.java

License:Apache License

/**
 * This method extracts images by going through all COSObjects pointed from xref table
 * @param is input stream containing PDF file
 * @param prefix output basename for images
 * @param password password for access to PDF if needed
 * @param pagesToProcess list of pages which should be processed if null given => processed all pages
 *      -- not working yet/*from  w w w  . j  a  va  2  s.c o m*/
 * @param binarize -- enables processing of nonbitonal images as well (LZW is still not
 *      processed because of output with inverted colors)
 * @throws PdfRecompressionException if problem to extract images from PDF
 */
public void extractImagesUsingPdfParser(InputStream is, String prefix, String password,
        Set<Integer> pagesToProcess, Boolean binarize) throws PdfRecompressionException {
    // checking arguments and setting appropriate variables
    if (binarize == null) {
        binarize = false;
    }

    log.debug("Extracting images (binarize set to {})", binarize);

    InputStream inputStream = null;
    if (password != null) {
        try (ByteArrayOutputStream decryptedOutputStream = new ByteArrayOutputStream()) {
            PdfReader reader = new PdfReader(is, password.getBytes(StandardCharsets.UTF_8));
            PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream);
            if (stamper != null) {
                stamper.close();
            }
            inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray());
        } catch (DocumentException ex) {
            throw new PdfRecompressionException(ex);
        } catch (IOException ex) {
            throw new PdfRecompressionException("Reading file caused exception", ex);
        }
    } else {
        inputStream = is;
    }

    PDFParser parser = null;
    COSDocument doc = null;
    try {
        parser = new PDFParser(inputStream);
        parser.parse();
        doc = parser.getDocument();

        List<COSObject> objs = doc.getObjectsByType(COSName.XOBJECT);
        if (objs != null) {
            for (COSObject obj : objs) {
                COSBase subtype = obj.getItem(COSName.SUBTYPE);
                if (subtype.toString().equalsIgnoreCase("COSName{Image}")) {
                    COSBase imageObj = obj.getObject();
                    COSBase cosNameObj = obj.getItem(COSName.NAME);
                    String key;
                    if (cosNameObj != null) {
                        String cosNameKey = cosNameObj.toString();
                        int startOfKey = cosNameKey.indexOf("{") + 1;
                        key = cosNameKey.substring(startOfKey, cosNameKey.length() - 1);
                    } else {
                        key = "im0";
                    }
                    int objectNum = obj.getObjectNumber().intValue();
                    int genNum = obj.getGenerationNumber().intValue();
                    PDXObjectImage image = (PDXObjectImage) PDXObjectImage.createXObject(imageObj);

                    PDStream pdStr = new PDStream(image.getCOSStream());
                    List<COSName> filters = pdStr.getFilters();

                    log.debug("Detected image with color depth: {} bits", image.getBitsPerComponent());
                    if (filters == null) {
                        continue;
                    }
                    log.debug("Detected filters: {}", filters.toString());

                    if ((image.getBitsPerComponent() > 1) && (!binarize)) {
                        log.info("It is not a bitonal image => skipping");
                        continue;
                    }

                    // at this moment for preventing bad output (bad coloring) from LZWDecode filter
                    if (filters.contains(COSName.LZW_DECODE)) {
                        log.info("This is LZWDecoded => skipping");
                        continue;
                    }

                    if (filters.contains(COSName.FLATE_DECODE)) {
                        log.debug("FlateDecoded image detected");
                    }

                    if (filters.contains(COSName.JBIG2_DECODE)) {
                        if (skipJBig2Images) {
                            log.warn("Allready compressed according to JBIG2 standard => skipping");
                            continue;
                        } else {
                            log.debug("JBIG2 image detected");
                        }
                    }

                    // detection of unsupported filters by pdfBox library
                    if (filters.contains(COSName.JPX_DECODE)) {
                        log.warn("Unsupported filter JPXDecode => skipping");
                        continue;
                    }

                    String name = getUniqueFileName(prefix, image.getSuffix());
                    log.info("Writing image: {}", name);
                    image.write2file(name);

                    PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(),
                            image.getHeight(), objectNum, genNum);
                    originalImageInformations.add(pdfImageInfo);

                    namesOfImages.add(name + "." + image.getSuffix());

                }
            }
        }
    } catch (IOException ex) {
        Tools.deleteFilesFromList(namesOfImages);
        throw new PdfRecompressionException("Unable to parse PDF document", ex);
    } catch (Exception ex) {
        Tools.deleteFilesFromList(namesOfImages);
    } finally {
        if (doc != null) {
            try {
                doc.close();
            } catch (IOException ex) {
                throw new PdfRecompressionException(ex);
            }
        }
    }
}

From source file:cz.muni.pdfjbim.PdfImageExtractor.java

License:Apache License

/**
 * @deprecated -- do not use doesn't work properly yet
 * This method extracts images by going through PDF tree structure
 * @param pdfFile name of input PDF file
 * @param prefix //  w  w  w .j  a va 2 s .c  om
 * @param password password for access to PDF if needed
 * @param pagesToProcess list of pages which should be processed if null given => processed all pages
 *      -- not working yet
//    * @param silent -- if true error messages are not written to output otherwise they are
 * @param binarize -- enables processing of nonbitonal images as well (LZW is still not
 *      processed because of output with inverted colors)
 * @throws PdfRecompressionException if problem to extract images from PDF
 */
public void extractImagesUsingPdfObjectAccess(String pdfFile, String prefix, String password,
        Set<Integer> pagesToProcess, Boolean binarize) throws PdfRecompressionException {
    if (binarize == null) {
        binarize = false;
    }
    // checking arguments and setting appropriate variables
    if (pdfFile == null) {
        throw new IllegalArgumentException("pdfFile must be defined");
    }

    InputStream inputStream = null;
    if (password != null) {
        try {
            log.debug("PDF probably encrypted, trying to decrypt using given password {}", password);
            ByteArrayOutputStream decryptedOutputStream = new ByteArrayOutputStream();
            PdfReader reader = new PdfReader(pdfFile, password.getBytes(StandardCharsets.UTF_8));
            PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream);
            stamper.close();
            inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray());
        } catch (DocumentException ex) {
            throw new PdfRecompressionException(ex);
        } catch (IOException ex) {
            throw new PdfRecompressionException("Reading file caused exception", ex);
        }
    } else {
        try {
            inputStream = new FileInputStream(pdfFile);
        } catch (FileNotFoundException ex) {
            throw new PdfRecompressionException("File wasn't found", ex);
        }
    }

    // if prefix is not set then prefix set to name of pdf without .pdf
    // if pdfFile has unconsistent name (without suffix .pdf) and name longer than 4 chars then last for chars are removed
    // and this string set as prefix
    if ((prefix == null) && (pdfFile.length() > 4)) {
        prefix = pdfFile.substring(0, pdfFile.length() - 4);
    }

    PDFParser parser = null;
    PDDocument doc = null;
    try {
        parser = new PDFParser(inputStream);
        parser.parse();
        doc = parser.getPDDocument();

        AccessPermission accessPermissions = doc.getCurrentAccessPermission();

        if (!accessPermissions.canExtractContent()) {
            throw new PdfRecompressionException("Error: You do not have permission to extract images.");
        }

        // going page by page
        List pages = doc.getDocumentCatalog().getAllPages();
        for (int pageNumber = 0; pageNumber < pages.size(); pageNumber++) {
            if ((pagesToProcess != null) && (!pagesToProcess.contains(pageNumber + 1))) {
                continue;
            }
            PDPage page = (PDPage) pages.get(pageNumber);
            PDResources resources = page.getResources();
            Map xobjs = resources.getXObjects();

            if (xobjs != null) {
                Iterator xobjIter = xobjs.entrySet().iterator();
                while (xobjIter.hasNext()) {
                    Map.Entry entry = (Map.Entry) xobjIter.next();
                    String key = (String) entry.getKey();
                    PDXObject xobj = (PDXObject) entry.getValue();
                    Map images;
                    if (xobj instanceof PDXObjectForm) {
                        PDXObjectForm xform = (PDXObjectForm) xobj;
                        images = xform.getResources().getImages();
                    } else {
                        images = resources.getImages();
                    }

                    // reading images from each page and saving them to file
                    if (images != null) {
                        Iterator imageIter = images.entrySet().iterator();
                        while (imageIter.hasNext()) {
                            Map.Entry imEntry = (Map.Entry) imageIter.next();
                            String imKey = (String) imEntry.getKey();
                            PDXObjectImage image = (PDXObjectImage) imEntry.getValue();

                            PDStream pdStr = new PDStream(image.getCOSStream());
                            List<COSName> filters = pdStr.getFilters();

                            if (image.getBitsPerComponent() > 1 && !binarize) {
                                log.info("It is not a bitonal image => skipping");
                                continue;
                            }

                            // at this moment for preventing bad output (bad coloring) from LZWDecode filter
                            if (filters.contains(COSName.LZW_DECODE)) {
                                log.info("This is LZWDecoded => skipping");
                                continue;

                            }

                            if (filters.contains(COSName.JBIG2_DECODE)) {
                                if (skipJBig2Images) {
                                    log.warn("Allready compressed according to JBIG2 standard => skipping");
                                    continue;
                                } else {
                                    log.debug("JBIG2 image detected");
                                }
                            }

                            // detection of unsupported filters by pdfBox library
                            if (filters.contains(COSName.JPX_DECODE)) {
                                log.info("Unsupported filter JPXDecode => skipping");
                                continue;
                            }

                            COSObject cosObj = new COSObject(image.getCOSObject());
                            int objectNum = cosObj.getObjectNumber().intValue();
                            int genNum = cosObj.getGenerationNumber().intValue();
                            log.debug(objectNum + " " + genNum + " obj");

                            String name = getUniqueFileName(prefix + imKey, image.getSuffix());
                            log.debug("Writing image:" + name);
                            image.write2file(name);

                            PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(),
                                    image.getHeight(), objectNum, genNum);
                            originalImageInformations.add(pdfImageInfo);
                            log.debug(pdfImageInfo.toString());

                            namesOfImages.add(name + "." + image.getSuffix());
                        }
                    }
                }
            }
        }
    } catch (IOException ex) {
        Tools.deleteFilesFromList(namesOfImages);
        throw new PdfRecompressionException("Unable to parse PDF document", ex);
    } catch (RuntimeException ex) {
        Tools.deleteFilesFromList(namesOfImages);
    } finally {
        if (doc != null) {
            try {
                doc.close();
            } catch (IOException ex) {
                throw new PdfRecompressionException(ex);
            }
        }
    }
}

From source file:org.apache.tika.parser.pdf.PDF2XHTMLPureJava.java

License:Apache License

private String getJBIG2Suffix(PDImageXObject image) {
    List<COSName> filters = image.getStream().getFilters();
    if (filters != null && filters.contains(COSName.JBIG2_DECODE)) {
        return "jb2";
    }// w  w w. j  a v a 2 s.  c  o  m
    return null;
}

Example usage for org.apache.pdfbox.cos COSName JBIG2_DECODE

Introduction

Prototype

Usage