Example usage for org.apache.pdfbox.pdmodel.common PDStream getFilters

List of usage examples for org.apache.pdfbox.pdmodel.common PDStream getFilters

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel.common PDStream getFilters.

Prototype

public List<COSName> getFilters() 

Source Link

Document

This will get the list of filters that are associated with this stream.

Usage

From source file:cz.muni.pdfjbim.PdfImageExtractor.java

License:Apache License

/**
 * This method extracts images by going through all COSObjects pointed from xref table
 * @param is input stream containing PDF file
 * @param prefix output basename for images
 * @param password password for access to PDF if needed
 * @param pagesToProcess list of pages which should be processed if null given => processed all pages
 *      -- not working yet//from w ww .ja va2 s  .c o  m
 * @param binarize -- enables processing of nonbitonal images as well (LZW is still not
 *      processed because of output with inverted colors)
 * @throws PdfRecompressionException if problem to extract images from PDF
 */
public void extractImagesUsingPdfParser(InputStream is, String prefix, String password,
        Set<Integer> pagesToProcess, Boolean binarize) throws PdfRecompressionException {
    // checking arguments and setting appropriate variables
    if (binarize == null) {
        binarize = false;
    }

    log.debug("Extracting images (binarize set to {})", binarize);

    InputStream inputStream = null;
    if (password != null) {
        try (ByteArrayOutputStream decryptedOutputStream = new ByteArrayOutputStream()) {
            PdfReader reader = new PdfReader(is, password.getBytes(StandardCharsets.UTF_8));
            PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream);
            if (stamper != null) {
                stamper.close();
            }
            inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray());
        } catch (DocumentException ex) {
            throw new PdfRecompressionException(ex);
        } catch (IOException ex) {
            throw new PdfRecompressionException("Reading file caused exception", ex);
        }
    } else {
        inputStream = is;
    }

    PDFParser parser = null;
    COSDocument doc = null;
    try {
        parser = new PDFParser(inputStream);
        parser.parse();
        doc = parser.getDocument();

        List<COSObject> objs = doc.getObjectsByType(COSName.XOBJECT);
        if (objs != null) {
            for (COSObject obj : objs) {
                COSBase subtype = obj.getItem(COSName.SUBTYPE);
                if (subtype.toString().equalsIgnoreCase("COSName{Image}")) {
                    COSBase imageObj = obj.getObject();
                    COSBase cosNameObj = obj.getItem(COSName.NAME);
                    String key;
                    if (cosNameObj != null) {
                        String cosNameKey = cosNameObj.toString();
                        int startOfKey = cosNameKey.indexOf("{") + 1;
                        key = cosNameKey.substring(startOfKey, cosNameKey.length() - 1);
                    } else {
                        key = "im0";
                    }
                    int objectNum = obj.getObjectNumber().intValue();
                    int genNum = obj.getGenerationNumber().intValue();
                    PDXObjectImage image = (PDXObjectImage) PDXObjectImage.createXObject(imageObj);

                    PDStream pdStr = new PDStream(image.getCOSStream());
                    List<COSName> filters = pdStr.getFilters();

                    log.debug("Detected image with color depth: {} bits", image.getBitsPerComponent());
                    if (filters == null) {
                        continue;
                    }
                    log.debug("Detected filters: {}", filters.toString());

                    if ((image.getBitsPerComponent() > 1) && (!binarize)) {
                        log.info("It is not a bitonal image => skipping");
                        continue;
                    }

                    // at this moment for preventing bad output (bad coloring) from LZWDecode filter
                    if (filters.contains(COSName.LZW_DECODE)) {
                        log.info("This is LZWDecoded => skipping");
                        continue;
                    }

                    if (filters.contains(COSName.FLATE_DECODE)) {
                        log.debug("FlateDecoded image detected");
                    }

                    if (filters.contains(COSName.JBIG2_DECODE)) {
                        if (skipJBig2Images) {
                            log.warn("Allready compressed according to JBIG2 standard => skipping");
                            continue;
                        } else {
                            log.debug("JBIG2 image detected");
                        }
                    }

                    // detection of unsupported filters by pdfBox library
                    if (filters.contains(COSName.JPX_DECODE)) {
                        log.warn("Unsupported filter JPXDecode => skipping");
                        continue;
                    }

                    String name = getUniqueFileName(prefix, image.getSuffix());
                    log.info("Writing image: {}", name);
                    image.write2file(name);

                    PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(),
                            image.getHeight(), objectNum, genNum);
                    originalImageInformations.add(pdfImageInfo);

                    namesOfImages.add(name + "." + image.getSuffix());

                }
            }
        }
    } catch (IOException ex) {
        Tools.deleteFilesFromList(namesOfImages);
        throw new PdfRecompressionException("Unable to parse PDF document", ex);
    } catch (Exception ex) {
        Tools.deleteFilesFromList(namesOfImages);
    } finally {
        if (doc != null) {
            try {
                doc.close();
            } catch (IOException ex) {
                throw new PdfRecompressionException(ex);
            }
        }
    }
}

From source file:cz.muni.pdfjbim.PdfImageExtractor.java

License:Apache License

/**
 * @deprecated -- do not use doesn't work properly yet
 * This method extracts images by going through PDF tree structure
 * @param pdfFile name of input PDF file
 * @param prefix /*w  w w .  j ava  2s  .c o  m*/
 * @param password password for access to PDF if needed
 * @param pagesToProcess list of pages which should be processed if null given => processed all pages
 *      -- not working yet
//    * @param silent -- if true error messages are not written to output otherwise they are
 * @param binarize -- enables processing of nonbitonal images as well (LZW is still not
 *      processed because of output with inverted colors)
 * @throws PdfRecompressionException if problem to extract images from PDF
 */
public void extractImagesUsingPdfObjectAccess(String pdfFile, String prefix, String password,
        Set<Integer> pagesToProcess, Boolean binarize) throws PdfRecompressionException {
    if (binarize == null) {
        binarize = false;
    }
    // checking arguments and setting appropriate variables
    if (pdfFile == null) {
        throw new IllegalArgumentException("pdfFile must be defined");
    }

    InputStream inputStream = null;
    if (password != null) {
        try {
            log.debug("PDF probably encrypted, trying to decrypt using given password {}", password);
            ByteArrayOutputStream decryptedOutputStream = new ByteArrayOutputStream();
            PdfReader reader = new PdfReader(pdfFile, password.getBytes(StandardCharsets.UTF_8));
            PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream);
            stamper.close();
            inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray());
        } catch (DocumentException ex) {
            throw new PdfRecompressionException(ex);
        } catch (IOException ex) {
            throw new PdfRecompressionException("Reading file caused exception", ex);
        }
    } else {
        try {
            inputStream = new FileInputStream(pdfFile);
        } catch (FileNotFoundException ex) {
            throw new PdfRecompressionException("File wasn't found", ex);
        }
    }

    // if prefix is not set then prefix set to name of pdf without .pdf
    // if pdfFile has unconsistent name (without suffix .pdf) and name longer than 4 chars then last for chars are removed
    // and this string set as prefix
    if ((prefix == null) && (pdfFile.length() > 4)) {
        prefix = pdfFile.substring(0, pdfFile.length() - 4);
    }

    PDFParser parser = null;
    PDDocument doc = null;
    try {
        parser = new PDFParser(inputStream);
        parser.parse();
        doc = parser.getPDDocument();

        AccessPermission accessPermissions = doc.getCurrentAccessPermission();

        if (!accessPermissions.canExtractContent()) {
            throw new PdfRecompressionException("Error: You do not have permission to extract images.");
        }

        // going page by page
        List pages = doc.getDocumentCatalog().getAllPages();
        for (int pageNumber = 0; pageNumber < pages.size(); pageNumber++) {
            if ((pagesToProcess != null) && (!pagesToProcess.contains(pageNumber + 1))) {
                continue;
            }
            PDPage page = (PDPage) pages.get(pageNumber);
            PDResources resources = page.getResources();
            Map xobjs = resources.getXObjects();

            if (xobjs != null) {
                Iterator xobjIter = xobjs.entrySet().iterator();
                while (xobjIter.hasNext()) {
                    Map.Entry entry = (Map.Entry) xobjIter.next();
                    String key = (String) entry.getKey();
                    PDXObject xobj = (PDXObject) entry.getValue();
                    Map images;
                    if (xobj instanceof PDXObjectForm) {
                        PDXObjectForm xform = (PDXObjectForm) xobj;
                        images = xform.getResources().getImages();
                    } else {
                        images = resources.getImages();
                    }

                    // reading images from each page and saving them to file
                    if (images != null) {
                        Iterator imageIter = images.entrySet().iterator();
                        while (imageIter.hasNext()) {
                            Map.Entry imEntry = (Map.Entry) imageIter.next();
                            String imKey = (String) imEntry.getKey();
                            PDXObjectImage image = (PDXObjectImage) imEntry.getValue();

                            PDStream pdStr = new PDStream(image.getCOSStream());
                            List<COSName> filters = pdStr.getFilters();

                            if (image.getBitsPerComponent() > 1 && !binarize) {
                                log.info("It is not a bitonal image => skipping");
                                continue;
                            }

                            // at this moment for preventing bad output (bad coloring) from LZWDecode filter
                            if (filters.contains(COSName.LZW_DECODE)) {
                                log.info("This is LZWDecoded => skipping");
                                continue;

                            }

                            if (filters.contains(COSName.JBIG2_DECODE)) {
                                if (skipJBig2Images) {
                                    log.warn("Allready compressed according to JBIG2 standard => skipping");
                                    continue;
                                } else {
                                    log.debug("JBIG2 image detected");
                                }
                            }

                            // detection of unsupported filters by pdfBox library
                            if (filters.contains(COSName.JPX_DECODE)) {
                                log.info("Unsupported filter JPXDecode => skipping");
                                continue;
                            }

                            COSObject cosObj = new COSObject(image.getCOSObject());
                            int objectNum = cosObj.getObjectNumber().intValue();
                            int genNum = cosObj.getGenerationNumber().intValue();
                            log.debug(objectNum + " " + genNum + " obj");

                            String name = getUniqueFileName(prefix + imKey, image.getSuffix());
                            log.debug("Writing image:" + name);
                            image.write2file(name);

                            PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(),
                                    image.getHeight(), objectNum, genNum);
                            originalImageInformations.add(pdfImageInfo);
                            log.debug(pdfImageInfo.toString());

                            namesOfImages.add(name + "." + image.getSuffix());
                        }
                    }
                }
            }
        }
    } catch (IOException ex) {
        Tools.deleteFilesFromList(namesOfImages);
        throw new PdfRecompressionException("Unable to parse PDF document", ex);
    } catch (RuntimeException ex) {
        Tools.deleteFilesFromList(namesOfImages);
    } finally {
        if (doc != null) {
            try {
                doc.close();
            } catch (IOException ex) {
                throw new PdfRecompressionException(ex);
            }
        }
    }
}

From source file:cz.muni.pdfjbim.PdfImageProcessor.java

License:Apache License

/**
 * This method extracts images by going through all COSObjects pointed from xref table
 * @param is input stream containing PDF file
 * @param password password for access to PDF if needed
 * @param pagesToProcess list of pages which should be processed if null given => processed all pages
 *      -- not working yet/*w w  w . j a va 2s.  c  o m*/
 * @param binarize -- enables processing of nonbitonal images as well (LZW is still not
 *      processed because of output with inverted colors)
 * @throws PdfRecompressionException if problem to extract images from PDF
 */
public void extractImagesUsingPdfParser(InputStream is, String prefix, String password,
        Set<Integer> pagesToProcess, Boolean binarize) throws PdfRecompressionException {
    // checking arguments and setting appropriate variables
    if (binarize == null) {
        binarize = false;
    }

    InputStream inputStream = null;
    if (password != null) {
        try {
            ByteArrayOutputStream decryptedOutputStream = null;
            PdfReader reader = new PdfReader(is, password.getBytes());
            PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream);
            stamper.close();
            inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray());
        } catch (DocumentException ex) {
            throw new PdfRecompressionException(ex);
        } catch (IOException ex) {
            throw new PdfRecompressionException("Reading file caused exception", ex);
        }
    } else {
        inputStream = is;
    }

    PDFParser parser = null;
    COSDocument doc = null;
    try {
        parser = new PDFParser(inputStream);
        parser.parse();
        doc = parser.getDocument();

        List<COSObject> objs = doc.getObjectsByType(COSName.XOBJECT);
        if (objs != null) {
            for (COSObject obj : objs) {
                COSBase subtype = obj.getItem(COSName.SUBTYPE);
                if (subtype.toString().equalsIgnoreCase("COSName{Image}")) {
                    COSBase imageObj = obj.getObject();
                    COSBase cosNameObj = obj.getItem(COSName.NAME);
                    String key;
                    if (cosNameObj != null) {
                        String cosNameKey = cosNameObj.toString();
                        int startOfKey = cosNameKey.indexOf("{") + 1;
                        key = cosNameKey.substring(startOfKey, cosNameKey.length() - 1);
                    } else {
                        key = "im0";
                    }
                    int objectNum = obj.getObjectNumber().intValue();
                    int genNum = obj.getGenerationNumber().intValue();
                    PDXObjectImage image = (PDXObjectImage) PDXObjectImage.createXObject(imageObj);

                    PDStream pdStr = new PDStream(image.getCOSStream());
                    List filters = pdStr.getFilters();

                    if ((image.getBitsPerComponent() > 1) && (!binarize)) {
                        log.info("It is not a bitonal image => skipping");

                        continue;
                    }

                    // at this moment for preventing bad output (bad coloring) from LZWDecode filter
                    if (filters.contains(COSName.LZW_DECODE.getName())) {
                        log.info("This is LZWDecoded => skipping");
                        continue;

                    }

                    // detection of unsupported filters by pdfBox library
                    if (filters.contains("JBIG2Decode")) {
                        log.warn("Allready compressed according to JBIG2 standard => skipping");
                        continue;
                    }

                    if (filters.contains("JPXDecode")) {
                        log.warn("Unsupported filter JPXDecode => skipping");
                        continue;
                    }

                    String name = getUniqueFileName(prefix, image.getSuffix());
                    log.info("Writing image:" + name);
                    image.write2file(name);

                    PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(),
                            image.getHeight(), objectNum, genNum);
                    originalImageInformations.add(pdfImageInfo);

                    namesOfImages.add(name + "." + image.getSuffix());

                }
                //                    }
            }
        }
    } catch (IOException ex) {
        throw new PdfRecompressionException("Unable to parse PDF document", ex);
    } finally {
        if (doc != null) {
            try {
                doc.close();
            } catch (IOException ex) {
                throw new PdfRecompressionException(ex);
            }
        }
    }
}

From source file:cz.muni.pdfjbim.PdfImageProcessor.java

License:Apache License

/**
 * @deprecated -- do not use doesn't work properly yet
 * This method extracts images by going through PDF tree structure
 * @param pdfFile name of input PDF file
 * @param password password for access to PDF if needed
 * @param pagesToProcess list of pages which should be processed if null given => processed all pages
 *      -- not working yet//w ww  .j a v  a  2  s.  c  om
 * @param silent -- if true error messages are not written to output otherwise they are
 * @param binarize -- enables processing of nonbitonal images as well (LZW is still not
 *      processed because of output with inverted colors)
 * @throws PdfRecompressionException if problem to extract images from PDF
 */
public void extractImagesUsingPdfObjectAccess(String pdfFile, String password, Set<Integer> pagesToProcess,
        Boolean silent, Boolean binarize) throws PdfRecompressionException {
    if (binarize == null) {
        binarize = false;
    }
    // checking arguments and setting appropriate variables
    if (pdfFile == null) {
        throw new IllegalArgumentException(pdfFile);
    }

    String prefix = null;

    InputStream inputStream = null;
    if (password != null) {
        try {
            ByteArrayOutputStream decryptedOutputStream = null;
            PdfReader reader = new PdfReader(pdfFile, password.getBytes());
            PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream);
            stamper.close();
            inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray());
        } catch (DocumentException ex) {
            throw new PdfRecompressionException(ex);
        } catch (IOException ex) {
            throw new PdfRecompressionException("Reading file caused exception", ex);
        }
    } else {
        try {
            inputStream = new FileInputStream(pdfFile);
        } catch (FileNotFoundException ex) {
            throw new PdfRecompressionException("File wasn't found", ex);
        }
    }

    // if prefix is not set then prefix set to name of pdf without .pdf
    // if pdfFile has unconsistent name (without suffix .pdf) and name longer than 4 chars then last for chars are removed
    // and this string set as prefix
    if ((prefix == null) && (pdfFile.length() > 4)) {
        prefix = pdfFile.substring(0, pdfFile.length() - 4);
    }

    PDFParser parser = null;
    PDDocument doc = null;
    try {
        parser = new PDFParser(inputStream);
        parser.parse();
        doc = parser.getPDDocument();

        AccessPermission accessPermissions = doc.getCurrentAccessPermission();

        if (!accessPermissions.canExtractContent()) {
            throw new PdfRecompressionException("Error: You do not have permission to extract images.");
        }

        // going page by page
        List pages = doc.getDocumentCatalog().getAllPages();
        for (int pageNumber = 0; pageNumber < pages.size(); pageNumber++) {
            if ((pagesToProcess != null) && (!pagesToProcess.contains(pageNumber + 1))) {
                continue;
            }
            PDPage page = (PDPage) pages.get(pageNumber);
            PDResources resources = page.getResources();
            Map xobjs = resources.getXObjects();

            if (xobjs != null) {
                Iterator xobjIter = xobjs.keySet().iterator();
                while (xobjIter.hasNext()) {
                    String key = (String) xobjIter.next();
                    PDXObject xobj = (PDXObject) xobjs.get(key);
                    Map images;
                    if (xobj instanceof PDXObjectForm) {
                        PDXObjectForm xform = (PDXObjectForm) xobj;
                        images = xform.getResources().getImages();
                    } else {
                        images = resources.getImages();
                    }

                    // reading images from each page and saving them to file
                    if (images != null) {
                        Iterator imageIter = images.keySet().iterator();
                        while (imageIter.hasNext()) {
                            String imKey = (String) imageIter.next();
                            PDXObjectImage image = (PDXObjectImage) images.get(imKey);

                            PDStream pdStr = new PDStream(image.getCOSStream());
                            List filters = pdStr.getFilters();

                            if (image.getBitsPerComponent() > 1) {
                                log.info("It is not a bitonal image => skipping");
                                continue;
                            }

                            // at this moment for preventing bad output (bad coloring) from LZWDecode filter
                            if (filters.contains(COSName.LZW_DECODE.getName())) {
                                log.info("This is LZWDecoded => skipping");
                                continue;

                            }

                            // detection of unsupported filters by pdfBox library
                            if (filters.contains("JBIG2Decode")) {
                                log.info("Allready compressed according to JBIG2 standard => skipping");
                                continue;
                            }
                            if (filters.contains("JPXDecode")) {
                                log.info("Unsupported filter JPXDecode => skipping");
                                continue;
                            }

                            COSObject cosObj = new COSObject(image.getCOSObject());
                            int objectNum = cosObj.getObjectNumber().intValue();
                            int genNum = cosObj.getGenerationNumber().intValue();
                            log.debug(objectNum + " " + genNum + " obj");

                            String name = getUniqueFileName(prefix + imKey, image.getSuffix());
                            log.debug("Writing image:" + name);
                            image.write2file(name);

                            PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(),
                                    image.getHeight(), objectNum, genNum);
                            originalImageInformations.add(pdfImageInfo);
                            log.debug(pdfImageInfo.toString());

                            namesOfImages.add(name + "." + image.getSuffix());
                        }
                    }

                }
            }

        }
    } catch (IOException ex) {
        throw new PdfRecompressionException("Unable to parse PDF document", ex);
    } finally {
        if (doc != null) {
            try {
                doc.close();
            } catch (IOException ex) {
                throw new PdfRecompressionException(ex);
            }
        }
    }
}