Example usage for org.apache.pdfbox.pdmodel PDDocument load

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument load.

Prototype

public static PDDocument load(byte[] input) throws IOException

Source Link

Document

Parses a PDF.

Usage

From source file:com.sustainalytics.crawlerfilter.PDFTitleGeneration.java

License:Apache License

/**
 * This method extracts creation date/ custom date of a PDF file
 * @param file is a File object/*from   w  w  w  . j a  v  a2 s.c o  m*/
 * @return String that contains the creation date/ custom date of the PDF
 */
public static String extractDate(File file) {
    PDDocument document = null;
    boolean isDamaged = false; //to deal with damaged pdf
    String creationDateMetaData = "";
    try {
        document = PDDocument.load(file.toString());
        /*If the PDF file is not damanged --->*/
        if (!isDamaged) {
            /*...but the file is encrypted --->*/
            if (document.isEncrypted()) {
                logger.info("File " + file.getName() + "is encrypted. Trying to decrypt...");
                try {
                    /*...then decryptt it --->*/
                    document.decrypt("");
                    document.setAllSecurityToBeRemoved(true);
                    logger.info("File " + file.getName() + "successfully decrypted!");
                } catch (CryptographyException e) {
                    logger.info("Error decrypting file " + file.getName());
                    isDamaged = true;
                }

            } /*<--work around to decrypt an encrypted pdf ends here*/

            /*Metadata extraction --->*/
            PDDocumentInformation info = document.getDocumentInformation();

            /*We are only interested in date data--->*/
            Calendar calendar = info.getCreationDate();
            int creationYear = 0, creationMonth = 0, creationDate = 0;
            if (calendar != null) {
                creationYear = calendar.get(Calendar.YEAR);
                creationMonth = calendar.get(Calendar.MONTH) + 1;
                creationDate = calendar.get(Calendar.DATE);

            } /*<---Date data extraction complete*/

            /*If creation date is not empty --->*/
            if (creationYear != 0) {
                creationDateMetaData = creationYear + "-" + creationMonth + "-" + creationDate;
            } //<--- creation date found and the date part of the title is generated
            /*No creation date is found --->*/
            else {
                SimpleDateFormat dateFormatter = new SimpleDateFormat("MM/dd/yyyy");
                Date customDate = null;
                /*But we have custom date some times --->*/
                try {
                    customDate = dateFormatter.parse(info.getCustomMetadataValue("customdate"));
                } catch (ParseException e) {
                    logger.info("Error parsing date from custom date");
                }
                calendar = Calendar.getInstance();
                calendar.setTime(customDate);
                if (calendar != null) {
                    creationYear = calendar.get(Calendar.YEAR);
                    creationMonth = calendar.get(Calendar.MONTH) + 1;
                    creationDate = calendar.get(Calendar.DATE);

                } /*<---Date data extraction complete from customdate*/
                if (creationYear != 0) {
                    creationDateMetaData = creationYear + "-" + creationMonth + "-" + creationDate;
                }
            } //<--- work around if no creation date is found

        } /*<--- Good to know that the PDF was not damaged*/
    } catch (IOException e) { /*If the PDF was not read by the system --->*/
        logger.info("Error processing file " + file.getName());
        /*... then maybe it is damaged*/
        isDamaged = true;
    } finally {
        try {
            /*If the file was good, not damaged, then please close it --->*/
            if (!isDamaged) {
                document.close();
                logger.info("File " + file.getName() + " is closed successfully!");
            }
        } catch (IOException e) {
            logger.info("Error closing file " + file.getName());
        }
    } /*<--- PDF closing done!*/
    return creationDateMetaData;
}

From source file:com.tekstosense.segmenter.Main.java

License:Open Source License

private TextExtractor parsePdf(File f) throws IOException {
    PDDocument doc = PDDocument.load(f);

    if (doc.isEncrypted()) {
        // Some documents are encrypted with the empty password. Try
        // to decrypt with this password, or the one passed in on the
        // command line (if any), and fail if we can't.
        try {//from www.  ja  v  a  2  s. c  om
            doc.setAllSecurityToBeRemoved(false);
            //doc.decrypt(password); // Defaults to the empty string.
        } catch (Exception e) {
            throw new IOException("Can't decrypt document: ", e);
        }
    }
    TextExtractor te = new TextExtractor();
    te.writeText(doc, new OutputStreamWriter(new ByteArrayOutputStream()));

    return te;
}

From source file:com.tekstosense.segmenter.StructurePdf.PdfSections.java

License:Open Source License

private TextExtractor parsePdf(File f) throws IOException {
    PDDocument doc = PDDocument.load(f);

    if (doc.isEncrypted()) {
        // Some documents are encrypted with the empty password. Try
        // to decrypt with this password, or the one passed in on the
        // command line (if any), and fail if we can't.
        try {/* w  w  w . j  av a 2s.co m*/
            doc.setAllSecurityToBeRemoved(false);
            // doc.decrypt(password); // Defaults to the empty string.
        } catch (Exception e) {
            throw new IOException("Can't decrypt document: ", e);
        }
    }
    TextExtractor te = new TextExtractor();
    te.writeText(doc, new OutputStreamWriter(new ByteArrayOutputStream()));

    return te;
}

From source file:com.testautomationguru.utility.PDFUtil.java

License:Apache License

/**
* Get the page count of the document./*from w w  w  . ja v  a2 s  .c  o m*/
* 
* @param file Absolute file path
* @return int No of pages in the document.
* @throws java.io.IOException when file is not found.
*/
public int getPageCount(String file) throws IOException {
    logger.info("file :" + file);
    PDDocument doc = PDDocument.load(new File(file));
    int pageCount = doc.getNumberOfPages();
    logger.info("pageCount :" + pageCount);
    doc.close();
    return pageCount;
}

From source file:com.testautomationguru.utility.PDFUtil.java

License:Apache License

/**
* This method returns the content of the document
*//*from   w  ww .j  a v a  2 s.  c o m*/
private String getPDFText(String file, int startPage, int endPage) throws IOException {

    logger.info("file : " + file);
    logger.info("startPage : " + startPage);
    logger.info("endPage : " + endPage);

    PDDocument doc = PDDocument.load(new File(file));

    PDFTextStripper localStripper = new PDFTextStripper();
    if (null != this.stripper) {
        localStripper = this.stripper;
    }

    this.updateStartAndEndPages(file, startPage, endPage);
    localStripper.setStartPage(this.startPage);
    localStripper.setEndPage(this.endPage);

    String txt = localStripper.getText(doc);
    logger.info("PDF Text before trimming : " + txt);
    if (this.bTrimWhiteSpace) {
        txt = txt.trim().replaceAll("\\s+", " ").trim();
        logger.info("PDF Text after  trimming : " + txt);
    }

    doc.close();
    return txt;
}

From source file:com.testautomationguru.utility.PDFUtil.java

License:Apache License

/**
* This method saves the each page of the pdf as image
*//* www  .  j  av  a  2 s.  co m*/
private List<String> saveAsImage(String file, int startPage, int endPage) throws IOException {

    logger.info("file : " + file);
    logger.info("startPage : " + startPage);
    logger.info("endPage : " + endPage);

    ArrayList<String> imgNames = new ArrayList<String>();

    try {
        File sourceFile = new File(file);
        this.createImageDestinationDirectory(file);
        this.updateStartAndEndPages(file, startPage, endPage);

        String fileName = sourceFile.getName().replace(".pdf", "");

        PDDocument document = PDDocument.load(sourceFile);
        PDFRenderer pdfRenderer = new PDFRenderer(document);
        for (int iPage = this.startPage - 1; iPage < this.endPage; iPage++) {
            logger.info("Page No : " + (iPage + 1));
            String fname = this.imageDestinationPath + fileName + "_" + (iPage + 1) + ".png";
            BufferedImage image = pdfRenderer.renderImageWithDPI(iPage, 300, ImageType.RGB);
            ImageIOUtil.writeImage(image, fname, 300);
            imgNames.add(fname);
            logger.info("PDf Page saved as image : " + fname);
        }
        document.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return imgNames;
}

From source file:com.testautomationguru.utility.PDFUtil.java

License:Apache License

private boolean convertToImageAndCompare(String file1, String file2, int startPage, int endPage)
        throws IOException {

    boolean result = true;

    PDDocument doc1 = null;//from  w ww . j av  a2s.co m
    PDDocument doc2 = null;

    PDFRenderer pdfRenderer1 = null;
    PDFRenderer pdfRenderer2 = null;

    try {

        doc1 = PDDocument.load(new File(file1));
        doc2 = PDDocument.load(new File(file2));

        pdfRenderer1 = new PDFRenderer(doc1);
        pdfRenderer2 = new PDFRenderer(doc2);

        for (int iPage = startPage - 1; iPage < endPage; iPage++) {
            String fileName = new File(file1).getName().replace(".pdf", "_") + (iPage + 1);
            fileName = this.getImageDestinationPath() + "/" + fileName + "_diff.png";

            logger.info("Comparing Page No : " + (iPage + 1));
            BufferedImage image1 = pdfRenderer1.renderImageWithDPI(iPage, 300, ImageType.RGB);
            BufferedImage image2 = pdfRenderer2.renderImageWithDPI(iPage, 300, ImageType.RGB);
            result = ImageUtil.compareAndHighlight(image1, image2, fileName, this.bHighlightPdfDifference,
                    this.imgColor.getRGB()) && result;
            if (!this.bCompareAllPages && !result) {
                break;
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        doc1.close();
        doc2.close();
    }
    return result;
}

From source file:com.testautomationguru.utility.PDFUtil.java

License:Apache License

/**
* This method extracts all the embedded images of the pdf document
*//*from   ww  w  .  ja va2s .  c om*/
private List<String> extractimages(String file, int startPage, int endPage) {

    logger.info("file : " + file);
    logger.info("startPage : " + startPage);
    logger.info("endPage : " + endPage);

    ArrayList<String> imgNames = new ArrayList<String>();
    boolean bImageFound = false;
    try {

        this.createImageDestinationDirectory(file);
        String fileName = this.getFileName(file).replace(".pdf", "_resource");

        PDDocument document = PDDocument.load(new File(file));
        PDPageTree list = document.getPages();

        this.updateStartAndEndPages(file, startPage, endPage);

        int totalImages = 1;
        for (int iPage = this.startPage - 1; iPage < this.endPage; iPage++) {
            logger.info("Page No : " + (iPage + 1));
            PDResources pdResources = list.get(iPage).getResources();
            for (COSName c : pdResources.getXObjectNames()) {
                PDXObject o = pdResources.getXObject(c);
                if (o instanceof org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) {
                    bImageFound = true;
                    String fname = this.imageDestinationPath + "/" + fileName + "_" + totalImages + ".png";
                    ImageIO.write(((org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) o).getImage(),
                            "png", new File(fname));
                    imgNames.add(fname);
                    totalImages++;
                }
            }
        }
        document.close();
        if (bImageFound)
            logger.info("Images are saved @ " + this.imageDestinationPath);
        else
            logger.info("No images were found in the PDF");
    } catch (Exception e) {
        e.printStackTrace();
    }
    return imgNames;
}

From source file:com.testautomationguru.utility.PDFUtil.java

License:Apache License

private void updateStartAndEndPages(String file, int start, int end) throws IOException {

    PDDocument document = PDDocument.load(new File(file));
    int pagecount = document.getNumberOfPages();
    logger.info("Page Count : " + pagecount);
    logger.info("Given start page:" + start);
    logger.info("Given end   page:" + end);

    if ((start > 0 && start <= pagecount)) {
        this.startPage = start;
    } else {/*from   w w  w  . j a v a  2s. c  o  m*/
        this.startPage = 1;
    }
    if ((end > 0 && end >= start && end <= pagecount)) {
        this.endPage = end;
    } else {
        this.endPage = pagecount;
    }
    document.close();
    logger.info("Updated start page:" + this.startPage);
    logger.info("Updated end   page:" + this.endPage);
}

From source file:com.truckzoo.test.pdf.CustomPageDrawer.java

License:Apache License

public static void main(String[] args) throws IOException {
    File file = new File("custom-render-demo.pdf");

    PDDocument doc = PDDocument.load(file);
    PDFRenderer renderer = new MyPDFRenderer(doc);
    BufferedImage image = renderer.renderImage(0);
    ImageIO.write(image, "PNG", new File("custom-render.png"));
    doc.close();/* w  w  w .  j ava  2 s . c o m*/
}