Example usage for org.apache.pdfbox.pdmodel PDDocument load

List of usage examples for org.apache.pdfbox.pdmodel PDDocument load

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument load.

Prototype

public static PDDocument load(byte[] input) throws IOException 

Source Link

Document

Parses a PDF.

Usage

From source file:com.sustainalytics.crawlerfilter.PDFTitleGeneration.java

License:Apache License

/**
 * This method extracts creation date/ custom date of a PDF file
 * @param file is a File object/*from   w  w  w  . j a  v  a2 s.c o  m*/
 * @return String that contains the creation date/ custom date of the PDF
 */
public static String extractDate(File file) {
    PDDocument document = null;
    boolean isDamaged = false; //to deal with damaged pdf
    String creationDateMetaData = "";
    try {
        document = PDDocument.load(file.toString());
        /*If the PDF file is not damanged --->*/
        if (!isDamaged) {
            /*...but the file is encrypted --->*/
            if (document.isEncrypted()) {
                logger.info("File " + file.getName() + "is encrypted. Trying to decrypt...");
                try {
                    /*...then decryptt it --->*/
                    document.decrypt("");
                    document.setAllSecurityToBeRemoved(true);
                    logger.info("File " + file.getName() + "successfully decrypted!");
                } catch (CryptographyException e) {
                    logger.info("Error decrypting file " + file.getName());
                    isDamaged = true;
                }

            } /*<--work around to decrypt an encrypted pdf ends here*/

            /*Metadata extraction --->*/
            PDDocumentInformation info = document.getDocumentInformation();

            /*We are only interested in date data--->*/
            Calendar calendar = info.getCreationDate();
            int creationYear = 0, creationMonth = 0, creationDate = 0;
            if (calendar != null) {
                creationYear = calendar.get(Calendar.YEAR);
                creationMonth = calendar.get(Calendar.MONTH) + 1;
                creationDate = calendar.get(Calendar.DATE);

            } /*<---Date data extraction complete*/

            /*If creation date is not empty --->*/
            if (creationYear != 0) {
                creationDateMetaData = creationYear + "-" + creationMonth + "-" + creationDate;
            } //<--- creation date found and the date part of the title is generated
            /*No creation date is found --->*/
            else {
                SimpleDateFormat dateFormatter = new SimpleDateFormat("MM/dd/yyyy");
                Date customDate = null;
                /*But we have custom date some times --->*/
                try {
                    customDate = dateFormatter.parse(info.getCustomMetadataValue("customdate"));
                } catch (ParseException e) {
                    logger.info("Error parsing date from custom date");
                }
                calendar = Calendar.getInstance();
                calendar.setTime(customDate);
                if (calendar != null) {
                    creationYear = calendar.get(Calendar.YEAR);
                    creationMonth = calendar.get(Calendar.MONTH) + 1;
                    creationDate = calendar.get(Calendar.DATE);

                } /*<---Date data extraction complete from customdate*/
                if (creationYear != 0) {
                    creationDateMetaData = creationYear + "-" + creationMonth + "-" + creationDate;
                }
            } //<--- work around if no creation date is found

        } /*<--- Good to know that the PDF was not damaged*/
    } catch (IOException e) { /*If the PDF was not read by the system --->*/
        logger.info("Error processing file " + file.getName());
        /*... then maybe it is damaged*/
        isDamaged = true;
    } finally {
        try {
            /*If the file was good, not damaged, then please close it --->*/
            if (!isDamaged) {
                document.close();
                logger.info("File " + file.getName() + " is closed successfully!");
            }
        } catch (IOException e) {
            logger.info("Error closing file " + file.getName());
        }
    } /*<--- PDF closing done!*/
    return creationDateMetaData;
}

From source file:com.tekstosense.segmenter.Main.java

License:Open Source License

private TextExtractor parsePdf(File f) throws IOException {
    PDDocument doc = PDDocument.load(f);

    if (doc.isEncrypted()) {
        // Some documents are encrypted with the empty password. Try
        // to decrypt with this password, or the one passed in on the
        // command line (if any), and fail if we can't.
        try {//from www.  ja  v  a  2  s. c  om
            doc.setAllSecurityToBeRemoved(false);
            //doc.decrypt(password); // Defaults to the empty string.
        } catch (Exception e) {
            throw new IOException("Can't decrypt document: ", e);
        }
    }
    TextExtractor te = new TextExtractor();
    te.writeText(doc, new OutputStreamWriter(new ByteArrayOutputStream()));

    return te;
}

From source file:com.tekstosense.segmenter.StructurePdf.PdfSections.java

License:Open Source License

private TextExtractor parsePdf(File f) throws IOException {
    PDDocument doc = PDDocument.load(f);

    if (doc.isEncrypted()) {
        // Some documents are encrypted with the empty password. Try
        // to decrypt with this password, or the one passed in on the
        // command line (if any), and fail if we can't.
        try {/* w  w  w . j  av a 2s.co m*/
            doc.setAllSecurityToBeRemoved(false);
            // doc.decrypt(password); // Defaults to the empty string.
        } catch (Exception e) {
            throw new IOException("Can't decrypt document: ", e);
        }
    }
    TextExtractor te = new TextExtractor();
    te.writeText(doc, new OutputStreamWriter(new ByteArrayOutputStream()));

    return te;
}

From source file:com.testautomationguru.utility.PDFUtil.java

License:Apache License

/**
* Get the page count of the document./*from w w  w  . ja v  a2 s  .c  o m*/
* 
* @param file Absolute file path
* @return int No of pages in the document.
* @throws java.io.IOException when file is not found.
*/
public int getPageCount(String file) throws IOException {
    logger.info("file :" + file);
    PDDocument doc = PDDocument.load(new File(file));
    int pageCount = doc.getNumberOfPages();
    logger.info("pageCount :" + pageCount);
    doc.close();
    return pageCount;
}

From source file:com.testautomationguru.utility.PDFUtil.java

License:Apache License

/**
* This method returns the content of the document
*//*from   w  ww .j  a v a  2 s.  c o m*/
private String getPDFText(String file, int startPage, int endPage) throws IOException {

    logger.info("file : " + file);
    logger.info("startPage : " + startPage);
    logger.info("endPage : " + endPage);

    PDDocument doc = PDDocument.load(new File(file));

    PDFTextStripper localStripper = new PDFTextStripper();
    if (null != this.stripper) {
        localStripper = this.stripper;
    }

    this.updateStartAndEndPages(file, startPage, endPage);
    localStripper.setStartPage(this.startPage);
    localStripper.setEndPage(this.endPage);

    String txt = localStripper.getText(doc);
    logger.info("PDF Text before trimming : " + txt);
    if (this.bTrimWhiteSpace) {
        txt = txt.trim().replaceAll("\\s+", " ").trim();
        logger.info("PDF Text after  trimming : " + txt);
    }

    doc.close();
    return txt;
}

From source file:com.testautomationguru.utility.PDFUtil.java

License:Apache License

/**
* This method saves the each page of the pdf as image
*//* www  .  j  av  a  2 s.  co m*/
private List<String> saveAsImage(String file, int startPage, int endPage) throws IOException {

    logger.info("file : " + file);
    logger.info("startPage : " + startPage);
    logger.info("endPage : " + endPage);

    ArrayList<String> imgNames = new ArrayList<String>();

    try {
        File sourceFile = new File(file);
        this.createImageDestinationDirectory(file);
        this.updateStartAndEndPages(file, startPage, endPage);

        String fileName = sourceFile.getName().replace(".pdf", "");

        PDDocument document = PDDocument.load(sourceFile);
        PDFRenderer pdfRenderer = new PDFRenderer(document);
        for (int iPage = this.startPage - 1; iPage < this.endPage; iPage++) {
            logger.info("Page No : " + (iPage + 1));
            String fname = this.imageDestinationPath + fileName + "_" + (iPage + 1) + ".png";
            BufferedImage image = pdfRenderer.renderImageWithDPI(iPage, 300, ImageType.RGB);
            ImageIOUtil.writeImage(image, fname, 300);
            imgNames.add(fname);
            logger.info("PDf Page saved as image : " + fname);
        }
        document.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return imgNames;
}

From source file:com.testautomationguru.utility.PDFUtil.java

License:Apache License

private boolean convertToImageAndCompare(String file1, String file2, int startPage, int endPage)
        throws IOException {

    boolean result = true;

    PDDocument doc1 = null;//from  w ww . j av  a2s.co m
    PDDocument doc2 = null;

    PDFRenderer pdfRenderer1 = null;
    PDFRenderer pdfRenderer2 = null;

    try {

        doc1 = PDDocument.load(new File(file1));
        doc2 = PDDocument.load(new File(file2));

        pdfRenderer1 = new PDFRenderer(doc1);
        pdfRenderer2 = new PDFRenderer(doc2);

        for (int iPage = startPage - 1; iPage < endPage; iPage++) {
            String fileName = new File(file1).getName().replace(".pdf", "_") + (iPage + 1);
            fileName = this.getImageDestinationPath() + "/" + fileName + "_diff.png";

            logger.info("Comparing Page No : " + (iPage + 1));
            BufferedImage image1 = pdfRenderer1.renderImageWithDPI(iPage, 300, ImageType.RGB);
            BufferedImage image2 = pdfRenderer2.renderImageWithDPI(iPage, 300, ImageType.RGB);
            result = ImageUtil.compareAndHighlight(image1, image2, fileName, this.bHighlightPdfDifference,
                    this.imgColor.getRGB()) && result;
            if (!this.bCompareAllPages && !result) {
                break;
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        doc1.close();
        doc2.close();
    }
    return result;
}

From source file:com.testautomationguru.utility.PDFUtil.java

License:Apache License

/**
* This method extracts all the embedded images of the pdf document
*//*from   ww  w  .  ja va2s .  c om*/
private List<String> extractimages(String file, int startPage, int endPage) {

    logger.info("file : " + file);
    logger.info("startPage : " + startPage);
    logger.info("endPage : " + endPage);

    ArrayList<String> imgNames = new ArrayList<String>();
    boolean bImageFound = false;
    try {

        this.createImageDestinationDirectory(file);
        String fileName = this.getFileName(file).replace(".pdf", "_resource");

        PDDocument document = PDDocument.load(new File(file));
        PDPageTree list = document.getPages();

        this.updateStartAndEndPages(file, startPage, endPage);

        int totalImages = 1;
        for (int iPage = this.startPage - 1; iPage < this.endPage; iPage++) {
            logger.info("Page No : " + (iPage + 1));
            PDResources pdResources = list.get(iPage).getResources();
            for (COSName c : pdResources.getXObjectNames()) {
                PDXObject o = pdResources.getXObject(c);
                if (o instanceof org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) {
                    bImageFound = true;
                    String fname = this.imageDestinationPath + "/" + fileName + "_" + totalImages + ".png";
                    ImageIO.write(((org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) o).getImage(),
                            "png", new File(fname));
                    imgNames.add(fname);
                    totalImages++;
                }
            }
        }
        document.close();
        if (bImageFound)
            logger.info("Images are saved @ " + this.imageDestinationPath);
        else
            logger.info("No images were found in the PDF");
    } catch (Exception e) {
        e.printStackTrace();
    }
    return imgNames;
}

From source file:com.testautomationguru.utility.PDFUtil.java

License:Apache License

private void updateStartAndEndPages(String file, int start, int end) throws IOException {

    PDDocument document = PDDocument.load(new File(file));
    int pagecount = document.getNumberOfPages();
    logger.info("Page Count : " + pagecount);
    logger.info("Given start page:" + start);
    logger.info("Given end   page:" + end);

    if ((start > 0 && start <= pagecount)) {
        this.startPage = start;
    } else {/*from   w w  w  . j a v a  2s. c  o  m*/
        this.startPage = 1;
    }
    if ((end > 0 && end >= start && end <= pagecount)) {
        this.endPage = end;
    } else {
        this.endPage = pagecount;
    }
    document.close();
    logger.info("Updated start page:" + this.startPage);
    logger.info("Updated end   page:" + this.endPage);
}

From source file:com.truckzoo.test.pdf.CustomPageDrawer.java

License:Apache License

public static void main(String[] args) throws IOException {
    File file = new File("custom-render-demo.pdf");

    PDDocument doc = PDDocument.load(file);
    PDFRenderer renderer = new MyPDFRenderer(doc);
    BufferedImage image = renderer.renderImage(0);
    ImageIO.write(image, "PNG", new File("custom-render.png"));
    doc.close();/* w  w  w .  j ava  2 s . c o m*/
}