Example usage for org.apache.pdfbox.pdmodel PDDocument isEncrypted

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument isEncrypted.

Prototype

public boolean isEncrypted()

Source Link

Document

This will tell if this document is encrypted or not.

Usage

From source file:com.jaeksoft.searchlib.web.controller.ViewerController.java

License:Open Source License

private void loadPdfBox() throws IOException, CryptographyException, SearchLibException, InterruptedException {
    PDDocument document = null;
    try {//from   w w  w  .ja  v a  2s .  com
        document = PDDocument.loadNonSeq(tempFile, null);
        // Trying to open with empty password
        boolean isEncrypted = document.isEncrypted();
        if (isEncrypted)
            document.decrypt("");
        loadGS(isEncrypted ? "" : null);
        List<Rectangle> boxList = new ArrayList<Rectangle>(0);
        checkPdfBoxHighlight(document, boxList);
        checkHocrHighlight(currentImage.getWidth(), currentImage.getHeight(), boxList);
        ImageUtils.yellowHighlight(currentImage, boxList, 0.1F);
        numberOfPages = document.getNumberOfPages();
    } finally {
        if (document != null)
            IOUtils.close(document);
    }
}

From source file:com.jt.tool.pdf.CreateBookmarks.java

License:Apache License

public static void createBookmark(String srcFile, String targetFile, String reg) throws Exception {
    PDDocument document = null;
    try {// w ww.  jav  a 2 s .com
        document = PDDocument.load(new File(srcFile));
        if (document.isEncrypted()) {
            System.err.println("Error: Cannot add bookmarks to encrypted document.");
            System.exit(1);
        }
        PDDocumentOutline outline = new PDDocumentOutline();
        document.getDocumentCatalog().setDocumentOutline(outline);
        PDOutlineItem pagesOutline = new PDOutlineItem();
        pagesOutline.setTitle("All Pages");
        //            outline.appendChild(pagesOutline);
        List pages = new ArrayList();
        //                    document.getDocumentCatalog().getAllPages();
        for (int i = 12; i < pages.size(); i++) {
            String pageText = getPageText(document, i + 1, 0);
            String[] strings = matchTitle(pageText, reg);
            if (makeBookmark(strings)) {
                PDPage page = (PDPage) pages.get(i);
                PDPageFitWidthDestination dest = new PDPageFitWidthDestination();
                dest.setPage(page);
                PDOutlineItem bookmark = new PDOutlineItem();
                bookmark.setDestination(dest);
                bookmark.setTitle(strings[0]);
                //                    pagesOutline.appendChild(bookmark);
                System.out.println("add " + strings[0]);
            }
        }
        pagesOutline.openNode();
        outline.openNode();
        document.save(targetFile);
    } finally {
        if (document != null) {
            document.close();
        }
    }
}

From source file:com.odc.pdfextractor.parser.CleanPdfParser.java

License:Apache License

/**
 * This will print the documents docBuilder.
 *
 * @param args The command line arguments.
 *
 * @throws Exception If there is an error parsing the document.
 *//* ww w . j a v a2  s . com*/
public DocumentLocation processPdf(String filename) throws Exception {

    PDDocument document = null;
    try {
        document = PDDocument.load(filename);
        if (document.isEncrypted()) {
            try {
                document.decrypt("");
            } catch (InvalidPasswordException e) {
                System.err.println("Error: Document is encrypted with a password.");
                System.exit(1);
            }
        }
        List allPages = document.getDocumentCatalog().getAllPages();
        System.out.print("Extracting text from PDF");
        for (int i = 0; i < allPages.size(); i++) {
            PDPage page = (PDPage) allPages.get(i);
            System.out.print(".");
            PDStream contents = page.getContents();
            if (contents != null) {
                this.processStream(page, page.findResources(), page.getContents().getStream());
            }
            docBuilder.incrementPage();
        }
    } finally {
        System.out.println();
        if (document != null) {
            document.close();
        }
    }
    return docBuilder.getDoc();
}

From source file:com.openkm.extractor.PdfTextExtractor.java

License:Open Source License

/**
 * {@inheritDoc}//from   www  .  j  a  v  a 2s  . c om
 */
@SuppressWarnings("rawtypes")
public String extractText(InputStream stream, String type, String encoding) throws IOException {
    try {
        PDFParser parser = new PDFParser(new BufferedInputStream(stream));

        try {
            parser.parse();
            PDDocument document = parser.getPDDocument();

            if (document.isEncrypted()) {
                try {
                    document.decrypt("");
                    document.setAllSecurityToBeRemoved(true);
                } catch (Exception e) {
                    throw new IOException("Unable to extract text: document encrypted", e);
                }
            }

            CharArrayWriter writer = new CharArrayWriter();
            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setLineSeparator("\n");
            stripper.writeText(document, writer);
            String st = writer.toString().trim();
            log.debug("TextStripped: '{}'", st);

            if (Config.SYSTEM_PDF_FORCE_OCR || st.length() <= 1) {
                log.warn("PDF does not contains text layer");

                // Extract images from PDF
                StringBuilder sb = new StringBuilder();

                if (!Config.SYSTEM_PDFIMAGES.isEmpty()) {
                    File tmpPdf = FileUtils.createTempFile("pdf");
                    File tmpDir = new File(EnvironmentDetector.getTempDir());
                    String baseName = FileUtils.getFileName(tmpPdf.getName());
                    document.save(tmpPdf);
                    int pgNum = 1;

                    try {
                        for (PDPage page : (List<PDPage>) document.getDocumentCatalog().getAllPages()) {
                            HashMap<String, Object> hm = new HashMap<String, Object>();
                            hm.put("fileIn", tmpPdf.getPath());
                            hm.put("firstPage", pgNum);
                            hm.put("lastPage", pgNum++);
                            hm.put("imageRoot", tmpDir + File.separator + baseName);
                            String cmd = TemplateUtils.replace("SYSTEM_PDFIMAGES", Config.SYSTEM_PDFIMAGES, hm);
                            ExecutionUtils.runCmd(cmd);

                            for (File tmp : tmpDir.listFiles()) {
                                if (tmp.getName().startsWith(baseName + "-")) {
                                    if (page.findRotation() > 0) {
                                        ImageUtils.rotate(tmp, tmp, page.findRotation());
                                    }

                                    try {
                                        String txt = doOcr(tmp);
                                        sb.append(txt).append(" ");
                                        log.debug("OCR Extracted: {}", txt);
                                    } finally {
                                        FileUtils.deleteQuietly(tmp);
                                    }
                                }
                            }
                        }
                    } finally {
                        FileUtils.deleteQuietly(tmpPdf);
                    }
                } else {
                    for (PDPage page : (List<PDPage>) document.getDocumentCatalog().getAllPages()) {
                        PDResources resources = page.getResources();
                        Map<String, PDXObject> images = resources.getXObjects();

                        if (images != null) {
                            for (String key : images.keySet()) {
                                PDXObjectImage image = (PDXObjectImage) images.get(key);
                                String prefix = "img-" + key + "-";
                                File pdfImg = null;

                                try {
                                    pdfImg = File.createTempFile(prefix, ".png");
                                    log.debug("Writing image: {}", pdfImg.getPath());

                                    // Won't work until PDFBox 1.8.9
                                    ImageIO.write(image.getRGBImage(), "png", pdfImg);

                                    if (page.findRotation() > 0) {
                                        ImageUtils.rotate(pdfImg, pdfImg, page.findRotation());
                                    }

                                    // Do OCR
                                    String txt = doOcr(pdfImg);
                                    sb.append(txt).append(" ");
                                    log.debug("OCR Extracted: {}", txt);
                                } finally {
                                    FileUtils.deleteQuietly(pdfImg);
                                }
                            }
                        }
                    }
                }

                return sb.toString();
            } else {
                return writer.toString();
            }
        } finally {
            try {
                PDDocument doc = parser.getPDDocument();
                if (doc != null) {
                    doc.close();
                }
            } catch (IOException e) {
                // ignore
            }
        }
    } catch (Exception e) {
        // it may happen that PDFParser throws a runtime
        // exception when parsing certain pdf documents
        log.warn("Failed to extract PDF text content", e);
        throw new IOException(e.getMessage(), e);
    } finally {
        stream.close();
    }
}

From source file:com.opensearchserver.extractor.parser.PdfBox.java

License:Apache License

/**
 * Extract text content using PDFBox//from www  .  j a  v  a  2 s  . com
 * 
 * @param pdf
 * @throws Exception
 */
private void parseContent(PDDocument pdf) throws Exception {
    try {
        if (pdf.isEncrypted())
            pdf.openProtection(new StandardDecryptionMaterial(""));
        extractMetaData(pdf);
        Stripper stripper = new Stripper();
        stripper.getText(pdf);
    } finally {
        if (pdf != null)
            pdf.close();
    }
}

From source file:com.santaanna.friendlyreader.pdfstod.GUI.PDFReader.java

License:Apache License

/**
 * This will parse a document./*from  w w  w  .j ava 2 s  . c om*/
 *
 * @param input The input stream for the document.
 *
 * @return The document.
 *
 * @throws IOException If there is an error parsing the document.
 */
private static PDDocument parseDocument(InputStream input) throws IOException {
    PDDocument document = PDDocument.load(input);
    if (document.isEncrypted()) {
        try {
            document.decrypt("");
        } catch (org.apache.pdfbox.exceptions.InvalidPasswordException e) {
            System.err.println("Error: The document is encrypted.");
        } catch (org.apache.pdfbox.exceptions.CryptographyException e) {
            e.printStackTrace();
        }
    }

    return document;
}

From source file:com.stimulus.archiva.extraction.PDFExtractor.java

License:Open Source License

public Reader getText(InputStream is, Charset charset, IndexInfo indexInfo) throws ExtractionException {
    logger.debug("extracting pdf file");
    File file = null;//  w ww .  j a v  a2 s  .  c om
    PDDocument document = null;
    Writer output = null;
    try {
        PDFParser parser = new PDFParser(is);
        parser.parse();
        document = parser.getPDDocument();
        if (document.isEncrypted()) {
            DocumentEncryption decryptor = new DocumentEncryption(document);
            if (logger.isDebugEnabled()) {
                logger.debug("pdf document appears to be encrypted (will attempt decryption)");

            }
            decryptor.decryptDocument("");
        }
        file = File.createTempFile("extract_pdf", ".tmp");
        indexInfo.addDeleteFile(file);
        output = new OutputStreamWriter(new FileOutputStream(file), "UTF-8");
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.writeText(document, output);
        /*logger.debug("PDF extraction completed");
         BufferedReader reader;
         try {
            reader = new BufferedReader(new FileReader(file));
           String line = null;
           while( (line=reader.readLine()) != null) {
              logger.debug("PDF>"+line);
           }
           reader.close();
         } catch(Exception e) {
            logger.error("failed to open txt file",e);
         }*/
    } catch (Throwable e) {
        throw new ExtractionException("failed to extract pdf (probable password protected document)", e, logger,
                ChainedException.Level.DEBUG);
    } finally {
        try {
            if (document != null)
                document.close();
            if (output != null)
                output.close();
        } catch (IOException io) {
        }
    }
    try {
        logger.debug("returning extracted PDF data");
        Reader outReader = new FileReader(file);
        indexInfo.addReader(outReader);
        return outReader;
    } catch (Exception ex) {
        throw new ExtractionException("failed to extract text from powerpoint document", ex, logger,
                ChainedException.Level.DEBUG);
    }
}

From source file:com.sustainalytics.crawlerfilter.PDFTitleGeneration.java

License:Apache License

/**
 * This method extracts creation date/ custom date of a PDF file
 * @param file is a File object// www .  j ava2  s.  c  o m
 * @return String that contains the creation date/ custom date of the PDF
 */
public static String extractDate(File file) {
    PDDocument document = null;
    boolean isDamaged = false; //to deal with damaged pdf
    String creationDateMetaData = "";
    try {
        document = PDDocument.load(file.toString());
        /*If the PDF file is not damanged --->*/
        if (!isDamaged) {
            /*...but the file is encrypted --->*/
            if (document.isEncrypted()) {
                logger.info("File " + file.getName() + "is encrypted. Trying to decrypt...");
                try {
                    /*...then decryptt it --->*/
                    document.decrypt("");
                    document.setAllSecurityToBeRemoved(true);
                    logger.info("File " + file.getName() + "successfully decrypted!");
                } catch (CryptographyException e) {
                    logger.info("Error decrypting file " + file.getName());
                    isDamaged = true;
                }

            } /*<--work around to decrypt an encrypted pdf ends here*/

            /*Metadata extraction --->*/
            PDDocumentInformation info = document.getDocumentInformation();

            /*We are only interested in date data--->*/
            Calendar calendar = info.getCreationDate();
            int creationYear = 0, creationMonth = 0, creationDate = 0;
            if (calendar != null) {
                creationYear = calendar.get(Calendar.YEAR);
                creationMonth = calendar.get(Calendar.MONTH) + 1;
                creationDate = calendar.get(Calendar.DATE);

            } /*<---Date data extraction complete*/

            /*If creation date is not empty --->*/
            if (creationYear != 0) {
                creationDateMetaData = creationYear + "-" + creationMonth + "-" + creationDate;
            } //<--- creation date found and the date part of the title is generated
            /*No creation date is found --->*/
            else {
                SimpleDateFormat dateFormatter = new SimpleDateFormat("MM/dd/yyyy");
                Date customDate = null;
                /*But we have custom date some times --->*/
                try {
                    customDate = dateFormatter.parse(info.getCustomMetadataValue("customdate"));
                } catch (ParseException e) {
                    logger.info("Error parsing date from custom date");
                }
                calendar = Calendar.getInstance();
                calendar.setTime(customDate);
                if (calendar != null) {
                    creationYear = calendar.get(Calendar.YEAR);
                    creationMonth = calendar.get(Calendar.MONTH) + 1;
                    creationDate = calendar.get(Calendar.DATE);

                } /*<---Date data extraction complete from customdate*/
                if (creationYear != 0) {
                    creationDateMetaData = creationYear + "-" + creationMonth + "-" + creationDate;
                }
            } //<--- work around if no creation date is found

        } /*<--- Good to know that the PDF was not damaged*/
    } catch (IOException e) { /*If the PDF was not read by the system --->*/
        logger.info("Error processing file " + file.getName());
        /*... then maybe it is damaged*/
        isDamaged = true;
    } finally {
        try {
            /*If the file was good, not damaged, then please close it --->*/
            if (!isDamaged) {
                document.close();
                logger.info("File " + file.getName() + " is closed successfully!");
            }
        } catch (IOException e) {
            logger.info("Error closing file " + file.getName());
        }
    } /*<--- PDF closing done!*/
    return creationDateMetaData;
}

From source file:com.tekstosense.segmenter.Main.java

License:Open Source License

private TextExtractor parsePdf(File f) throws IOException {
    PDDocument doc = PDDocument.load(f);

    if (doc.isEncrypted()) {
        // Some documents are encrypted with the empty password. Try
        // to decrypt with this password, or the one passed in on the
        // command line (if any), and fail if we can't.
        try {//from   w ww .  jav a  2s  . c  o m
            doc.setAllSecurityToBeRemoved(false);
            //doc.decrypt(password); // Defaults to the empty string.
        } catch (Exception e) {
            throw new IOException("Can't decrypt document: ", e);
        }
    }
    TextExtractor te = new TextExtractor();
    te.writeText(doc, new OutputStreamWriter(new ByteArrayOutputStream()));

    return te;
}

From source file:com.tekstosense.segmenter.StructurePdf.PdfSections.java

License:Open Source License

private TextExtractor parsePdf(File f) throws IOException {
    PDDocument doc = PDDocument.load(f);

    if (doc.isEncrypted()) {
        // Some documents are encrypted with the empty password. Try
        // to decrypt with this password, or the one passed in on the
        // command line (if any), and fail if we can't.
        try {/*from   w w w  .ja va2  s. c  o  m*/
            doc.setAllSecurityToBeRemoved(false);
            // doc.decrypt(password); // Defaults to the empty string.
        } catch (Exception e) {
            throw new IOException("Can't decrypt document: ", e);
        }
    }
    TextExtractor te = new TextExtractor();
    te.writeText(doc, new OutputStreamWriter(new ByteArrayOutputStream()));

    return te;
}