Example usage for org.apache.pdfbox.pdmodel PDDocument isEncrypted

List of usage examples for org.apache.pdfbox.pdmodel PDDocument isEncrypted

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument isEncrypted.

Prototype

public boolean isEncrypted() 

Source Link

Document

This will tell if this document is encrypted or not.

Usage

From source file:ddf.catalog.transformer.input.pdf.PdfInputTransformer.java

License:Open Source License

private Metacard transformPdf(String id, PDDocument pdfDocument, String contentInput) throws IOException {
    MetacardImpl metacard = initializeMetacard(id, contentInput);

    if (pdfDocument.isEncrypted()) {
        LOGGER.debug("Cannot transform encrypted pdf");
        return metacard;
    }//from  w w w .  j  a v a2  s .  co  m

    extractPdfMetadata(pdfDocument, metacard);

    pdfThumbnailGenerator.apply(pdfDocument).ifPresent(metacard::setThumbnail);

    Optional.ofNullable(geoParser.apply(pdfDocument)).ifPresent(metacard::setLocation);

    return metacard;
}

From source file:de.catma.document.source.contenthandler.PDFContentHandler.java

License:Open Source License

public void load(InputStream is) throws IOException {
    PDDocument document = null;
    try {/*from   ww  w  . j  a  va 2  s. c o m*/
        document = PDDocument.load(is, false);

        if (document.isEncrypted()) {
            throw new IOException("can not open pdf document because it is encrypted");
        }

        AccessPermission ap = document.getCurrentAccessPermission();
        if (!ap.canExtractContent()) {
            throw new IOException("You do not have permission to extract text");
        }

        PDFTextStripper stripper = new PDFTextStripper("UTF-8");

        stripper.setForceParsing(false);
        stripper.setSortByPosition(false);
        stripper.setShouldSeparateByBeads(true);
        stripper.setStartPage(1);
        stripper.setEndPage(Integer.MAX_VALUE);

        ByteArrayOutputStream os = new ByteArrayOutputStream();
        Writer w = new OutputStreamWriter(os);
        try {
            stripper.writeText(document, w);
        } finally {
            w.close();
        }
        // some pdfs seem to include non valid unicode characters
        // and this causes problems when converting text to HTML
        // for GUI delivery and during indexing 
        setContent(os.toString().replaceAll("[^\\x09\\x0A\\x0D\\x20-\\uD7FF\\uE000-\\uFFFD\\u10000-\\u10FFFF]",
                "?"));
    } finally {
        if (document != null) {
            document.close();
        }
    }
}

From source file:de.ilias.services.lucene.index.file.PDFBoxPDFHandler.java

License:Open Source License

/**
 * @throws IOException /*from   w  w  w  . j  a  v  a  2s  . c  om*/
 * @see de.ilias.services.lucene.index.file.FileHandler#getContent(java.io.InputStream)
 */
public String getContent(InputStream is) throws FileHandlerException {

    PDDocument pddo = null;
    PDFTextStripper stripper = null;
    String str = new String("");

    try {

        pddo = PDDocument.load(is);

        if (pddo.isEncrypted()) {
            logger.warn("PDF Document is encrypted. Trying empty password...");
            return "";
        }
        stripper = new PDFTextStripper();
        str = stripper.getText(pddo);
    } catch (NumberFormatException e) {
        logger.warn("Invalid PDF version number given. Aborting");
    } catch (IOException e) {
        logger.warn(e.getMessage());
        throw new FileHandlerException(e);
    } catch (Exception e) {
        logger.error(e.getMessage());
        throw new FileHandlerException(e);
    } finally {
        try {
            if (pddo != null)
                pddo.close();
        } catch (IOException e) {
            ;
        }
    }
    return str;
}

From source file:de.tudarmstadt.ukp.dkpro.core.io.pdf.Pdf2CasConverter.java

License:Apache License

public void writeText(final CAS aCas, final InputStream aIs) throws IOException {
    final PDDocument doc = PDDocument.load(aIs);

    try {//from  w  w w  . j  a  va 2  s  . c  o  m
        if (doc.isEncrypted()) {
            throw new IOException("Encrypted documents currently not supported");
        }

        cas = aCas;
        text = new StringBuilder();

        writeText(doc);
    } finally {
        doc.close();
    }
}

From source file:editorframework.pdfbox.PDFBoxDocumentAdaptee.java

private static PDDocument parseDocument(String filename) throws IOException {
    PDDocument document = PDDocument.load(filename);
    if (document.isEncrypted()) {
        try {/* w w w  .j a  va 2s .co  m*/
            document.decrypt("");
        } catch (org.apache.pdfbox.exceptions.CryptographyException e) {
            e.printStackTrace();
        }
    }

    return document;
}

From source file:editorframework.pdfbox.testes.PDFReaderAdaptor.java

License:Apache License

private static PDDocument parseDocument(InputStream input) throws IOException {
    PDDocument document = PDDocument.load(input);
    if (document.isEncrypted()) {
        try {/*from ww w. j a  v  a 2  s  . c  om*/
            document.decrypt("");
        } catch (org.apache.pdfbox.exceptions.CryptographyException e) {
            e.printStackTrace();
        }
    }

    return document;
}

From source file:edu.ur.ir.index.DefaultPdfTextExtractor.java

License:Apache License

/**
 * Extract text from the PDF document/* ww w . j a  va  2  s  . c o m*/
 * @throws Exception 
 * 
 * @see edu.ur.ir.index.FileTextExtractor#getText(java.io.File)
 */
public String getText(File f) throws Exception {
    String text = null;

    // don't even try if the file is too large
    if (isFileTooLarge(f) || f.length() <= 0l) {
        return text;
    }
    PDDocument pdDoc = null;
    try {
        pdDoc = PDDocument.load(f);

        // don't do anything with decripted docs
        if (!pdDoc.isEncrypted()) {
            PDFTextStripper stripper = new PDFTextStripper();
            String myText = stripper.getText(pdDoc);

            if (myText != null && !myText.trim().equals("")) {
                text = myText;
            }

        } else {
            log.error("pdf " + f.getAbsolutePath() + " is encrypted and "
                    + " cannot be decrypted because we don't have a password");
        }

    } catch (OutOfMemoryError oome) {
        text = null;
        log.error("could not extract text", oome);
        throw (oome);
    } catch (Exception e) {
        log.error("could not extract text with other error", e);
        text = null;
        throw (e);
    } finally {
        closePDDocument(pdDoc);
        pdDoc = null;
    }

    return text;

}

From source file:edu.uwm.jiaoduan.lab.ExtractTextByArea.java

License:Apache License

/**
 * This will print the documents text in a certain area.
 *
 * @param args The command line arguments.
 *
 * @throws Exception If there is an error parsing the document.
 *///from   w w  w  .  j  a  v a  2 s.  co m
public static void main(String[] args) throws Exception {
    args = new String[] { "test.pdf" };
    if (args.length != 1) {
        usage();
    } else {
        PDDocument document = null;
        try {
            document = PDDocument.load(args[0]);
            if (document.isEncrypted()) {
                try {
                    document.decrypt("");
                } catch (InvalidPasswordException e) {
                    System.err.println("Error: Document is encrypted with a password.");
                    System.exit(1);
                }
            }
            PDFTextStripperByArea stripper = new PDFTextStripperByArea();
            stripper.setSortByPosition(true);
            //Rectangle rect = new Rectangle( 99,219,80,15 );
            //convert xfdf coordinate to rectangle

            Rectangle2D.Double rect = new Rectangle2D.Double();

            List allPages = document.getDocumentCatalog().getAllPages();
            PDPage firstPage = (PDPage) allPages.get(0);

            double pageHeight = firstPage.getMediaBox().getHeight();

            //125.824906,672.39502,390.577109,694.679017
            double[] coords = new double[] { 58.50615, 500.847504, 302.919073, 552.419312 };
            //rect.height = 694.679017 - 672.39502;
            rect.height = coords[3] - coords[1];
            //rect.width = 390.577109 - 125.824906;
            rect.width = coords[2] - coords[0];
            ;

            //rect.x = 125.824906;
            rect.x = coords[0];
            //rect.y = pageHeight -672.39502 - rect.height; 
            rect.y = pageHeight - coords[1] - rect.height;
            System.out.println(rect);

            stripper.addRegion("class1", rect);
            stripper.extractRegions(firstPage);

            System.out.println("Text in the area:" + rect);
            System.out.println(stripper.getTextForRegion("class1"));

        } finally {
            if (document != null) {
                document.close();
            }
        }
    }
}

From source file:fr.acxio.tools.agia.file.pdf.AbstractPDDocumentFactory.java

License:Apache License

protected PDDocument loadDocument(InputStream sInputStream, Map<String, Object> sParameters)
        throws IOException, CryptographyException, InvalidPasswordException {
    PDDocument aDocument = PDDocument.load(sInputStream);
    if (aDocument.isEncrypted()) {
        aDocument.decrypt((String) sParameters.get(PDDocumentFactoryConstants.PARAM_PASSWORD));
    }/*from  w w w.ja  v  a  2 s  . co  m*/
    return aDocument;
}

From source file:fr.acxio.tools.agia.file.pdf.AbstractPDDocumentFactory.java

License:Apache License

protected PDDocument loadDocument(File sFile, Map<String, Object> sParameters)
        throws IOException, CryptographyException, InvalidPasswordException {
    PDDocument aDocument = null;
    if (Boolean.TRUE.equals(sParameters.get(PARAM_NONSEQ))) {
        aDocument = PDDocument.loadNonSeq(sFile,
                (RandomAccess) sParameters.get(PDDocumentFactoryConstants.PARAM_SCRATCHFILE),
                (String) sParameters.get(PDDocumentFactoryConstants.PARAM_PASSWORD));
    } else {//from w w  w  . java2s. c  om
        aDocument = PDDocument.load(sFile);
        if (aDocument.isEncrypted()) {
            aDocument.decrypt((String) sParameters.get(PDDocumentFactoryConstants.PARAM_PASSWORD));
        }
    }
    return aDocument;
}