Example usage for org.apache.pdfbox.pdmodel PDDocument getDocumentCatalog

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument getDocumentCatalog.

Prototype

public PDDocumentCatalog getDocumentCatalog()

Source Link

Document

This will get the document CATALOG.

Usage

From source file:com.liferay.portlet.documentlibrary.util.PDFProcessorImpl.java

License:Open Source License

private void _generateImagesPB(FileVersion fileVersion, InputStream inputStream) throws Exception {

    boolean generatePreview = _isGeneratePreview(fileVersion);
    boolean generateThumbnail = _isGenerateThumbnail(fileVersion);

    PDDocument pdDocument = null;

    try {//  ww w.  j  a v a  2 s  . com
        pdDocument = PDDocument.load(inputStream);

        PDDocumentCatalog pdDocumentCatalog = pdDocument.getDocumentCatalog();

        List<PDPage> pdPages = pdDocumentCatalog.getAllPages();

        for (int i = 0; i < pdPages.size(); i++) {
            PDPage pdPage = pdPages.get(i);

            if (generateThumbnail && (i == 0)) {
                _generateImagesPB(fileVersion, pdPage, i);

                if (_log.isInfoEnabled()) {
                    _log.info("PDFBox generated a thumbnail for " + fileVersion.getFileVersionId());
                }
            }

            if (!generatePreview) {
                break;
            }

            _generateImagesPB(fileVersion, pdPage, i + 1);
        }

        if (_log.isInfoEnabled() && generatePreview) {
            _log.info("PDFBox generated " + getPreviewFileCount(fileVersion) + " preview pages for "
                    + fileVersion.getFileVersionId());
        }
    } finally {
        if (pdDocument != null) {
            pdDocument.close();
        }
    }
}

From source file:com.lp.server.system.ejbfac.SystemFacBean.java

License:Open Source License

public byte[][] konvertierePDFFileInEinzelneBilder(String pdfFile, int resolution) {
    PDDocument document = null;
    byte[][] oBilder = null;
    try {/*from   ww  w  . j a  v a2 s.c  o  m*/
        document = PDDocument.load(pdfFile);
        List pages = document.getDocumentCatalog().getAllPages();

        oBilder = new byte[pages.size()][];

        for (int i = 0; i < pages.size(); i++) {
            PDPage page = (PDPage) pages.get(i);
            BufferedImage image = page.convertToImage(BufferedImage.TYPE_INT_RGB, resolution);
            oBilder[i] = Helper.imageToByteArray(image);
        }
    } catch (IOException e) {
        e.printStackTrace();
        throw new EJBExceptionLP(EJBExceptionLP.FEHLER, e.getMessage());

    } finally {
        if (document != null) {

            try {
                document.close();
            } catch (IOException e) {
                e.printStackTrace();
                throw new EJBExceptionLP(EJBExceptionLP.FEHLER, e.getMessage());

            }
        }

    }
    return oBilder;
}

From source file:com.modemo.javase.signature.SigUtils.java

License:Apache License

/**
 * Get the access permissions granted for this document in the DocMDP transform parameters
 * dictionary. Details are described in the table "Entries in the DocMDP transform parameters
 * dictionary" in the PDF specification.
 *
 * @param doc document.//from www.  j  a v  a  2 s .co m
 * @return the permission value. 0 means no DocMDP transform parameters dictionary exists. Other
 * return values are 1, 2 or 3. 2 is also returned if the DocMDP transform parameters dictionary
 * is found but did not contain a /P entry, or if the value is outside the valid range.
 */
public static int getMDPPermission(PDDocument doc) {
    COSBase base = doc.getDocumentCatalog().getCOSObject().getDictionaryObject(COSName.PERMS);
    if (base instanceof COSDictionary) {
        COSDictionary permsDict = (COSDictionary) base;
        base = permsDict.getDictionaryObject(COSName.DOCMDP);
        if (base instanceof COSDictionary) {
            COSDictionary signatureDict = (COSDictionary) base;
            base = signatureDict.getDictionaryObject("Reference");
            if (base instanceof COSArray) {
                COSArray refArray = (COSArray) base;
                for (int i = 0; i < refArray.size(); ++i) {
                    base = refArray.getObject(i);
                    if (base instanceof COSDictionary) {
                        COSDictionary sigRefDict = (COSDictionary) base;
                        if (COSName.DOCMDP.equals(sigRefDict.getDictionaryObject("TransformMethod"))) {
                            base = sigRefDict.getDictionaryObject("TransformParams");
                            if (base instanceof COSDictionary) {
                                COSDictionary transformDict = (COSDictionary) base;
                                int accessPermissions = transformDict.getInt(COSName.P, 2);
                                if (accessPermissions < 1 || accessPermissions > 3) {
                                    accessPermissions = 2;
                                }
                                return accessPermissions;
                            }
                        }
                    }
                }
            }
        }
    }
    return 0;
}

From source file:com.modemo.javase.signature.SigUtils.java

License:Apache License

/**
 * Set the access permissions granted for this document in the DocMDP transform parameters
 * dictionary. Details are described in the table "Entries in the DocMDP transform parameters
 * dictionary" in the PDF specification.
 *
 * @param doc The document./*w ww . j av  a2  s  .c o  m*/
 * @param signature The signature object.
 * @param accessPermissions The permission value (1, 2 or 3).
 */
static public void setMDPPermission(PDDocument doc, PDSignature signature, int accessPermissions) {
    COSDictionary sigDict = signature.getCOSObject();

    // DocMDP specific stuff
    COSDictionary transformParameters = new COSDictionary();
    transformParameters.setItem(COSName.TYPE, COSName.getPDFName("TransformParams"));
    transformParameters.setInt(COSName.P, accessPermissions);
    transformParameters.setName(COSName.V, "1.2");
    transformParameters.setNeedToBeUpdated(true);

    COSDictionary referenceDict = new COSDictionary();
    referenceDict.setItem(COSName.TYPE, COSName.getPDFName("SigRef"));
    referenceDict.setItem("TransformMethod", COSName.DOCMDP);
    referenceDict.setItem("DigestMethod", COSName.getPDFName("SHA1"));
    referenceDict.setItem("TransformParams", transformParameters);
    referenceDict.setNeedToBeUpdated(true);

    COSArray referenceArray = new COSArray();
    referenceArray.add(referenceDict);
    sigDict.setItem("Reference", referenceArray);
    referenceArray.setNeedToBeUpdated(true);

    // Catalog
    COSDictionary catalogDict = doc.getDocumentCatalog().getCOSObject();
    COSDictionary permsDict = new COSDictionary();
    catalogDict.setItem(COSName.PERMS, permsDict);
    permsDict.setItem(COSName.DOCMDP, signature);
    catalogDict.setNeedToBeUpdated(true);
    permsDict.setNeedToBeUpdated(true);
}

From source file:com.mycompany.mavenproject1.ragaiproject.PDFManipulation.java

public String[] getFieldNames(PDDocument pdfDocument, JList list) {
    int i = 0;//  www.  j a  v  a2 s.  com

    String[] names = new String[10000];
    PDDocumentCatalog docCatalog = pdfDocument.getDocumentCatalog();
    PDAcroForm acroForm = docCatalog.getAcroForm();
    java.util.List<PDField> fields = acroForm.getFields();

    for (PDField field : fields) {
        names[i] = field.getPartialName();
        this.arrayOfFieldNames[i] = names[i];
        list.setListData(names);
        this.fieldLabelPairs.add(new Pair(field.getPartialName(), field.getValueAsString()));
        System.out.println(this.fieldLabelPairs.size());
        i++;

    }

    return names;
}

From source file:com.netsteadfast.greenstep.util.PdfConvertUtils.java

License:Apache License

@SuppressWarnings("unchecked")
public static List<File> toImageFiles(File pdfFile, int resolution) throws Exception {
    PDDocument document = PDDocument.loadNonSeq(pdfFile, null);
    List<PDPage> pages = document.getDocumentCatalog().getAllPages();
    File tmpDir = new File(Constants.getWorkTmpDir() + "/" + PdfConvertUtils.class.getSimpleName() + "/"
            + System.currentTimeMillis() + "/");
    FileUtils.forceMkdir(tmpDir);//from ww  w. j av  a 2s  . co  m
    List<File> files = new LinkedList<File>();
    int len = String.valueOf(pages.size() + 1).length();
    for (int i = 0; i < pages.size(); i++) {
        String name = StringUtils.leftPad(String.valueOf(i + 1), len, "0");
        BufferedImage bufImage = pages.get(i).convertToImage(BufferedImage.TYPE_INT_RGB, resolution);
        File imageFile = new File(tmpDir.getPath() + "/" + name + ".png");
        FileOutputStream fos = new FileOutputStream(imageFile);
        ImageIOUtil.writeImage(bufImage, "png", fos, resolution);
        fos.flush();
        fos.close();
        files.add(imageFile);
    }
    document.close();
    tmpDir = null;
    return files;
}

From source file:com.odc.pdfextractor.parser.CleanPdfParser.java

License:Apache License

/**
 * This will print the documents docBuilder.
 *
 * @param args The command line arguments.
 *
 * @throws Exception If there is an error parsing the document.
 *///from  w  w w . ja  v  a  2  s. co  m
public DocumentLocation processPdf(String filename) throws Exception {

    PDDocument document = null;
    try {
        document = PDDocument.load(filename);
        if (document.isEncrypted()) {
            try {
                document.decrypt("");
            } catch (InvalidPasswordException e) {
                System.err.println("Error: Document is encrypted with a password.");
                System.exit(1);
            }
        }
        List allPages = document.getDocumentCatalog().getAllPages();
        System.out.print("Extracting text from PDF");
        for (int i = 0; i < allPages.size(); i++) {
            PDPage page = (PDPage) allPages.get(i);
            System.out.print(".");
            PDStream contents = page.getContents();
            if (contents != null) {
                this.processStream(page, page.findResources(), page.getContents().getStream());
            }
            docBuilder.incrementPage();
        }
    } finally {
        System.out.println();
        if (document != null) {
            document.close();
        }
    }
    return docBuilder.getDoc();
}

From source file:com.openkm.extractor.PdfTextExtractor.java

License:Open Source License

/**
 * {@inheritDoc}//from w  w  w  .j a v  a  2  s  .c o m
 */
@SuppressWarnings("rawtypes")
public String extractText(InputStream stream, String type, String encoding) throws IOException {
    try {
        PDFParser parser = new PDFParser(new BufferedInputStream(stream));

        try {
            parser.parse();
            PDDocument document = parser.getPDDocument();

            if (document.isEncrypted()) {
                try {
                    document.decrypt("");
                    document.setAllSecurityToBeRemoved(true);
                } catch (Exception e) {
                    throw new IOException("Unable to extract text: document encrypted", e);
                }
            }

            CharArrayWriter writer = new CharArrayWriter();
            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setLineSeparator("\n");
            stripper.writeText(document, writer);
            String st = writer.toString().trim();
            log.debug("TextStripped: '{}'", st);

            if (Config.SYSTEM_PDF_FORCE_OCR || st.length() <= 1) {
                log.warn("PDF does not contains text layer");

                // Extract images from PDF
                StringBuilder sb = new StringBuilder();

                if (!Config.SYSTEM_PDFIMAGES.isEmpty()) {
                    File tmpPdf = FileUtils.createTempFile("pdf");
                    File tmpDir = new File(EnvironmentDetector.getTempDir());
                    String baseName = FileUtils.getFileName(tmpPdf.getName());
                    document.save(tmpPdf);
                    int pgNum = 1;

                    try {
                        for (PDPage page : (List<PDPage>) document.getDocumentCatalog().getAllPages()) {
                            HashMap<String, Object> hm = new HashMap<String, Object>();
                            hm.put("fileIn", tmpPdf.getPath());
                            hm.put("firstPage", pgNum);
                            hm.put("lastPage", pgNum++);
                            hm.put("imageRoot", tmpDir + File.separator + baseName);
                            String cmd = TemplateUtils.replace("SYSTEM_PDFIMAGES", Config.SYSTEM_PDFIMAGES, hm);
                            ExecutionUtils.runCmd(cmd);

                            for (File tmp : tmpDir.listFiles()) {
                                if (tmp.getName().startsWith(baseName + "-")) {
                                    if (page.findRotation() > 0) {
                                        ImageUtils.rotate(tmp, tmp, page.findRotation());
                                    }

                                    try {
                                        String txt = doOcr(tmp);
                                        sb.append(txt).append(" ");
                                        log.debug("OCR Extracted: {}", txt);
                                    } finally {
                                        FileUtils.deleteQuietly(tmp);
                                    }
                                }
                            }
                        }
                    } finally {
                        FileUtils.deleteQuietly(tmpPdf);
                    }
                } else {
                    for (PDPage page : (List<PDPage>) document.getDocumentCatalog().getAllPages()) {
                        PDResources resources = page.getResources();
                        Map<String, PDXObject> images = resources.getXObjects();

                        if (images != null) {
                            for (String key : images.keySet()) {
                                PDXObjectImage image = (PDXObjectImage) images.get(key);
                                String prefix = "img-" + key + "-";
                                File pdfImg = null;

                                try {
                                    pdfImg = File.createTempFile(prefix, ".png");
                                    log.debug("Writing image: {}", pdfImg.getPath());

                                    // Won't work until PDFBox 1.8.9
                                    ImageIO.write(image.getRGBImage(), "png", pdfImg);

                                    if (page.findRotation() > 0) {
                                        ImageUtils.rotate(pdfImg, pdfImg, page.findRotation());
                                    }

                                    // Do OCR
                                    String txt = doOcr(pdfImg);
                                    sb.append(txt).append(" ");
                                    log.debug("OCR Extracted: {}", txt);
                                } finally {
                                    FileUtils.deleteQuietly(pdfImg);
                                }
                            }
                        }
                    }
                }

                return sb.toString();
            } else {
                return writer.toString();
            }
        } finally {
            try {
                PDDocument doc = parser.getPDDocument();
                if (doc != null) {
                    doc.close();
                }
            } catch (IOException e) {
                // ignore
            }
        }
    } catch (Exception e) {
        // it may happen that PDFParser throws a runtime
        // exception when parsing certain pdf documents
        log.warn("Failed to extract PDF text content", e);
        throw new IOException(e.getMessage(), e);
    } finally {
        stream.close();
    }
}

From source file:com.opensearchserver.extractor.parser.PdfBox.java

License:Apache License

private void extractMetaData(PDDocument pdf) throws IOException {
    PDDocumentInformation info = pdf.getDocumentInformation();
    if (info != null) {
        metas.add(TITLE, info.getTitle());
        metas.add(SUBJECT, info.getSubject());
        metas.add(AUTHOR, info.getAuthor());
        metas.add(PRODUCER, info.getProducer());
        metas.add(KEYWORDS, info.getKeywords());
        metas.add(CREATION_DATE, getDate(getCreationDate(info)));
        metas.add(MODIFICATION_DATE, getModificationDate(info));
    }//from ww  w  .  j  av  a2  s . c o  m
    int pages = pdf.getNumberOfPages();
    metas.add(NUMBER_OF_PAGES, pages);
    PDDocumentCatalog catalog = pdf.getDocumentCatalog();
    if (catalog != null)
        metas.add(LANGUAGE, catalog.getLanguage());
}

From source file:com.qwazr.library.pdfbox.PdfBoxParser.java

License:Apache License

private void extractMetaData(final PDDocument pdf, final ParserFieldsBuilder metas) {
    metas.set(MIME_TYPE, DEFAULT_MIMETYPES[0]);
    final PDDocumentInformation info = pdf.getDocumentInformation();
    if (info != null) {
        metas.add(TITLE, info.getTitle());
        metas.add(SUBJECT, info.getSubject());
        metas.add(AUTHOR, info.getAuthor());
        metas.add(PRODUCER, info.getProducer());
        metas.add(KEYWORDS, info.getKeywords());
        metas.add(CREATION_DATE, info.getCreationDate());
        metas.add(MODIFICATION_DATE, info.getModificationDate());
    }/*from w  w w . j a va2s. c  o  m*/
    int pages = pdf.getNumberOfPages();
    metas.add(NUMBER_OF_PAGES, pages);
    PDDocumentCatalog catalog = pdf.getDocumentCatalog();
    if (catalog != null)
        metas.add(LANGUAGE, catalog.getLanguage());
}