Example usage for org.apache.pdfbox.cos COSDocument close

List of usage examples for org.apache.pdfbox.cos COSDocument close

Introduction

In this page you can find the example usage for org.apache.pdfbox.cos COSDocument close.

Prototype

@Override
public void close() throws IOException 

Source Link

Document

This will close all storage and delete the tmp files.

Usage

From source file:ExtractTextFromPdf.java

public static void main(String[] args) {

    PDFParser parser = null;//from  ww w. ja  v  a  2s  . com
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    PDFTextStripper pdfStripper;

    String parsedText;
    String fileName = "C:/Users/Kavya Gupta/Desktop/Texas_Title.pdf";
    File file = new File(fileName);
    try {
        byte data[] = new byte[1024];
        ((RandomAccessRead) file).read(data, 0, 1024);

        pdDoc = PDDocument.load(new File(fileName));
        pdfStripper = new PDFTextStripper();
        parsedText = pdfStripper.getText(pdDoc);
        System.out.println(parsedText.replaceAll("[^A-Za-z0-9. ]+", ""));
    } catch (Exception e) {
        e.printStackTrace();
        try {
            if (cosDoc != null)
                cosDoc.close();
            if (pdDoc != null)
                pdDoc.close();
        } catch (Exception e1) {
            e.printStackTrace();
        }
    }
}

From source file:at.gv.egiz.pdfas.lib.impl.signing.pdfbox.PADESPDFBOXSigner.java

License:EUPL

public void signPDF(PDFObject genericPdfObject, RequestedSignature requestedSignature,
        PDFASSignatureInterface genericSigner) throws PdfAsException {
    String fisTmpFile = null;//  ww  w .  j a va 2s  . c om

    PDFAsVisualSignatureProperties properties = null;

    if (!(genericPdfObject instanceof PDFBOXObject)) {
        // tODO:
        throw new PdfAsException();
    }

    PDFBOXObject pdfObject = (PDFBOXObject) genericPdfObject;

    if (!(genericSigner instanceof PDFASPDFBOXSignatureInterface)) {
        // tODO:
        throw new PdfAsException();
    }

    PDFASPDFBOXSignatureInterface signer = (PDFASPDFBOXSignatureInterface) genericSigner;

    TempFileHelper helper = pdfObject.getStatus().getTempFileHelper();
    PDDocument doc = pdfObject.getDocument();
    SignatureOptions options = new SignatureOptions();
    COSDocument visualSignatureDocumentGuard = null;
    try {
        fisTmpFile = helper.getStaticFilename();

        FileOutputStream tmpOutputStream = null;
        try {
            // write to temporary file
            tmpOutputStream = new FileOutputStream(new File(fisTmpFile));
            InputStream tmpis = null;
            try {
                tmpis = pdfObject.getOriginalDocument().getInputStream();
                IOUtils.copy(tmpis, tmpOutputStream);
                tmpis.close();
            } finally {
                IOUtils.closeQuietly(tmpis);
            }

            SignaturePlaceholderData signaturePlaceholderData = PlaceholderFilter
                    .checkPlaceholderSignature(pdfObject.getStatus(), pdfObject.getStatus().getSettings());

            TablePos tablePos = null;

            if (signaturePlaceholderData != null) {
                // Placeholder found!
                logger.info("Placeholder data found.");
                if (signaturePlaceholderData.getProfile() != null) {
                    logger.debug("Placeholder Profile set to: " + signaturePlaceholderData.getProfile());
                    requestedSignature.setSignatureProfileID(signaturePlaceholderData.getProfile());
                }

                tablePos = signaturePlaceholderData.getTablePos();
                if (tablePos != null) {

                    SignatureProfileConfiguration signatureProfileConfiguration = pdfObject.getStatus()
                            .getSignatureProfileConfiguration(requestedSignature.getSignatureProfileID());

                    float minWidth = signatureProfileConfiguration.getMinWidth();

                    if (minWidth > 0) {
                        if (tablePos.getWidth() < minWidth) {
                            tablePos.width = minWidth;
                            logger.debug("Correcting placeholder with to minimum width {}", minWidth);
                        }
                    }
                    logger.debug("Placeholder Position set to: " + tablePos.toString());
                }
            }

            PDSignature signature = new PDSignature();
            signature.setFilter(COSName.getPDFName(signer.getPDFFilter())); // default
            // filter
            signature.setSubFilter(COSName.getPDFName(signer.getPDFSubFilter()));

            SignatureProfileSettings signatureProfileSettings = TableFactory.createProfile(
                    requestedSignature.getSignatureProfileID(), pdfObject.getStatus().getSettings());

            ValueResolver resolver = new ValueResolver(requestedSignature, pdfObject.getStatus());
            String signerName = resolver.resolve("SIG_SUBJECT",
                    signatureProfileSettings.getValue("SIG_SUBJECT"), signatureProfileSettings);

            signature.setName(signerName);

            // take signing time from provided signer...
            signature.setSignDate(signer.getSigningDate());
            // ...and update operation status in order to use exactly this date for the complete signing process
            requestedSignature.getStatus().setSigningDate(signer.getSigningDate());

            String signerReason = signatureProfileSettings.getSigningReason();

            if (signerReason == null) {
                signerReason = "PAdES Signature";
            }

            signature.setReason(signerReason);
            logger.debug("Signing reason: " + signerReason);

            logger.debug("Signing @ " + signer.getSigningDate().getTime().toString());

            // the signing date, needed for valid signature
            // signature.setSignDate(signer.getSigningDate());

            signer.setPDSignature(signature);

            int signatureSize = 0x1000;
            try {
                String reservedSignatureSizeString = signatureProfileSettings.getValue(SIG_RESERVED_SIZE);
                if (reservedSignatureSizeString != null) {
                    signatureSize = Integer.parseInt(reservedSignatureSizeString);
                }
                logger.debug("Reserving {} bytes for signature", signatureSize);
            } catch (NumberFormatException e) {
                logger.warn("Invalid configuration value: {} should be a number using 0x1000",
                        SIG_RESERVED_SIZE);
            }
            options.setPreferedSignatureSize(signatureSize);

            // Is visible Signature
            if (requestedSignature.isVisual()) {
                logger.debug("Creating visual signature block");

                SignatureProfileConfiguration signatureProfileConfiguration = pdfObject.getStatus()
                        .getSignatureProfileConfiguration(requestedSignature.getSignatureProfileID());

                if (tablePos == null) {
                    // ================================================================
                    // PositioningStage (visual) -> find position or use
                    // fixed
                    // position

                    String posString = pdfObject.getStatus().getSignParamter().getSignaturePosition();

                    TablePos signaturePos = null;

                    String signaturePosString = signatureProfileConfiguration.getDefaultPositioning();

                    if (signaturePosString != null) {
                        logger.debug("using signature Positioning: " + signaturePos);
                        signaturePos = new TablePos(signaturePosString);
                    }

                    logger.debug("using Positioning: " + posString);

                    if (posString != null) {
                        // Merge Signature Position
                        tablePos = new TablePos(posString, signaturePos);
                    } else {
                        // Fallback to signature Position!
                        tablePos = signaturePos;
                    }

                    if (tablePos == null) {
                        // Last Fallback default position
                        tablePos = new TablePos();
                    }
                }
                boolean legacy32Position = signatureProfileConfiguration.getLegacy32Positioning();
                boolean legacy40Position = signatureProfileConfiguration.getLegacy40Positioning();

                // create Table describtion
                Table main = TableFactory.createSigTable(signatureProfileSettings, MAIN, pdfObject.getStatus(),
                        requestedSignature);

                IPDFStamper stamper = StamperFactory.createDefaultStamper(pdfObject.getStatus().getSettings());

                IPDFVisualObject visualObject = stamper.createVisualPDFObject(pdfObject, main);

                /*
                 * PDDocument originalDocument = PDDocument .load(new
                 * ByteArrayInputStream(pdfObject.getStatus()
                 * .getPdfObject().getOriginalDocument()));
                 */

                PositioningInstruction positioningInstruction = Positioning.determineTablePositioning(tablePos,
                        "", doc, visualObject, legacy32Position, legacy40Position);

                logger.debug("Positioning: {}", positioningInstruction.toString());

                if (positioningInstruction.isMakeNewPage()) {
                    int last = doc.getNumberOfPages() - 1;
                    PDDocumentCatalog root = doc.getDocumentCatalog();
                    PDPageNode rootPages = root.getPages();
                    List<PDPage> kids = new ArrayList<PDPage>();
                    rootPages.getAllKids(kids);
                    PDPage lastPage = kids.get(last);
                    rootPages.getCOSObject().setNeedToBeUpdate(true);
                    PDPage p = new PDPage(lastPage.findMediaBox());
                    p.setResources(new PDResources());
                    p.setRotation(lastPage.findRotation());
                    doc.addPage(p);
                }

                // handle rotated page
                PDDocumentCatalog documentCatalog = doc.getDocumentCatalog();
                PDPageNode documentPages = documentCatalog.getPages();
                List<PDPage> documentPagesKids = new ArrayList<PDPage>();
                documentPages.getAllKids(documentPagesKids);
                int targetPageNumber = positioningInstruction.getPage();
                logger.debug("Target Page: " + targetPageNumber);
                // rootPages.getAllKids(kids);
                PDPage targetPage = documentPagesKids.get(targetPageNumber - 1);
                int rot = targetPage.findRotation();
                logger.debug("Page rotation: " + rot);
                // positioningInstruction.setRotation(positioningInstruction.getRotation()
                // + rot);
                logger.debug("resulting Sign rotation: " + positioningInstruction.getRotation());

                SignaturePositionImpl position = new SignaturePositionImpl();
                position.setX(positioningInstruction.getX());
                position.setY(positioningInstruction.getY());
                position.setPage(positioningInstruction.getPage());
                position.setHeight(visualObject.getHeight());
                position.setWidth(visualObject.getWidth());

                requestedSignature.setSignaturePosition(position);

                properties = new PDFAsVisualSignatureProperties(pdfObject.getStatus().getSettings(), pdfObject,
                        (PdfBoxVisualObject) visualObject, positioningInstruction, signatureProfileSettings);

                properties.buildSignature();

                /*
                 * ByteArrayOutputStream sigbos = new
                 * ByteArrayOutputStream();
                 * sigbos.write(StreamUtils.inputStreamToByteArray
                 * (properties .getVisibleSignature())); sigbos.close();
                 */

                if (signaturePlaceholderData != null) {
                    // Placeholder found!
                    // replace placeholder
                    InputStream is = null;
                    try {
                        is = PADESPDFBOXSigner.class.getResourceAsStream("/placeholder/empty.jpg");
                        PDJpeg img = new PDJpeg(doc, is);

                        img.getCOSObject().setNeedToBeUpdate(true);

                        PDDocumentCatalog root = doc.getDocumentCatalog();
                        PDPageNode rootPages = root.getPages();
                        List<PDPage> kids = new ArrayList<PDPage>();
                        rootPages.getAllKids(kids);
                        int pageNumber = positioningInstruction.getPage();
                        // rootPages.getAllKids(kids);
                        PDPage page = kids.get(pageNumber - 1);

                        logger.info("Placeholder name: " + signaturePlaceholderData.getPlaceholderName());
                        COSDictionary xobjectsDictionary = (COSDictionary) page.findResources()
                                .getCOSDictionary().getDictionaryObject(COSName.XOBJECT);
                        xobjectsDictionary.setItem(signaturePlaceholderData.getPlaceholderName(), img);
                        xobjectsDictionary.setNeedToBeUpdate(true);
                        page.findResources().getCOSObject().setNeedToBeUpdate(true);
                        logger.info("Placeholder name: " + signaturePlaceholderData.getPlaceholderName());
                    } finally {
                        IOUtils.closeQuietly(is);
                    }
                }

                if (signatureProfileSettings.isPDFA()) {
                    PDDocumentCatalog root = doc.getDocumentCatalog();
                    COSBase base = root.getCOSDictionary().getItem(COSName.OUTPUT_INTENTS);
                    if (base == null) {
                        InputStream colorProfile = null;
                        try {
                            colorProfile = PDDocumentCatalog.class
                                    .getResourceAsStream("/icm/sRGB Color Space Profile.icm");

                            try {
                                PDOutputIntent oi = new PDOutputIntent(doc, colorProfile);
                                oi.setInfo("sRGB IEC61966-2.1");
                                oi.setOutputCondition("sRGB IEC61966-2.1");
                                oi.setOutputConditionIdentifier("sRGB IEC61966-2.1");
                                oi.setRegistryName("http://www.color.org");

                                root.addOutputIntent(oi);
                                root.getCOSObject().setNeedToBeUpdate(true);
                                logger.info("added Output Intent");
                            } catch (Throwable e) {
                                throw new PdfAsException("Failed to add Output Intent", e);
                            }
                        } finally {
                            IOUtils.closeQuietly(colorProfile);
                        }
                    }
                }

                options.setPage(positioningInstruction.getPage());

                options.setVisualSignature(properties.getVisibleSignature());
            }

            visualSignatureDocumentGuard = options.getVisualSignature();

            doc.addSignature(signature, signer, options);

            // set need to update indirect fields array in acro form
            COSDictionary trailer = doc.getDocument().getTrailer();
            if (trailer != null) {
                COSDictionary troot = (COSDictionary) trailer.getDictionaryObject(COSName.ROOT);
                if (troot != null) {
                    COSDictionary acroForm = (COSDictionary) troot.getDictionaryObject(COSName.ACRO_FORM);
                    if (acroForm != null) {
                        COSArray tfields = (COSArray) acroForm.getDictionaryObject(COSName.FIELDS);
                        if (tfields != null && !tfields.isDirect()) {
                            tfields.setNeedToBeUpdate(true);
                        }
                    }
                }
            }

            String sigFieldName = signatureProfileSettings.getSignFieldValue();

            if (sigFieldName == null) {
                sigFieldName = "PDF-AS Signatur";
            }

            int count = PdfBoxUtils.countSignatures(doc, sigFieldName);

            sigFieldName = sigFieldName + count;

            PDAcroForm acroFormm = doc.getDocumentCatalog().getAcroForm();

            PDSignatureField signatureField = null;
            if (acroFormm != null) {
                @SuppressWarnings("unchecked")
                List<PDField> fields = acroFormm.getFields();

                if (fields != null) {
                    for (PDField pdField : fields) {
                        if (pdField != null) {
                            if (pdField instanceof PDSignatureField) {
                                PDSignatureField tmpSigField = (PDSignatureField) pdField;

                                if (tmpSigField.getSignature() != null
                                        && tmpSigField.getSignature().getDictionary() != null) {
                                    if (tmpSigField.getSignature().getDictionary()
                                            .equals(signature.getDictionary())) {
                                        signatureField = (PDSignatureField) pdField;

                                    }
                                }
                            }
                        }
                    }
                } else {
                    logger.warn("Failed to name Signature Field! [Cannot find Field list in acroForm!]");
                }

                if (signatureField != null) {
                    signatureField.setPartialName(sigFieldName);
                }
                if (properties != null) {
                    signatureField.setAlternateFieldName(properties.getAlternativeTableCaption());
                } else {
                    signatureField.setAlternateFieldName(sigFieldName);
                }
            } else {
                logger.warn("Failed to name Signature Field! [Cannot find acroForm!]");
            }

            // PDF-UA
            logger.info("Adding pdf/ua content.");
            try {
                PDDocumentCatalog root = doc.getDocumentCatalog();
                PDStructureTreeRoot structureTreeRoot = root.getStructureTreeRoot();
                if (structureTreeRoot != null) {
                    logger.info("Tree Root: {}", structureTreeRoot.toString());
                    List<Object> kids = structureTreeRoot.getKids();

                    if (kids == null) {
                        logger.info("No kid-elements in structure tree Root, maybe not PDF/UA document");
                    }

                    PDStructureElement docElement = null;
                    for (Object k : kids) {
                        if (k instanceof PDStructureElement) {
                            docElement = (PDStructureElement) k;
                            break;

                        }
                    }

                    PDStructureElement sigBlock = new PDStructureElement("Form", docElement);

                    // create object dictionary and add as child element
                    COSDictionary objectDic = new COSDictionary();
                    objectDic.setName("Type", "OBJR");
                    objectDic.setItem("Pg", signatureField.getWidget().getPage());
                    objectDic.setItem("Obj", signatureField.getWidget());

                    List<Object> l = new ArrayList<Object>();
                    l.add(objectDic);
                    sigBlock.setKids(l);
                    sigBlock.setPage(signatureField.getWidget().getPage());

                    sigBlock.setTitle("Signature Table");
                    sigBlock.setParent(docElement);
                    docElement.appendKid(sigBlock);

                    // Create and add Attribute dictionary to mitigate PAC
                    // warning
                    COSDictionary sigBlockDic = (COSDictionary) sigBlock.getCOSObject();
                    COSDictionary sub = new COSDictionary();

                    sub.setName("O", "Layout");
                    sub.setName("Placement", "Block");
                    sigBlockDic.setItem(COSName.A, sub);
                    sigBlockDic.setNeedToBeUpdate(true);

                    // Modify number tree
                    PDNumberTreeNode ntn = structureTreeRoot.getParentTree();
                    int parentTreeNextKey = structureTreeRoot.getParentTreeNextKey();
                    if (ntn == null) {
                        ntn = new PDNumberTreeNode(objectDic, null);
                        logger.info("No number-tree-node found!");
                    }

                    COSArray ntnKids = (COSArray) ntn.getCOSDictionary().getDictionaryObject(COSName.KIDS);
                    COSArray ntnNumbers = (COSArray) ntn.getCOSDictionary().getDictionaryObject(COSName.NUMS);

                    if (ntnNumbers == null && ntnKids != null) {//no number array, so continue with the kids array

                        //create dictionary with limits and nums array
                        COSDictionary pTreeEntry = new COSDictionary();
                        COSArray limitsArray = new COSArray();
                        //limits for exact one entry
                        limitsArray.add(COSInteger.get(parentTreeNextKey));
                        limitsArray.add(COSInteger.get(parentTreeNextKey));

                        COSArray numsArray = new COSArray();
                        numsArray.add(COSInteger.get(parentTreeNextKey));
                        numsArray.add(sigBlock);

                        pTreeEntry.setItem(COSName.NUMS, numsArray);
                        pTreeEntry.setItem(COSName.LIMITS, limitsArray);

                        PDNumberTreeNode newKidsElement = new PDNumberTreeNode(pTreeEntry,
                                PDNumberTreeNode.class);

                        ntnKids.add(newKidsElement);
                        ntnKids.setNeedToBeUpdate(true);

                        //working
                        //                     List<PDNumberTreeNode> treeRootKids = structureTreeRoot.getParentTree().getKids();
                        //                     PDNumberTreeNode last = (PDNumberTreeNode)treeRootKids.get(treeRootKids.size()-1);
                        //                     COSArray lim1 = (COSArray) last.getCOSDictionary().getDictionaryObject(COSName.LIMITS);
                        //                     lim1.remove(1);
                        //                     lim1.add(1, COSInteger.get(parentTreeNextKey));
                        //                     PDNumberTreeNode verylast = (PDNumberTreeNode)last.getKids().get(last.getKids().size()-1);
                        //                     COSArray numa = (COSArray) verylast.getCOSDictionary().getDictionaryObject(COSName.NUMS);
                        //                     COSArray lim = (COSArray) verylast.getCOSDictionary().getDictionaryObject(COSName.LIMITS);
                        //                     lim.remove(1);
                        //                     lim.add(1, COSInteger.get(parentTreeNextKey));
                        //
                        //                     int size = numa.size();
                        //                     numa.add(size, COSInteger.get(parentTreeNextKey));
                        //                     numa.add(sigBlock);
                        //working end

                    } else if (ntnNumbers != null && ntnKids == null) {

                        int arrindex = ntnNumbers.size();

                        ntnNumbers.add(arrindex, COSInteger.get(parentTreeNextKey));
                        ntnNumbers.add(arrindex + 1, sigBlock.getCOSObject());

                        ntnNumbers.getCOSObject().setNeedToBeUpdate(true);

                        structureTreeRoot.setParentTree(ntn);

                    } else if (ntnNumbers == null && ntnKids == null) {
                        //document is not pdfua conform before signature creation
                        throw new PdfAsException("error.pdf.sig.pdfua.1");
                    } else {
                        //this is not allowed
                        throw new PdfAsException("error.pdf.sig.pdfua.1");
                    }

                    // set StructureParent for signature field annotation
                    signatureField.getWidget().setStructParent(parentTreeNextKey);

                    //Increase the next Key value in the structure tree root
                    structureTreeRoot.setParentTreeNextKey(parentTreeNextKey + 1);

                    // add the Tabs /S Element for Tabbing through annots
                    PDPage p = signatureField.getWidget().getPage();
                    p.getCOSDictionary().setName("Tabs", "S");
                    p.getCOSObject().setNeedToBeUpdate(true);

                    //check alternative signature field name
                    if (signatureField != null) {
                        if (signatureField.getAlternateFieldName().equals(""))
                            signatureField.setAlternateFieldName(sigFieldName);
                    }

                    ntn.getCOSDictionary().setNeedToBeUpdate(true);
                    sigBlock.getCOSObject().setNeedToBeUpdate(true);
                    structureTreeRoot.getCOSObject().setNeedToBeUpdate(true);
                    objectDic.getCOSObject().setNeedToBeUpdate(true);
                    docElement.getCOSObject().setNeedToBeUpdate(true);

                }

            } catch (Throwable e) {
                if (signatureProfileSettings.isPDFUA() == true) {
                    logger.error("Could not create PDF-UA conform document!");
                    throw new PdfAsException("error.pdf.sig.pdfua.1", e);
                } else {
                    logger.info("Could not create PDF-UA conform signature");
                }
            }

            try {
                applyFilter(doc, requestedSignature);
            } catch (PDFASError e) {
                throw new PdfAsErrorCarrier(e);
            }

            FileInputStream tmpFileIs = null;

            try {
                tmpFileIs = new FileInputStream(new File(fisTmpFile));
                synchronized (doc) {
                    doc.saveIncremental(tmpFileIs, tmpOutputStream);
                }
                tmpFileIs.close();
            } finally {
                IOUtils.closeQuietly(tmpFileIs);
                if (options != null) {
                    if (options.getVisualSignature() != null) {
                        options.getVisualSignature().close();
                    }
                }
            }
            tmpOutputStream.flush();
            tmpOutputStream.close();
        } finally {
            IOUtils.closeQuietly(tmpOutputStream);
            COSDocument visualSignature = options.getVisualSignature();
            if (visualSignature != null) {
                visualSignature.close();
            }
        }

        FileInputStream readReadyFile = null;
        try {
            readReadyFile = new FileInputStream(new File(fisTmpFile));

            // write to resulting output stream
            // ByteArrayOutputStream bos = new ByteArrayOutputStream();
            // bos.write();
            // bos.close();

            pdfObject.setSignedDocument(StreamUtils.inputStreamToByteArray(readReadyFile));
            readReadyFile.close();
        } finally {
            IOUtils.closeQuietly(readReadyFile);
        }
    } catch (IOException e) {
        logger.info(MessageResolver.resolveMessage("error.pdf.sig.01") + ": {}", e.toString());
        throw new PdfAsException("error.pdf.sig.01", e);
    } catch (SignatureException e) {
        logger.info(MessageResolver.resolveMessage("error.pdf.sig.01") + ": {}", e.toString());
        throw new PdfAsException("error.pdf.sig.01", e);
    } catch (COSVisitorException e) {
        logger.info(MessageResolver.resolveMessage("error.pdf.sig.01") + ": {}", e.toString());
        throw new PdfAsException("error.pdf.sig.01", e);
    } finally {
        if (doc != null) {
            try {
                doc.close();
            } catch (IOException e) {
                logger.debug("Failed to close COS Doc!", e);
                // Ignore
            }
        }

        if (fisTmpFile != null) {
            helper.deleteFile(fisTmpFile);
        }
        logger.debug("Signature done!");

    }
}

From source file:com.cisco.iwe.services.util.EmailMonitor.java

/**
 * /*ww w . ja  v  a2  s  .co  m*/
 * @param fileDir
 * @return
 */
/* This method is used to scan the uploaded expense receipt in .pdf format and extract the text embedded in it. */
public String scanPDF(String fileDir) {
    PDFParser parser;
    String parsedText = null;
    PDFTextStripper pdfStripper = null;
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    File file = new File(fileDir);
    if (!file.isFile()) {
        System.err.println("File " + fileDir + " does not exist.");
        return null;
    }
    try {
        parser = new PDFParser(new FileInputStream(file));
    } catch (IOException e) {
        System.err.println("Unable to open PDF Parser. " + e.getMessage());
        return null;
    }
    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        pdfStripper.setStartPage(1);
        pdfStripper.setEndPage(pdDoc.getNumberOfPages());
        parsedText = pdfStripper.getText(pdDoc);
    } catch (Exception e) {
        System.err.println("An exception occured in parsing the PDF Document." + e.getMessage());
    } finally {
        try {
            if (cosDoc != null)
                cosDoc.close();
            if (pdDoc != null)
                pdDoc.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    return parsedText;
}

From source file:com.lanacion.adminsiteln.services.PdfIndexerService.PdfIndexerService.java

/**
 * Metodos privados para la indexacin//from   w w  w . ja  v a  2  s  . c o  m
 */
private String pdftoText(String fileName, int pagina) {

    PDFParser parser;
    String parsedText = null;
    ;
    PDFTextStripper pdfStripper = null;
    //pdfStripper.setStartPage(0);
    //pdfStripper.setEndPage(0);
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    File file = new File(fileName);
    if (!file.isFile()) {
        System.err.println("File " + fileName + " does not exist.");
        return null;
    }
    try {
        parser = new PDFParser(new FileInputStream(file));
    } catch (IOException e) {
        System.err.println("Unable to open PDF Parser. " + e.getMessage());
        return null;
    }
    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        pdfStripper.setStartPage(pagina);
        pdfStripper.setEndPage(pagina);
        parsedText = pdfStripper.getText(pdDoc);
    } catch (Exception e) {
        System.err.println("An exception occured in parsing the PDF Document." + e.getMessage());
    } finally {
        try {
            if (cosDoc != null) {
                cosDoc.close();
            }
            if (pdDoc != null) {
                pdDoc.close();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    return parsedText;

}

From source file:com.lanacion.adminsiteln.services.PdfIndexerService.PdfIndexerService.java

private int pdfgetPages(String fileName) {

    int numero_paginas = 0;
    PDFParser parser;//from w  ww  .  jav a  2s  .  c o  m
    String parsedText = null;
    ;
    PDFTextStripper pdfStripper = null;
    //pdfStripper.setStartPage(0);
    //pdfStripper.setEndPage(0);
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    File file = new File(fileName);
    if (!file.isFile()) {
        System.err.println("File " + fileName + " does not exist.");
        return 0;
    }
    try {
        parser = new PDFParser(new FileInputStream(file));
    } catch (IOException e) {
        System.err.println("Unable to open PDF Parser. " + e.getMessage());
        return 0;
    }
    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        numero_paginas = pdDoc.getNumberOfPages();
    } catch (Exception e) {
        System.err.println("An exception occured in parsing the PDF Document." + e.getMessage());
    } finally {
        try {
            if (cosDoc != null) {
                cosDoc.close();
            }
            if (pdDoc != null) {
                pdDoc.close();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    return numero_paginas;
}

From source file:com.validation.manager.core.server.core.AttachmentServerTest.java

License:Apache License

/**
 * Test of addFile method, of class AttachmentServer.
 *///from  w  w  w.  j a  va 2 s .c o  m
@Test
public void testAddRetrieveTextFile() {
    try {
        System.out.println("add text File");
        File f = new File("target/Test.txt");
        f.deleteOnExit();
        List<String> lines = Arrays.asList("The first line", "The second line");
        Path file = Paths.get(f.getAbsolutePath());
        Files.write(file, lines, Charset.forName("UTF-8"));
        AttachmentServer instance = new AttachmentServer();
        instance.addFile(f, f.getName());
        instance.write2DB();
        //Delete the file
        FileUtils.delete(f.getAbsolutePath());
        assertEquals(1, (int) instance.getAttachmentType().getId());//Text file
        System.out.println("retrieveFile");
        AttachmentServer temp = new AttachmentServer(instance.getAttachmentPK());
        File loadedFile = temp.getAttachedFile("target/loaded/");
        BufferedReader br = new BufferedReader(new FileReader(loadedFile));
        String line;
        int count = 0;
        while ((line = br.readLine()) != null) {
            assertEquals(lines.get(count), line);
            System.out.println(line);
            count++;
        }
        assertEquals(lines.size(), count);
        //Create pdf file
        System.out.println("add pdf File");
        File pdf = Tool.convertToPDF(loadedFile, "target/Text.pdf");
        pdf.deleteOnExit();
        instance = new AttachmentServer();
        instance.addFile(pdf, pdf.getName());
        instance.write2DB();
        //Delete the file
        FileUtils.delete(pdf.getAbsolutePath());
        assertEquals(2, (int) instance.getAttachmentType().getId());//PDF file
        System.out.println("retrieveFile");
        temp = new AttachmentServer(instance.getAttachmentPK());
        loadedFile = temp.getAttachedFile("target/loaded/");
        PDFTextStripper pdfStripper;
        PDDocument pdDoc = null;
        COSDocument cosDoc = null;
        try {
            PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(loadedFile));
            parser.parse();
            cosDoc = parser.getDocument();
            pdfStripper = new PDFTextStripper();
            pdDoc = new PDDocument(cosDoc);
            pdfStripper.setStartPage(1);
            pdfStripper.setEndPage(1);
            String parsedText = pdfStripper.getText(pdDoc);
            System.out.println(parsedText);
        } catch (IOException ex) {
            Exceptions.printStackTrace(ex);
            fail();
        } finally {
            if (cosDoc != null) {
                cosDoc.close();
            }
            if (pdDoc != null) {
                pdDoc.close();
            }
        }
    } catch (IOException | VMException ex) {
        Exceptions.printStackTrace(ex);
        fail();
    }
}

From source file:cz.muni.pdfjbim.PdfImageExtractor.java

License:Apache License

/**
 * This method extracts images by going through all COSObjects pointed from xref table
 * @param is input stream containing PDF file
 * @param prefix output basename for images
 * @param password password for access to PDF if needed
 * @param pagesToProcess list of pages which should be processed if null given => processed all pages
 *      -- not working yet/* ww w  .  ja v  a  2  s.  c  o  m*/
 * @param binarize -- enables processing of nonbitonal images as well (LZW is still not
 *      processed because of output with inverted colors)
 * @throws PdfRecompressionException if problem to extract images from PDF
 */
public void extractImagesUsingPdfParser(InputStream is, String prefix, String password,
        Set<Integer> pagesToProcess, Boolean binarize) throws PdfRecompressionException {
    // checking arguments and setting appropriate variables
    if (binarize == null) {
        binarize = false;
    }

    log.debug("Extracting images (binarize set to {})", binarize);

    InputStream inputStream = null;
    if (password != null) {
        try (ByteArrayOutputStream decryptedOutputStream = new ByteArrayOutputStream()) {
            PdfReader reader = new PdfReader(is, password.getBytes(StandardCharsets.UTF_8));
            PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream);
            if (stamper != null) {
                stamper.close();
            }
            inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray());
        } catch (DocumentException ex) {
            throw new PdfRecompressionException(ex);
        } catch (IOException ex) {
            throw new PdfRecompressionException("Reading file caused exception", ex);
        }
    } else {
        inputStream = is;
    }

    PDFParser parser = null;
    COSDocument doc = null;
    try {
        parser = new PDFParser(inputStream);
        parser.parse();
        doc = parser.getDocument();

        List<COSObject> objs = doc.getObjectsByType(COSName.XOBJECT);
        if (objs != null) {
            for (COSObject obj : objs) {
                COSBase subtype = obj.getItem(COSName.SUBTYPE);
                if (subtype.toString().equalsIgnoreCase("COSName{Image}")) {
                    COSBase imageObj = obj.getObject();
                    COSBase cosNameObj = obj.getItem(COSName.NAME);
                    String key;
                    if (cosNameObj != null) {
                        String cosNameKey = cosNameObj.toString();
                        int startOfKey = cosNameKey.indexOf("{") + 1;
                        key = cosNameKey.substring(startOfKey, cosNameKey.length() - 1);
                    } else {
                        key = "im0";
                    }
                    int objectNum = obj.getObjectNumber().intValue();
                    int genNum = obj.getGenerationNumber().intValue();
                    PDXObjectImage image = (PDXObjectImage) PDXObjectImage.createXObject(imageObj);

                    PDStream pdStr = new PDStream(image.getCOSStream());
                    List<COSName> filters = pdStr.getFilters();

                    log.debug("Detected image with color depth: {} bits", image.getBitsPerComponent());
                    if (filters == null) {
                        continue;
                    }
                    log.debug("Detected filters: {}", filters.toString());

                    if ((image.getBitsPerComponent() > 1) && (!binarize)) {
                        log.info("It is not a bitonal image => skipping");
                        continue;
                    }

                    // at this moment for preventing bad output (bad coloring) from LZWDecode filter
                    if (filters.contains(COSName.LZW_DECODE)) {
                        log.info("This is LZWDecoded => skipping");
                        continue;
                    }

                    if (filters.contains(COSName.FLATE_DECODE)) {
                        log.debug("FlateDecoded image detected");
                    }

                    if (filters.contains(COSName.JBIG2_DECODE)) {
                        if (skipJBig2Images) {
                            log.warn("Allready compressed according to JBIG2 standard => skipping");
                            continue;
                        } else {
                            log.debug("JBIG2 image detected");
                        }
                    }

                    // detection of unsupported filters by pdfBox library
                    if (filters.contains(COSName.JPX_DECODE)) {
                        log.warn("Unsupported filter JPXDecode => skipping");
                        continue;
                    }

                    String name = getUniqueFileName(prefix, image.getSuffix());
                    log.info("Writing image: {}", name);
                    image.write2file(name);

                    PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(),
                            image.getHeight(), objectNum, genNum);
                    originalImageInformations.add(pdfImageInfo);

                    namesOfImages.add(name + "." + image.getSuffix());

                }
            }
        }
    } catch (IOException ex) {
        Tools.deleteFilesFromList(namesOfImages);
        throw new PdfRecompressionException("Unable to parse PDF document", ex);
    } catch (Exception ex) {
        Tools.deleteFilesFromList(namesOfImages);
    } finally {
        if (doc != null) {
            try {
                doc.close();
            } catch (IOException ex) {
                throw new PdfRecompressionException(ex);
            }
        }
    }
}

From source file:cz.muni.pdfjbim.PdfImageProcessor.java

License:Apache License

/**
 * This method extracts images by going through all COSObjects pointed from xref table
 * @param is input stream containing PDF file
 * @param password password for access to PDF if needed
 * @param pagesToProcess list of pages which should be processed if null given => processed all pages
 *      -- not working yet/* ww  w.j  a  va2 s . c  o  m*/
 * @param binarize -- enables processing of nonbitonal images as well (LZW is still not
 *      processed because of output with inverted colors)
 * @throws PdfRecompressionException if problem to extract images from PDF
 */
public void extractImagesUsingPdfParser(InputStream is, String prefix, String password,
        Set<Integer> pagesToProcess, Boolean binarize) throws PdfRecompressionException {
    // checking arguments and setting appropriate variables
    if (binarize == null) {
        binarize = false;
    }

    InputStream inputStream = null;
    if (password != null) {
        try {
            ByteArrayOutputStream decryptedOutputStream = null;
            PdfReader reader = new PdfReader(is, password.getBytes());
            PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream);
            stamper.close();
            inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray());
        } catch (DocumentException ex) {
            throw new PdfRecompressionException(ex);
        } catch (IOException ex) {
            throw new PdfRecompressionException("Reading file caused exception", ex);
        }
    } else {
        inputStream = is;
    }

    PDFParser parser = null;
    COSDocument doc = null;
    try {
        parser = new PDFParser(inputStream);
        parser.parse();
        doc = parser.getDocument();

        List<COSObject> objs = doc.getObjectsByType(COSName.XOBJECT);
        if (objs != null) {
            for (COSObject obj : objs) {
                COSBase subtype = obj.getItem(COSName.SUBTYPE);
                if (subtype.toString().equalsIgnoreCase("COSName{Image}")) {
                    COSBase imageObj = obj.getObject();
                    COSBase cosNameObj = obj.getItem(COSName.NAME);
                    String key;
                    if (cosNameObj != null) {
                        String cosNameKey = cosNameObj.toString();
                        int startOfKey = cosNameKey.indexOf("{") + 1;
                        key = cosNameKey.substring(startOfKey, cosNameKey.length() - 1);
                    } else {
                        key = "im0";
                    }
                    int objectNum = obj.getObjectNumber().intValue();
                    int genNum = obj.getGenerationNumber().intValue();
                    PDXObjectImage image = (PDXObjectImage) PDXObjectImage.createXObject(imageObj);

                    PDStream pdStr = new PDStream(image.getCOSStream());
                    List filters = pdStr.getFilters();

                    if ((image.getBitsPerComponent() > 1) && (!binarize)) {
                        log.info("It is not a bitonal image => skipping");

                        continue;
                    }

                    // at this moment for preventing bad output (bad coloring) from LZWDecode filter
                    if (filters.contains(COSName.LZW_DECODE.getName())) {
                        log.info("This is LZWDecoded => skipping");
                        continue;

                    }

                    // detection of unsupported filters by pdfBox library
                    if (filters.contains("JBIG2Decode")) {
                        log.warn("Allready compressed according to JBIG2 standard => skipping");
                        continue;
                    }

                    if (filters.contains("JPXDecode")) {
                        log.warn("Unsupported filter JPXDecode => skipping");
                        continue;
                    }

                    String name = getUniqueFileName(prefix, image.getSuffix());
                    log.info("Writing image:" + name);
                    image.write2file(name);

                    PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(),
                            image.getHeight(), objectNum, genNum);
                    originalImageInformations.add(pdfImageInfo);

                    namesOfImages.add(name + "." + image.getSuffix());

                }
                //                    }
            }
        }
    } catch (IOException ex) {
        throw new PdfRecompressionException("Unable to parse PDF document", ex);
    } finally {
        if (doc != null) {
            try {
                doc.close();
            } catch (IOException ex) {
                throw new PdfRecompressionException(ex);
            }
        }
    }
}

From source file:cz.mzk.editor.server.handler.GetOcrFromPdfHandler.java

License:Open Source License

private String pdftoText(String fileName) throws ActionException {

    File pdfFile = new File(fileName);

    if (!pdfFile.isFile()) {
        LOGGER.error("The file: " + fileName + " does not exist.");
        throw new ActionException("Unable to parse the pdf file.");
    }/*from   ww  w. j a  v  a 2s .  c  o  m*/

    PDFParser parser = null;
    COSDocument cosDoc = null;
    PDFTextStripper pdfStripper;
    PDDocument pdDoc = null;
    String parsedText;
    try {
        parser = new PDFParser(new RandomAccessBufferedFileInputStream(new FileInputStream(pdfFile)));
    } catch (Exception e) {
        LOGGER.error("Unable to open PDF Parser.: " + e);
        e.printStackTrace();
        throw new ActionException("Unable to parse the pdf file.");
    }

    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        parsedText = pdfStripper.getText(pdDoc);
    } catch (Exception e) {
        LOGGER.error("An exception occured in parsing the PDF Document.");
        e.printStackTrace();
        throw new ActionException("Unable to parse the pdf file. " + e);
    } finally {
        try {
            if (cosDoc != null)
                cosDoc.close();
            if (pdDoc != null)
                pdDoc.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    return parsedText;
}

From source file:de.hsmannheim.ss15.alr.searchengine.PDFParser.java

public String getTextOfPDF(byte[] in) throws Exception {

    ByteArrayInputStream input = new ByteArrayInputStream(in);

    org.apache.pdfbox.pdfparser.PDFParser parser;
    String parsedText = null;//from  w w  w.  j  a  v  a2s .  c om
    ;
    PDFTextStripper pdfStripper = null;
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;

    parser = new NonSequentialPDFParser(input);

    //parse PDF
    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);

        parsedText = pdfStripper.getText(pdDoc);
    } catch (Exception e) {
        throw (e);

    } finally {
        if (cosDoc != null) {
            cosDoc.close();
        }
        if (pdDoc != null) {
            pdDoc.close();
        }

    }
    return parsedText;
}