Example usage for org.apache.pdfbox.io RandomAccessFile RandomAccessFile

List of usage examples for org.apache.pdfbox.io RandomAccessFile RandomAccessFile

Introduction

In this page you can find the example usage for org.apache.pdfbox.io RandomAccessFile RandomAccessFile.

Prototype

public RandomAccessFile(File file, String mode) throws FileNotFoundException 

Source Link

Document

Constructor.

Usage

From source file:org.apache.tika.parser.pdf.PDFParser.java

License:Apache License

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    PDDocument pdfDocument = null;/*from   ww w.  j  a v a  2  s.co m*/
    TemporaryResources tmp = new TemporaryResources();
    //config from context, or default if not set via context
    PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
    String password = "";
    try {
        // PDFBox can process entirely in memory, or can use a temp file
        //  for unpacked / processed resources
        // Decide which to do based on if we're reading from a file or not already
        TikaInputStream tstream = TikaInputStream.cast(stream);
        password = getPassword(metadata, context);
        if (tstream != null && tstream.hasFile()) {
            // File based, take that as a cue to use a temporary file
            RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw");
            if (localConfig.getUseNonSequentialParser() == true) {
                pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), scratchFile, password);
            } else {
                pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true);
            }
        } else {
            // Go for the normal, stream based in-memory parsing
            if (localConfig.getUseNonSequentialParser() == true) {
                pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream),
                        new RandomAccessBuffer(), password);
            } else {
                pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true);
            }
        }
        metadata.set("pdf:encrypted", Boolean.toString(pdfDocument.isEncrypted()));

        //if using the classic parser and the doc is encrypted, we must manually decrypt
        if (!localConfig.getUseNonSequentialParser() && pdfDocument.isEncrypted()) {
            pdfDocument.decrypt(password);
        }

        metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
        extractMetadata(pdfDocument, metadata);

        AccessChecker checker = localConfig.getAccessChecker();
        checker.check(metadata);
        if (handler != null) {
            if (shouldHandleXFAOnly(pdfDocument, localConfig)) {
                handleXFAOnly(pdfDocument, handler, metadata);
            } else {
                PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
            }
        }

    } catch (CryptographyException e) {
        //seq parser throws CryptographyException for bad password
        throw new EncryptedDocumentException(e);
    } catch (IOException e) {
        //nonseq parser throws IOException for bad password
        //At the Tika level, we want the same exception to be thrown
        if (e.getMessage() != null && e.getMessage().contains("Error (CryptographyException)")) {
            metadata.set("pdf:encrypted", Boolean.toString(true));
            throw new EncryptedDocumentException(e);
        }
        //rethrow any other IOExceptions
        throw e;
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
        tmp.dispose();
        //TODO: once we migrate to PDFBox 2.0, remove this (PDFBOX-2200)
        PDFont.clearResources();
    }
}

From source file:org.wangwei.pdf.AddImageToPDF.java

License:Apache License

/**
 * Add an image to an existing PDF document.
 *
 * @param inputFile The input PDF to add the image to.
 * @param image The filename of the image to put in the PDF.
 * @param outputFile The file to write to the pdf to.
 * @throws IOException If there is an error writing the data.
 * @throws COSVisitorException If there is an error writing the PDF.
 *//* w w  w . j  av  a2s.co m*/
public void createPDFFromImage(String inputFile, String image, String outputFile)
        throws IOException, COSVisitorException {
    // the document
    PDDocument doc = null;
    try {
        doc = PDDocument.load(inputFile);

        // we will add the image to the first page.
        PDPage page = (PDPage) doc.getDocumentCatalog().getAllPages().get(0);

        PDXObjectImage ximage = null;
        if (image.toLowerCase().endsWith(".jpg")) {
            ximage = new PDJpeg(doc, new FileInputStream(image));
        } else if (image.toLowerCase().endsWith(".tif") || image.toLowerCase().endsWith(".tiff")) {
            ximage = new PDCcitt(doc, new RandomAccessFile(new File(image), "r"));
        } else {
            BufferedImage awtImage = ImageIO.read(new File(image));
            ximage = new PDPixelMap(doc, awtImage);
        }
        PDPageContentStream contentStream = new PDPageContentStream(doc, page, true, true);

        // contentStream.drawImage(ximage, 20, 20 );
        // better method inspired by http://stackoverflow.com/a/22318681/535646
        float scale = 1f; // reduce this value if the image is too large
        contentStream.drawXObject(ximage, 20, 20, ximage.getWidth() * scale, ximage.getHeight() * scale);

        contentStream.close();
        doc.save(outputFile);
    } finally {
        if (doc != null) {
            doc.close();
        }
    }
}

From source file:pdf.to.info.PDF.java

/**
 * Creating a PDDocument object/*ww  w  . j ava 2s.com*/
 *
 * @param filePath
 * @return
 * @throws java.io.IOException
 */
private PDDocument ReadPDDoc(String filePath) throws IOException {
    File file = new File(filePath);
    PDFParser parser = new PDFParser(new RandomAccessFile(file, "r")); // update for PDFBox V 2.0
    parser.parse();
    COSDocument cosDoc = parser.getDocument();
    PDFTextStripper pdfStripper = new PDFTextStripper();
    PDDocument pdDoc = new PDDocument(cosDoc);
    pdDoc.getNumberOfPages();
    pdfStripper.setStartPage(1);
    pdfStripper.setEndPage(1);
    // for reading all pages of pdf file
    // pdfStripper.setEndPage(pdDoc.getNumberOfPages());
    return pdDoc;
}

From source file:uk.bl.dpt.qa.flint.wrappers.PDFBoxWrapper.java

License:Apache License

/**
 * Check if a PDF file has DRM or not/* ww w .j ava2s.c om*/
 * @param pFile file to check
 * @return whether the file is had DRM or not
 */
public boolean hasDRM(File pFile) {
    boolean ret = false;

    File tmp = null;
    try {
        System.setProperty("org.apache.pdfbox.baseParser.pushBackSize", "1024768");
        // NOTE: we use loadNonSeq here as it is the latest parser
        // load() and parser.parse() have hung on test files
        tmp = File.createTempFile("flint-", ".tmp");
        tmp.deleteOnExit();
        RandomAccess scratchFile = new RandomAccessFile(tmp, "rw");
        PDDocument doc = PDDocument.loadNonSeq(new FileInputStream(pFile), scratchFile);
        ret = doc.isEncrypted();
        doc.close();

    } catch (IOException e) {

        // This may occur when a suitable security handler cannot be found
        if (e.getMessage().contains("BadSecurityHandlerException")) {
            // if this happens then there must be some sort of DRM here
            ret = true;
        }

    } catch (Exception e) {

        e.printStackTrace();

        // See comments in https://issues.apache.org/jira/browse/PDFBOX-1757
        // PDFBox state that these files have errors and their parser is correct
        // The only way to find out that the parser doesn't like it is to catch
        // a general Exception.

        // If we reach this point then we have no idea of whether the file contains
        // DRM or not.  Return false and hope it is detected elsewhere.

        ret = false;
    } finally {
        if (tmp != null)
            tmp.delete();
    }
    return ret;
}

From source file:uk.bl.dpt.qa.flint.wrappers.PDFBoxWrapper.java

License:Apache License

/**
 * Check for encryption with Apache PDFBox
 * -> query the encryption dictionary (might allow more granular checks of protection)
 * @param pPDF pdf file to check//from w w w .j  a v a  2s . com
 * @return whether or not the file has DRM
 */
public boolean hasDRMGranular(File pPDF) {

    boolean ret = false;

    File tmp = null;
    try {
        System.setProperty("org.apache.pdfbox.baseParser.pushBackSize", "1024768");
        // NOTE: we use loadNonSeq here as it is the latest parser
        // load() and parser.parse() have hung on test files
        tmp = File.createTempFile("flint-", ".tmp");
        tmp.deleteOnExit();
        RandomAccess scratchFile = new RandomAccessFile(tmp, "rw");
        PDDocument doc = PDDocument.loadNonSeq(new FileInputStream(pPDF), scratchFile);

        PDEncryptionDictionary dict = doc.getEncryptionDictionary();
        if (dict != null) {

            //print encryption dictionary
            //            for(COSName key:dict.keySet()) {
            //               System.out.print(key.getName());
            //               String value = dict.getString(key);
            //               if(value!=null){
            //                  System.out.println(": "+value);
            //               } else {
            //                  System.out.println(": "+dict.getLong(key));
            //               }
            //            }

            //this feaure in pdfbox is currently broken, see: https://issues.apache.org/jira/browse/PDFBOX-1651
            //AccessPermission perms = parser.getPDDocument().getCurrentAccessPermission();
            //this is a work around; creating a new object from the data
            AccessPermission perms = new AccessPermission(dict.getPermissions());//.getInt("P"));

            boolean debug = true;

            if (debug) {

                System.out.println("canAssembleDocument()        : " + perms.canAssembleDocument());
                System.out.println("canExtractContent()          : " + perms.canExtractContent());
                System.out.println("canExtractForAccessibility() : " + perms.canExtractForAccessibility());
                System.out.println("canFillInForm()              : " + perms.canFillInForm());
                System.out.println("canModify()                  : " + perms.canModify());
                System.out.println("canModifyAnnotations()       : " + perms.canModifyAnnotations());
                System.out.println("canPrint()                   : " + perms.canPrint());
                System.out.println("canPrintDegraded()           : " + perms.canPrintDegraded());
                System.out.println("isOwnerPermission()          : " + perms.isOwnerPermission());
                System.out.println("isReadOnly()                 : " + perms.isReadOnly());

            }
        }

        doc.close();

    } catch (Exception e) {
        LOGGER.warn("Exception while doing granular DRM checks leads to invalidity: {}", e);
    } finally {
        if (tmp != null)
            tmp.delete();
    }

    return ret;
}