Example usage for org.apache.pdfbox.pdmodel PDDocument load

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument load.

Prototype

public static PDDocument load(byte[] input) throws IOException

Source Link

Document

Parses a PDF.

Usage

From source file:org.apache.fop.render.pdf.StructureTreeMergerTestCase.java

License:Apache License

@Test
public void testOBJRCorrectPosition() throws IOException {
    setUp();//from w  w  w  .j  a  v  a 2  s  .  com
    PDDocument doc = PDDocument.load(new File(getClass().getResource(MissingOBJR).getFile()));
    PDPage srcPage = doc.getPage(0);
    PageParentTreeFinder finder = new PageParentTreeFinder(srcPage);
    COSArray markedContentParents = finder.getPageParentTreeArray(doc);
    PDFStructElem elem = new PDFStructElem();
    elem.setObjectNumber(2);
    adapter = new PDFBoxAdapter(pdfPage, new HashMap(), new HashMap<Integer, PDFArray>());
    PDFLogicalStructureHandler handler = setUpPDFLogicalStructureHandler();
    StructureTreeMerger merger = new StructureTreeMerger(elem, handler, adapter, srcPage);
    merger.copyStructure(markedContentParents);
    //        PDFArray array = handler.getPageParentTree();

    //        PDFStructElem kid = (PDFStructElem)array.get(0);
    //        PDFReference reference = (PDFReference) kid.get("P");
    //        PDFStructElem parent = (PDFStructElem)reference.getObject();
    //        List<PDFObject> kids = parent.getKids();
    //        PDFDictionary first = (PDFDictionary) kids.get(0);

    //        Assert.assertEquals(first.get("Type").toString(), "/OBJR");
    //        PDFDictionary last = (PDFDictionary) kids.get(2);
    //        Assert.assertEquals(last.get("Type").toString(), "/OBJR");

    //        PDFStructElem middle = (PDFStructElem) kids.get(1);
    //        Assert.assertEquals(middle.get("Type").toString(), "/StructElem");
}

From source file:org.apache.fop.render.pdf.StructureTreeMergerTestCase.java

License:Apache License

@Test
public void testCheckNullCOSObject() throws IOException {
    setUp();// ww w.java  2  s.c  o  m
    PDDocument doc = PDDocument.load(new File(getClass().getResource(BrokenLink).getFile()));
    PDPage srcPage = doc.getPage(0);
    PageParentTreeFinder finder = new PageParentTreeFinder(srcPage);
    COSArray markedContentParents = finder.getPageParentTreeArray(doc);
    COSObject nullObj = new COSObject(null);
    nullObj.setObjectNumber(100);
    nullObj.setGenerationNumber(0);
    PDFStructElem elem = new PDFStructElem();
    elem.setObjectNumber(2);
    COSObject parent = (COSObject) markedContentParents.get(1);
    COSArray kids = (COSArray) parent.getDictionaryObject(COSName.K);
    COSDictionary kid = (COSDictionary) kids.get(1);
    kid.setItem(COSName.OBJ, nullObj);
    adapter = new PDFBoxAdapter(pdfPage, new HashMap(), new HashMap<Integer, PDFArray>());
    PDFLogicalStructureHandler handler = setUpPDFLogicalStructureHandler();
    StructureTreeMerger merger = new StructureTreeMerger(elem, handler, adapter, srcPage);
    merger.copyStructure(markedContentParents);
    PDFArray array = handler.getPageParentTree();
    PDFStructElem parentElem = (PDFStructElem) array.get(1);
    PDFDictionary objrDict = (PDFDictionary) parentElem.getKids().get(1);
    Assert.assertNull(objrDict.get("Obj"));
}

From source file:org.apache.fop.render.pdf.TaggedPDFConductorTestCase.java

License:Apache License

private void runConductor(String pdf, PDFStructElem elem) throws IOException {
    setUp();/* w  ww.ja va2s  . c o m*/
    PDDocument doc = PDDocument.load(new File(getClass().getResource(pdf).getFile()));
    PDPage srcPage = doc.getPage(0);
    elem.setObjectNumber(2);
    PDFBoxAdapter adapter = new PDFBoxAdapter(pdfPage, new HashMap(), new HashMap<Integer, PDFArray>());
    PDFLogicalStructureHandler handler = setUpPDFLogicalStructureHandler();
    new TaggedPDFConductor(elem, handler, srcPage, adapter).handleLogicalStructure(doc);
}

From source file:org.apache.james.mailbox.store.search.PDFTextExtractor.java

License:Apache License

private ParsedContent extractTextFromPDF(InputStream inputStream) throws IOException {
    return new ParsedContent(Optional.ofNullable(new PDFTextStripper().getText(PDDocument.load(inputStream))),
            ImmutableMap.of());/*  www  . j a va  2 s  .c om*/
}

From source file:org.apache.padaf.preflight.ExtractStream.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length != 3) {
        System.err.println("usage : ExtractStream file objNum objGen");
    }//from  w  w w.j  av a  2 s  .c om
    PDDocument document = PDDocument.load(new FileInputStream(args[0]));
    COSObject obj = document.getDocument()
            .getObjectFromPool(new COSObjectKey(Integer.parseInt(args[1]), Integer.parseInt(args[2])));
    if (obj.getObject() instanceof COSStream) {
        COSStream stream = (COSStream) obj.getObject();
        InputStream is = stream.getUnfilteredStream();
        FileOutputStream out = new FileOutputStream("stream.out");
        IOUtils.copyLarge(is, out);
        IOUtils.closeQuietly(out);
    }
}

From source file:org.apache.padaf.preflight.PdfA1bValidator.java

License:Apache License

public synchronized ValidationResult validate(DataSource source) throws ValidationException {
    DocumentHandler handler = createDocumentHandler(source);
    try {/*from ww  w .  j  a v  a  2  s  .  com*/
        ArrayList<ValidationError> allErrors = new ArrayList<ValidationError>();

        // syntax (javacc) validation
        try {
            InputStreamReader reader = new InputStreamReader(source.getInputStream(), encoding);

            PDFParser parser = new PDFParser(reader);
            handler.setParser(parser);
            parser.PDF();
        } catch (IOException e) {
            throw new ValidationException("Failed to parse datasource due to : " + e.getMessage(), e);
        } catch (ParseException e) {
            allErrors.addAll(createErrorResult(e).getErrorsList());
        }

        // if here is reached, validate with helpers
        // init PDF Box document
        PDDocument document = null;
        try {
            document = PDDocument.load(handler.getSource().getInputStream());
            handler.setDocument(document);
        } catch (IOException e) {
            throw new ValidationException("PDFBox failed to parse datasource", e);
        }

        // init PDF Extractor
        try {
            SimpleCharStream scs = new SimpleCharStream(source.getInputStream());
            ExtractorTokenManager extractor = new ExtractorTokenManager(scs);
            extractor.parse();
            handler.setPdfExtractor(extractor);
        } catch (IOException e) {
            throw new ValidationException("PDF ExtractorTokenMng failed to parse datasource", e);
        }

        /* 
         * call all helpers
         */

        // Execute priority helpers.
        for (AbstractValidationHelper helper : priorHelpers) {
            runValidation(handler, helper, allErrors);
        }

        // Execute other helpers.
        for (AbstractValidationHelper helper : standHelpers) {
            runValidation(handler, helper, allErrors);
        }

        // check result
        ValidationResult valRes = null;
        if (allErrors.size() == 0) {
            valRes = new ValidationResult(true);
        } else {
            // there are some errors
            valRes = new ValidationResult(allErrors);
        }

        // addition of the some objects to avoid a second file parsing  
        valRes.setPdf(document);
        valRes.setXmpMetaData(handler.getMetadata());
        return valRes;
    } catch (ValidationException e) {
        // ---- Close all open resources if an error occurs.
        handler.close();
        throw e;
    }
}

From source file:org.apache.padaf.preflight.RetrieveMissingStream.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length != 1) {
        System.err.println("usage : RetrieveMissingStream file");
        System.exit(233);/*from   ww  w  .  j  ava  2s. c  o m*/
    }

    HashSet<COSObjectKey> listOfKeys = new HashSet<COSObjectKey>();

    PDDocument document = PDDocument.load(new FileInputStream(args[0]));
    List<COSObject> lCosObj = document.getDocument().getObjects();
    for (COSObject cosObject : lCosObj) {

        if (cosObject.getObject() instanceof COSStream) {
            listOfKeys.add(new COSObjectKey(cosObject.getObjectNumber().intValue(),
                    cosObject.getGenerationNumber().intValue()));
        }

    }

    PDDocumentCatalog catalog = document.getDocumentCatalog();
    List<?> pages = catalog.getAllPages();
    for (int i = 0; i < pages.size(); ++i) {
        PDPage pdp = (PDPage) pages.get(i);
        PDStream pdStream = pdp.getContents();

        COSBase b = pdp.getCOSDictionary().getItem(COSName.getPDFName("Contents"));
        System.out.println();
    }
}

From source file:org.apache.pdflens.Main.java

License:Apache License

/**
* This will parse a document.//from  w w  w .  j  av  a  2s. c  o m
*
* @param input The input stream for the document.
*
* @return The document.
*
* @throws IOException If there is an error parsing the document.
*/
private static PDDocument parseDocument(InputStream input) throws IOException {
    PDDocument document = PDDocument.load(input);
    if (document.isEncrypted()) {
        try {
            document.decrypt("");
        } catch (InvalidPasswordException e) {
            System.err.println("Error: The document is encrypted.");
        } catch (org.apache.pdfbox.exceptions.CryptographyException e) {
            e.printStackTrace();
        }
    }

    return document;
}

From source file:org.apache.syncope.client.console.wicket.markup.html.form.preview.BinaryPDFPreviewer.java

License:Apache License

@Override
public Component preview(final byte[] uploadedBytes) {
    firstPage = null;// w w  w . j a  va 2 s .  c om

    PDDocument document = null;
    try {
        document = PDDocument.load(new ByteArrayInputStream(uploadedBytes));
        if (document.isEncrypted()) {
            LOG.info("Document is encrypted, no preview is possible");
        } else {
            firstPage = new PDFRenderer(document).renderImage(0, RESOLUTION, IMAGE_TYPE);
        }
    } catch (IOException e) {
        LOG.error("While generating thumbnail from first page", e);
    } finally {
        IOUtils.closeQuietly(document);
    }

    Fragment fragment;
    if (firstPage == null) {
        fragment = new Fragment("preview", "noPreviewFragment", this);
    } else {
        fragment = new Fragment("preview", "previewFragment", this);
        fragment.add(new NonCachingImage("previewImage", new ThumbnailImageResource(firstPage)));
    }

    WebMarkupContainer previewContainer = new WebMarkupContainer("previewContainer");
    previewContainer.setOutputMarkupId(true);
    previewContainer.add(fragment);
    return this.addOrReplace(previewContainer);
}

From source file:org.argrr.extractor.gdrive.downloader.ChartsDownloader.java

License:Open Source License

public static void extractPictures(String path, String fileName) throws IOException {
    PDDocument document = null;// w w w  .  j a va 2 s. c o m
    try {
        document = PDDocument.load(path + "/" + fileName + ".pdf");
    } catch (IOException ex) {
        System.out.println("" + ex);
    }
    List pages = document.getDocumentCatalog().getAllPages();
    Iterator iter = pages.iterator();
    int i = 1;
    String name = null;

    while (iter.hasNext()) {
        PDPage page = (PDPage) iter.next();
        PDResources resources = page.getResources();
        Map pageImages = resources.getImages();
        if (pageImages != null) {
            Iterator imageIter = pageImages.keySet().iterator();
            while (imageIter.hasNext()) {
                String key = (String) imageIter.next();
                PDXObjectImage image = (PDXObjectImage) pageImages.get(key);
                image.write2file(ChartsDownloader.rootOutputPathCharts + "/" + fileName + "-" + i);
                i++;
            }
        }
    }
}