Example usage for org.apache.pdfbox.pdmodel PDDocument load

List of usage examples for org.apache.pdfbox.pdmodel PDDocument load

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument load.

Prototype

public static PDDocument load(byte[] input) throws IOException 

Source Link

Document

Parses a PDF.

Usage

From source file:org.apache.fop.render.pdf.StructureTreeMergerTestCase.java

License:Apache License

@Test
public void testOBJRCorrectPosition() throws IOException {
    setUp();//from w  w  w  .j  a  v  a 2  s  .  com
    PDDocument doc = PDDocument.load(new File(getClass().getResource(MissingOBJR).getFile()));
    PDPage srcPage = doc.getPage(0);
    PageParentTreeFinder finder = new PageParentTreeFinder(srcPage);
    COSArray markedContentParents = finder.getPageParentTreeArray(doc);
    PDFStructElem elem = new PDFStructElem();
    elem.setObjectNumber(2);
    adapter = new PDFBoxAdapter(pdfPage, new HashMap(), new HashMap<Integer, PDFArray>());
    PDFLogicalStructureHandler handler = setUpPDFLogicalStructureHandler();
    StructureTreeMerger merger = new StructureTreeMerger(elem, handler, adapter, srcPage);
    merger.copyStructure(markedContentParents);
    //        PDFArray array = handler.getPageParentTree();

    //        PDFStructElem kid = (PDFStructElem)array.get(0);
    //        PDFReference reference = (PDFReference) kid.get("P");
    //        PDFStructElem parent = (PDFStructElem)reference.getObject();
    //        List<PDFObject> kids = parent.getKids();
    //        PDFDictionary first = (PDFDictionary) kids.get(0);

    //        Assert.assertEquals(first.get("Type").toString(), "/OBJR");
    //        PDFDictionary last = (PDFDictionary) kids.get(2);
    //        Assert.assertEquals(last.get("Type").toString(), "/OBJR");

    //        PDFStructElem middle = (PDFStructElem) kids.get(1);
    //        Assert.assertEquals(middle.get("Type").toString(), "/StructElem");
}

From source file:org.apache.fop.render.pdf.StructureTreeMergerTestCase.java

License:Apache License

@Test
public void testCheckNullCOSObject() throws IOException {
    setUp();// ww w.java  2  s.c  o  m
    PDDocument doc = PDDocument.load(new File(getClass().getResource(BrokenLink).getFile()));
    PDPage srcPage = doc.getPage(0);
    PageParentTreeFinder finder = new PageParentTreeFinder(srcPage);
    COSArray markedContentParents = finder.getPageParentTreeArray(doc);
    COSObject nullObj = new COSObject(null);
    nullObj.setObjectNumber(100);
    nullObj.setGenerationNumber(0);
    PDFStructElem elem = new PDFStructElem();
    elem.setObjectNumber(2);
    COSObject parent = (COSObject) markedContentParents.get(1);
    COSArray kids = (COSArray) parent.getDictionaryObject(COSName.K);
    COSDictionary kid = (COSDictionary) kids.get(1);
    kid.setItem(COSName.OBJ, nullObj);
    adapter = new PDFBoxAdapter(pdfPage, new HashMap(), new HashMap<Integer, PDFArray>());
    PDFLogicalStructureHandler handler = setUpPDFLogicalStructureHandler();
    StructureTreeMerger merger = new StructureTreeMerger(elem, handler, adapter, srcPage);
    merger.copyStructure(markedContentParents);
    PDFArray array = handler.getPageParentTree();
    PDFStructElem parentElem = (PDFStructElem) array.get(1);
    PDFDictionary objrDict = (PDFDictionary) parentElem.getKids().get(1);
    Assert.assertNull(objrDict.get("Obj"));
}

From source file:org.apache.fop.render.pdf.TaggedPDFConductorTestCase.java

License:Apache License

private void runConductor(String pdf, PDFStructElem elem) throws IOException {
    setUp();/* w  ww.ja va2s  . c o m*/
    PDDocument doc = PDDocument.load(new File(getClass().getResource(pdf).getFile()));
    PDPage srcPage = doc.getPage(0);
    elem.setObjectNumber(2);
    PDFBoxAdapter adapter = new PDFBoxAdapter(pdfPage, new HashMap(), new HashMap<Integer, PDFArray>());
    PDFLogicalStructureHandler handler = setUpPDFLogicalStructureHandler();
    new TaggedPDFConductor(elem, handler, srcPage, adapter).handleLogicalStructure(doc);
}

From source file:org.apache.james.mailbox.store.search.PDFTextExtractor.java

License:Apache License

private ParsedContent extractTextFromPDF(InputStream inputStream) throws IOException {
    return new ParsedContent(Optional.ofNullable(new PDFTextStripper().getText(PDDocument.load(inputStream))),
            ImmutableMap.of());/*  www  . j a va  2 s  .c om*/
}

From source file:org.apache.padaf.preflight.ExtractStream.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length != 3) {
        System.err.println("usage : ExtractStream file objNum objGen");
    }//from  w  w w.j  av a  2 s  .c om
    PDDocument document = PDDocument.load(new FileInputStream(args[0]));
    COSObject obj = document.getDocument()
            .getObjectFromPool(new COSObjectKey(Integer.parseInt(args[1]), Integer.parseInt(args[2])));
    if (obj.getObject() instanceof COSStream) {
        COSStream stream = (COSStream) obj.getObject();
        InputStream is = stream.getUnfilteredStream();
        FileOutputStream out = new FileOutputStream("stream.out");
        IOUtils.copyLarge(is, out);
        IOUtils.closeQuietly(out);
    }
}

From source file:org.apache.padaf.preflight.PdfA1bValidator.java

License:Apache License

public synchronized ValidationResult validate(DataSource source) throws ValidationException {
    DocumentHandler handler = createDocumentHandler(source);
    try {/*from ww  w .  j  a v  a  2  s  .  com*/
        ArrayList<ValidationError> allErrors = new ArrayList<ValidationError>();

        // syntax (javacc) validation
        try {
            InputStreamReader reader = new InputStreamReader(source.getInputStream(), encoding);

            PDFParser parser = new PDFParser(reader);
            handler.setParser(parser);
            parser.PDF();
        } catch (IOException e) {
            throw new ValidationException("Failed to parse datasource due to : " + e.getMessage(), e);
        } catch (ParseException e) {
            allErrors.addAll(createErrorResult(e).getErrorsList());
        }

        // if here is reached, validate with helpers
        // init PDF Box document
        PDDocument document = null;
        try {
            document = PDDocument.load(handler.getSource().getInputStream());
            handler.setDocument(document);
        } catch (IOException e) {
            throw new ValidationException("PDFBox failed to parse datasource", e);
        }

        // init PDF Extractor
        try {
            SimpleCharStream scs = new SimpleCharStream(source.getInputStream());
            ExtractorTokenManager extractor = new ExtractorTokenManager(scs);
            extractor.parse();
            handler.setPdfExtractor(extractor);
        } catch (IOException e) {
            throw new ValidationException("PDF ExtractorTokenMng failed to parse datasource", e);
        }

        /* 
         * call all helpers
         */

        // Execute priority helpers.
        for (AbstractValidationHelper helper : priorHelpers) {
            runValidation(handler, helper, allErrors);
        }

        // Execute other helpers.
        for (AbstractValidationHelper helper : standHelpers) {
            runValidation(handler, helper, allErrors);
        }

        // check result
        ValidationResult valRes = null;
        if (allErrors.size() == 0) {
            valRes = new ValidationResult(true);
        } else {
            // there are some errors
            valRes = new ValidationResult(allErrors);
        }

        // addition of the some objects to avoid a second file parsing  
        valRes.setPdf(document);
        valRes.setXmpMetaData(handler.getMetadata());
        return valRes;
    } catch (ValidationException e) {
        // ---- Close all open resources if an error occurs.
        handler.close();
        throw e;
    }
}

From source file:org.apache.padaf.preflight.RetrieveMissingStream.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length != 1) {
        System.err.println("usage : RetrieveMissingStream file");
        System.exit(233);/*from   ww  w  .  j  ava  2s. c  o m*/
    }

    HashSet<COSObjectKey> listOfKeys = new HashSet<COSObjectKey>();

    PDDocument document = PDDocument.load(new FileInputStream(args[0]));
    List<COSObject> lCosObj = document.getDocument().getObjects();
    for (COSObject cosObject : lCosObj) {

        if (cosObject.getObject() instanceof COSStream) {
            listOfKeys.add(new COSObjectKey(cosObject.getObjectNumber().intValue(),
                    cosObject.getGenerationNumber().intValue()));
        }

    }

    PDDocumentCatalog catalog = document.getDocumentCatalog();
    List<?> pages = catalog.getAllPages();
    for (int i = 0; i < pages.size(); ++i) {
        PDPage pdp = (PDPage) pages.get(i);
        PDStream pdStream = pdp.getContents();

        COSBase b = pdp.getCOSDictionary().getItem(COSName.getPDFName("Contents"));
        System.out.println();
    }
}

From source file:org.apache.pdflens.Main.java

License:Apache License

/**
* This will parse a document.//from  w w  w .  j  av  a  2s. c  o m
*
* @param input The input stream for the document.
*
* @return The document.
*
* @throws IOException If there is an error parsing the document.
*/
private static PDDocument parseDocument(InputStream input) throws IOException {
    PDDocument document = PDDocument.load(input);
    if (document.isEncrypted()) {
        try {
            document.decrypt("");
        } catch (InvalidPasswordException e) {
            System.err.println("Error: The document is encrypted.");
        } catch (org.apache.pdfbox.exceptions.CryptographyException e) {
            e.printStackTrace();
        }
    }

    return document;
}

From source file:org.apache.syncope.client.console.wicket.markup.html.form.preview.BinaryPDFPreviewer.java

License:Apache License

@Override
public Component preview(final byte[] uploadedBytes) {
    firstPage = null;// w w  w . j a  va 2 s .  c om

    PDDocument document = null;
    try {
        document = PDDocument.load(new ByteArrayInputStream(uploadedBytes));
        if (document.isEncrypted()) {
            LOG.info("Document is encrypted, no preview is possible");
        } else {
            firstPage = new PDFRenderer(document).renderImage(0, RESOLUTION, IMAGE_TYPE);
        }
    } catch (IOException e) {
        LOG.error("While generating thumbnail from first page", e);
    } finally {
        IOUtils.closeQuietly(document);
    }

    Fragment fragment;
    if (firstPage == null) {
        fragment = new Fragment("preview", "noPreviewFragment", this);
    } else {
        fragment = new Fragment("preview", "previewFragment", this);
        fragment.add(new NonCachingImage("previewImage", new ThumbnailImageResource(firstPage)));
    }

    WebMarkupContainer previewContainer = new WebMarkupContainer("previewContainer");
    previewContainer.setOutputMarkupId(true);
    previewContainer.add(fragment);
    return this.addOrReplace(previewContainer);
}

From source file:org.argrr.extractor.gdrive.downloader.ChartsDownloader.java

License:Open Source License

public static void extractPictures(String path, String fileName) throws IOException {
    PDDocument document = null;// w w w  .  j a va 2 s. c o m
    try {
        document = PDDocument.load(path + "/" + fileName + ".pdf");
    } catch (IOException ex) {
        System.out.println("" + ex);
    }
    List pages = document.getDocumentCatalog().getAllPages();
    Iterator iter = pages.iterator();
    int i = 1;
    String name = null;

    while (iter.hasNext()) {
        PDPage page = (PDPage) iter.next();
        PDResources resources = page.getResources();
        Map pageImages = resources.getImages();
        if (pageImages != null) {
            Iterator imageIter = pageImages.keySet().iterator();
            while (imageIter.hasNext()) {
                String key = (String) imageIter.next();
                PDXObjectImage image = (PDXObjectImage) pageImages.get(key);
                image.write2file(ChartsDownloader.rootOutputPathCharts + "/" + fileName + "-" + i);
                i++;
            }
        }
    }
}