Example usage for org.apache.pdfbox.pdmodel PDDocument load

List of usage examples for org.apache.pdfbox.pdmodel PDDocument load

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument load.

Prototype

public static PDDocument load(byte[] input) throws IOException 

Source Link

Document

Parses a PDF.

Usage

From source file:org.mycore.iview2.frontend.MCRPDFTools.java

License:Open Source License

static BufferedImage getThumbnail(Path pdfFile, int thumbnailSize, boolean centered) throws IOException {
    InputStream fileIS = Files.newInputStream(pdfFile);
    PDDocument pdf = PDDocument.load(fileIS);
    try {//  w  ww  .j  a  va 2  s .co  m
        PDFRenderer pdfRenderer = new PDFRenderer(pdf);
        BufferedImage level1Image = pdfRenderer.renderImage(0);
        int imageType = BufferedImage.TYPE_INT_ARGB;

        if (!centered) {
            return level1Image;
        }
        final double width = level1Image.getWidth();
        final double height = level1Image.getHeight();
        LOGGER.info("new PDFBox: " + width + "x" + height);
        LOGGER.info("temporary image dimensions: " + width + "x" + height);
        final int newWidth = width < height ? (int) Math.ceil(thumbnailSize * width / height) : thumbnailSize;
        final int newHeight = width < height ? thumbnailSize : (int) Math.ceil(thumbnailSize * height / width);
        //if centered make thumbnailSize x thumbnailSize image
        final BufferedImage bicubic = new BufferedImage(centered ? thumbnailSize : newWidth,
                centered ? thumbnailSize : newHeight, imageType);
        LOGGER.info("target image dimensions: " + bicubic.getWidth() + "x" + bicubic.getHeight());
        final Graphics2D bg = bicubic.createGraphics();
        bg.setRenderingHint(RenderingHints.KEY_INTERPOLATION, RenderingHints.VALUE_INTERPOLATION_BICUBIC);
        int x = centered ? (thumbnailSize - newWidth) / 2 : 0;
        int y = centered ? (thumbnailSize - newHeight) / 2 : 0;
        if (x != 0 && y != 0) {
            LOGGER.warn("Writing at position " + x + "," + y);
        }
        bg.drawImage(level1Image, x, y, x + newWidth, y + newHeight, 0, 0, (int) Math.ceil(width),
                (int) Math.ceil(height), null);
        bg.dispose();
        return bicubic;
    } finally {
        pdf.close();
    }
}

From source file:org.mycore.media.MCRMediaPDFParser.java

License:Open Source License

/**
 * Parse file and store metadata in related Object.
 * // w  ww. j  a va 2 s  .  com
 * @return MCRMediaObject
 *              can be held any MCRMediaObject
 * @see MCRMediaObject#clone()
 */
@SuppressWarnings("unchecked")
public synchronized MCRMediaObject parse(File file) throws Exception {
    if (!file.exists())
        throw new IOException("File \"" + file.getName() + "\" doesn't exists!");

    MCRPDFObject media = new MCRPDFObject();

    LOGGER.info("parse " + file.getName() + "...");

    PDDocument pdf = PDDocument.load(file);
    try {
        media.fileName = file.getName();
        media.fileSize = file.length();
        media.folderName = (file.getAbsolutePath()).replace(file.getName(), "");

        PDPageTree pages = pdf.getDocumentCatalog().getPages();

        media.numPages = pdf.getNumberOfPages();

        PDPage page = (PDPage) pages.get(0);
        PDRectangle rect = page.getMediaBox();

        media.width = Math.round(rect.getWidth());
        media.height = Math.round(rect.getHeight());

        PDDocumentInformation info = pdf.getDocumentInformation();
        if (info != null) {
            media.tags = new MCRMediaTagObject();
            media.tags.author = info.getAuthor();
            media.tags.creator = info.getCreator();
            media.tags.producer = info.getProducer();
            media.tags.title = info.getTitle();
            media.tags.subject = info.getSubject();
            media.tags.keywords = info.getKeywords();
        }
    } catch (Exception e) {
        LOGGER.error(e.getMessage());
        throw new Exception(e.getMessage());
    } finally {
        pdf.close();
    }

    return media;
}

From source file:org.nines.NinesStatementHandlerTest.java

License:Apache License

@Test
public void testPdfStrip() {
    try {//  w  w  w.  ja  v a2s .  co m
        FileInputStream is = new FileInputStream(new File("test_data/sample.pdf"));
        PDDocument pdfDoc = PDDocument.load(is);
        assertEquals(2, pdfDoc.getNumberOfPages());
        PDFTextStripper pdfStrip = new PDFTextStripper();
        String text = pdfStrip.getText(pdfDoc);

        assertNotNull(text);
        System.out.println(text);

    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}

From source file:org.nines.RdfTextSpider.java

License:Apache License

/**
 * Extract the text from the PDF specified by the URI
 * @param uri//ww  w. j  a va  2 s  .c  om
 * @return
 * @throws IOException 
 */
private byte[] scrapeExternalPDF(final String uri) throws IOException {
    InputStream is = null;
    GetMethod get = new GetMethod(uri);
    ;
    PDDocument pdfDoc = null;
    try {
        int result;
        result = httpClient.executeMethod(get);
        if (result != 200) {
            throw new IOException(result + " code returned for URL: " + uri);
        }
        is = get.getResponseBodyAsStream();
        pdfDoc = PDDocument.load(is);
        PDFTextStripper pdfStrip = new PDFTextStripper();
        return pdfStrip.getText(pdfDoc).getBytes();

    } catch (IOException e) {
        throw e; // just rethrow it
    } finally {
        try {
            get.releaseConnection();
            IOUtils.closeQuietly(is);
            if (pdfDoc != null) {
                pdfDoc.close();
            }
        } catch (Exception e) {
        }
    }
}

From source file:org.nuxeo.ecm.core.convert.plugins.tests.PDFEncodingWarn.java

License:Apache License

@Test
public void extract() throws IOException {
    URL url = getClass().getResource("/test-docs/nutcracker.pdf");
    PDDocument doc = PDDocument.load(url);
    PDFTextStripper stripper = new PDF2TextConverter.PatchedPDFTextStripper();
    stripper.getText(doc);/*from   www .  j  a  v a2  s .c o  m*/
    stripper.getText(doc);
}

From source file:org.nuxeo.ecm.core.convert.plugins.text.extractors.PDF2TextConverter.java

License:Apache License

@Override
public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters)
        throws ConversionException {

    PDDocument document = null;/*from w  w w . ja v  a  2  s  . co  m*/
    File f = null;
    OutputStream fas = null;
    try {
        document = PDDocument.load(blobHolder.getBlob().getStream());
        // NXP-1556: if document is protected an IOException will be raised
        // Instead of catching the exception based on its message string
        // lets avoid sending messages that will generate this error
        // code taken from PDFTextStripper.writeText source.
        // only care about standard encryption and if it was decrypted with
        // the user password
        AccessPermission permission = document.getCurrentAccessPermission();
        if (permission.canExtractContent()) {
            PatchedPDFTextStripper textStripper = new PatchedPDFTextStripper();

            // use the position information to heuristically organize the
            // extracted paragraphs. This is also important for
            // right-to-left languages.
            textStripper.setSortByPosition(true);

            String text = textStripper.getText(document);
            // replace non breaking space by regular spaces (why?)
            // text = text.replace("\u00a0", " ");
            f = Framework.createTempFile("pdfboplugin", ".txt");
            fas = new FileOutputStream(f);
            fas.write(text.getBytes("UTF-8"));
            try (FileInputStream is = new FileInputStream(f)) {
                Blob blob = Blobs.createBlob(is, "text/plain", "UTF-8");
                return new SimpleCachableBlobHolder(blob);
            }
        } else {
            return new SimpleCachableBlobHolder(Blobs.createBlob(""));
        }
    } catch (IOException e) {
        throw new ConversionException("Error during text extraction with PDFBox", e);
    } finally {
        if (document != null) {
            try {
                document.close();
            } catch (IOException e) {
                log.error("Error while closing PDFBox document", e);
            }
        }
        if (fas != null) {
            try {
                fas.close();
            } catch (IOException e) {
                log.error(e);
            }
        }
        if (f != null) {
            f.delete();
        }
    }
}

From source file:org.nuxeo.ecm.platform.convert.tests.BaseConverterTest.java

License:Apache License

public static String readPdfText(File pdfFile) throws IOException {
    PDFTextStripper textStripper = new PDFTextStripper();
    PDDocument document = PDDocument.load(pdfFile);
    String text = textStripper.getText(document);
    document.close();//  w w w  .  j  av  a  2s  . com
    return text.trim();
}

From source file:org.nuxeo.ecm.platform.convert.tests.BaseConverterTest.java

License:Apache License

public static boolean isPDFA(File pdfFile) throws Exception {
    PDDocument pddoc = PDDocument.load(pdfFile);
    XMPMetadata xmp = pddoc.getDocumentCatalog().getMetadata().exportXMPMetadata();
    Document doc = xmp.getXMPDocument();
    // <rdf:Description xmlns:pdfaid="http://www.aiim.org/pdfa/ns/id/"
    // rdf:about="">
    // <pdfaid:part>1</pdfaid:part>
    // <pdfaid:conformance>A</pdfaid:conformance>
    // </rdf:Description>
    NodeList list = doc.getElementsByTagName("pdfaid:conformance");
    return list != null && "A".equals(list.item(0).getTextContent());
}

From source file:org.nuxeo.ecm.platform.convert.tests.DocumentUTUtils.java

License:Open Source License

/**
 * Extracts the text from a PDF file./*from   w  ww . j  a va2s. c om*/
 *
 * @return the document content as plain text
 */
public static String readPdfText(File pdfFile) throws IOException {
    PDFTextStripper textStripper = new PDFTextStripper();
    PDDocument document = PDDocument.load(pdfFile);
    String text = textStripper.getText(document);
    document.close();
    return text.trim();
}

From source file:org.nuxeo.ecm.platform.picture.core.test.TestMagickExecutors.java

License:Apache License

@Test
public void testConverterToPDF() throws Exception {
    File file = FileUtils.getResourceFileFromContext("images/test.jpg");
    File out = Framework.createTempFile(TMP_FILE_PREFIX, ".document.pdf");

    ImageConverter.convert(file.getAbsolutePath(), out.getAbsolutePath());

    assertEquals("pdf", FilenameUtils.getExtension(out.getAbsolutePath()));
    PDDocument doc = PDDocument.load(out);
    assertNotNull(doc);/*from   w  w w.j  a v  a 2 s.c om*/

    out.delete();
}