Example usage for org.apache.pdfbox.pdmodel PDDocument load

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument load.

Prototype

public static PDDocument load(byte[] input) throws IOException

Source Link

Document

Parses a PDF.

Usage

From source file:org.mycore.iview2.frontend.MCRPDFTools.java

License:Open Source License

static BufferedImage getThumbnail(Path pdfFile, int thumbnailSize, boolean centered) throws IOException {
    InputStream fileIS = Files.newInputStream(pdfFile);
    PDDocument pdf = PDDocument.load(fileIS);
    try {//  w  ww  .j  a  va 2  s .co  m
        PDFRenderer pdfRenderer = new PDFRenderer(pdf);
        BufferedImage level1Image = pdfRenderer.renderImage(0);
        int imageType = BufferedImage.TYPE_INT_ARGB;

        if (!centered) {
            return level1Image;
        }
        final double width = level1Image.getWidth();
        final double height = level1Image.getHeight();
        LOGGER.info("new PDFBox: " + width + "x" + height);
        LOGGER.info("temporary image dimensions: " + width + "x" + height);
        final int newWidth = width < height ? (int) Math.ceil(thumbnailSize * width / height) : thumbnailSize;
        final int newHeight = width < height ? thumbnailSize : (int) Math.ceil(thumbnailSize * height / width);
        //if centered make thumbnailSize x thumbnailSize image
        final BufferedImage bicubic = new BufferedImage(centered ? thumbnailSize : newWidth,
                centered ? thumbnailSize : newHeight, imageType);
        LOGGER.info("target image dimensions: " + bicubic.getWidth() + "x" + bicubic.getHeight());
        final Graphics2D bg = bicubic.createGraphics();
        bg.setRenderingHint(RenderingHints.KEY_INTERPOLATION, RenderingHints.VALUE_INTERPOLATION_BICUBIC);
        int x = centered ? (thumbnailSize - newWidth) / 2 : 0;
        int y = centered ? (thumbnailSize - newHeight) / 2 : 0;
        if (x != 0 && y != 0) {
            LOGGER.warn("Writing at position " + x + "," + y);
        }
        bg.drawImage(level1Image, x, y, x + newWidth, y + newHeight, 0, 0, (int) Math.ceil(width),
                (int) Math.ceil(height), null);
        bg.dispose();
        return bicubic;
    } finally {
        pdf.close();
    }
}

From source file:org.mycore.media.MCRMediaPDFParser.java

License:Open Source License

/**
 * Parse file and store metadata in related Object.
 * // w  ww. j  a va 2 s  .  com
 * @return MCRMediaObject
 *              can be held any MCRMediaObject
 * @see MCRMediaObject#clone()
 */
@SuppressWarnings("unchecked")
public synchronized MCRMediaObject parse(File file) throws Exception {
    if (!file.exists())
        throw new IOException("File \"" + file.getName() + "\" doesn't exists!");

    MCRPDFObject media = new MCRPDFObject();

    LOGGER.info("parse " + file.getName() + "...");

    PDDocument pdf = PDDocument.load(file);
    try {
        media.fileName = file.getName();
        media.fileSize = file.length();
        media.folderName = (file.getAbsolutePath()).replace(file.getName(), "");

        PDPageTree pages = pdf.getDocumentCatalog().getPages();

        media.numPages = pdf.getNumberOfPages();

        PDPage page = (PDPage) pages.get(0);
        PDRectangle rect = page.getMediaBox();

        media.width = Math.round(rect.getWidth());
        media.height = Math.round(rect.getHeight());

        PDDocumentInformation info = pdf.getDocumentInformation();
        if (info != null) {
            media.tags = new MCRMediaTagObject();
            media.tags.author = info.getAuthor();
            media.tags.creator = info.getCreator();
            media.tags.producer = info.getProducer();
            media.tags.title = info.getTitle();
            media.tags.subject = info.getSubject();
            media.tags.keywords = info.getKeywords();
        }
    } catch (Exception e) {
        LOGGER.error(e.getMessage());
        throw new Exception(e.getMessage());
    } finally {
        pdf.close();
    }

    return media;
}

From source file:org.nines.NinesStatementHandlerTest.java

License:Apache License

@Test
public void testPdfStrip() {
    try {//  w  w  w.  ja  v a2s .  co m
        FileInputStream is = new FileInputStream(new File("test_data/sample.pdf"));
        PDDocument pdfDoc = PDDocument.load(is);
        assertEquals(2, pdfDoc.getNumberOfPages());
        PDFTextStripper pdfStrip = new PDFTextStripper();
        String text = pdfStrip.getText(pdfDoc);

        assertNotNull(text);
        System.out.println(text);

    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}

From source file:org.nines.RdfTextSpider.java

License:Apache License

/**
 * Extract the text from the PDF specified by the URI
 * @param uri//ww  w. j  a va  2 s  .c  om
 * @return
 * @throws IOException 
 */
private byte[] scrapeExternalPDF(final String uri) throws IOException {
    InputStream is = null;
    GetMethod get = new GetMethod(uri);
    ;
    PDDocument pdfDoc = null;
    try {
        int result;
        result = httpClient.executeMethod(get);
        if (result != 200) {
            throw new IOException(result + " code returned for URL: " + uri);
        }
        is = get.getResponseBodyAsStream();
        pdfDoc = PDDocument.load(is);
        PDFTextStripper pdfStrip = new PDFTextStripper();
        return pdfStrip.getText(pdfDoc).getBytes();

    } catch (IOException e) {
        throw e; // just rethrow it
    } finally {
        try {
            get.releaseConnection();
            IOUtils.closeQuietly(is);
            if (pdfDoc != null) {
                pdfDoc.close();
            }
        } catch (Exception e) {
        }
    }
}

From source file:org.nuxeo.ecm.core.convert.plugins.tests.PDFEncodingWarn.java

License:Apache License

@Test
public void extract() throws IOException {
    URL url = getClass().getResource("/test-docs/nutcracker.pdf");
    PDDocument doc = PDDocument.load(url);
    PDFTextStripper stripper = new PDF2TextConverter.PatchedPDFTextStripper();
    stripper.getText(doc);/*from   www .  j  a  v a2  s .c o  m*/
    stripper.getText(doc);
}

From source file:org.nuxeo.ecm.core.convert.plugins.text.extractors.PDF2TextConverter.java

License:Apache License

@Override
public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters)
        throws ConversionException {

    PDDocument document = null;/*from w  w w . ja v  a  2  s  . co  m*/
    File f = null;
    OutputStream fas = null;
    try {
        document = PDDocument.load(blobHolder.getBlob().getStream());
        // NXP-1556: if document is protected an IOException will be raised
        // Instead of catching the exception based on its message string
        // lets avoid sending messages that will generate this error
        // code taken from PDFTextStripper.writeText source.
        // only care about standard encryption and if it was decrypted with
        // the user password
        AccessPermission permission = document.getCurrentAccessPermission();
        if (permission.canExtractContent()) {
            PatchedPDFTextStripper textStripper = new PatchedPDFTextStripper();

            // use the position information to heuristically organize the
            // extracted paragraphs. This is also important for
            // right-to-left languages.
            textStripper.setSortByPosition(true);

            String text = textStripper.getText(document);
            // replace non breaking space by regular spaces (why?)
            // text = text.replace("\u00a0", " ");
            f = Framework.createTempFile("pdfboplugin", ".txt");
            fas = new FileOutputStream(f);
            fas.write(text.getBytes("UTF-8"));
            try (FileInputStream is = new FileInputStream(f)) {
                Blob blob = Blobs.createBlob(is, "text/plain", "UTF-8");
                return new SimpleCachableBlobHolder(blob);
            }
        } else {
            return new SimpleCachableBlobHolder(Blobs.createBlob(""));
        }
    } catch (IOException e) {
        throw new ConversionException("Error during text extraction with PDFBox", e);
    } finally {
        if (document != null) {
            try {
                document.close();
            } catch (IOException e) {
                log.error("Error while closing PDFBox document", e);
            }
        }
        if (fas != null) {
            try {
                fas.close();
            } catch (IOException e) {
                log.error(e);
            }
        }
        if (f != null) {
            f.delete();
        }
    }
}

From source file:org.nuxeo.ecm.platform.convert.tests.BaseConverterTest.java

License:Apache License

public static String readPdfText(File pdfFile) throws IOException {
    PDFTextStripper textStripper = new PDFTextStripper();
    PDDocument document = PDDocument.load(pdfFile);
    String text = textStripper.getText(document);
    document.close();//  w w w  .  j  av  a  2s  . com
    return text.trim();
}

From source file:org.nuxeo.ecm.platform.convert.tests.BaseConverterTest.java

License:Apache License

public static boolean isPDFA(File pdfFile) throws Exception {
    PDDocument pddoc = PDDocument.load(pdfFile);
    XMPMetadata xmp = pddoc.getDocumentCatalog().getMetadata().exportXMPMetadata();
    Document doc = xmp.getXMPDocument();
    // <rdf:Description xmlns:pdfaid="http://www.aiim.org/pdfa/ns/id/"
    // rdf:about="">
    // <pdfaid:part>1</pdfaid:part>
    // <pdfaid:conformance>A</pdfaid:conformance>
    // </rdf:Description>
    NodeList list = doc.getElementsByTagName("pdfaid:conformance");
    return list != null && "A".equals(list.item(0).getTextContent());
}

From source file:org.nuxeo.ecm.platform.convert.tests.DocumentUTUtils.java

License:Open Source License

/**
 * Extracts the text from a PDF file./*from   w  ww . j  a va2s. c om*/
 *
 * @return the document content as plain text
 */
public static String readPdfText(File pdfFile) throws IOException {
    PDFTextStripper textStripper = new PDFTextStripper();
    PDDocument document = PDDocument.load(pdfFile);
    String text = textStripper.getText(document);
    document.close();
    return text.trim();
}

From source file:org.nuxeo.ecm.platform.picture.core.test.TestMagickExecutors.java

License:Apache License

@Test
public void testConverterToPDF() throws Exception {
    File file = FileUtils.getResourceFileFromContext("images/test.jpg");
    File out = Framework.createTempFile(TMP_FILE_PREFIX, ".document.pdf");

    ImageConverter.convert(file.getAbsolutePath(), out.getAbsolutePath());

    assertEquals("pdf", FilenameUtils.getExtension(out.getAbsolutePath()));
    PDDocument doc = PDDocument.load(out);
    assertNotNull(doc);/*from   w  w w.j  a v  a 2 s.c om*/

    out.delete();
}