Example usage for org.apache.pdfbox.pdmodel PDDocument load

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument load.

Prototype

public static PDDocument load(byte[] input) throws IOException

Source Link

Document

Parses a PDF.

Usage

From source file:nominas.sei.NominasSEI.java

/**
 * @param args the command line arguments
 *//*from w  ww  . j a va  2  s  . c  o  m*/
public static void main(String[] args) {

    ArrayList<PaginaNomina> paginasNomina = new ArrayList<PaginaNomina>();

    for (int x = 0; x < 1; x++) {//RECORREMOS EL ARREGLO CON LOS NOMBRES DE ARCHIVO
        String ruta = new String();//VARIABLE QUE DETERMINARA LA RUTA DEL ARCHIVO A LEER.
        ruta = (".\\NOMINAS.pdf"); //SE ALMACENA LA RUTA DEL ARCHIVO A LEER. 

        try {
            PDDocument pd = PDDocument.load(ruta); //CARGAR EL PDF
            List l = pd.getDocumentCatalog().getAllPages();//NUMERO LAS PAGINAS DEL ARCHIVO
            Object[] obj = l.toArray();//METO EN UN OBJETO LA LISTA DE PAGINAS PARA MANIPULARLA
            for (int i = 0; i < l.size(); i++) {
                PDPage page = (PDPage) obj[i];//PAGE ES LA PAGINA 1 DE LA QUE CONSTA EL ARCHIVO
                PageFormat pageFormat = pd.getPageFormat(0);//PROPIEDADES DE LA PAGINA (FORMATO)
                Double d1 = new Double(pageFormat.getHeight());//ALTO
                Double d2 = new Double(pageFormat.getWidth());//ANCHO
                int width = d1.intValue();//ANCHO
                int eigth = 1024;//ALTO

                PDFTextStripperByArea stripper = new PDFTextStripperByArea();//COMPONENTE PARA ACCESO AL TEXTO
                Rectangle rect = new Rectangle(0, 0, width, eigth);//DEFNIR AREA DONDE SE BUSCARA EL TEXTO
                stripper.addRegion("area1", rect);//REGISTRAMOS LA REGION CON UN NOMBRE
                stripper.extractRegions(page);//EXTRAE TEXTO DEL AREA

                String contenido = new String();//CONTENIDO = A LO QUE CONTENGA EL AREA O REGION
                contenido = (stripper.getTextForRegion("area1"));
                String[] lines = contenido.split("[\\r\\n]+");
                String nombre = lines[1].substring(28, lines[1].length() - 10);
                PaginaNomina nomina = new PaginaNomina(page, nombre);
                paginasNomina.add(nomina);
            }
            Collections.sort(paginasNomina);
            // Create a new empty document
            PDDocument document = new PDDocument();

            for (int i = 0; i < paginasNomina.size(); i++) {
                System.out.println(paginasNomina.get(i).getNombre());
                document.addPage(paginasNomina.get(i).getPagina());
            }
            // Save the newly created document
            document.save("NominasOrdenadas.pdf");

            // finally make sure that the document is properly
            // closed.
            document.close();
            pd.close();//CERRAMOS OBJETO ACROBAT
        } catch (Exception e) {
            System.out.println(e.getMessage());
        } //CATCH
    } //FOR

}

From source file:noprint.NoPrint.java

/**
 * @param args the command line arguments
 * @throws IOException in case input file is can't be read or output written
 * @throws org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException
 * @throws org.apache.pdfbox.exceptions.COSVisitorException
 */// w  w  w. j a  v  a2 s .c  om
public static void main(String[] args) throws IOException, BadSecurityHandlerException, COSVisitorException {
    String infile = "input.pdf";
    String outfile = "output.pdf";

    String ownerPass = "";
    String userPass = "";
    /**
     * TODO: read up what the actual difference is between
     * userpassword and ownerpassword.
     */
    int keylength = 40;

    AccessPermission ap = new AccessPermission();
    PDDocument document = null;

    ap.setCanAssembleDocument(true);
    ap.setCanExtractContent(true);
    ap.setCanExtractForAccessibility(true);
    ap.setCanFillInForm(true);
    ap.setCanModify(true);
    ap.setCanModifyAnnotations(true);
    ap.setCanPrintDegraded(true);

    ap.setCanPrint(false);
    // YOU CAN'T PRINT
    // at least not when your PDFreader adheres to DRM (some don't)
    // also this is trivial to remove

    document = PDDocument.load(infile);

    if (!document.isEncrypted()) {
        StandardProtectionPolicy spp;
        spp = new StandardProtectionPolicy(ownerPass, userPass, ap);
        spp.setEncryptionKeyLength(keylength);
        document.protect(spp);
        document.save(outfile);
    }

    if (document != null) {
        document.close();
    }
}

From source file:nz.co.testamation.core.reader.pdf.PdfContentReaderImpl.java

License:Apache License

private String getPdfText(CloseableHttpResponse response) throws IOException {
    PDDocument load = PDDocument.load(response.getEntity().getContent());
    try {//from  w  w  w  .  jav a  2 s . c om
        return new PDFTextStripper().getText(load).replaceAll("\\s+", " ");
    } finally {
        load.close();
    }
}

From source file:org.ala.harvester.ExtractPubfSciNamesAndImages.java

License:Apache License

/**
 * This will print the documents text in a certain area.
 *
 * @param args The command line arguments.
 *
 * @throws Exception If there is an error parsing the document.
 *//*  www  .  j  a  v  a  2 s  . c o  m*/
public static void main(String[] args) throws Exception {
    if (args.length != 1) {
        usage();
    } else {
        PDDocument document = null;
        try {
            document = PDDocument.load(args[0]);
            if (document.isEncrypted()) {
                try {
                    document.decrypt("");
                } catch (InvalidPasswordException e) {
                    System.err.println("Error: Document is encrypted with a password.");
                    System.exit(1);
                }
            }

            extractSciNameAndImages(document);

        } finally {
            if (document != null) {
                document.close();
            }
        }
    }
}

From source file:org.alfresco.extension.pdftoolkit.repo.action.executer.PDFAppendActionExecuter.java

License:Apache License

/**
 * @param reader// ww  w .j a v  a  2 s  .  co  m
 * @param writer
 * @param options
 * @throws Exception
 */
protected final void action(Action ruleAction, NodeRef actionedUponNodeRef, NodeRef targetNodeRef,
        ContentReader reader, ContentReader targetContentReader, Map<String, Object> options) {
    PDDocument pdf = null;
    PDDocument pdfTarget = null;
    InputStream is = null;
    InputStream tis = null;
    File tempDir = null;
    ContentWriter writer = null;

    try {
        is = reader.getContentInputStream();
        tis = targetContentReader.getContentInputStream();
        // stream the document in
        pdf = PDDocument.load(is);
        pdfTarget = PDDocument.load(tis);
        // Append the PDFs
        PDFMergerUtility merger = new PDFMergerUtility();
        merger.appendDocument(pdfTarget, pdf);
        merger.setDestinationFileName(options.get(PARAM_DESTINATION_NAME).toString());
        merger.mergeDocuments();

        // build a temp dir name based on the ID of the noderef we are
        // importing
        File alfTempDir = TempFileProvider.getTempDir();
        tempDir = new File(alfTempDir.getPath() + File.separatorChar + actionedUponNodeRef.getId());
        tempDir.mkdir();

        String fileName = options.get(PARAM_DESTINATION_NAME).toString();
        pdfTarget.save(tempDir + "" + File.separatorChar + fileName + FILE_EXTENSION);

        for (File file : tempDir.listFiles()) {
            try {
                if (file.isFile()) {
                    // Get a writer and prep it for putting it back into the
                    // repo
                    NodeRef destinationNode = createDestinationNode(file.getName(),
                            (NodeRef) ruleAction.getParameterValue(PARAM_DESTINATION_FOLDER),
                            actionedUponNodeRef);
                    writer = serviceRegistry.getContentService().getWriter(destinationNode,
                            ContentModel.PROP_CONTENT, true);

                    writer.setEncoding(reader.getEncoding()); // original
                                                              // encoding
                    writer.setMimetype(FILE_MIMETYPE);

                    // Put it in the repo
                    writer.putContent(file);

                    // Clean up
                    file.delete();
                }
            } catch (FileExistsException e) {
                throw new AlfrescoRuntimeException("Failed to process file.", e);
            }
        }
    } catch (COSVisitorException e) {
        throw new AlfrescoRuntimeException(e.getMessage(), e);
    } catch (IOException e) {
        throw new AlfrescoRuntimeException(e.getMessage(), e);
    }

    finally {
        if (pdf != null) {
            try {
                pdf.close();
            } catch (IOException e) {
                throw new AlfrescoRuntimeException(e.getMessage(), e);
            }
        }
        if (pdfTarget != null) {
            try {
                pdfTarget.close();
            } catch (IOException e) {
                throw new AlfrescoRuntimeException(e.getMessage(), e);
            }
        }
        if (is != null) {
            try {
                is.close();
            } catch (IOException e) {
                throw new AlfrescoRuntimeException(e.getMessage(), e);
            }
        }

        if (tempDir != null) {
            tempDir.delete();
        }
    }
}

From source file:org.alfresco.extension.pdftoolkit.repo.action.executer.PDFInsertAtPageActionExecuter.java

License:Apache License

/**
 * @param reader//from   ww  w  .  j ava2s  . c  o  m
 * @param writer
 * @param options
 * @throws Exception
 */
protected final void action(Action ruleAction, NodeRef actionedUponNodeRef, ContentReader reader,
        ContentReader insertReader, Map<String, Object> options) {
    PDDocument pdf = null;
    PDDocument insertContentPDF = null;
    InputStream is = null;
    InputStream cis = null;
    File tempDir = null;
    ContentWriter writer = null;

    try {

        int insertAt = Integer.valueOf((String) options.get(PARAM_INSERT_AT_PAGE)).intValue();

        // Get contentReader inputStream
        is = reader.getContentInputStream();
        // Get insertContentReader inputStream
        cis = insertReader.getContentInputStream();
        // stream the target document in
        pdf = PDDocument.load(is);
        // stream the insert content document in
        insertContentPDF = PDDocument.load(cis);

        // split the PDF and put the pages in a list
        Splitter splitter = new Splitter();
        // Need to adjust the input value to get the split at the right page
        splitter.setSplitAtPage(insertAt - 1);

        // Split the pages
        List<PDDocument> pdfs = splitter.split(pdf);

        // Build the output PDF
        PDFMergerUtility merger = new PDFMergerUtility();
        merger.appendDocument((PDDocument) pdfs.get(0), insertContentPDF);
        merger.appendDocument((PDDocument) pdfs.get(0), (PDDocument) pdfs.get(1));
        merger.setDestinationFileName(options.get(PARAM_DESTINATION_NAME).toString());
        merger.mergeDocuments();

        // build a temp dir, name based on the ID of the noderef we are
        // importing
        File alfTempDir = TempFileProvider.getTempDir();
        tempDir = new File(alfTempDir.getPath() + File.separatorChar + actionedUponNodeRef.getId());
        tempDir.mkdir();

        String fileName = options.get(PARAM_DESTINATION_NAME).toString();

        PDDocument completePDF = (PDDocument) pdfs.get(0);

        completePDF.save(tempDir + "" + File.separatorChar + fileName + FILE_EXTENSION);

        try {
            completePDF.close();
        } catch (IOException e) {
            throw new AlfrescoRuntimeException(e.getMessage(), e);
        }

        for (File file : tempDir.listFiles()) {
            try {
                if (file.isFile()) {

                    // Get a writer and prep it for putting it back into the
                    // repo
                    NodeRef destinationNode = createDestinationNode(file.getName(),
                            (NodeRef) ruleAction.getParameterValue(PARAM_DESTINATION_FOLDER),
                            actionedUponNodeRef);
                    writer = serviceRegistry.getContentService().getWriter(destinationNode,
                            ContentModel.PROP_CONTENT, true);

                    writer.setEncoding(reader.getEncoding()); // original
                    // encoding
                    writer.setMimetype(FILE_MIMETYPE);

                    // Put it in the repo
                    writer.putContent(file);

                    // Clean up
                    file.delete();
                }
            } catch (FileExistsException e) {
                throw new AlfrescoRuntimeException("Failed to process file.", e);
            }
        }
    }
    // TODO add better handling
    catch (COSVisitorException e) {
        throw new AlfrescoRuntimeException(e.getMessage(), e);
    } catch (IOException e) {
        throw new AlfrescoRuntimeException(e.getMessage(), e);
    }

    finally {
        if (pdf != null) {
            try {
                pdf.close();
            } catch (IOException e) {
                throw new AlfrescoRuntimeException(e.getMessage(), e);
            }
        }
        if (is != null) {
            try {
                is.close();
            } catch (IOException e) {
                throw new AlfrescoRuntimeException(e.getMessage(), e);
            }
        }

        if (tempDir != null) {
            tempDir.delete();
        }
    }
}

From source file:org.alfresco.repo.content.transform.TextToPdfContentTransformerTest.java

License:Open Source License

private void transformTextAndCheck(String text, String encoding, String checkText) throws IOException {
    // Get a reader for the text
    ContentReader reader = buildContentReader(text, Charset.forName(encoding));

    // And a temp writer
    File out = TempFileProvider.createTempFile("AlfrescoTest_", ".pdf");
    ContentWriter writer = new FileContentWriter(out);
    writer.setMimetype("application/pdf");

    // Transform to PDF
    transformer.transform(reader, writer);

    // Read back in the PDF and check it
    PDDocument doc = PDDocument.load(out);
    PDFTextStripper textStripper = new PDFTextStripper();
    StringWriter textWriter = new StringWriter();
    textStripper.writeText(doc, textWriter);
    doc.close();/*ww w .j a  v  a  2 s. c  o m*/

    String roundTrip = clean(textWriter.toString());

    assertEquals("Incorrect text in PDF when starting from text in " + encoding, checkText, roundTrip);
}

From source file:org.apache.camel.component.fop.FopComponentTest.java

License:Apache License

@Test
public void createPdfUsingXmlDataAndXsltTransformation() throws Exception {
    resultEndpoint.expectedMessageCount(1);
    FileInputStream inputStream = new FileInputStream("src/test/data/xml/data.xml");

    template.sendBody(inputStream);//from w w  w. ja  v  a  2s. c o m
    resultEndpoint.assertIsSatisfied();

    PDDocument document = PDDocument.load("target/data/result.pdf");
    String pdfText = FopHelper.extractTextFrom(document);
    assertTrue(pdfText.contains("Project")); //from xsl template
    assertTrue(pdfText.contains("John Doe")); //from data xml

    // assert on the header "foo" being populated
    Exchange exchange = resultEndpoint.getReceivedExchanges().get(0);
    assertEquals("Header value is lost!", "bar", exchange.getIn().getHeader("foo"));
}

From source file:org.apache.camel.component.fop.FopEndpointTest.java

License:Apache License

private PDDocument getDocumentFrom(Exchange exchange) throws IOException {
    InputStream inputStream = exchange.getOut().getBody(InputStream.class);
    return PDDocument.load(inputStream);
}

From source file:org.apache.camel.component.pdf.PdfAppendTest.java

License:Apache License

@Test
public void testAppend() throws Exception {
    final String originalText = "Test";
    final String textToAppend = "Append";
    PDDocument document = new PDDocument();
    PDPage page = new PDPage(PDPage.PAGE_SIZE_A4);
    document.addPage(page);// www .j av  a2s  . co  m
    PDPageContentStream contentStream = new PDPageContentStream(document, page);
    contentStream.setFont(PDType1Font.HELVETICA, 12);
    contentStream.beginText();
    contentStream.moveTextPositionByAmount(20, 400);
    contentStream.drawString(originalText);
    contentStream.endText();
    contentStream.close();

    template.sendBodyAndHeader("direct:start", textToAppend, PdfHeaderConstants.PDF_DOCUMENT_HEADER_NAME,
            document);

    resultEndpoint.setExpectedMessageCount(1);
    resultEndpoint.expectedMessagesMatches(new Predicate() {
        @Override
        public boolean matches(Exchange exchange) {
            Object body = exchange.getIn().getBody();
            assertThat(body, instanceOf(ByteArrayOutputStream.class));
            try {
                PDDocument doc = PDDocument
                        .load(new ByteArrayInputStream(((ByteArrayOutputStream) body).toByteArray()));
                PDFTextStripper pdfTextStripper = new PDFTextStripper();
                String text = pdfTextStripper.getText(doc);
                assertEquals(2, doc.getNumberOfPages());
                assertThat(text, containsString(originalText));
                assertThat(text, containsString(textToAppend));
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
            return true;
        }
    });
    resultEndpoint.assertIsSatisfied();

}