Example usage for org.apache.pdfbox.pdmodel PDDocument load

List of usage examples for org.apache.pdfbox.pdmodel PDDocument load

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument load.

Prototype

public static PDDocument load(byte[] input, String password) throws IOException 

Source Link

Document

Parses a PDF.

Usage

From source file:dk.defxws.fedoragsearch.server.TransformerToText.java

License:Open Source License

/**
 * //  w w w .j a  va 2 s. com
 *
 * @throws Exception.
 */
private StringBuffer getTextFromPDF(byte[] doc, String pageNum) throws Exception {
    StringBuffer docText = new StringBuffer();
    PDDocument pdDoc = null;
    String password = "";

    // extract PDF document's textual content
    try {
        PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/);
        int page = Integer.parseInt(pageNum);
        if (page != -1) {
            stripper.setStartPage(page);
            stripper.setEndPage(page);
        }
        //password
        pdDoc = PDDocument.load(new ByteArrayInputStream(doc), password); // new PDDocument(cosDoc);
        docText = new StringBuffer(stripper.getText(pdDoc));
    } catch (IOException e) {
        throw new Exception("Cannot parse PDF document", e);
    } finally {
        closePDDocument(pdDoc);
    }
    return docText;
}

From source file:edu.ist.psu.sagnik.research.pdfbox2playground.javatest.ExtractImages.java

License:Apache License

private void extract(String pdfFile, String password) throws IOException {
    PDDocument document = null;//from w w  w  .ja v a 2  s .co m
    try {
        document = PDDocument.load(new File(pdfFile), password);
        AccessPermission ap = document.getCurrentAccessPermission();
        if (!ap.canExtractContent()) {
            throw new IOException("You do not have permission to extract images");
        }

        for (int i = 0; i < document.getNumberOfPages(); i++) // todo: ITERATOR would be much better
        {
            PDPage page = document.getPage(i);
            ImageGraphicsEngine extractor = new ImageGraphicsEngine(page);
            extractor.run();
        }
    } finally {
        if (document != null) {
            document.close();
        }
    }
}

From source file:merge_split.MergeSplit.java

License:Apache License

private void ConvertButtonActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_ConvertButtonActionPerformed
    PDDocument document = null;//from  w  ww.  ja va2s.  com
    try {
        document = PDDocument.load(new File((String) ConvertFileField.getText()), convertcode);
    } catch (IOException ex) {
        JOptionPane.showMessageDialog(null, "Problem opening pdf.", "Problem opening pdf",
                JOptionPane.WARNING_MESSAGE);
    }
    TreeSet tree = findPages((String) ConvertPagesField.getText());

    PDFRenderer pdfRenderer = new PDFRenderer(document);
    for (int page = 0; page < document.getNumberOfPages(); ++page) {
        BufferedImage bim = null;
        try {
            bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);
        } catch (IOException ex) {
            JOptionPane.showMessageDialog(null, "Problem rendering image.", "Problem rendering image",
                    JOptionPane.WARNING_MESSAGE);

        }

        // suffix in filename will be used as the file format
        String destination = ConvertDestinationField.getText() + "\\" + ConvertNameField.getText();
        String image = ".png";
        if (pngbutton.isSelected()) {
            image = ".png";
        } else if (bmpbutton.isSelected()) {
            image = ".bmp";
        } else if (gifbutton.isSelected()) {
            image = ".gif";
        } else if (jpgbutton.isSelected()) {
            image = ".jpg";
        }
        try {
            if (tree.contains(page + 1)) {

                ImageIOUtil.writeImage(bim, destination + "-" + (page + 1) + image, 300);
            }
        } catch (IOException ex) {
            JOptionPane.showMessageDialog(null, "Problem output image.", "Problem output image",
                    JOptionPane.WARNING_MESSAGE);
            java.util.logging.Logger.getLogger(MergeSplit.class.getName()).log(java.util.logging.Level.SEVERE,
                    null, ex);

        }
    }
    try {
        document.close();
    } catch (IOException ex) {
        Logger.getLogger(MergeSplit.class.getName()).log(Level.SEVERE, null, ex);

    }
}

From source file:model.util.pdf.PDFUtils.java

License:Apache License

/**
 * Infamous main method.// www.ja  v  a 2 s  . c  om
 * Adapted by Julius Huelsmann.
 * 
 * @author Ben Litchfield
 *
 * @param args Command line arguments, should be one and a reference to a file.
 *
 * @throws IOException If there is an error parsing the document.
 */
public static BufferedImage pdf2image(final String[] _parameters) throws IOException {

    // suppress the Dock icon on OS X if called from outside
    // the paint - project.
    System.setProperty("apple.awt.UIElement", "true");

    String password = "";
    String pdfFile = null;
    String outputPrefix = null;
    String imageFormat = "jpg";
    int startPage = 1;
    int endPage = Integer.MAX_VALUE;
    String color = "rgb";
    int dpi;
    float cropBoxLowerLeftX = 0;
    float cropBoxLowerLeftY = 0;
    float cropBoxUpperRightX = 0;
    float cropBoxUpperRightY = 0;
    boolean showTime = false;
    try {
        dpi = Toolkit.getDefaultToolkit().getScreenResolution();
    } catch (HeadlessException e) {
        dpi = 96;
    }
    for (int i = 0; i < _parameters.length; i++) {
        if (_parameters[i].equals(PASSWORD)) {
            i++;
            if (i >= _parameters.length) {
                usage();
            }
            password = _parameters[i];
        } else if (_parameters[i].equals(START_PAGE)) {
            i++;
            if (i >= _parameters.length) {
                usage();
            }
            startPage = Integer.parseInt(_parameters[i]);
        } else if (_parameters[i].equals(END_PAGE)) {
            i++;
            if (i >= _parameters.length) {
                usage();
            }
            endPage = Integer.parseInt(_parameters[i]);
        } else if (_parameters[i].equals(PAGE)) {
            i++;
            if (i >= _parameters.length) {
                usage();
            }
            startPage = Integer.parseInt(_parameters[i]);
            endPage = Integer.parseInt(_parameters[i]);
        } else if (_parameters[i].equals(IMAGE_TYPE) || _parameters[i].equals(FORMAT)) {
            i++;
            imageFormat = _parameters[i];
        } else if (_parameters[i].equals(OUTPUT_PREFIX) || _parameters[i].equals(PREFIX)) {
            i++;
            outputPrefix = _parameters[i];
        } else if (_parameters[i].equals(COLOR)) {
            i++;
            color = _parameters[i];
        } else if (_parameters[i].equals(RESOLUTION) || _parameters[i].equals(DPI)) {
            i++;
            dpi = Integer.parseInt(_parameters[i]);
        } else if (_parameters[i].equals(CROPBOX)) {
            i++;
            cropBoxLowerLeftX = Float.valueOf(_parameters[i]);
            i++;
            cropBoxLowerLeftY = Float.valueOf(_parameters[i]);
            i++;
            cropBoxUpperRightX = Float.valueOf(_parameters[i]);
            i++;
            cropBoxUpperRightY = Float.valueOf(_parameters[i]);
        } else if (_parameters[i].equals(TIME)) {
            showTime = true;
        } else {
            if (pdfFile == null) {
                pdfFile = _parameters[i];
            }
        }
    }
    if (pdfFile == null) {
        usage();
    } else {
        if (outputPrefix == null) {
            outputPrefix = pdfFile.substring(0, pdfFile.lastIndexOf('.'));
        }

        PDDocument document = null;
        try {
            document = PDDocument.load(new File(pdfFile), password);

            ImageType imageType = null;
            if ("bilevel".equalsIgnoreCase(color)) {
                imageType = ImageType.BINARY;
            } else if ("gray".equalsIgnoreCase(color)) {
                imageType = ImageType.GRAY;
            } else if ("rgb".equalsIgnoreCase(color)) {
                imageType = ImageType.RGB;
            } else if ("rgba".equalsIgnoreCase(color)) {
                imageType = ImageType.ARGB;
            }

            if (imageType == null) {
                System.err.println("Error: Invalid color.");
                System.exit(2);
            }

            //if a CropBox has been specified, update the CropBox:
            //changeCropBoxes(PDDocument document,float a, float b, float c,float d)
            if (cropBoxLowerLeftX != 0 || cropBoxLowerLeftY != 0 || cropBoxUpperRightX != 0
                    || cropBoxUpperRightY != 0) {
                changeCropBox(document, cropBoxLowerLeftX, cropBoxLowerLeftY, cropBoxUpperRightX,
                        cropBoxUpperRightY);
            }

            long startTime = System.nanoTime();

            // render the pages
            boolean success = true;
            endPage = Math.min(endPage, document.getNumberOfPages());
            PDFRenderer renderer = new PDFRenderer(document);
            for (int i = startPage - 1; i < endPage;) {
                BufferedImage image = renderer.renderImageWithDPI(i, dpi, imageType);
                String fileName = outputPrefix + (i + 1) + "." + imageFormat;
                success &= ImageIOUtil.writeImage(image, fileName, dpi);
                return image;
            }

            // performance stats
            long endTime = System.nanoTime();
            long duration = endTime - startTime;
            int count = 1 + endPage - startPage;
            if (showTime) {
                System.err.printf("Rendered %d page%s in %dms\n", count, count == 1 ? "" : "s",
                        duration / 1000000);
            }

            if (!success) {
                System.err.println("Error: no writer found for image format '" + imageFormat + "'");
                System.exit(1);
            }
        } finally {
            if (document != null) {
                document.close();
            }
        }
        return null;
    }
    return null;
}

From source file:net.dstserbak.dataindexer.tokenizer.PDFTokenizer.java

/**
 * Splits text from PDF URL to words and returns them as TokensMap object.
 * @param url Input PDF URL/*w w w  .  j a  v  a  2  s  . com*/
 * @return Map that contains tokens, which are belong to PDF document
 * @throws IOException If an I/O error occurs
 */
public static TokensMap tokenizePdf(URL url) throws IOException {
    checkTempFileExistance();
    try (RandomAccessFile scratchFile = new RandomAccessFile(PDF_SCRATCH_FILE, "rw")) {
        try (PDDocument pd = PDDocument.load(url, scratchFile)) {
            return tokenizeInput(pd);
        }
    }
}

From source file:net.homeip.donaldm.doxmentor4j.indexers.PDFIndexer.java

License:Open Source License

@Override
public Reader getText(URI uri, int page, StringBuilder title)
        throws FileNotFoundException, MalformedURLException, IOException
//-----------------------------------------------------------------------------------------
{
    FileWriter writer = null;//from   w w  w  .j  a va 2  s  .  c  o  m
    PDDocument pdf = null;
    PDFTextStripper stripper = null;
    java.io.File tmpPdf = null;
    try {
        tmpPdf = Utils.uri2File(uri);
        if (tmpPdf != null)
            pdf = PDDocument.load(tmpPdf.getAbsolutePath(), true);
        else
            pdf = PDDocument.load(uri.toURL(), true);
        PDDocumentInformation pdfInfo = pdf.getDocumentInformation();
        String s = pdfInfo.getTitle();
        if ((s == null) || (s.length() == 0))
            s = uri.getPath();
        if (title != null)
            title.append(s);
        stripper = new PDFTextStripper();
        if (page >= 0) {
            stripper.setStartPage(page);
            stripper.setEndPage(page);
        } else {
            stripper.setStartPage(1);
            stripper.setEndPage(pdf.getNumberOfPages());
        }
        java.io.File f = java.io.File.createTempFile("pdf", ".tmp");
        writer = new FileWriter(f);
        stripper.writeText(pdf, writer);
        try {
            writer.close();
            writer = null;
        } catch (Exception _e) {
        }
        stripper.resetEngine();
        return new FileReader(f);
    } finally {
        if (stripper != null)
            try {
                stripper.resetEngine();
            } catch (Exception _e) {
            }
        if (pdf != null)
            try {
                pdf.close();
            } catch (Exception _e) {
            }
        if (writer != null)
            try {
                writer.close();
            } catch (Exception _e) {
            }
        if ((tmpPdf != null)
                && (tmpPdf.getAbsolutePath().toLowerCase().indexOf(System.getProperty("java.io.tmpdir")) >= 0))
            tmpPdf.delete();
    }
}

From source file:net.homeip.donaldm.doxmentor4j.indexers.PDFIndexer.java

License:Open Source License

@Override
public long index(String href, URI uri, boolean followLinks, Object... extraParams) throws IOException
//-----------------------------------------------------------------------------------------------------
{
    if (m_indexWriter == null) {
        logger.error("PDFIndexer: index writer is null");
        return -1;
    }// ww w  . j  a va 2 s  .  co  m
    PDDocument pdf = null;
    PDFTextStripper stripper = null;
    Reader reader = null;
    Writer writer = null;
    java.io.File tmpPdf = null;
    try {
        tmpPdf = Utils.uri2File(uri);
        if (tmpPdf != null)
            pdf = PDDocument.load(tmpPdf.getAbsolutePath(), true);
        else
            pdf = PDDocument.load(uri.toURL(), true);
        PDDocumentInformation pdfInfo = pdf.getDocumentInformation();
        String title = pdfInfo.getTitle();
        if ((title == null) || (title.isEmpty()))
            title = uri.getPath();
        stripper = new PDFTextStripper();
        int noPages = pdf.getNumberOfPages();
        stripper.setSuppressDuplicateOverlappingText(false);
        if (noPages != PDDocument.UNKNOWN_NUMBER_OF_PAGES) {
            for (int page = 1; page <= noPages; page++) {
                stripper.setStartPage(page);
                stripper.setEndPage(page);
                writer = new StringWriter();
                stripper.writeText(pdf, writer);
                reader = new StringReader(writer.toString());
                Document doc = new Document();
                doc.add(new Field("path", href, Field.Store.YES, Field.Index.NO));
                doc.add(new Field("title", title.toString(), Field.Store.YES, Field.Index.ANALYZED));
                doc.add(new Field("contents", reader, Field.TermVector.WITH_POSITIONS_OFFSETS));
                doc.add(new Field("page", Integer.toString(page), Field.Store.YES, Field.Index.NO));
                if (addDocument(doc))
                    AjaxIndexer.incrementCount();
                try {
                    writer.close();
                    writer = null;
                } catch (Exception _e) {
                }
                try {
                    reader.close();
                    reader = null;
                } catch (Exception _e) {
                }
                if ((page % 50) == 0) {
                    try {
                        System.runFinalization();
                        System.gc();
                    } catch (Exception _e) {
                    }
                }
            }
        } else {
            java.io.File f = java.io.File.createTempFile("pdf", ".tmp");
            writer = new FileWriter(f);
            stripper.writeText(pdf, writer);
            try {
                writer.close();
                writer = null;
            } catch (Exception _e) {
            }
            reader = new FileReader(f);
            Document doc = new Document();
            doc.add(new Field("path", href, Field.Store.YES, Field.Index.NO));
            doc.add(new Field("title", title.toString(), Field.Store.YES, Field.Index.ANALYZED));
            doc.add(new Field("contents", reader, Field.TermVector.WITH_POSITIONS_OFFSETS));
            doc.add(new Field("page", "-1", Field.Store.YES, Field.Index.NO));
            if (addDocument(doc))
                AjaxIndexer.incrementCount();
            try {
                reader.close();
                reader = null;
            } catch (Exception _e) {
            }
            try {
                System.runFinalization();
                System.gc();
            } catch (Exception _e) {
            }
        }
        return 1;
    } catch (Exception e) {
        logger.error("Error indexing PDF text from " + uri.toString(), e);
        return -1;
    } finally {
        if (stripper != null)
            try {
                stripper.resetEngine();
            } catch (Exception _e) {
            }
        if (pdf != null)
            try {
                pdf.close();
            } catch (Exception _e) {
            }
        if (writer != null)
            try {
                writer.close();
            } catch (Exception _e) {
            }
        if (reader != null)
            try {
                reader.close();
            } catch (Exception _e) {
            }
        if ((tmpPdf != null)
                && (tmpPdf.getAbsolutePath().toLowerCase().indexOf(System.getProperty("java.io.tmpdir")) >= 0))
            tmpPdf.delete();
    }
}

From source file:net.sourceforge.docfetcher.model.parse.PdfParser.java

License:Open Source License

@Override
protected ParseResult parse(@NotNull InputStream in, @NotNull final ParseContext context)
        throws ParseException {
    PDDocument pdfDoc = null;/*  www . j a va2 s.c  om*/
    try {
        /*
         * TODO post-release-1.1: check if 'force' argument in PDDocument/Stripper increases
         * number of parsed PDF files
         */
        pdfDoc = PDDocument.load(in, true);
        PDDocumentInformation pdInfo;
        final int pageCount;
        try {
            pdInfo = pdfDoc.getDocumentInformation();
            pageCount = pdfDoc.getNumberOfPages();
        } catch (ClassCastException e) {
            // Bug #3529070 and #3528345
            throw new ParseException(e);
        }
        StringWriter writer = new StringWriter();

        /*
         * If the PDF file is encrypted, the PDF stripper will automatically
         * try an empty password.
         * 
         * In contrast to the paging PDF parser that is used for the
         * preview, we do not need to call setSortByPosition(true) here
         * because the extracted text will be digested by Lucene anyway.
         */
        PDFTextStripper stripper = new PDFTextStripper() {
            protected void startPage(PDPage page) throws IOException {
                context.getReporter().subInfo(getCurrentPageNo(), pageCount);
            }

            protected void endPage(PDPage page) throws IOException {
                if (context.getCancelable().isCanceled())
                    setEndPage(0);
            }
        };
        stripper.setForceParsing(true);

        try {
            stripper.writeText(pdfDoc, writer);
        } catch (RuntimeException e) {
            /*
             * PDFTextStripper.writeText can throw various
             * RuntimeExceptions, see bugs #3446010, #3448272, #3444887.
             */
            throw new ParseException(e);
        }

        return new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle()).addAuthor(pdInfo.getAuthor())
                .addMiscMetadata(pdInfo.getSubject()).addMiscMetadata(pdInfo.getKeywords());
    } catch (IOException e) {
        if (e.getCause() instanceof CryptographyException)
            throw new ParseException(Msg.doc_pw_protected.get());
        throw new ParseException(e);
    } finally {
        close(pdfDoc);
    }
}

From source file:net.sourceforge.vaticanfetcher.model.parse.PdfParser.java

License:Open Source License

@Override
protected ParseResult parse(@NotNull InputStream in, @NotNull final ParseContext context)
        throws ParseException {
    PDDocument pdfDoc = null;/*from  w ww .  j a  va  2  s  .  c  om*/
    try {
        /* TODO post-release-1.1: check if 'force' argument in PDDocument/Stripper increases number of parsed PDF files */
        pdfDoc = PDDocument.load(in, true);
        PDDocumentInformation pdInfo;
        final int pageCount;
        try {
            pdInfo = pdfDoc.getDocumentInformation();
            pageCount = pdfDoc.getNumberOfPages();
        } catch (ClassCastException e) {
            // Bug #3529070 and #3528345
            throw new ParseException(e);
        }
        StringWriter writer = new StringWriter();

        /*
         * If the PDF file is encrypted, the PDF stripper will automatically try an empty password.
         * 
         * In contrast to the paging PDF parser that is used for the preview, we do not need to call 
         * setSortByPosition(true) here because the extracted text will be digested by Lucene anyway.
         */
        PDFTextStripper stripper = new PDFTextStripper() {
            protected void startPage(PDPage page) throws IOException {
                context.getReporter().subInfo(getCurrentPageNo(), pageCount);
            }

            protected void endPage(PDPage page) throws IOException {
                if (context.getCancelable().isCanceled())
                    setEndPage(0);
            }
        };
        stripper.setForceParsing(true);

        try {
            stripper.writeText(pdfDoc, writer);
        } catch (RuntimeException e) {
            /* PDFTextStripper.writeText can throw various RuntimeExceptions, see bugs #3446010, #3448272, #3444887. */
            throw new ParseException(e);
        }

        return new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle()).addAuthor(pdInfo.getAuthor())
                .addMiscMetadata(pdInfo.getSubject()).addMiscMetadata(pdInfo.getKeywords());
    } catch (IOException e) {
        if (e.getCause() instanceof CryptographyException)
            throw new ParseException(Msg.doc_pw_protected.get());
        throw new ParseException(e);
    } finally {
        close(pdfDoc);
    }
}

From source file:org.apache.tika.parser.pdf.EnhancedPDFParser.java

License:Apache License

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    PDDocument pdfDocument = null;//from w  w  w  . j  av a2s  .  c  o m
    //config from context, or default if not set via context
    PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
    String password = "";
    try {
        // PDFBox can process entirely in memory, or can use a temp file
        //  for unpacked / processed resources
        // Decide which to do based on if we're reading from a file or not already
        TikaInputStream tstream = TikaInputStream.cast(stream);
        password = getPassword(metadata, context);
        if (tstream != null && tstream.hasFile()) {
            // File based, take that as a cue to use a temporary file
            if (localConfig.getUseNonSequentialParser() == true) {
                pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password);
            } else {
                pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true);
            }
        } else {
            // Go for the normal, stream based in-memory parsing
            if (localConfig.getUseNonSequentialParser() == true) {
                pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password);
            } else {
                pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true);
            }
        }
        metadata.set("pdf:encrypted", Boolean.toString(pdfDocument.isEncrypted()));

        pdfDocument.setAllSecurityToBeRemoved(true);

        metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
        extractMetadata(pdfDocument, metadata);
        if (handler != null) {
            String xfaXml = extractXFAText(pdfDocument);
            if (xfaXml != null) {
                try (BufferedInputStream is = new BufferedInputStream(
                        new ByteArrayInputStream(xfaXml.getBytes()))) {
                    new TXTParser().parse(is, handler, metadata, context);
                }
                metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
            } else {
                EnhancedPDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
            }
        }
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}