Example usage for org.apache.pdfbox.pdmodel PDDocument load

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument load.

Prototype

public static PDDocument load(byte[] input, String password) throws IOException

Source Link

Document

Parses a PDF.

Usage

From source file:dk.defxws.fedoragsearch.server.TransformerToText.java

License:Open Source License

/**
 * //  w w w .j a  va 2 s. com
 *
 * @throws Exception.
 */
private StringBuffer getTextFromPDF(byte[] doc, String pageNum) throws Exception {
    StringBuffer docText = new StringBuffer();
    PDDocument pdDoc = null;
    String password = "";

    // extract PDF document's textual content
    try {
        PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/);
        int page = Integer.parseInt(pageNum);
        if (page != -1) {
            stripper.setStartPage(page);
            stripper.setEndPage(page);
        }
        //password
        pdDoc = PDDocument.load(new ByteArrayInputStream(doc), password); // new PDDocument(cosDoc);
        docText = new StringBuffer(stripper.getText(pdDoc));
    } catch (IOException e) {
        throw new Exception("Cannot parse PDF document", e);
    } finally {
        closePDDocument(pdDoc);
    }
    return docText;
}

From source file:edu.ist.psu.sagnik.research.pdfbox2playground.javatest.ExtractImages.java

License:Apache License

private void extract(String pdfFile, String password) throws IOException {
    PDDocument document = null;//from w w  w  .ja v a 2  s .co m
    try {
        document = PDDocument.load(new File(pdfFile), password);
        AccessPermission ap = document.getCurrentAccessPermission();
        if (!ap.canExtractContent()) {
            throw new IOException("You do not have permission to extract images");
        }

        for (int i = 0; i < document.getNumberOfPages(); i++) // todo: ITERATOR would be much better
        {
            PDPage page = document.getPage(i);
            ImageGraphicsEngine extractor = new ImageGraphicsEngine(page);
            extractor.run();
        }
    } finally {
        if (document != null) {
            document.close();
        }
    }
}

From source file:merge_split.MergeSplit.java

License:Apache License

private void ConvertButtonActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_ConvertButtonActionPerformed
    PDDocument document = null;//from  w  ww.  ja va2s.  com
    try {
        document = PDDocument.load(new File((String) ConvertFileField.getText()), convertcode);
    } catch (IOException ex) {
        JOptionPane.showMessageDialog(null, "Problem opening pdf.", "Problem opening pdf",
                JOptionPane.WARNING_MESSAGE);
    }
    TreeSet tree = findPages((String) ConvertPagesField.getText());

    PDFRenderer pdfRenderer = new PDFRenderer(document);
    for (int page = 0; page < document.getNumberOfPages(); ++page) {
        BufferedImage bim = null;
        try {
            bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);
        } catch (IOException ex) {
            JOptionPane.showMessageDialog(null, "Problem rendering image.", "Problem rendering image",
                    JOptionPane.WARNING_MESSAGE);

        }

        // suffix in filename will be used as the file format
        String destination = ConvertDestinationField.getText() + "\\" + ConvertNameField.getText();
        String image = ".png";
        if (pngbutton.isSelected()) {
            image = ".png";
        } else if (bmpbutton.isSelected()) {
            image = ".bmp";
        } else if (gifbutton.isSelected()) {
            image = ".gif";
        } else if (jpgbutton.isSelected()) {
            image = ".jpg";
        }
        try {
            if (tree.contains(page + 1)) {

                ImageIOUtil.writeImage(bim, destination + "-" + (page + 1) + image, 300);
            }
        } catch (IOException ex) {
            JOptionPane.showMessageDialog(null, "Problem output image.", "Problem output image",
                    JOptionPane.WARNING_MESSAGE);
            java.util.logging.Logger.getLogger(MergeSplit.class.getName()).log(java.util.logging.Level.SEVERE,
                    null, ex);

        }
    }
    try {
        document.close();
    } catch (IOException ex) {
        Logger.getLogger(MergeSplit.class.getName()).log(Level.SEVERE, null, ex);

    }
}

From source file:model.util.pdf.PDFUtils.java

License:Apache License

/**
 * Infamous main method.// www.ja  v  a 2 s  . c  om
 * Adapted by Julius Huelsmann.
 * 
 * @author Ben Litchfield
 *
 * @param args Command line arguments, should be one and a reference to a file.
 *
 * @throws IOException If there is an error parsing the document.
 */
public static BufferedImage pdf2image(final String[] _parameters) throws IOException {

    // suppress the Dock icon on OS X if called from outside
    // the paint - project.
    System.setProperty("apple.awt.UIElement", "true");

    String password = "";
    String pdfFile = null;
    String outputPrefix = null;
    String imageFormat = "jpg";
    int startPage = 1;
    int endPage = Integer.MAX_VALUE;
    String color = "rgb";
    int dpi;
    float cropBoxLowerLeftX = 0;
    float cropBoxLowerLeftY = 0;
    float cropBoxUpperRightX = 0;
    float cropBoxUpperRightY = 0;
    boolean showTime = false;
    try {
        dpi = Toolkit.getDefaultToolkit().getScreenResolution();
    } catch (HeadlessException e) {
        dpi = 96;
    }
    for (int i = 0; i < _parameters.length; i++) {
        if (_parameters[i].equals(PASSWORD)) {
            i++;
            if (i >= _parameters.length) {
                usage();
            }
            password = _parameters[i];
        } else if (_parameters[i].equals(START_PAGE)) {
            i++;
            if (i >= _parameters.length) {
                usage();
            }
            startPage = Integer.parseInt(_parameters[i]);
        } else if (_parameters[i].equals(END_PAGE)) {
            i++;
            if (i >= _parameters.length) {
                usage();
            }
            endPage = Integer.parseInt(_parameters[i]);
        } else if (_parameters[i].equals(PAGE)) {
            i++;
            if (i >= _parameters.length) {
                usage();
            }
            startPage = Integer.parseInt(_parameters[i]);
            endPage = Integer.parseInt(_parameters[i]);
        } else if (_parameters[i].equals(IMAGE_TYPE) || _parameters[i].equals(FORMAT)) {
            i++;
            imageFormat = _parameters[i];
        } else if (_parameters[i].equals(OUTPUT_PREFIX) || _parameters[i].equals(PREFIX)) {
            i++;
            outputPrefix = _parameters[i];
        } else if (_parameters[i].equals(COLOR)) {
            i++;
            color = _parameters[i];
        } else if (_parameters[i].equals(RESOLUTION) || _parameters[i].equals(DPI)) {
            i++;
            dpi = Integer.parseInt(_parameters[i]);
        } else if (_parameters[i].equals(CROPBOX)) {
            i++;
            cropBoxLowerLeftX = Float.valueOf(_parameters[i]);
            i++;
            cropBoxLowerLeftY = Float.valueOf(_parameters[i]);
            i++;
            cropBoxUpperRightX = Float.valueOf(_parameters[i]);
            i++;
            cropBoxUpperRightY = Float.valueOf(_parameters[i]);
        } else if (_parameters[i].equals(TIME)) {
            showTime = true;
        } else {
            if (pdfFile == null) {
                pdfFile = _parameters[i];
            }
        }
    }
    if (pdfFile == null) {
        usage();
    } else {
        if (outputPrefix == null) {
            outputPrefix = pdfFile.substring(0, pdfFile.lastIndexOf('.'));
        }

        PDDocument document = null;
        try {
            document = PDDocument.load(new File(pdfFile), password);

            ImageType imageType = null;
            if ("bilevel".equalsIgnoreCase(color)) {
                imageType = ImageType.BINARY;
            } else if ("gray".equalsIgnoreCase(color)) {
                imageType = ImageType.GRAY;
            } else if ("rgb".equalsIgnoreCase(color)) {
                imageType = ImageType.RGB;
            } else if ("rgba".equalsIgnoreCase(color)) {
                imageType = ImageType.ARGB;
            }

            if (imageType == null) {
                System.err.println("Error: Invalid color.");
                System.exit(2);
            }

            //if a CropBox has been specified, update the CropBox:
            //changeCropBoxes(PDDocument document,float a, float b, float c,float d)
            if (cropBoxLowerLeftX != 0 || cropBoxLowerLeftY != 0 || cropBoxUpperRightX != 0
                    || cropBoxUpperRightY != 0) {
                changeCropBox(document, cropBoxLowerLeftX, cropBoxLowerLeftY, cropBoxUpperRightX,
                        cropBoxUpperRightY);
            }

            long startTime = System.nanoTime();

            // render the pages
            boolean success = true;
            endPage = Math.min(endPage, document.getNumberOfPages());
            PDFRenderer renderer = new PDFRenderer(document);
            for (int i = startPage - 1; i < endPage;) {
                BufferedImage image = renderer.renderImageWithDPI(i, dpi, imageType);
                String fileName = outputPrefix + (i + 1) + "." + imageFormat;
                success &= ImageIOUtil.writeImage(image, fileName, dpi);
                return image;
            }

            // performance stats
            long endTime = System.nanoTime();
            long duration = endTime - startTime;
            int count = 1 + endPage - startPage;
            if (showTime) {
                System.err.printf("Rendered %d page%s in %dms\n", count, count == 1 ? "" : "s",
                        duration / 1000000);
            }

            if (!success) {
                System.err.println("Error: no writer found for image format '" + imageFormat + "'");
                System.exit(1);
            }
        } finally {
            if (document != null) {
                document.close();
            }
        }
        return null;
    }
    return null;
}

From source file:net.dstserbak.dataindexer.tokenizer.PDFTokenizer.java

/**
 * Splits text from PDF URL to words and returns them as TokensMap object.
 * @param url Input PDF URL/*w w w  .  j a  v  a  2  s  . com*/
 * @return Map that contains tokens, which are belong to PDF document
 * @throws IOException If an I/O error occurs
 */
public static TokensMap tokenizePdf(URL url) throws IOException {
    checkTempFileExistance();
    try (RandomAccessFile scratchFile = new RandomAccessFile(PDF_SCRATCH_FILE, "rw")) {
        try (PDDocument pd = PDDocument.load(url, scratchFile)) {
            return tokenizeInput(pd);
        }
    }
}

From source file:net.homeip.donaldm.doxmentor4j.indexers.PDFIndexer.java

License:Open Source License

@Override
public Reader getText(URI uri, int page, StringBuilder title)
        throws FileNotFoundException, MalformedURLException, IOException
//-----------------------------------------------------------------------------------------
{
    FileWriter writer = null;//from   w w  w  .j  a va 2  s  .  c  o  m
    PDDocument pdf = null;
    PDFTextStripper stripper = null;
    java.io.File tmpPdf = null;
    try {
        tmpPdf = Utils.uri2File(uri);
        if (tmpPdf != null)
            pdf = PDDocument.load(tmpPdf.getAbsolutePath(), true);
        else
            pdf = PDDocument.load(uri.toURL(), true);
        PDDocumentInformation pdfInfo = pdf.getDocumentInformation();
        String s = pdfInfo.getTitle();
        if ((s == null) || (s.length() == 0))
            s = uri.getPath();
        if (title != null)
            title.append(s);
        stripper = new PDFTextStripper();
        if (page >= 0) {
            stripper.setStartPage(page);
            stripper.setEndPage(page);
        } else {
            stripper.setStartPage(1);
            stripper.setEndPage(pdf.getNumberOfPages());
        }
        java.io.File f = java.io.File.createTempFile("pdf", ".tmp");
        writer = new FileWriter(f);
        stripper.writeText(pdf, writer);
        try {
            writer.close();
            writer = null;
        } catch (Exception _e) {
        }
        stripper.resetEngine();
        return new FileReader(f);
    } finally {
        if (stripper != null)
            try {
                stripper.resetEngine();
            } catch (Exception _e) {
            }
        if (pdf != null)
            try {
                pdf.close();
            } catch (Exception _e) {
            }
        if (writer != null)
            try {
                writer.close();
            } catch (Exception _e) {
            }
        if ((tmpPdf != null)
                && (tmpPdf.getAbsolutePath().toLowerCase().indexOf(System.getProperty("java.io.tmpdir")) >= 0))
            tmpPdf.delete();
    }
}

From source file:net.homeip.donaldm.doxmentor4j.indexers.PDFIndexer.java

License:Open Source License

@Override
public long index(String href, URI uri, boolean followLinks, Object... extraParams) throws IOException
//-----------------------------------------------------------------------------------------------------
{
    if (m_indexWriter == null) {
        logger.error("PDFIndexer: index writer is null");
        return -1;
    }// ww w  . j  a va 2 s  .  co  m
    PDDocument pdf = null;
    PDFTextStripper stripper = null;
    Reader reader = null;
    Writer writer = null;
    java.io.File tmpPdf = null;
    try {
        tmpPdf = Utils.uri2File(uri);
        if (tmpPdf != null)
            pdf = PDDocument.load(tmpPdf.getAbsolutePath(), true);
        else
            pdf = PDDocument.load(uri.toURL(), true);
        PDDocumentInformation pdfInfo = pdf.getDocumentInformation();
        String title = pdfInfo.getTitle();
        if ((title == null) || (title.isEmpty()))
            title = uri.getPath();
        stripper = new PDFTextStripper();
        int noPages = pdf.getNumberOfPages();
        stripper.setSuppressDuplicateOverlappingText(false);
        if (noPages != PDDocument.UNKNOWN_NUMBER_OF_PAGES) {
            for (int page = 1; page <= noPages; page++) {
                stripper.setStartPage(page);
                stripper.setEndPage(page);
                writer = new StringWriter();
                stripper.writeText(pdf, writer);
                reader = new StringReader(writer.toString());
                Document doc = new Document();
                doc.add(new Field("path", href, Field.Store.YES, Field.Index.NO));
                doc.add(new Field("title", title.toString(), Field.Store.YES, Field.Index.ANALYZED));
                doc.add(new Field("contents", reader, Field.TermVector.WITH_POSITIONS_OFFSETS));
                doc.add(new Field("page", Integer.toString(page), Field.Store.YES, Field.Index.NO));
                if (addDocument(doc))
                    AjaxIndexer.incrementCount();
                try {
                    writer.close();
                    writer = null;
                } catch (Exception _e) {
                }
                try {
                    reader.close();
                    reader = null;
                } catch (Exception _e) {
                }
                if ((page % 50) == 0) {
                    try {
                        System.runFinalization();
                        System.gc();
                    } catch (Exception _e) {
                    }
                }
            }
        } else {
            java.io.File f = java.io.File.createTempFile("pdf", ".tmp");
            writer = new FileWriter(f);
            stripper.writeText(pdf, writer);
            try {
                writer.close();
                writer = null;
            } catch (Exception _e) {
            }
            reader = new FileReader(f);
            Document doc = new Document();
            doc.add(new Field("path", href, Field.Store.YES, Field.Index.NO));
            doc.add(new Field("title", title.toString(), Field.Store.YES, Field.Index.ANALYZED));
            doc.add(new Field("contents", reader, Field.TermVector.WITH_POSITIONS_OFFSETS));
            doc.add(new Field("page", "-1", Field.Store.YES, Field.Index.NO));
            if (addDocument(doc))
                AjaxIndexer.incrementCount();
            try {
                reader.close();
                reader = null;
            } catch (Exception _e) {
            }
            try {
                System.runFinalization();
                System.gc();
            } catch (Exception _e) {
            }
        }
        return 1;
    } catch (Exception e) {
        logger.error("Error indexing PDF text from " + uri.toString(), e);
        return -1;
    } finally {
        if (stripper != null)
            try {
                stripper.resetEngine();
            } catch (Exception _e) {
            }
        if (pdf != null)
            try {
                pdf.close();
            } catch (Exception _e) {
            }
        if (writer != null)
            try {
                writer.close();
            } catch (Exception _e) {
            }
        if (reader != null)
            try {
                reader.close();
            } catch (Exception _e) {
            }
        if ((tmpPdf != null)
                && (tmpPdf.getAbsolutePath().toLowerCase().indexOf(System.getProperty("java.io.tmpdir")) >= 0))
            tmpPdf.delete();
    }
}

From source file:net.sourceforge.docfetcher.model.parse.PdfParser.java

License:Open Source License

@Override
protected ParseResult parse(@NotNull InputStream in, @NotNull final ParseContext context)
        throws ParseException {
    PDDocument pdfDoc = null;/*  www . j a va2 s.c  om*/
    try {
        /*
         * TODO post-release-1.1: check if 'force' argument in PDDocument/Stripper increases
         * number of parsed PDF files
         */
        pdfDoc = PDDocument.load(in, true);
        PDDocumentInformation pdInfo;
        final int pageCount;
        try {
            pdInfo = pdfDoc.getDocumentInformation();
            pageCount = pdfDoc.getNumberOfPages();
        } catch (ClassCastException e) {
            // Bug #3529070 and #3528345
            throw new ParseException(e);
        }
        StringWriter writer = new StringWriter();

        /*
         * If the PDF file is encrypted, the PDF stripper will automatically
         * try an empty password.
         * 
         * In contrast to the paging PDF parser that is used for the
         * preview, we do not need to call setSortByPosition(true) here
         * because the extracted text will be digested by Lucene anyway.
         */
        PDFTextStripper stripper = new PDFTextStripper() {
            protected void startPage(PDPage page) throws IOException {
                context.getReporter().subInfo(getCurrentPageNo(), pageCount);
            }

            protected void endPage(PDPage page) throws IOException {
                if (context.getCancelable().isCanceled())
                    setEndPage(0);
            }
        };
        stripper.setForceParsing(true);

        try {
            stripper.writeText(pdfDoc, writer);
        } catch (RuntimeException e) {
            /*
             * PDFTextStripper.writeText can throw various
             * RuntimeExceptions, see bugs #3446010, #3448272, #3444887.
             */
            throw new ParseException(e);
        }

        return new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle()).addAuthor(pdInfo.getAuthor())
                .addMiscMetadata(pdInfo.getSubject()).addMiscMetadata(pdInfo.getKeywords());
    } catch (IOException e) {
        if (e.getCause() instanceof CryptographyException)
            throw new ParseException(Msg.doc_pw_protected.get());
        throw new ParseException(e);
    } finally {
        close(pdfDoc);
    }
}

From source file:net.sourceforge.vaticanfetcher.model.parse.PdfParser.java

License:Open Source License

@Override
protected ParseResult parse(@NotNull InputStream in, @NotNull final ParseContext context)
        throws ParseException {
    PDDocument pdfDoc = null;/*from  w ww .  j a  va  2  s  .  c  om*/
    try {
        /* TODO post-release-1.1: check if 'force' argument in PDDocument/Stripper increases number of parsed PDF files */
        pdfDoc = PDDocument.load(in, true);
        PDDocumentInformation pdInfo;
        final int pageCount;
        try {
            pdInfo = pdfDoc.getDocumentInformation();
            pageCount = pdfDoc.getNumberOfPages();
        } catch (ClassCastException e) {
            // Bug #3529070 and #3528345
            throw new ParseException(e);
        }
        StringWriter writer = new StringWriter();

        /*
         * If the PDF file is encrypted, the PDF stripper will automatically try an empty password.
         * 
         * In contrast to the paging PDF parser that is used for the preview, we do not need to call 
         * setSortByPosition(true) here because the extracted text will be digested by Lucene anyway.
         */
        PDFTextStripper stripper = new PDFTextStripper() {
            protected void startPage(PDPage page) throws IOException {
                context.getReporter().subInfo(getCurrentPageNo(), pageCount);
            }

            protected void endPage(PDPage page) throws IOException {
                if (context.getCancelable().isCanceled())
                    setEndPage(0);
            }
        };
        stripper.setForceParsing(true);

        try {
            stripper.writeText(pdfDoc, writer);
        } catch (RuntimeException e) {
            /* PDFTextStripper.writeText can throw various RuntimeExceptions, see bugs #3446010, #3448272, #3444887. */
            throw new ParseException(e);
        }

        return new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle()).addAuthor(pdInfo.getAuthor())
                .addMiscMetadata(pdInfo.getSubject()).addMiscMetadata(pdInfo.getKeywords());
    } catch (IOException e) {
        if (e.getCause() instanceof CryptographyException)
            throw new ParseException(Msg.doc_pw_protected.get());
        throw new ParseException(e);
    } finally {
        close(pdfDoc);
    }
}

From source file:org.apache.tika.parser.pdf.EnhancedPDFParser.java

License:Apache License

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    PDDocument pdfDocument = null;//from w  w  w  . j  av a2s  .  c  o m
    //config from context, or default if not set via context
    PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
    String password = "";
    try {
        // PDFBox can process entirely in memory, or can use a temp file
        //  for unpacked / processed resources
        // Decide which to do based on if we're reading from a file or not already
        TikaInputStream tstream = TikaInputStream.cast(stream);
        password = getPassword(metadata, context);
        if (tstream != null && tstream.hasFile()) {
            // File based, take that as a cue to use a temporary file
            if (localConfig.getUseNonSequentialParser() == true) {
                pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password);
            } else {
                pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true);
            }
        } else {
            // Go for the normal, stream based in-memory parsing
            if (localConfig.getUseNonSequentialParser() == true) {
                pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password);
            } else {
                pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true);
            }
        }
        metadata.set("pdf:encrypted", Boolean.toString(pdfDocument.isEncrypted()));

        pdfDocument.setAllSecurityToBeRemoved(true);

        metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
        extractMetadata(pdfDocument, metadata);
        if (handler != null) {
            String xfaXml = extractXFAText(pdfDocument);
            if (xfaXml != null) {
                try (BufferedInputStream is = new BufferedInputStream(
                        new ByteArrayInputStream(xfaXml.getBytes()))) {
                    new TXTParser().parse(is, handler, metadata, context);
                }
                metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
            } else {
                EnhancedPDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
            }
        }
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}