List of usage examples for org.apache.pdfbox.pdmodel PDDocument load
public static PDDocument load(byte[] input, String password) throws IOException
From source file:dk.defxws.fedoragsearch.server.TransformerToText.java
License:Open Source License
/** * // w w w .j a va 2 s. com * * @throws Exception. */ private StringBuffer getTextFromPDF(byte[] doc, String pageNum) throws Exception { StringBuffer docText = new StringBuffer(); PDDocument pdDoc = null; String password = ""; // extract PDF document's textual content try { PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/); int page = Integer.parseInt(pageNum); if (page != -1) { stripper.setStartPage(page); stripper.setEndPage(page); } //password pdDoc = PDDocument.load(new ByteArrayInputStream(doc), password); // new PDDocument(cosDoc); docText = new StringBuffer(stripper.getText(pdDoc)); } catch (IOException e) { throw new Exception("Cannot parse PDF document", e); } finally { closePDDocument(pdDoc); } return docText; }
From source file:edu.ist.psu.sagnik.research.pdfbox2playground.javatest.ExtractImages.java
License:Apache License
private void extract(String pdfFile, String password) throws IOException { PDDocument document = null;//from w w w .ja v a 2 s .co m try { document = PDDocument.load(new File(pdfFile), password); AccessPermission ap = document.getCurrentAccessPermission(); if (!ap.canExtractContent()) { throw new IOException("You do not have permission to extract images"); } for (int i = 0; i < document.getNumberOfPages(); i++) // todo: ITERATOR would be much better { PDPage page = document.getPage(i); ImageGraphicsEngine extractor = new ImageGraphicsEngine(page); extractor.run(); } } finally { if (document != null) { document.close(); } } }
From source file:merge_split.MergeSplit.java
License:Apache License
private void ConvertButtonActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_ConvertButtonActionPerformed PDDocument document = null;//from w ww. ja va2s. com try { document = PDDocument.load(new File((String) ConvertFileField.getText()), convertcode); } catch (IOException ex) { JOptionPane.showMessageDialog(null, "Problem opening pdf.", "Problem opening pdf", JOptionPane.WARNING_MESSAGE); } TreeSet tree = findPages((String) ConvertPagesField.getText()); PDFRenderer pdfRenderer = new PDFRenderer(document); for (int page = 0; page < document.getNumberOfPages(); ++page) { BufferedImage bim = null; try { bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB); } catch (IOException ex) { JOptionPane.showMessageDialog(null, "Problem rendering image.", "Problem rendering image", JOptionPane.WARNING_MESSAGE); } // suffix in filename will be used as the file format String destination = ConvertDestinationField.getText() + "\\" + ConvertNameField.getText(); String image = ".png"; if (pngbutton.isSelected()) { image = ".png"; } else if (bmpbutton.isSelected()) { image = ".bmp"; } else if (gifbutton.isSelected()) { image = ".gif"; } else if (jpgbutton.isSelected()) { image = ".jpg"; } try { if (tree.contains(page + 1)) { ImageIOUtil.writeImage(bim, destination + "-" + (page + 1) + image, 300); } } catch (IOException ex) { JOptionPane.showMessageDialog(null, "Problem output image.", "Problem output image", JOptionPane.WARNING_MESSAGE); java.util.logging.Logger.getLogger(MergeSplit.class.getName()).log(java.util.logging.Level.SEVERE, null, ex); } } try { document.close(); } catch (IOException ex) { Logger.getLogger(MergeSplit.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:model.util.pdf.PDFUtils.java
License:Apache License
/** * Infamous main method.// www.ja v a 2 s . c om * Adapted by Julius Huelsmann. * * @author Ben Litchfield * * @param args Command line arguments, should be one and a reference to a file. * * @throws IOException If there is an error parsing the document. */ public static BufferedImage pdf2image(final String[] _parameters) throws IOException { // suppress the Dock icon on OS X if called from outside // the paint - project. System.setProperty("apple.awt.UIElement", "true"); String password = ""; String pdfFile = null; String outputPrefix = null; String imageFormat = "jpg"; int startPage = 1; int endPage = Integer.MAX_VALUE; String color = "rgb"; int dpi; float cropBoxLowerLeftX = 0; float cropBoxLowerLeftY = 0; float cropBoxUpperRightX = 0; float cropBoxUpperRightY = 0; boolean showTime = false; try { dpi = Toolkit.getDefaultToolkit().getScreenResolution(); } catch (HeadlessException e) { dpi = 96; } for (int i = 0; i < _parameters.length; i++) { if (_parameters[i].equals(PASSWORD)) { i++; if (i >= _parameters.length) { usage(); } password = _parameters[i]; } else if (_parameters[i].equals(START_PAGE)) { i++; if (i >= _parameters.length) { usage(); } startPage = Integer.parseInt(_parameters[i]); } else if (_parameters[i].equals(END_PAGE)) { i++; if (i >= _parameters.length) { usage(); } endPage = Integer.parseInt(_parameters[i]); } else if (_parameters[i].equals(PAGE)) { i++; if (i >= _parameters.length) { usage(); } startPage = Integer.parseInt(_parameters[i]); endPage = Integer.parseInt(_parameters[i]); } else if (_parameters[i].equals(IMAGE_TYPE) || _parameters[i].equals(FORMAT)) { i++; imageFormat = _parameters[i]; } else if (_parameters[i].equals(OUTPUT_PREFIX) || _parameters[i].equals(PREFIX)) { i++; outputPrefix = _parameters[i]; } else if (_parameters[i].equals(COLOR)) { i++; color = _parameters[i]; } else if (_parameters[i].equals(RESOLUTION) || _parameters[i].equals(DPI)) { i++; dpi = Integer.parseInt(_parameters[i]); } else if (_parameters[i].equals(CROPBOX)) { i++; cropBoxLowerLeftX = Float.valueOf(_parameters[i]); i++; cropBoxLowerLeftY = Float.valueOf(_parameters[i]); i++; cropBoxUpperRightX = Float.valueOf(_parameters[i]); i++; cropBoxUpperRightY = Float.valueOf(_parameters[i]); } else if (_parameters[i].equals(TIME)) { showTime = true; } else { if (pdfFile == null) { pdfFile = _parameters[i]; } } } if (pdfFile == null) { usage(); } else { if (outputPrefix == null) { outputPrefix = pdfFile.substring(0, pdfFile.lastIndexOf('.')); } PDDocument document = null; try { document = PDDocument.load(new File(pdfFile), password); ImageType imageType = null; if ("bilevel".equalsIgnoreCase(color)) { imageType = ImageType.BINARY; } else if ("gray".equalsIgnoreCase(color)) { imageType = ImageType.GRAY; } else if ("rgb".equalsIgnoreCase(color)) { imageType = ImageType.RGB; } else if ("rgba".equalsIgnoreCase(color)) { imageType = ImageType.ARGB; } if (imageType == null) { System.err.println("Error: Invalid color."); System.exit(2); } //if a CropBox has been specified, update the CropBox: //changeCropBoxes(PDDocument document,float a, float b, float c,float d) if (cropBoxLowerLeftX != 0 || cropBoxLowerLeftY != 0 || cropBoxUpperRightX != 0 || cropBoxUpperRightY != 0) { changeCropBox(document, cropBoxLowerLeftX, cropBoxLowerLeftY, cropBoxUpperRightX, cropBoxUpperRightY); } long startTime = System.nanoTime(); // render the pages boolean success = true; endPage = Math.min(endPage, document.getNumberOfPages()); PDFRenderer renderer = new PDFRenderer(document); for (int i = startPage - 1; i < endPage;) { BufferedImage image = renderer.renderImageWithDPI(i, dpi, imageType); String fileName = outputPrefix + (i + 1) + "." + imageFormat; success &= ImageIOUtil.writeImage(image, fileName, dpi); return image; } // performance stats long endTime = System.nanoTime(); long duration = endTime - startTime; int count = 1 + endPage - startPage; if (showTime) { System.err.printf("Rendered %d page%s in %dms\n", count, count == 1 ? "" : "s", duration / 1000000); } if (!success) { System.err.println("Error: no writer found for image format '" + imageFormat + "'"); System.exit(1); } } finally { if (document != null) { document.close(); } } return null; } return null; }
From source file:net.dstserbak.dataindexer.tokenizer.PDFTokenizer.java
/** * Splits text from PDF URL to words and returns them as TokensMap object. * @param url Input PDF URL/*w w w . j a v a 2 s . com*/ * @return Map that contains tokens, which are belong to PDF document * @throws IOException If an I/O error occurs */ public static TokensMap tokenizePdf(URL url) throws IOException { checkTempFileExistance(); try (RandomAccessFile scratchFile = new RandomAccessFile(PDF_SCRATCH_FILE, "rw")) { try (PDDocument pd = PDDocument.load(url, scratchFile)) { return tokenizeInput(pd); } } }
From source file:net.homeip.donaldm.doxmentor4j.indexers.PDFIndexer.java
License:Open Source License
@Override public Reader getText(URI uri, int page, StringBuilder title) throws FileNotFoundException, MalformedURLException, IOException //----------------------------------------------------------------------------------------- { FileWriter writer = null;//from w w w .j a va 2 s . c o m PDDocument pdf = null; PDFTextStripper stripper = null; java.io.File tmpPdf = null; try { tmpPdf = Utils.uri2File(uri); if (tmpPdf != null) pdf = PDDocument.load(tmpPdf.getAbsolutePath(), true); else pdf = PDDocument.load(uri.toURL(), true); PDDocumentInformation pdfInfo = pdf.getDocumentInformation(); String s = pdfInfo.getTitle(); if ((s == null) || (s.length() == 0)) s = uri.getPath(); if (title != null) title.append(s); stripper = new PDFTextStripper(); if (page >= 0) { stripper.setStartPage(page); stripper.setEndPage(page); } else { stripper.setStartPage(1); stripper.setEndPage(pdf.getNumberOfPages()); } java.io.File f = java.io.File.createTempFile("pdf", ".tmp"); writer = new FileWriter(f); stripper.writeText(pdf, writer); try { writer.close(); writer = null; } catch (Exception _e) { } stripper.resetEngine(); return new FileReader(f); } finally { if (stripper != null) try { stripper.resetEngine(); } catch (Exception _e) { } if (pdf != null) try { pdf.close(); } catch (Exception _e) { } if (writer != null) try { writer.close(); } catch (Exception _e) { } if ((tmpPdf != null) && (tmpPdf.getAbsolutePath().toLowerCase().indexOf(System.getProperty("java.io.tmpdir")) >= 0)) tmpPdf.delete(); } }
From source file:net.homeip.donaldm.doxmentor4j.indexers.PDFIndexer.java
License:Open Source License
@Override public long index(String href, URI uri, boolean followLinks, Object... extraParams) throws IOException //----------------------------------------------------------------------------------------------------- { if (m_indexWriter == null) { logger.error("PDFIndexer: index writer is null"); return -1; }// ww w . j a va 2 s . co m PDDocument pdf = null; PDFTextStripper stripper = null; Reader reader = null; Writer writer = null; java.io.File tmpPdf = null; try { tmpPdf = Utils.uri2File(uri); if (tmpPdf != null) pdf = PDDocument.load(tmpPdf.getAbsolutePath(), true); else pdf = PDDocument.load(uri.toURL(), true); PDDocumentInformation pdfInfo = pdf.getDocumentInformation(); String title = pdfInfo.getTitle(); if ((title == null) || (title.isEmpty())) title = uri.getPath(); stripper = new PDFTextStripper(); int noPages = pdf.getNumberOfPages(); stripper.setSuppressDuplicateOverlappingText(false); if (noPages != PDDocument.UNKNOWN_NUMBER_OF_PAGES) { for (int page = 1; page <= noPages; page++) { stripper.setStartPage(page); stripper.setEndPage(page); writer = new StringWriter(); stripper.writeText(pdf, writer); reader = new StringReader(writer.toString()); Document doc = new Document(); doc.add(new Field("path", href, Field.Store.YES, Field.Index.NO)); doc.add(new Field("title", title.toString(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("contents", reader, Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.add(new Field("page", Integer.toString(page), Field.Store.YES, Field.Index.NO)); if (addDocument(doc)) AjaxIndexer.incrementCount(); try { writer.close(); writer = null; } catch (Exception _e) { } try { reader.close(); reader = null; } catch (Exception _e) { } if ((page % 50) == 0) { try { System.runFinalization(); System.gc(); } catch (Exception _e) { } } } } else { java.io.File f = java.io.File.createTempFile("pdf", ".tmp"); writer = new FileWriter(f); stripper.writeText(pdf, writer); try { writer.close(); writer = null; } catch (Exception _e) { } reader = new FileReader(f); Document doc = new Document(); doc.add(new Field("path", href, Field.Store.YES, Field.Index.NO)); doc.add(new Field("title", title.toString(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("contents", reader, Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.add(new Field("page", "-1", Field.Store.YES, Field.Index.NO)); if (addDocument(doc)) AjaxIndexer.incrementCount(); try { reader.close(); reader = null; } catch (Exception _e) { } try { System.runFinalization(); System.gc(); } catch (Exception _e) { } } return 1; } catch (Exception e) { logger.error("Error indexing PDF text from " + uri.toString(), e); return -1; } finally { if (stripper != null) try { stripper.resetEngine(); } catch (Exception _e) { } if (pdf != null) try { pdf.close(); } catch (Exception _e) { } if (writer != null) try { writer.close(); } catch (Exception _e) { } if (reader != null) try { reader.close(); } catch (Exception _e) { } if ((tmpPdf != null) && (tmpPdf.getAbsolutePath().toLowerCase().indexOf(System.getProperty("java.io.tmpdir")) >= 0)) tmpPdf.delete(); } }
From source file:net.sourceforge.docfetcher.model.parse.PdfParser.java
License:Open Source License
@Override protected ParseResult parse(@NotNull InputStream in, @NotNull final ParseContext context) throws ParseException { PDDocument pdfDoc = null;/* www . j a va2 s.c om*/ try { /* * TODO post-release-1.1: check if 'force' argument in PDDocument/Stripper increases * number of parsed PDF files */ pdfDoc = PDDocument.load(in, true); PDDocumentInformation pdInfo; final int pageCount; try { pdInfo = pdfDoc.getDocumentInformation(); pageCount = pdfDoc.getNumberOfPages(); } catch (ClassCastException e) { // Bug #3529070 and #3528345 throw new ParseException(e); } StringWriter writer = new StringWriter(); /* * If the PDF file is encrypted, the PDF stripper will automatically * try an empty password. * * In contrast to the paging PDF parser that is used for the * preview, we do not need to call setSortByPosition(true) here * because the extracted text will be digested by Lucene anyway. */ PDFTextStripper stripper = new PDFTextStripper() { protected void startPage(PDPage page) throws IOException { context.getReporter().subInfo(getCurrentPageNo(), pageCount); } protected void endPage(PDPage page) throws IOException { if (context.getCancelable().isCanceled()) setEndPage(0); } }; stripper.setForceParsing(true); try { stripper.writeText(pdfDoc, writer); } catch (RuntimeException e) { /* * PDFTextStripper.writeText can throw various * RuntimeExceptions, see bugs #3446010, #3448272, #3444887. */ throw new ParseException(e); } return new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle()).addAuthor(pdInfo.getAuthor()) .addMiscMetadata(pdInfo.getSubject()).addMiscMetadata(pdInfo.getKeywords()); } catch (IOException e) { if (e.getCause() instanceof CryptographyException) throw new ParseException(Msg.doc_pw_protected.get()); throw new ParseException(e); } finally { close(pdfDoc); } }
From source file:net.sourceforge.vaticanfetcher.model.parse.PdfParser.java
License:Open Source License
@Override protected ParseResult parse(@NotNull InputStream in, @NotNull final ParseContext context) throws ParseException { PDDocument pdfDoc = null;/*from w ww . j a va 2 s . c om*/ try { /* TODO post-release-1.1: check if 'force' argument in PDDocument/Stripper increases number of parsed PDF files */ pdfDoc = PDDocument.load(in, true); PDDocumentInformation pdInfo; final int pageCount; try { pdInfo = pdfDoc.getDocumentInformation(); pageCount = pdfDoc.getNumberOfPages(); } catch (ClassCastException e) { // Bug #3529070 and #3528345 throw new ParseException(e); } StringWriter writer = new StringWriter(); /* * If the PDF file is encrypted, the PDF stripper will automatically try an empty password. * * In contrast to the paging PDF parser that is used for the preview, we do not need to call * setSortByPosition(true) here because the extracted text will be digested by Lucene anyway. */ PDFTextStripper stripper = new PDFTextStripper() { protected void startPage(PDPage page) throws IOException { context.getReporter().subInfo(getCurrentPageNo(), pageCount); } protected void endPage(PDPage page) throws IOException { if (context.getCancelable().isCanceled()) setEndPage(0); } }; stripper.setForceParsing(true); try { stripper.writeText(pdfDoc, writer); } catch (RuntimeException e) { /* PDFTextStripper.writeText can throw various RuntimeExceptions, see bugs #3446010, #3448272, #3444887. */ throw new ParseException(e); } return new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle()).addAuthor(pdInfo.getAuthor()) .addMiscMetadata(pdInfo.getSubject()).addMiscMetadata(pdInfo.getKeywords()); } catch (IOException e) { if (e.getCause() instanceof CryptographyException) throw new ParseException(Msg.doc_pw_protected.get()); throw new ParseException(e); } finally { close(pdfDoc); } }
From source file:org.apache.tika.parser.pdf.EnhancedPDFParser.java
License:Apache License
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { PDDocument pdfDocument = null;//from w w w . j av a2s . c o m //config from context, or default if not set via context PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig); String password = ""; try { // PDFBox can process entirely in memory, or can use a temp file // for unpacked / processed resources // Decide which to do based on if we're reading from a file or not already TikaInputStream tstream = TikaInputStream.cast(stream); password = getPassword(metadata, context); if (tstream != null && tstream.hasFile()) { // File based, take that as a cue to use a temporary file if (localConfig.getUseNonSequentialParser() == true) { pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password); } else { pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true); } } else { // Go for the normal, stream based in-memory parsing if (localConfig.getUseNonSequentialParser() == true) { pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password); } else { pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true); } } metadata.set("pdf:encrypted", Boolean.toString(pdfDocument.isEncrypted())); pdfDocument.setAllSecurityToBeRemoved(true); metadata.set(Metadata.CONTENT_TYPE, "application/pdf"); extractMetadata(pdfDocument, metadata); if (handler != null) { String xfaXml = extractXFAText(pdfDocument); if (xfaXml != null) { try (BufferedInputStream is = new BufferedInputStream( new ByteArrayInputStream(xfaXml.getBytes()))) { new TXTParser().parse(is, handler, metadata, context); } metadata.set(Metadata.CONTENT_TYPE, "application/pdf"); } else { EnhancedPDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig); } } } finally { if (pdfDocument != null) { pdfDocument.close(); } } }