List of usage examples for org.apache.pdfbox.pdmodel PDDocument load
public static PDDocument load(byte[] input) throws IOException
From source file:net.sourceforge.docfetcher.model.parse.PagingPdfParser.java
License:Open Source License
public final void run() throws ParseException, CheckedOutOfMemoryError { PDDocument doc = null;/*from w ww . j av a2 s .c o m*/ try { doc = PDDocument.load(file); PagingStripper stripper = new PagingStripper(); stripper.setForceParsing(true); stripper.setSortByPosition(true); stripper.writeText(doc, writer); } catch (Exception e) { throw new ParseException(e); } catch (OutOfMemoryError e) { throw new CheckedOutOfMemoryError(e); } finally { PdfParser.close(doc); } }
From source file:net.sourceforge.docfetcher.model.parse.TestParseFromZip.java
License:Open Source License
@Test public void testZippedPdf() throws Exception { new ZipAndRun(TestFiles.multi_page_pdf) { protected void handleInputStream(InputStream in) throws Exception { PDDocument pdfDoc = PDDocument.load(in); PDFTextStripper stripper = new PDFTextStripper(); StringWriter writer = new StringWriter(); stripper.setForceParsing(true); stripper.setSortByPosition(true); stripper.writeText(pdfDoc, writer); // Will handle encryption with empty password PDDocumentInformation pdInfo = pdfDoc.getDocumentInformation(); ParseResult result = new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle()) .addAuthor(pdInfo.getAuthor()).addMiscMetadata(pdInfo.getSubject()) .addMiscMetadata(pdInfo.getKeywords()); String expectedContents = Util.join(Util.LS, "page 1", "page 2", "page 3"); String actualContents = result.getContent().toString().trim(); assertEquals(expectedContents, actualContents); }/*from w ww .j a v a 2 s .c o m*/ }; }
From source file:net.sourceforge.docfetcher.parse.PDFParser.java
License:Open Source License
public String renderText(File file) throws ParseException { PDDocument pdfDoc = null;//from w w w .ja va2 s. c o m try { pdfDoc = PDDocument.load(file); if (pdfDoc.isEncrypted()) { try { pdfDoc.openProtection(new StandardDecryptionMaterial("")); } catch (Exception e) { throw new ParseException(file, Msg.no_extraction_permission.value()); } } PDFTextStripper stripper = new PDFTextStripper(); StringWriter writer = new StringWriter(); stripper.writeText(pdfDoc, writer); return writer.toString(); } catch (IOException e) { throw new ParseException(file, Msg.file_not_readable.value()); } finally { if (pdfDoc != null) { try { pdfDoc.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:net.sourceforge.docfetcher.parse.PDFParser.java
License:Open Source License
public Document parse(File file) throws ParseException { PDDocument pdfDoc = null;/* w ww . j a v a2s.co m*/ try { // Check if PDF file is encrypted pdfDoc = PDDocument.load(file); if (pdfDoc.isEncrypted()) { try { pdfDoc.openProtection(new StandardDecryptionMaterial("")); } catch (Exception e) { throw new ParseException(file, Msg.no_extraction_permission.value()); } } // Get tags and contents PDFTextStripper stripper = new PDFTextStripper(); StringWriter writer = new StringWriter(); stripper.writeText(pdfDoc, writer); DocFetcher.getInstance().setExceptionHandlerEnabled(true); PDDocumentInformation pdInfo = pdfDoc.getDocumentInformation(); String[] metaData = new String[] { pdInfo.getTitle(), pdInfo.getAuthor(), pdInfo.getSubject(), pdInfo.getKeywords(), }; for (String field : metaData) if (field != null) writer.append(" ").append(field); //$NON-NLS-1$ return new Document(file, metaData[0], writer.getBuffer()).addAuthor(metaData[1]); } catch (IOException e) { throw new ParseException(file, Msg.file_not_readable.value()); } finally { if (pdfDoc != null) { try { pdfDoc.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:net.yacy.cora.util.Html2Image.java
License:Open Source License
/** * convert a pdf (first page) to an image. proper values are i.e. width = 1024, height = 1024, density = 300, quality = 75 * using internal pdf library or external command line tool on linux or mac * @param pdf input pdf file/*w ww .j a va 2 s . c o m*/ * @param image output jpg file * @param width * @param height * @param density (dpi) * @param quality * @return */ public static boolean pdf2image(File pdf, File image, int width, int height, int density, int quality) { final File convert = convertMac1.exists() ? convertMac1 : convertMac2.exists() ? convertMac2 : convertDebian; // convert pdf to jpg using internal pdfbox capability if (OS.isWindows || !convert.exists()) { try { PDDocument pdoc = PDDocument.load(pdf); BufferedImage bi = new PDFRenderer(pdoc).renderImageWithDPI(0, density, ImageType.RGB); return ImageIO.write(bi, "jpg", image); } catch (IOException ex) { } } // convert on mac or linux using external command line utility try { // i.e. convert -density 300 -trim yacy.pdf[0] -trim -resize 1024x -crop x1024+0+0 -quality 75% yacy-convert-300.jpg // note: both -trim are necessary, otherwise it is trimmed only on one side. The [0] selects the first page of the pdf String command = convert.getAbsolutePath() + " -density " + density + " -trim " + pdf.getAbsolutePath() + "[0] -trim -resize " + width + "x -crop x" + height + "+0+0 -quality " + quality + "% " + image.getAbsolutePath(); List<String> message = OS.execSynchronous(command); if (image.exists()) return true; ConcurrentLog.warn("Html2Image", "failed to create image with command: " + command); for (String m : message) ConcurrentLog.warn("Html2Image", ">> " + m); // another try for mac: use Image Events using AppleScript in osacript commands... // the following command overwrites a pdf with an png, so we must make a copy first if (!OS.isMacArchitecture) return false; File pngFile = new File(pdf.getAbsolutePath() + ".tmp.pdf"); org.apache.commons.io.FileUtils.copyFile(pdf, pngFile); String[] commandx = { "osascript", "-e", "set ImgFile to \"" + pngFile.getAbsolutePath() + "\"", "-e", "tell application \"Image Events\"", "-e", "set Img to open file ImgFile", "-e", "save Img as PNG", "-e", "end tell" }; //ConcurrentLog.warn("Html2Image", "failed to create image with command: " + commandx); message = OS.execSynchronous(commandx); for (String m : message) ConcurrentLog.warn("Html2Image", ">> " + m); // now we must read and convert this file to a jpg with the target size 1024x1024 try { File newPngFile = new File(pngFile.getAbsolutePath() + ".png"); pngFile.renameTo(newPngFile); Image img = ImageParser.parse(pngFile.getAbsolutePath(), FileUtils.read(newPngFile)); final Image scaled = img.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING); final MediaTracker mediaTracker = new MediaTracker(new Container()); mediaTracker.addImage(scaled, 0); try { mediaTracker.waitForID(0); } catch (final InterruptedException e) { } // finally write the image final BufferedImage bi = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB); bi.createGraphics().drawImage(scaled, 0, 0, width, height, null); ImageIO.write(bi, "jpg", image); newPngFile.delete(); return image.exists(); } catch (IOException e) { ConcurrentLog.logException(e); return false; } } catch (IOException e) { e.printStackTrace(); return false; } }
From source file:net.yacy.document.parser.pdfParser.java
License:Open Source License
@Override public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, false)) throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(), location);/* w ww. j a v a2 s. c o m*/ // create a pdf parser PDDocument pdfDoc; //final PDFParser pdfParser; try { Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain pdfDoc = PDDocument.load(source); //PDFParser pdfParser = new PDFParser(source); //pdfParser.parse(); //pdfDoc = pdfParser.getPDDocument(); } catch (final IOException e) { throw new Parser.Failure(e.getMessage(), location); } finally { Thread.currentThread().setPriority(Thread.NORM_PRIORITY); } if (pdfDoc.isEncrypted()) { try { pdfDoc.openProtection(new StandardDecryptionMaterial("")); } catch (final BadSecurityHandlerException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location); } catch (final IOException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location); } catch (final CryptographyException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location); } final AccessPermission perm = pdfDoc.getCurrentAccessPermission(); if (perm == null || !perm.canExtractContent()) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted and cannot be decrypted", location); } } // extracting some metadata PDDocumentInformation info = pdfDoc.getDocumentInformation(); String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null; Date docDate = new Date(); if (info != null) { docTitle = info.getTitle(); docSubject = info.getSubject(); docAuthor = info.getAuthor(); docPublisher = info.getProducer(); if (docPublisher == null || docPublisher.isEmpty()) docPublisher = info.getCreator(); docKeywordStr = info.getKeywords(); try { if (info.getModificationDate() != null) docDate = info.getModificationDate().getTime(); } catch (IOException e) { } // unused: // info.getTrapped()); } info = null; if (docTitle == null || docTitle.isEmpty()) { docTitle = MultiProtocolURL.unescape(location.getFileName()); } if (docTitle == null) { docTitle = docSubject; } String[] docKeywords = null; if (docKeywordStr != null) { docKeywords = docKeywordStr.split(" |,"); } Collection<AnchorURL>[] pdflinks = null; Document[] result = null; try { // get the links pdflinks = extractPdfLinks(pdfDoc); // get the fulltext (either per document or for each page) final PDFTextStripper stripper = new PDFTextStripper("UTF-8"); if (individualPages) { // this is a hack which stores individual pages of the source pdf into individual index documents // the new documents will get a virtual link with a post argument page=X appended to the original url // collect text int pagecount = pdfDoc.getNumberOfPages(); String[] pages = new String[pagecount]; for (int page = 1; page <= pagecount; page++) { stripper.setStartPage(page); stripper.setEndPage(page); pages[page - 1] = stripper.getText(pdfDoc); //System.out.println("PAGE " + page + ": " + pages[page - 1]); } // create individual documents for each page assert pages.length == pdflinks.length : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.length; result = new Document[Math.min(pages.length, pdflinks.length)]; String loc = location.toNormalform(true); for (int page = 0; page < result.length; page++) { result[page] = new Document( new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash mimeType, "UTF-8", this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]), pdflinks == null || page >= pdflinks.length ? null : pdflinks[page], null, null, false, docDate); } } else { // collect the whole text at once final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); byte[] contentBytes = new byte[0]; stripper.setEndPage(3); // get first 3 pages (always) writer.append(stripper.getText(pdfDoc)); contentBytes = writer.getBytes(); // remember text in case of interrupting thread if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text) stripper.setEndPage(Integer.MAX_VALUE); // set to default // we start the pdf parsing in a separate thread to ensure that it can be terminated final PDDocument pdfDocC = pdfDoc; final Thread t = new Thread() { @Override public void run() { Thread.currentThread().setName("pdfParser.getText:" + location); try { writer.append(stripper.getText(pdfDocC)); } catch (final Throwable e) { } } }; t.start(); t.join(3000); // pdfbox likes to forget to terminate ... (quite often) if (t.isAlive()) t.interrupt(); } contentBytes = writer.getBytes(); // get final text before closing writer Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>(); for (Collection<AnchorURL> pdflinksx : pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); result = new Document[] { new Document(location, mimeType, "UTF-8", this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, contentBytes, pdflinksCombined, null, null, false, docDate) }; } } catch (final Throwable e) { //close the writer (in finally) //throw new Parser.Failure(e.getMessage(), location); } finally { try { pdfDoc.close(); } catch (final Throwable e) { } } // clear resources in pdfbox. they say that is resolved but it's not. see: // https://issues.apache.org/jira/browse/PDFBOX-313 // https://issues.apache.org/jira/browse/PDFBOX-351 // https://issues.apache.org/jira/browse/PDFBOX-441 // the pdfbox still generates enormeous number of object allocations and don't delete these // the following Object are statically stored and never flushed: // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary, // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull // the great number of these objects can easily be seen in Java Visual VM // we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out. pdfDoc = null; clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); return result; }
From source file:neuralclassification.Classificator.java
String readText(String filepath, String name) { PDDocument pdfDocument = null;/*from ww w.j a va2 s. c o m*/ String paper = null; try { pdfDocument = PDDocument.load(new File(filepath + "/" + name)); PDFTextStripper stripper = new PDFTextStripper(); paper = stripper.getText(pdfDocument); } catch (IOException e) { throw new RuntimeException(e); } finally { if (pdfDocument != null) try { pdfDocument.close(); } catch (IOException e) { throw new RuntimeException(e); } } return paper; }
From source file:neuralclassification.Trainer.java
String readText(String name) { PDDocument pdfDocument = null;// w w w.java 2 s .com String paper = null; try { pdfDocument = PDDocument.load(new File(filepath + "/" + name)); PDFTextStripper stripper = new PDFTextStripper(); paper = stripper.getText(pdfDocument); } catch (IOException e) { throw new RuntimeException(e); } finally { if (pdfDocument != null) try { pdfDocument.close(); } catch (IOException e) { throw new RuntimeException(e); } } return paper; }
From source file:no.digipost.print.validate.PdfValidator.java
License:Apache License
/** * @param pdfStream the input stream for reading the PDF. It will be closed before returning from * this method/*from w w w. jav a 2 s.c o m*/ * @param readStrategy decides if PDF is completely read into memory or not */ private PdfValidationResult validerForPrint(InputStream pdfStream, PdfValidationSettings printValideringsinnstillinger, PdfValidateStrategy readStrategy) { int antallSider = -1; try { List<PdfValidationError> errors; try { if (readStrategy == NON_SEQUENTIALLY) { try (EnhancedNonSequentialPDFParser dpostNonSequentialPDFParser = new EnhancedNonSequentialPDFParser( pdfStream)) { antallSider = dpostNonSequentialPDFParser.getNumberOfPages(); errors = validerStreamForPrint(dpostNonSequentialPDFParser, printValideringsinnstillinger); } } else if (readStrategy == FULLY_IN_MEMORY) { try (PDDocument pdDoc = PDDocument.load(pdfStream)) { antallSider = pdDoc.getNumberOfPages(); errors = validerDokumentForPrint(pdDoc, printValideringsinnstillinger); } } else { throw new IllegalArgumentException( "Unknown " + PdfValidateStrategy.class.getSimpleName() + ": " + readStrategy); } } catch (Exception e) { errors = asList(PdfValidationError.PDF_PARSE_ERROR); LOG.info("PDF-en kunne ikke parses. (" + e.getMessage() + ")"); LOG.debug(e.getMessage(), e); } return new PdfValidationResult(errors, antallSider); } finally { IOUtils.closeQuietly(pdfStream); } }
From source file:nominas.sei.form.Principal.java
private void ordenaNominas(String rutaEntrada, String rutaSalida) { ArrayList<PaginaNomina> paginasNomina = new ArrayList<PaginaNomina>(); for (int x = 0; x < 1; x++) {//RECORREMOS EL ARREGLO CON LOS NOMBRES DE ARCHIVO try {/* w ww. ja va 2 s.c o m*/ PDDocument pd = PDDocument.load(rutaEntrada); //CARGAR EL PDF List l = pd.getDocumentCatalog().getAllPages();//NUMERO LAS PAGINAS DEL ARCHIVO Object[] obj = l.toArray();//METO EN UN OBJETO LA LISTA DE PAGINAS PARA MANIPULARLA for (int i = 0; i < l.size(); i++) { PDPage page = (PDPage) obj[i];//PAGE ES LA PAGINA 1 DE LA QUE CONSTA EL ARCHIVO PageFormat pageFormat = pd.getPageFormat(0);//PROPIEDADES DE LA PAGINA (FORMATO) Double d1 = new Double(pageFormat.getHeight());//ALTO Double d2 = new Double(pageFormat.getWidth());//ANCHO int width = d1.intValue();//ANCHO int eigth = 1024;//ALTO PDFTextStripperByArea stripper = new PDFTextStripperByArea();//COMPONENTE PARA ACCESO AL TEXTO Rectangle rect = new Rectangle(0, 0, width, eigth);//DEFNIR AREA DONDE SE BUSCARA EL TEXTO stripper.addRegion("area1", rect);//REGISTRAMOS LA REGION CON UN NOMBRE stripper.extractRegions(page);//EXTRAE TEXTO DEL AREA String contenido = new String();//CONTENIDO = A LO QUE CONTENGA EL AREA O REGION contenido = (stripper.getTextForRegion("area1")); String[] lines = contenido.split("[\\r\\n]+"); String nombre = lines[1].substring(28, lines[1].length() - 10);//Separamos el nombre PaginaNomina nomina = new PaginaNomina(page, nombre); paginasNomina.add(nomina); } Collections.sort(paginasNomina); // Create a new empty document PDDocument document = new PDDocument(); for (int i = 0; i < paginasNomina.size(); i++) { System.out.println(paginasNomina.get(i).getNombre()); document.addPage(paginasNomina.get(i).getPagina()); } // Save the newly created document document.save(rutaSalida); // finally make sure that the document is properly // closed. document.close(); pd.close();//CERRAMOS OBJETO ACROBAT } catch (Exception e) { System.out.println(e.getMessage()); } //CATCH } //FOR }