List of usage examples for org.apache.pdfbox.pdfparser PDFParser getDocument
public COSDocument getDocument() throws IOException
From source file:com.amandine.NewEmptyJUnitTest.java
public String pdflookbook() throws IOException { String filePath = "C:\\Users\\janitha\\OneDrive\\Documents\\lookbookSS2016.pdf"; InputStream inputStream = null; String statementPDF = null;//from w ww .j a va 2 s. c o m try { inputStream = new FileInputStream(filePath); PDFParser parser = new PDFParser(inputStream); // This will parse the stream and populate the COSDocument object. parser.parse(); // Get the document that was parsed. COSDocument cosDoc = parser.getDocument(); // This class will take a pdf document and strip out all of the text and // ignore the formatting and such. PDFTextStripper pdfStripper = new PDFTextStripper(); // This is the in-memory representation of the PDF document PDDocument pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(3); pdfStripper.setEndPage(pdDoc.getNumberOfPages() - 1); assertEquals(41, pdDoc.getNumberOfPages() - 1); // This will return the text of a document. statementPDF = pdfStripper.getText(pdDoc); // System.out.println(statementPDF); // String [] statementPDFArray = statementPDF.split("\\n"); // assertEquals(256, statementPDFArray.length); } catch (Exception e) { //Syste String errorMessage = "\nUnexpected Exception: " + e.getClass() + "\n" + e.getMessage(); for (StackTraceElement trace : e.getStackTrace()) { errorMessage += "\n\t" + trace; } System.out.println(errorMessage); } finally { if (inputStream != null) { inputStream.close(); } } return statementPDF; }
From source file:com.cisco.iwe.services.util.EmailMonitor.java
/** * // ww w .ja va 2 s. c om * @param fileDir * @return */ /* This method is used to scan the uploaded expense receipt in .pdf format and extract the text embedded in it. */ public String scanPDF(String fileDir) { PDFParser parser; String parsedText = null; PDFTextStripper pdfStripper = null; PDDocument pdDoc = null; COSDocument cosDoc = null; File file = new File(fileDir); if (!file.isFile()) { System.err.println("File " + fileDir + " does not exist."); return null; } try { parser = new PDFParser(new FileInputStream(file)); } catch (IOException e) { System.err.println("Unable to open PDF Parser. " + e.getMessage()); return null; } try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(pdDoc.getNumberOfPages()); parsedText = pdfStripper.getText(pdDoc); } catch (Exception e) { System.err.println("An exception occured in parsing the PDF Document." + e.getMessage()); } finally { try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e) { e.printStackTrace(); } } return parsedText; }
From source file:com.exlibris.dps.repository.plugin.riskExtractor.drmlint.PDFBoxWrapper.java
License:Apache License
/** * Check if a PDF file is valid or not/* w ww. j av a2s.c o m*/ * @param pFile file to check * @return whether the file is valid or not */ public static boolean isValid(File pFile) { boolean ret = false; try { PDFParser parser = new PDFParser(new FileInputStream(pFile)); parser.parse(); File temp = File.createTempFile("drmlint-temp-", ".pdf"); parser.getPDDocument().save(temp); parser.getDocument().close(); temp.delete(); ret = true; } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (COSVisitorException e) { // TODO Auto-generated catch block ret = false; } return ret; }
From source file:com.exlibris.dps.repository.plugin.riskExtractor.drmlint.PDFBoxWrapper.java
License:Apache License
/** * Check if a PDF file has DRM or not//from w w w . j av a2 s . c o m * @param pFile file to check * @return whether the file is had DRM or not */ public static boolean hasDRM(File pFile) { boolean ret = false; try { PDFParser parser = new PDFParser(new FileInputStream(pFile)); parser.parse(); ret = parser.getDocument().isEncrypted(); parser.getDocument().close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return ret; }
From source file:com.exlibris.dps.repository.plugin.riskExtractor.drmlint.PDFBoxWrapper.java
License:Apache License
/** * Check for encryption with Apache PDFBox * -> query the encryption dictionary (might allow more granular checks of protection) * @param pPDF pdf file to check//from ww w.j av a 2 s . c o m * @return whether or not the file has DRM */ public static boolean hasDRMGranular(File pPDF) { boolean ret = false; try { PDFParser parser = new PDFParser(new FileInputStream(pPDF)); parser.parse(); COSDictionary dict = parser.getDocument().getEncryptionDictionary(); if (dict != null) { //print encryption dictionary // for(COSName key:dict.keySet()) { // System.out.print(key.getName()); // String value = dict.getString(key); // if(value!=null){ // System.out.println(": "+value); // } else { // System.out.println(": "+dict.getLong(key)); // } // } //this feaure in pdfbox is currently broken, see: https://issues.apache.org/jira/browse/PDFBOX-1651 //AccessPermission perms = parser.getPDDocument().getCurrentAccessPermission(); //this is a work around; creating a new object from the data AccessPermission perms = new AccessPermission(dict.getInt("P")); boolean debug = false; if (debug) { System.out.println("canAssembleDocument() : " + perms.canAssembleDocument()); System.out.println("canExtractContent() : " + perms.canExtractContent()); System.out.println("canExtractForAccessibility() : " + perms.canExtractForAccessibility()); System.out.println("canFillInForm() : " + perms.canFillInForm()); System.out.println("canModify() : " + perms.canModify()); System.out.println("canModifyAnnotations() : " + perms.canModifyAnnotations()); System.out.println("canPrint() : " + perms.canPrint()); System.out.println("canPrintDegraded() : " + perms.canPrintDegraded()); System.out.println("isOwnerPermission() : " + perms.isOwnerPermission()); System.out.println("isReadOnly() : " + perms.isReadOnly()); } } parser.getDocument().close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return ret; }
From source file:com.lanacion.adminsiteln.services.PdfIndexerService.PdfIndexerService.java
/** * Metodos privados para la indexacin/* ww w. java 2s .c o m*/ */ private String pdftoText(String fileName, int pagina) { PDFParser parser; String parsedText = null; ; PDFTextStripper pdfStripper = null; //pdfStripper.setStartPage(0); //pdfStripper.setEndPage(0); PDDocument pdDoc = null; COSDocument cosDoc = null; File file = new File(fileName); if (!file.isFile()) { System.err.println("File " + fileName + " does not exist."); return null; } try { parser = new PDFParser(new FileInputStream(file)); } catch (IOException e) { System.err.println("Unable to open PDF Parser. " + e.getMessage()); return null; } try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(pagina); pdfStripper.setEndPage(pagina); parsedText = pdfStripper.getText(pdDoc); } catch (Exception e) { System.err.println("An exception occured in parsing the PDF Document." + e.getMessage()); } finally { try { if (cosDoc != null) { cosDoc.close(); } if (pdDoc != null) { pdDoc.close(); } } catch (Exception e) { e.printStackTrace(); } } return parsedText; }
From source file:com.lanacion.adminsiteln.services.PdfIndexerService.PdfIndexerService.java
private int pdfgetPages(String fileName) { int numero_paginas = 0; PDFParser parser; String parsedText = null;//from w ww.j av a 2s .com ; PDFTextStripper pdfStripper = null; //pdfStripper.setStartPage(0); //pdfStripper.setEndPage(0); PDDocument pdDoc = null; COSDocument cosDoc = null; File file = new File(fileName); if (!file.isFile()) { System.err.println("File " + fileName + " does not exist."); return 0; } try { parser = new PDFParser(new FileInputStream(file)); } catch (IOException e) { System.err.println("Unable to open PDF Parser. " + e.getMessage()); return 0; } try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); numero_paginas = pdDoc.getNumberOfPages(); } catch (Exception e) { System.err.println("An exception occured in parsing the PDF Document." + e.getMessage()); } finally { try { if (cosDoc != null) { cosDoc.close(); } if (pdDoc != null) { pdDoc.close(); } } catch (Exception e) { e.printStackTrace(); } } return numero_paginas; }
From source file:com.sastix.cms.common.services.htmltopdf.PdfTest.java
License:Apache License
@Test public void testPdfFromStringTo() throws Exception { // GIVEN an html template containing special characters that java stores in utf-16 internally Pdf pdf = pdfBuilder.build();//from w w w.jav a 2 s.co m pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Mller</h1></html>", PageType.htmlAsString); String tempFolder = temporaryFolder.newFolder().getPath(); pdf.saveAs(tempFolder + "/output.pdf"); // WHEN byte[] pdfBytes = pdf.getPDF(); PDFParser parser = new PDFParser( new RandomAccessBufferedFileInputStream(new ByteArrayInputStream(pdfBytes))); // that is a valid PDF (otherwise an IOException occurs) parser.parse(); PDFTextStripper pdfTextStripper = new PDFTextStripper(); String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument())); assertThat("document should contain the creditorName", pdfText, containsString("Mller")); }
From source file:com.validation.manager.core.server.core.AttachmentServerTest.java
License:Apache License
/** * Test of addFile method, of class AttachmentServer. *///from w w w . java 2 s.co m @Test public void testAddRetrieveTextFile() { try { System.out.println("add text File"); File f = new File("target/Test.txt"); f.deleteOnExit(); List<String> lines = Arrays.asList("The first line", "The second line"); Path file = Paths.get(f.getAbsolutePath()); Files.write(file, lines, Charset.forName("UTF-8")); AttachmentServer instance = new AttachmentServer(); instance.addFile(f, f.getName()); instance.write2DB(); //Delete the file FileUtils.delete(f.getAbsolutePath()); assertEquals(1, (int) instance.getAttachmentType().getId());//Text file System.out.println("retrieveFile"); AttachmentServer temp = new AttachmentServer(instance.getAttachmentPK()); File loadedFile = temp.getAttachedFile("target/loaded/"); BufferedReader br = new BufferedReader(new FileReader(loadedFile)); String line; int count = 0; while ((line = br.readLine()) != null) { assertEquals(lines.get(count), line); System.out.println(line); count++; } assertEquals(lines.size(), count); //Create pdf file System.out.println("add pdf File"); File pdf = Tool.convertToPDF(loadedFile, "target/Text.pdf"); pdf.deleteOnExit(); instance = new AttachmentServer(); instance.addFile(pdf, pdf.getName()); instance.write2DB(); //Delete the file FileUtils.delete(pdf.getAbsolutePath()); assertEquals(2, (int) instance.getAttachmentType().getId());//PDF file System.out.println("retrieveFile"); temp = new AttachmentServer(instance.getAttachmentPK()); loadedFile = temp.getAttachedFile("target/loaded/"); PDFTextStripper pdfStripper; PDDocument pdDoc = null; COSDocument cosDoc = null; try { PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(loadedFile)); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(1); String parsedText = pdfStripper.getText(pdDoc); System.out.println(parsedText); } catch (IOException ex) { Exceptions.printStackTrace(ex); fail(); } finally { if (cosDoc != null) { cosDoc.close(); } if (pdDoc != null) { pdDoc.close(); } } } catch (IOException | VMException ex) { Exceptions.printStackTrace(ex); fail(); } }
From source file:cz.muni.pdfjbim.PdfImageExtractor.java
License:Apache License
/** * This method extracts images by going through all COSObjects pointed from xref table * @param is input stream containing PDF file * @param prefix output basename for images * @param password password for access to PDF if needed * @param pagesToProcess list of pages which should be processed if null given => processed all pages * -- not working yet//from www. ja v a 2s. c om * @param binarize -- enables processing of nonbitonal images as well (LZW is still not * processed because of output with inverted colors) * @throws PdfRecompressionException if problem to extract images from PDF */ public void extractImagesUsingPdfParser(InputStream is, String prefix, String password, Set<Integer> pagesToProcess, Boolean binarize) throws PdfRecompressionException { // checking arguments and setting appropriate variables if (binarize == null) { binarize = false; } log.debug("Extracting images (binarize set to {})", binarize); InputStream inputStream = null; if (password != null) { try (ByteArrayOutputStream decryptedOutputStream = new ByteArrayOutputStream()) { PdfReader reader = new PdfReader(is, password.getBytes(StandardCharsets.UTF_8)); PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream); if (stamper != null) { stamper.close(); } inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray()); } catch (DocumentException ex) { throw new PdfRecompressionException(ex); } catch (IOException ex) { throw new PdfRecompressionException("Reading file caused exception", ex); } } else { inputStream = is; } PDFParser parser = null; COSDocument doc = null; try { parser = new PDFParser(inputStream); parser.parse(); doc = parser.getDocument(); List<COSObject> objs = doc.getObjectsByType(COSName.XOBJECT); if (objs != null) { for (COSObject obj : objs) { COSBase subtype = obj.getItem(COSName.SUBTYPE); if (subtype.toString().equalsIgnoreCase("COSName{Image}")) { COSBase imageObj = obj.getObject(); COSBase cosNameObj = obj.getItem(COSName.NAME); String key; if (cosNameObj != null) { String cosNameKey = cosNameObj.toString(); int startOfKey = cosNameKey.indexOf("{") + 1; key = cosNameKey.substring(startOfKey, cosNameKey.length() - 1); } else { key = "im0"; } int objectNum = obj.getObjectNumber().intValue(); int genNum = obj.getGenerationNumber().intValue(); PDXObjectImage image = (PDXObjectImage) PDXObjectImage.createXObject(imageObj); PDStream pdStr = new PDStream(image.getCOSStream()); List<COSName> filters = pdStr.getFilters(); log.debug("Detected image with color depth: {} bits", image.getBitsPerComponent()); if (filters == null) { continue; } log.debug("Detected filters: {}", filters.toString()); if ((image.getBitsPerComponent() > 1) && (!binarize)) { log.info("It is not a bitonal image => skipping"); continue; } // at this moment for preventing bad output (bad coloring) from LZWDecode filter if (filters.contains(COSName.LZW_DECODE)) { log.info("This is LZWDecoded => skipping"); continue; } if (filters.contains(COSName.FLATE_DECODE)) { log.debug("FlateDecoded image detected"); } if (filters.contains(COSName.JBIG2_DECODE)) { if (skipJBig2Images) { log.warn("Allready compressed according to JBIG2 standard => skipping"); continue; } else { log.debug("JBIG2 image detected"); } } // detection of unsupported filters by pdfBox library if (filters.contains(COSName.JPX_DECODE)) { log.warn("Unsupported filter JPXDecode => skipping"); continue; } String name = getUniqueFileName(prefix, image.getSuffix()); log.info("Writing image: {}", name); image.write2file(name); PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(), image.getHeight(), objectNum, genNum); originalImageInformations.add(pdfImageInfo); namesOfImages.add(name + "." + image.getSuffix()); } } } } catch (IOException ex) { Tools.deleteFilesFromList(namesOfImages); throw new PdfRecompressionException("Unable to parse PDF document", ex); } catch (Exception ex) { Tools.deleteFilesFromList(namesOfImages); } finally { if (doc != null) { try { doc.close(); } catch (IOException ex) { throw new PdfRecompressionException(ex); } } } }