Example usage for org.apache.pdfbox.pdfparser PDFParser getDocument

List of usage examples for org.apache.pdfbox.pdfparser PDFParser getDocument

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdfparser PDFParser getDocument.

Prototype

public COSDocument getDocument() throws IOException 

Source Link

Document

This will get the document that was parsed.

Usage

From source file:com.amandine.NewEmptyJUnitTest.java

public String pdflookbook() throws IOException {
    String filePath = "C:\\Users\\janitha\\OneDrive\\Documents\\lookbookSS2016.pdf";
    InputStream inputStream = null;
    String statementPDF = null;//from  w ww  .j  a va 2  s. c o  m
    try {
        inputStream = new FileInputStream(filePath);
        PDFParser parser = new PDFParser(inputStream);

        // This will parse the stream and populate the COSDocument object.
        parser.parse();

        // Get the document that was parsed.
        COSDocument cosDoc = parser.getDocument();

        // This class will take a pdf document and strip out all of the text and 
        // ignore the formatting and such.
        PDFTextStripper pdfStripper = new PDFTextStripper();

        // This is the in-memory representation of the PDF document
        PDDocument pdDoc = new PDDocument(cosDoc);
        pdfStripper.setStartPage(3);
        pdfStripper.setEndPage(pdDoc.getNumberOfPages() - 1);
        assertEquals(41, pdDoc.getNumberOfPages() - 1);

        // This will return the text of a document.
        statementPDF = pdfStripper.getText(pdDoc);
        //            System.out.println(statementPDF);

        //            String [] statementPDFArray = statementPDF.split("\\n");
        //            assertEquals(256, statementPDFArray.length);
    } catch (Exception e) {
        //Syste
        String errorMessage = "\nUnexpected Exception: " + e.getClass() + "\n" + e.getMessage();
        for (StackTraceElement trace : e.getStackTrace()) {
            errorMessage += "\n\t" + trace;
        }
        System.out.println(errorMessage);
    } finally {
        if (inputStream != null) {
            inputStream.close();
        }
    }
    return statementPDF;
}

From source file:com.cisco.iwe.services.util.EmailMonitor.java

/**
 * // ww w  .ja  va  2  s.  c om
 * @param fileDir
 * @return
 */
/* This method is used to scan the uploaded expense receipt in .pdf format and extract the text embedded in it. */
public String scanPDF(String fileDir) {
    PDFParser parser;
    String parsedText = null;
    PDFTextStripper pdfStripper = null;
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    File file = new File(fileDir);
    if (!file.isFile()) {
        System.err.println("File " + fileDir + " does not exist.");
        return null;
    }
    try {
        parser = new PDFParser(new FileInputStream(file));
    } catch (IOException e) {
        System.err.println("Unable to open PDF Parser. " + e.getMessage());
        return null;
    }
    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        pdfStripper.setStartPage(1);
        pdfStripper.setEndPage(pdDoc.getNumberOfPages());
        parsedText = pdfStripper.getText(pdDoc);
    } catch (Exception e) {
        System.err.println("An exception occured in parsing the PDF Document." + e.getMessage());
    } finally {
        try {
            if (cosDoc != null)
                cosDoc.close();
            if (pdDoc != null)
                pdDoc.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    return parsedText;
}

From source file:com.exlibris.dps.repository.plugin.riskExtractor.drmlint.PDFBoxWrapper.java

License:Apache License

/**
 * Check if a PDF file is valid or not/*  w ww.  j  av a2s.c o m*/
 * @param pFile file to check
 * @return whether the file is valid or not
 */
public static boolean isValid(File pFile) {
    boolean ret = false;
    try {
        PDFParser parser = new PDFParser(new FileInputStream(pFile));
        parser.parse();
        File temp = File.createTempFile("drmlint-temp-", ".pdf");
        parser.getPDDocument().save(temp);
        parser.getDocument().close();
        temp.delete();
        ret = true;
    } catch (FileNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (COSVisitorException e) {
        // TODO Auto-generated catch block
        ret = false;
    }
    return ret;
}

From source file:com.exlibris.dps.repository.plugin.riskExtractor.drmlint.PDFBoxWrapper.java

License:Apache License

/**
 * Check if a PDF file has DRM or not//from   w  w w .  j av a2  s .  c  o m
 * @param pFile file to check
 * @return whether the file is had DRM or not
 */
public static boolean hasDRM(File pFile) {
    boolean ret = false;
    try {
        PDFParser parser = new PDFParser(new FileInputStream(pFile));
        parser.parse();
        ret = parser.getDocument().isEncrypted();
        parser.getDocument().close();

    } catch (FileNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    return ret;
}

From source file:com.exlibris.dps.repository.plugin.riskExtractor.drmlint.PDFBoxWrapper.java

License:Apache License

/**
 * Check for encryption with Apache PDFBox
 * -> query the encryption dictionary (might allow more granular checks of protection)
 * @param pPDF pdf file to check//from ww w.j av  a  2 s  .  c o m
 * @return whether or not the file has DRM
 */
public static boolean hasDRMGranular(File pPDF) {

    boolean ret = false;

    try {
        PDFParser parser = new PDFParser(new FileInputStream(pPDF));
        parser.parse();

        COSDictionary dict = parser.getDocument().getEncryptionDictionary();
        if (dict != null) {

            //print encryption dictionary
            //            for(COSName key:dict.keySet()) {
            //               System.out.print(key.getName());
            //               String value = dict.getString(key);
            //               if(value!=null){
            //                  System.out.println(": "+value);
            //               } else {
            //                  System.out.println(": "+dict.getLong(key));
            //               }
            //            }

            //this feaure in pdfbox is currently broken, see: https://issues.apache.org/jira/browse/PDFBOX-1651
            //AccessPermission perms = parser.getPDDocument().getCurrentAccessPermission();
            //this is a work around; creating a new object from the data
            AccessPermission perms = new AccessPermission(dict.getInt("P"));

            boolean debug = false;

            if (debug) {

                System.out.println("canAssembleDocument()        : " + perms.canAssembleDocument());
                System.out.println("canExtractContent()          : " + perms.canExtractContent());
                System.out.println("canExtractForAccessibility() : " + perms.canExtractForAccessibility());
                System.out.println("canFillInForm()              : " + perms.canFillInForm());
                System.out.println("canModify()                  : " + perms.canModify());
                System.out.println("canModifyAnnotations()       : " + perms.canModifyAnnotations());
                System.out.println("canPrint()                   : " + perms.canPrint());
                System.out.println("canPrintDegraded()           : " + perms.canPrintDegraded());
                System.out.println("isOwnerPermission()          : " + perms.isOwnerPermission());
                System.out.println("isReadOnly()                 : " + perms.isReadOnly());

            }
        }

        parser.getDocument().close();

    } catch (FileNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    return ret;
}

From source file:com.lanacion.adminsiteln.services.PdfIndexerService.PdfIndexerService.java

/**
 * Metodos privados para la indexacin/*  ww  w.  java 2s .c o m*/
 */
private String pdftoText(String fileName, int pagina) {

    PDFParser parser;
    String parsedText = null;
    ;
    PDFTextStripper pdfStripper = null;
    //pdfStripper.setStartPage(0);
    //pdfStripper.setEndPage(0);
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    File file = new File(fileName);
    if (!file.isFile()) {
        System.err.println("File " + fileName + " does not exist.");
        return null;
    }
    try {
        parser = new PDFParser(new FileInputStream(file));
    } catch (IOException e) {
        System.err.println("Unable to open PDF Parser. " + e.getMessage());
        return null;
    }
    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        pdfStripper.setStartPage(pagina);
        pdfStripper.setEndPage(pagina);
        parsedText = pdfStripper.getText(pdDoc);
    } catch (Exception e) {
        System.err.println("An exception occured in parsing the PDF Document." + e.getMessage());
    } finally {
        try {
            if (cosDoc != null) {
                cosDoc.close();
            }
            if (pdDoc != null) {
                pdDoc.close();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    return parsedText;

}

From source file:com.lanacion.adminsiteln.services.PdfIndexerService.PdfIndexerService.java

private int pdfgetPages(String fileName) {

    int numero_paginas = 0;
    PDFParser parser;
    String parsedText = null;//from   w ww.j av a  2s  .com
    ;
    PDFTextStripper pdfStripper = null;
    //pdfStripper.setStartPage(0);
    //pdfStripper.setEndPage(0);
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    File file = new File(fileName);
    if (!file.isFile()) {
        System.err.println("File " + fileName + " does not exist.");
        return 0;
    }
    try {
        parser = new PDFParser(new FileInputStream(file));
    } catch (IOException e) {
        System.err.println("Unable to open PDF Parser. " + e.getMessage());
        return 0;
    }
    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        numero_paginas = pdDoc.getNumberOfPages();
    } catch (Exception e) {
        System.err.println("An exception occured in parsing the PDF Document." + e.getMessage());
    } finally {
        try {
            if (cosDoc != null) {
                cosDoc.close();
            }
            if (pdDoc != null) {
                pdDoc.close();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    return numero_paginas;
}

From source file:com.sastix.cms.common.services.htmltopdf.PdfTest.java

License:Apache License

@Test
public void testPdfFromStringTo() throws Exception {

    // GIVEN an html template containing special characters that java stores in utf-16 internally
    Pdf pdf = pdfBuilder.build();//from  w  w w.jav a  2 s.co  m
    pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Mller</h1></html>", PageType.htmlAsString);

    String tempFolder = temporaryFolder.newFolder().getPath();
    pdf.saveAs(tempFolder + "/output.pdf");

    // WHEN
    byte[] pdfBytes = pdf.getPDF();

    PDFParser parser = new PDFParser(
            new RandomAccessBufferedFileInputStream(new ByteArrayInputStream(pdfBytes)));

    // that is a valid PDF (otherwise an IOException occurs)
    parser.parse();
    PDFTextStripper pdfTextStripper = new PDFTextStripper();
    String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));

    assertThat("document should contain the creditorName", pdfText, containsString("Mller"));
}

From source file:com.validation.manager.core.server.core.AttachmentServerTest.java

License:Apache License

/**
 * Test of addFile method, of class AttachmentServer.
 *///from w w  w . java  2 s.co m
@Test
public void testAddRetrieveTextFile() {
    try {
        System.out.println("add text File");
        File f = new File("target/Test.txt");
        f.deleteOnExit();
        List<String> lines = Arrays.asList("The first line", "The second line");
        Path file = Paths.get(f.getAbsolutePath());
        Files.write(file, lines, Charset.forName("UTF-8"));
        AttachmentServer instance = new AttachmentServer();
        instance.addFile(f, f.getName());
        instance.write2DB();
        //Delete the file
        FileUtils.delete(f.getAbsolutePath());
        assertEquals(1, (int) instance.getAttachmentType().getId());//Text file
        System.out.println("retrieveFile");
        AttachmentServer temp = new AttachmentServer(instance.getAttachmentPK());
        File loadedFile = temp.getAttachedFile("target/loaded/");
        BufferedReader br = new BufferedReader(new FileReader(loadedFile));
        String line;
        int count = 0;
        while ((line = br.readLine()) != null) {
            assertEquals(lines.get(count), line);
            System.out.println(line);
            count++;
        }
        assertEquals(lines.size(), count);
        //Create pdf file
        System.out.println("add pdf File");
        File pdf = Tool.convertToPDF(loadedFile, "target/Text.pdf");
        pdf.deleteOnExit();
        instance = new AttachmentServer();
        instance.addFile(pdf, pdf.getName());
        instance.write2DB();
        //Delete the file
        FileUtils.delete(pdf.getAbsolutePath());
        assertEquals(2, (int) instance.getAttachmentType().getId());//PDF file
        System.out.println("retrieveFile");
        temp = new AttachmentServer(instance.getAttachmentPK());
        loadedFile = temp.getAttachedFile("target/loaded/");
        PDFTextStripper pdfStripper;
        PDDocument pdDoc = null;
        COSDocument cosDoc = null;
        try {
            PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(loadedFile));
            parser.parse();
            cosDoc = parser.getDocument();
            pdfStripper = new PDFTextStripper();
            pdDoc = new PDDocument(cosDoc);
            pdfStripper.setStartPage(1);
            pdfStripper.setEndPage(1);
            String parsedText = pdfStripper.getText(pdDoc);
            System.out.println(parsedText);
        } catch (IOException ex) {
            Exceptions.printStackTrace(ex);
            fail();
        } finally {
            if (cosDoc != null) {
                cosDoc.close();
            }
            if (pdDoc != null) {
                pdDoc.close();
            }
        }
    } catch (IOException | VMException ex) {
        Exceptions.printStackTrace(ex);
        fail();
    }
}

From source file:cz.muni.pdfjbim.PdfImageExtractor.java

License:Apache License

/**
 * This method extracts images by going through all COSObjects pointed from xref table
 * @param is input stream containing PDF file
 * @param prefix output basename for images
 * @param password password for access to PDF if needed
 * @param pagesToProcess list of pages which should be processed if null given => processed all pages
 *      -- not working yet//from  www. ja  v  a  2s.  c om
 * @param binarize -- enables processing of nonbitonal images as well (LZW is still not
 *      processed because of output with inverted colors)
 * @throws PdfRecompressionException if problem to extract images from PDF
 */
public void extractImagesUsingPdfParser(InputStream is, String prefix, String password,
        Set<Integer> pagesToProcess, Boolean binarize) throws PdfRecompressionException {
    // checking arguments and setting appropriate variables
    if (binarize == null) {
        binarize = false;
    }

    log.debug("Extracting images (binarize set to {})", binarize);

    InputStream inputStream = null;
    if (password != null) {
        try (ByteArrayOutputStream decryptedOutputStream = new ByteArrayOutputStream()) {
            PdfReader reader = new PdfReader(is, password.getBytes(StandardCharsets.UTF_8));
            PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream);
            if (stamper != null) {
                stamper.close();
            }
            inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray());
        } catch (DocumentException ex) {
            throw new PdfRecompressionException(ex);
        } catch (IOException ex) {
            throw new PdfRecompressionException("Reading file caused exception", ex);
        }
    } else {
        inputStream = is;
    }

    PDFParser parser = null;
    COSDocument doc = null;
    try {
        parser = new PDFParser(inputStream);
        parser.parse();
        doc = parser.getDocument();

        List<COSObject> objs = doc.getObjectsByType(COSName.XOBJECT);
        if (objs != null) {
            for (COSObject obj : objs) {
                COSBase subtype = obj.getItem(COSName.SUBTYPE);
                if (subtype.toString().equalsIgnoreCase("COSName{Image}")) {
                    COSBase imageObj = obj.getObject();
                    COSBase cosNameObj = obj.getItem(COSName.NAME);
                    String key;
                    if (cosNameObj != null) {
                        String cosNameKey = cosNameObj.toString();
                        int startOfKey = cosNameKey.indexOf("{") + 1;
                        key = cosNameKey.substring(startOfKey, cosNameKey.length() - 1);
                    } else {
                        key = "im0";
                    }
                    int objectNum = obj.getObjectNumber().intValue();
                    int genNum = obj.getGenerationNumber().intValue();
                    PDXObjectImage image = (PDXObjectImage) PDXObjectImage.createXObject(imageObj);

                    PDStream pdStr = new PDStream(image.getCOSStream());
                    List<COSName> filters = pdStr.getFilters();

                    log.debug("Detected image with color depth: {} bits", image.getBitsPerComponent());
                    if (filters == null) {
                        continue;
                    }
                    log.debug("Detected filters: {}", filters.toString());

                    if ((image.getBitsPerComponent() > 1) && (!binarize)) {
                        log.info("It is not a bitonal image => skipping");
                        continue;
                    }

                    // at this moment for preventing bad output (bad coloring) from LZWDecode filter
                    if (filters.contains(COSName.LZW_DECODE)) {
                        log.info("This is LZWDecoded => skipping");
                        continue;
                    }

                    if (filters.contains(COSName.FLATE_DECODE)) {
                        log.debug("FlateDecoded image detected");
                    }

                    if (filters.contains(COSName.JBIG2_DECODE)) {
                        if (skipJBig2Images) {
                            log.warn("Allready compressed according to JBIG2 standard => skipping");
                            continue;
                        } else {
                            log.debug("JBIG2 image detected");
                        }
                    }

                    // detection of unsupported filters by pdfBox library
                    if (filters.contains(COSName.JPX_DECODE)) {
                        log.warn("Unsupported filter JPXDecode => skipping");
                        continue;
                    }

                    String name = getUniqueFileName(prefix, image.getSuffix());
                    log.info("Writing image: {}", name);
                    image.write2file(name);

                    PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(),
                            image.getHeight(), objectNum, genNum);
                    originalImageInformations.add(pdfImageInfo);

                    namesOfImages.add(name + "." + image.getSuffix());

                }
            }
        }
    } catch (IOException ex) {
        Tools.deleteFilesFromList(namesOfImages);
        throw new PdfRecompressionException("Unable to parse PDF document", ex);
    } catch (Exception ex) {
        Tools.deleteFilesFromList(namesOfImages);
    } finally {
        if (doc != null) {
            try {
                doc.close();
            } catch (IOException ex) {
                throw new PdfRecompressionException(ex);
            }
        }
    }
}