Example usage for org.apache.pdfbox.pdfparser PDFParser parse

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdfparser PDFParser parse.

Prototype

public void parse() throws IOException

Source Link

Document

This will parse the stream and populate the COSDocument object.

Usage

From source file:com.amandine.NewEmptyJUnitTest.java

public String pdflookbook() throws IOException {
    String filePath = "C:\\Users\\janitha\\OneDrive\\Documents\\lookbookSS2016.pdf";
    InputStream inputStream = null;
    String statementPDF = null;/* www  .  j a  va  2  s  . c  om*/
    try {
        inputStream = new FileInputStream(filePath);
        PDFParser parser = new PDFParser(inputStream);

        // This will parse the stream and populate the COSDocument object.
        parser.parse();

        // Get the document that was parsed.
        COSDocument cosDoc = parser.getDocument();

        // This class will take a pdf document and strip out all of the text and 
        // ignore the formatting and such.
        PDFTextStripper pdfStripper = new PDFTextStripper();

        // This is the in-memory representation of the PDF document
        PDDocument pdDoc = new PDDocument(cosDoc);
        pdfStripper.setStartPage(3);
        pdfStripper.setEndPage(pdDoc.getNumberOfPages() - 1);
        assertEquals(41, pdDoc.getNumberOfPages() - 1);

        // This will return the text of a document.
        statementPDF = pdfStripper.getText(pdDoc);
        //            System.out.println(statementPDF);

        //            String [] statementPDFArray = statementPDF.split("\\n");
        //            assertEquals(256, statementPDFArray.length);
    } catch (Exception e) {
        //Syste
        String errorMessage = "\nUnexpected Exception: " + e.getClass() + "\n" + e.getMessage();
        for (StackTraceElement trace : e.getStackTrace()) {
            errorMessage += "\n\t" + trace;
        }
        System.out.println(errorMessage);
    } finally {
        if (inputStream != null) {
            inputStream.close();
        }
    }
    return statementPDF;
}

From source file:com.aurel.track.lucene.index.associatedFields.textExctractor.PdfExtractor.java

License:Open Source License

/**
 * Gets the text from file content /*from   ww  w  .  ja va2  s.co  m*/
 * @param file
 * @param fileExtension
 * @return
 */
@Override
public String getText(File file, String fileExtension) {
    FileInputStream fis = null;
    PDDocument pdDoc = null;
    StringWriter stringWriter = null;
    try {
        fis = new FileInputStream(file);
        PDFParser parser = new PDFParser(fis);
        parser.parse();
        pdDoc = parser.getPDDocument();
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.setLineSeparator("\n");
        stringWriter = new StringWriter();
        stripper.writeText(pdDoc, stringWriter);
        return stringWriter.toString();
    } catch (Exception e) {
        if (LOGGER.isDebugEnabled()) {
            LOGGER.debug(
                    "Extracting text from the .pdf  file " + file.getName() + " failed with " + e.getMessage());
            LOGGER.debug(ExceptionUtils.getStackTrace(e));
        }
    } finally {
        try {
            if (stringWriter != null) {
                stringWriter.close();
            }
        } catch (Exception e) {
        }
        try {
            if (pdDoc != null) {
                pdDoc.close();
            }
        } catch (Exception e) {
            LOGGER.info("Closing pdDoc for " + file + " failed with " + e.getMessage());
            LOGGER.debug(ExceptionUtils.getStackTrace(e));
        }
        try {
            if (fis != null) {
                fis.close();
            }
        } catch (Exception e) {
            LOGGER.info("Closing the FileInputStream for " + file + " failed with " + e.getMessage());
        }
    }
    return null;
}

From source file:com.cisco.iwe.services.util.EmailMonitor.java

/**
 * /*from ww w.j  ava  2  s  .co  m*/
 * @param fileDir
 * @return
 */
/* This method is used to scan the uploaded expense receipt in .pdf format and extract the text embedded in it. */
public String scanPDF(String fileDir) {
    PDFParser parser;
    String parsedText = null;
    PDFTextStripper pdfStripper = null;
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    File file = new File(fileDir);
    if (!file.isFile()) {
        System.err.println("File " + fileDir + " does not exist.");
        return null;
    }
    try {
        parser = new PDFParser(new FileInputStream(file));
    } catch (IOException e) {
        System.err.println("Unable to open PDF Parser. " + e.getMessage());
        return null;
    }
    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        pdfStripper.setStartPage(1);
        pdfStripper.setEndPage(pdDoc.getNumberOfPages());
        parsedText = pdfStripper.getText(pdDoc);
    } catch (Exception e) {
        System.err.println("An exception occured in parsing the PDF Document." + e.getMessage());
    } finally {
        try {
            if (cosDoc != null)
                cosDoc.close();
            if (pdDoc != null)
                pdDoc.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    return parsedText;
}

From source file:com.exlibris.dps.repository.plugin.riskExtractor.drmlint.PDFBoxWrapper.java

License:Apache License

/**
 * Check if a PDF file is valid or not//from   w ww.  j  a  v  a  2 s .com
 * @param pFile file to check
 * @return whether the file is valid or not
 */
public static boolean isValid(File pFile) {
    boolean ret = false;
    try {
        PDFParser parser = new PDFParser(new FileInputStream(pFile));
        parser.parse();
        File temp = File.createTempFile("drmlint-temp-", ".pdf");
        parser.getPDDocument().save(temp);
        parser.getDocument().close();
        temp.delete();
        ret = true;
    } catch (FileNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (COSVisitorException e) {
        // TODO Auto-generated catch block
        ret = false;
    }
    return ret;
}

From source file:com.exlibris.dps.repository.plugin.riskExtractor.drmlint.PDFBoxWrapper.java

License:Apache License

/**
 * Check if a PDF file has DRM or not/* w w  w.j  a v a 2s  .co m*/
 * @param pFile file to check
 * @return whether the file is had DRM or not
 */
public static boolean hasDRM(File pFile) {
    boolean ret = false;
    try {
        PDFParser parser = new PDFParser(new FileInputStream(pFile));
        parser.parse();
        ret = parser.getDocument().isEncrypted();
        parser.getDocument().close();

    } catch (FileNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    return ret;
}

From source file:com.exlibris.dps.repository.plugin.riskExtractor.drmlint.PDFBoxWrapper.java

License:Apache License

/**
 * Check for encryption with Apache PDFBox
 * -> query the encryption dictionary (might allow more granular checks of protection)
 * @param pPDF pdf file to check//from ww w  . j  a va  2s  .  c  o  m
 * @return whether or not the file has DRM
 */
public static boolean hasDRMGranular(File pPDF) {

    boolean ret = false;

    try {
        PDFParser parser = new PDFParser(new FileInputStream(pPDF));
        parser.parse();

        COSDictionary dict = parser.getDocument().getEncryptionDictionary();
        if (dict != null) {

            //print encryption dictionary
            //            for(COSName key:dict.keySet()) {
            //               System.out.print(key.getName());
            //               String value = dict.getString(key);
            //               if(value!=null){
            //                  System.out.println(": "+value);
            //               } else {
            //                  System.out.println(": "+dict.getLong(key));
            //               }
            //            }

            //this feaure in pdfbox is currently broken, see: https://issues.apache.org/jira/browse/PDFBOX-1651
            //AccessPermission perms = parser.getPDDocument().getCurrentAccessPermission();
            //this is a work around; creating a new object from the data
            AccessPermission perms = new AccessPermission(dict.getInt("P"));

            boolean debug = false;

            if (debug) {

                System.out.println("canAssembleDocument()        : " + perms.canAssembleDocument());
                System.out.println("canExtractContent()          : " + perms.canExtractContent());
                System.out.println("canExtractForAccessibility() : " + perms.canExtractForAccessibility());
                System.out.println("canFillInForm()              : " + perms.canFillInForm());
                System.out.println("canModify()                  : " + perms.canModify());
                System.out.println("canModifyAnnotations()       : " + perms.canModifyAnnotations());
                System.out.println("canPrint()                   : " + perms.canPrint());
                System.out.println("canPrintDegraded()           : " + perms.canPrintDegraded());
                System.out.println("isOwnerPermission()          : " + perms.isOwnerPermission());
                System.out.println("isReadOnly()                 : " + perms.isReadOnly());

            }
        }

        parser.getDocument().close();

    } catch (FileNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    return ret;
}

From source file:com.lanacion.adminsiteln.services.PdfIndexerService.PdfIndexerService.java

/**
 * Metodos privados para la indexacin/*from w  w  w. j a v  a 2  s  .  c om*/
 */
private String pdftoText(String fileName, int pagina) {

    PDFParser parser;
    String parsedText = null;
    ;
    PDFTextStripper pdfStripper = null;
    //pdfStripper.setStartPage(0);
    //pdfStripper.setEndPage(0);
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    File file = new File(fileName);
    if (!file.isFile()) {
        System.err.println("File " + fileName + " does not exist.");
        return null;
    }
    try {
        parser = new PDFParser(new FileInputStream(file));
    } catch (IOException e) {
        System.err.println("Unable to open PDF Parser. " + e.getMessage());
        return null;
    }
    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        pdfStripper.setStartPage(pagina);
        pdfStripper.setEndPage(pagina);
        parsedText = pdfStripper.getText(pdDoc);
    } catch (Exception e) {
        System.err.println("An exception occured in parsing the PDF Document." + e.getMessage());
    } finally {
        try {
            if (cosDoc != null) {
                cosDoc.close();
            }
            if (pdDoc != null) {
                pdDoc.close();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    return parsedText;

}

From source file:com.lanacion.adminsiteln.services.PdfIndexerService.PdfIndexerService.java

private int pdfgetPages(String fileName) {

    int numero_paginas = 0;
    PDFParser parser;
    String parsedText = null;//from  w w w.j  a  v  a 2  s . c  om
    ;
    PDFTextStripper pdfStripper = null;
    //pdfStripper.setStartPage(0);
    //pdfStripper.setEndPage(0);
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    File file = new File(fileName);
    if (!file.isFile()) {
        System.err.println("File " + fileName + " does not exist.");
        return 0;
    }
    try {
        parser = new PDFParser(new FileInputStream(file));
    } catch (IOException e) {
        System.err.println("Unable to open PDF Parser. " + e.getMessage());
        return 0;
    }
    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        numero_paginas = pdDoc.getNumberOfPages();
    } catch (Exception e) {
        System.err.println("An exception occured in parsing the PDF Document." + e.getMessage());
    } finally {
        try {
            if (cosDoc != null) {
                cosDoc.close();
            }
            if (pdDoc != null) {
                pdDoc.close();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    return numero_paginas;
}

From source file:com.liferay.portal.util.LuceneFields.java

License:Open Source License

public static Field getFile(String field, File file, String fileExt) throws IOException {

    fileExt = fileExt.toLowerCase();//ww w  .  j  a  v a2 s.c  om

    FileInputStream fis = new FileInputStream(file);
    Reader reader = new BufferedReader(new InputStreamReader(fis));

    String text = null;

    if (fileExt.equals(".doc")) {
        try {
            WordDocument wordDocument = new WordDocument(fis);

            StringWriter stringWriter = new StringWriter();

            wordDocument.writeAllText(stringWriter);

            text = stringWriter.toString();

            stringWriter.close();
        } catch (Exception e) {
            _log.error(e.getMessage());
        }
    } else if (fileExt.equals(".htm") || fileExt.equals(".html")) {
        try {
            DefaultStyledDocument dsd = new DefaultStyledDocument();

            HTMLEditorKit htmlEditorKit = new HTMLEditorKit();
            htmlEditorKit.read(reader, dsd, 0);

            text = dsd.getText(0, dsd.getLength());
        } catch (Exception e) {
            _log.error(e.getMessage());
        }
    } else if (fileExt.equals(".pdf")) {
        try {
            PDFParser parser = new PDFParser(fis);
            parser.parse();

            PDDocument pdDoc = parser.getPDDocument();

            StringWriter stringWriter = new StringWriter();

            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setLineSeparator("\n");
            stripper.writeText(pdDoc, stringWriter);

            text = stringWriter.toString();

            stringWriter.close();
            pdDoc.close();
        } catch (Exception e) {
            _log.error(e.getMessage());
        }
    } else if (fileExt.equals(".rtf")) {
        try {
            DefaultStyledDocument dsd = new DefaultStyledDocument();

            RTFEditorKit rtfEditorKit = new RTFEditorKit();
            rtfEditorKit.read(reader, dsd, 0);

            text = dsd.getText(0, dsd.getLength());
        } catch (Exception e) {
            _log.error(e.getMessage());
        }
    } else if (fileExt.equals(".xls")) {
        try {
            XLSTextStripper stripper = new XLSTextStripper(fis);

            text = stripper.getText();
        } catch (Exception e) {
            _log.error(e.getMessage());
        }
    }

    if (text != null) {
        return new Field(field, text, Field.Store.YES, Field.Index.NOT_ANALYZED);
    } else {
        return new Field(field, reader);
    }
}

From source file:com.openkm.extractor.PdfTextExtractor.java

License:Open Source License

/**
 * {@inheritDoc}/*from  w ww .j av  a  2s  .c o  m*/
 */
@SuppressWarnings("rawtypes")
public String extractText(InputStream stream, String type, String encoding) throws IOException {
    try {
        PDFParser parser = new PDFParser(new BufferedInputStream(stream));

        try {
            parser.parse();
            PDDocument document = parser.getPDDocument();

            if (document.isEncrypted()) {
                try {
                    document.decrypt("");
                    document.setAllSecurityToBeRemoved(true);
                } catch (Exception e) {
                    throw new IOException("Unable to extract text: document encrypted", e);
                }
            }

            CharArrayWriter writer = new CharArrayWriter();
            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setLineSeparator("\n");
            stripper.writeText(document, writer);
            String st = writer.toString().trim();
            log.debug("TextStripped: '{}'", st);

            if (Config.SYSTEM_PDF_FORCE_OCR || st.length() <= 1) {
                log.warn("PDF does not contains text layer");

                // Extract images from PDF
                StringBuilder sb = new StringBuilder();

                if (!Config.SYSTEM_PDFIMAGES.isEmpty()) {
                    File tmpPdf = FileUtils.createTempFile("pdf");
                    File tmpDir = new File(EnvironmentDetector.getTempDir());
                    String baseName = FileUtils.getFileName(tmpPdf.getName());
                    document.save(tmpPdf);
                    int pgNum = 1;

                    try {
                        for (PDPage page : (List<PDPage>) document.getDocumentCatalog().getAllPages()) {
                            HashMap<String, Object> hm = new HashMap<String, Object>();
                            hm.put("fileIn", tmpPdf.getPath());
                            hm.put("firstPage", pgNum);
                            hm.put("lastPage", pgNum++);
                            hm.put("imageRoot", tmpDir + File.separator + baseName);
                            String cmd = TemplateUtils.replace("SYSTEM_PDFIMAGES", Config.SYSTEM_PDFIMAGES, hm);
                            ExecutionUtils.runCmd(cmd);

                            for (File tmp : tmpDir.listFiles()) {
                                if (tmp.getName().startsWith(baseName + "-")) {
                                    if (page.findRotation() > 0) {
                                        ImageUtils.rotate(tmp, tmp, page.findRotation());
                                    }

                                    try {
                                        String txt = doOcr(tmp);
                                        sb.append(txt).append(" ");
                                        log.debug("OCR Extracted: {}", txt);
                                    } finally {
                                        FileUtils.deleteQuietly(tmp);
                                    }
                                }
                            }
                        }
                    } finally {
                        FileUtils.deleteQuietly(tmpPdf);
                    }
                } else {
                    for (PDPage page : (List<PDPage>) document.getDocumentCatalog().getAllPages()) {
                        PDResources resources = page.getResources();
                        Map<String, PDXObject> images = resources.getXObjects();

                        if (images != null) {
                            for (String key : images.keySet()) {
                                PDXObjectImage image = (PDXObjectImage) images.get(key);
                                String prefix = "img-" + key + "-";
                                File pdfImg = null;

                                try {
                                    pdfImg = File.createTempFile(prefix, ".png");
                                    log.debug("Writing image: {}", pdfImg.getPath());

                                    // Won't work until PDFBox 1.8.9
                                    ImageIO.write(image.getRGBImage(), "png", pdfImg);

                                    if (page.findRotation() > 0) {
                                        ImageUtils.rotate(pdfImg, pdfImg, page.findRotation());
                                    }

                                    // Do OCR
                                    String txt = doOcr(pdfImg);
                                    sb.append(txt).append(" ");
                                    log.debug("OCR Extracted: {}", txt);
                                } finally {
                                    FileUtils.deleteQuietly(pdfImg);
                                }
                            }
                        }
                    }
                }

                return sb.toString();
            } else {
                return writer.toString();
            }
        } finally {
            try {
                PDDocument doc = parser.getPDDocument();
                if (doc != null) {
                    doc.close();
                }
            } catch (IOException e) {
                // ignore
            }
        }
    } catch (Exception e) {
        // it may happen that PDFParser throws a runtime
        // exception when parsing certain pdf documents
        log.warn("Failed to extract PDF text content", e);
        throw new IOException(e.getMessage(), e);
    } finally {
        stream.close();
    }
}