Example usage for org.apache.pdfbox.pdfparser PDFParser PDFParser

List of usage examples for org.apache.pdfbox.pdfparser PDFParser PDFParser

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdfparser PDFParser PDFParser.

Prototype

public PDFParser(RandomAccessRead source) throws IOException 

Source Link

Document

Constructor.

Usage

From source file:com.amandine.NewEmptyJUnitTest.java

public String pdflookbook() throws IOException {
    String filePath = "C:\\Users\\janitha\\OneDrive\\Documents\\lookbookSS2016.pdf";
    InputStream inputStream = null;
    String statementPDF = null;/* ww  w .j a va  2 s . c o  m*/
    try {
        inputStream = new FileInputStream(filePath);
        PDFParser parser = new PDFParser(inputStream);

        // This will parse the stream and populate the COSDocument object.
        parser.parse();

        // Get the document that was parsed.
        COSDocument cosDoc = parser.getDocument();

        // This class will take a pdf document and strip out all of the text and 
        // ignore the formatting and such.
        PDFTextStripper pdfStripper = new PDFTextStripper();

        // This is the in-memory representation of the PDF document
        PDDocument pdDoc = new PDDocument(cosDoc);
        pdfStripper.setStartPage(3);
        pdfStripper.setEndPage(pdDoc.getNumberOfPages() - 1);
        assertEquals(41, pdDoc.getNumberOfPages() - 1);

        // This will return the text of a document.
        statementPDF = pdfStripper.getText(pdDoc);
        //            System.out.println(statementPDF);

        //            String [] statementPDFArray = statementPDF.split("\\n");
        //            assertEquals(256, statementPDFArray.length);
    } catch (Exception e) {
        //Syste
        String errorMessage = "\nUnexpected Exception: " + e.getClass() + "\n" + e.getMessage();
        for (StackTraceElement trace : e.getStackTrace()) {
            errorMessage += "\n\t" + trace;
        }
        System.out.println(errorMessage);
    } finally {
        if (inputStream != null) {
            inputStream.close();
        }
    }
    return statementPDF;
}

From source file:com.aurel.track.lucene.index.associatedFields.textExctractor.PdfExtractor.java

License:Open Source License

/**
 * Gets the text from file content /*from ww  w  . ja  v  a 2s.c o m*/
 * @param file
 * @param fileExtension
 * @return
 */
@Override
public String getText(File file, String fileExtension) {
    FileInputStream fis = null;
    PDDocument pdDoc = null;
    StringWriter stringWriter = null;
    try {
        fis = new FileInputStream(file);
        PDFParser parser = new PDFParser(fis);
        parser.parse();
        pdDoc = parser.getPDDocument();
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.setLineSeparator("\n");
        stringWriter = new StringWriter();
        stripper.writeText(pdDoc, stringWriter);
        return stringWriter.toString();
    } catch (Exception e) {
        if (LOGGER.isDebugEnabled()) {
            LOGGER.debug(
                    "Extracting text from the .pdf  file " + file.getName() + " failed with " + e.getMessage());
            LOGGER.debug(ExceptionUtils.getStackTrace(e));
        }
    } finally {
        try {
            if (stringWriter != null) {
                stringWriter.close();
            }
        } catch (Exception e) {
        }
        try {
            if (pdDoc != null) {
                pdDoc.close();
            }
        } catch (Exception e) {
            LOGGER.info("Closing pdDoc for " + file + " failed with " + e.getMessage());
            LOGGER.debug(ExceptionUtils.getStackTrace(e));
        }
        try {
            if (fis != null) {
                fis.close();
            }
        } catch (Exception e) {
            LOGGER.info("Closing the FileInputStream for " + file + " failed with " + e.getMessage());
        }
    }
    return null;
}

From source file:com.cisco.iwe.services.util.EmailMonitor.java

/**
 * /*from w w w  . j a  v  a 2s.com*/
 * @param fileDir
 * @return
 */
/* This method is used to scan the uploaded expense receipt in .pdf format and extract the text embedded in it. */
public String scanPDF(String fileDir) {
    PDFParser parser;
    String parsedText = null;
    PDFTextStripper pdfStripper = null;
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    File file = new File(fileDir);
    if (!file.isFile()) {
        System.err.println("File " + fileDir + " does not exist.");
        return null;
    }
    try {
        parser = new PDFParser(new FileInputStream(file));
    } catch (IOException e) {
        System.err.println("Unable to open PDF Parser. " + e.getMessage());
        return null;
    }
    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        pdfStripper.setStartPage(1);
        pdfStripper.setEndPage(pdDoc.getNumberOfPages());
        parsedText = pdfStripper.getText(pdDoc);
    } catch (Exception e) {
        System.err.println("An exception occured in parsing the PDF Document." + e.getMessage());
    } finally {
        try {
            if (cosDoc != null)
                cosDoc.close();
            if (pdDoc != null)
                pdDoc.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    return parsedText;
}

From source file:com.exlibris.dps.repository.plugin.riskExtractor.drmlint.PDFBoxWrapper.java

License:Apache License

/**
 * Check if a PDF file is valid or not/* w  w w  .  j  ava 2s.co m*/
 * @param pFile file to check
 * @return whether the file is valid or not
 */
public static boolean isValid(File pFile) {
    boolean ret = false;
    try {
        PDFParser parser = new PDFParser(new FileInputStream(pFile));
        parser.parse();
        File temp = File.createTempFile("drmlint-temp-", ".pdf");
        parser.getPDDocument().save(temp);
        parser.getDocument().close();
        temp.delete();
        ret = true;
    } catch (FileNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (COSVisitorException e) {
        // TODO Auto-generated catch block
        ret = false;
    }
    return ret;
}

From source file:com.exlibris.dps.repository.plugin.riskExtractor.drmlint.PDFBoxWrapper.java

License:Apache License

/**
 * Check if a PDF file has DRM or not// w  w  w  . j ava2s  . co  m
 * @param pFile file to check
 * @return whether the file is had DRM or not
 */
public static boolean hasDRM(File pFile) {
    boolean ret = false;
    try {
        PDFParser parser = new PDFParser(new FileInputStream(pFile));
        parser.parse();
        ret = parser.getDocument().isEncrypted();
        parser.getDocument().close();

    } catch (FileNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    return ret;
}

From source file:com.exlibris.dps.repository.plugin.riskExtractor.drmlint.PDFBoxWrapper.java

License:Apache License

/**
 * Check for encryption with Apache PDFBox
 * -> query the encryption dictionary (might allow more granular checks of protection)
 * @param pPDF pdf file to check//from ww w  . ja v  a2s  . c  o  m
 * @return whether or not the file has DRM
 */
public static boolean hasDRMGranular(File pPDF) {

    boolean ret = false;

    try {
        PDFParser parser = new PDFParser(new FileInputStream(pPDF));
        parser.parse();

        COSDictionary dict = parser.getDocument().getEncryptionDictionary();
        if (dict != null) {

            //print encryption dictionary
            //            for(COSName key:dict.keySet()) {
            //               System.out.print(key.getName());
            //               String value = dict.getString(key);
            //               if(value!=null){
            //                  System.out.println(": "+value);
            //               } else {
            //                  System.out.println(": "+dict.getLong(key));
            //               }
            //            }

            //this feaure in pdfbox is currently broken, see: https://issues.apache.org/jira/browse/PDFBOX-1651
            //AccessPermission perms = parser.getPDDocument().getCurrentAccessPermission();
            //this is a work around; creating a new object from the data
            AccessPermission perms = new AccessPermission(dict.getInt("P"));

            boolean debug = false;

            if (debug) {

                System.out.println("canAssembleDocument()        : " + perms.canAssembleDocument());
                System.out.println("canExtractContent()          : " + perms.canExtractContent());
                System.out.println("canExtractForAccessibility() : " + perms.canExtractForAccessibility());
                System.out.println("canFillInForm()              : " + perms.canFillInForm());
                System.out.println("canModify()                  : " + perms.canModify());
                System.out.println("canModifyAnnotations()       : " + perms.canModifyAnnotations());
                System.out.println("canPrint()                   : " + perms.canPrint());
                System.out.println("canPrintDegraded()           : " + perms.canPrintDegraded());
                System.out.println("isOwnerPermission()          : " + perms.isOwnerPermission());
                System.out.println("isReadOnly()                 : " + perms.isReadOnly());

            }
        }

        parser.getDocument().close();

    } catch (FileNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    return ret;
}

From source file:com.iqtb.validacion.util.LeerPDF.java

public String pdftoText(byte[] bytesPdf) {

    InputStream in = new ByteArrayInputStream(bytesPdf);

    // Se verifica si se puede abrir el InputStream
    try {//from   w w  w  .j  a v  a 2 s .  c  om
        parser = new PDFParser(in);
    } catch (IOException e) {
        logger.error("No se puede abrir. ERROR " + e);
        return null;
    }

    // En este proceso se abre, convierte y se cierra
    // el archivo PDF
    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        parsedText = pdfStripper.getText(pdDoc);
        cosDoc.close();
        pdDoc.close();

    } catch (IOException e) {
        logger.error("Ocurri un error. ERROR " + e);
        try {
            if (cosDoc != null) {
                cosDoc.close();
            }
            if (pdDoc != null) {
                pdDoc.close();
            }
        } catch (IOException e1) {
            logger.error("Ocurri un error. ERROR " + e1);
        }

        return null;
    }

    return parsedText;
}

From source file:com.lanacion.adminsiteln.services.PdfIndexerService.PdfIndexerService.java

/**
 * Metodos privados para la indexacin/*from www.  j  a  v  a  2  s . c om*/
 */
private String pdftoText(String fileName, int pagina) {

    PDFParser parser;
    String parsedText = null;
    ;
    PDFTextStripper pdfStripper = null;
    //pdfStripper.setStartPage(0);
    //pdfStripper.setEndPage(0);
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    File file = new File(fileName);
    if (!file.isFile()) {
        System.err.println("File " + fileName + " does not exist.");
        return null;
    }
    try {
        parser = new PDFParser(new FileInputStream(file));
    } catch (IOException e) {
        System.err.println("Unable to open PDF Parser. " + e.getMessage());
        return null;
    }
    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        pdfStripper.setStartPage(pagina);
        pdfStripper.setEndPage(pagina);
        parsedText = pdfStripper.getText(pdDoc);
    } catch (Exception e) {
        System.err.println("An exception occured in parsing the PDF Document." + e.getMessage());
    } finally {
        try {
            if (cosDoc != null) {
                cosDoc.close();
            }
            if (pdDoc != null) {
                pdDoc.close();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    return parsedText;

}

From source file:com.lanacion.adminsiteln.services.PdfIndexerService.PdfIndexerService.java

private int pdfgetPages(String fileName) {

    int numero_paginas = 0;
    PDFParser parser;/*w ww  .  j  av a 2  s . com*/
    String parsedText = null;
    ;
    PDFTextStripper pdfStripper = null;
    //pdfStripper.setStartPage(0);
    //pdfStripper.setEndPage(0);
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    File file = new File(fileName);
    if (!file.isFile()) {
        System.err.println("File " + fileName + " does not exist.");
        return 0;
    }
    try {
        parser = new PDFParser(new FileInputStream(file));
    } catch (IOException e) {
        System.err.println("Unable to open PDF Parser. " + e.getMessage());
        return 0;
    }
    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        numero_paginas = pdDoc.getNumberOfPages();
    } catch (Exception e) {
        System.err.println("An exception occured in parsing the PDF Document." + e.getMessage());
    } finally {
        try {
            if (cosDoc != null) {
                cosDoc.close();
            }
            if (pdDoc != null) {
                pdDoc.close();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    return numero_paginas;
}

From source file:com.liferay.portal.util.LuceneFields.java

License:Open Source License

public static Field getFile(String field, File file, String fileExt) throws IOException {

    fileExt = fileExt.toLowerCase();//from   w  ww  .ja v a  2s .  co  m

    FileInputStream fis = new FileInputStream(file);
    Reader reader = new BufferedReader(new InputStreamReader(fis));

    String text = null;

    if (fileExt.equals(".doc")) {
        try {
            WordDocument wordDocument = new WordDocument(fis);

            StringWriter stringWriter = new StringWriter();

            wordDocument.writeAllText(stringWriter);

            text = stringWriter.toString();

            stringWriter.close();
        } catch (Exception e) {
            _log.error(e.getMessage());
        }
    } else if (fileExt.equals(".htm") || fileExt.equals(".html")) {
        try {
            DefaultStyledDocument dsd = new DefaultStyledDocument();

            HTMLEditorKit htmlEditorKit = new HTMLEditorKit();
            htmlEditorKit.read(reader, dsd, 0);

            text = dsd.getText(0, dsd.getLength());
        } catch (Exception e) {
            _log.error(e.getMessage());
        }
    } else if (fileExt.equals(".pdf")) {
        try {
            PDFParser parser = new PDFParser(fis);
            parser.parse();

            PDDocument pdDoc = parser.getPDDocument();

            StringWriter stringWriter = new StringWriter();

            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setLineSeparator("\n");
            stripper.writeText(pdDoc, stringWriter);

            text = stringWriter.toString();

            stringWriter.close();
            pdDoc.close();
        } catch (Exception e) {
            _log.error(e.getMessage());
        }
    } else if (fileExt.equals(".rtf")) {
        try {
            DefaultStyledDocument dsd = new DefaultStyledDocument();

            RTFEditorKit rtfEditorKit = new RTFEditorKit();
            rtfEditorKit.read(reader, dsd, 0);

            text = dsd.getText(0, dsd.getLength());
        } catch (Exception e) {
            _log.error(e.getMessage());
        }
    } else if (fileExt.equals(".xls")) {
        try {
            XLSTextStripper stripper = new XLSTextStripper(fis);

            text = stripper.getText();
        } catch (Exception e) {
            _log.error(e.getMessage());
        }
    }

    if (text != null) {
        return new Field(field, text, Field.Store.YES, Field.Index.NOT_ANALYZED);
    } else {
        return new Field(field, reader);
    }
}