List of usage examples for org.apache.pdfbox.pdfparser PDFParser PDFParser
public PDFParser(RandomAccessRead source) throws IOException
From source file:com.amandine.NewEmptyJUnitTest.java
public String pdflookbook() throws IOException { String filePath = "C:\\Users\\janitha\\OneDrive\\Documents\\lookbookSS2016.pdf"; InputStream inputStream = null; String statementPDF = null;/* ww w .j a va 2 s . c o m*/ try { inputStream = new FileInputStream(filePath); PDFParser parser = new PDFParser(inputStream); // This will parse the stream and populate the COSDocument object. parser.parse(); // Get the document that was parsed. COSDocument cosDoc = parser.getDocument(); // This class will take a pdf document and strip out all of the text and // ignore the formatting and such. PDFTextStripper pdfStripper = new PDFTextStripper(); // This is the in-memory representation of the PDF document PDDocument pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(3); pdfStripper.setEndPage(pdDoc.getNumberOfPages() - 1); assertEquals(41, pdDoc.getNumberOfPages() - 1); // This will return the text of a document. statementPDF = pdfStripper.getText(pdDoc); // System.out.println(statementPDF); // String [] statementPDFArray = statementPDF.split("\\n"); // assertEquals(256, statementPDFArray.length); } catch (Exception e) { //Syste String errorMessage = "\nUnexpected Exception: " + e.getClass() + "\n" + e.getMessage(); for (StackTraceElement trace : e.getStackTrace()) { errorMessage += "\n\t" + trace; } System.out.println(errorMessage); } finally { if (inputStream != null) { inputStream.close(); } } return statementPDF; }
From source file:com.aurel.track.lucene.index.associatedFields.textExctractor.PdfExtractor.java
License:Open Source License
/** * Gets the text from file content /*from ww w . ja v a 2s.c o m*/ * @param file * @param fileExtension * @return */ @Override public String getText(File file, String fileExtension) { FileInputStream fis = null; PDDocument pdDoc = null; StringWriter stringWriter = null; try { fis = new FileInputStream(file); PDFParser parser = new PDFParser(fis); parser.parse(); pdDoc = parser.getPDDocument(); PDFTextStripper stripper = new PDFTextStripper(); stripper.setLineSeparator("\n"); stringWriter = new StringWriter(); stripper.writeText(pdDoc, stringWriter); return stringWriter.toString(); } catch (Exception e) { if (LOGGER.isDebugEnabled()) { LOGGER.debug( "Extracting text from the .pdf file " + file.getName() + " failed with " + e.getMessage()); LOGGER.debug(ExceptionUtils.getStackTrace(e)); } } finally { try { if (stringWriter != null) { stringWriter.close(); } } catch (Exception e) { } try { if (pdDoc != null) { pdDoc.close(); } } catch (Exception e) { LOGGER.info("Closing pdDoc for " + file + " failed with " + e.getMessage()); LOGGER.debug(ExceptionUtils.getStackTrace(e)); } try { if (fis != null) { fis.close(); } } catch (Exception e) { LOGGER.info("Closing the FileInputStream for " + file + " failed with " + e.getMessage()); } } return null; }
From source file:com.cisco.iwe.services.util.EmailMonitor.java
/** * /*from w w w . j a v a 2s.com*/ * @param fileDir * @return */ /* This method is used to scan the uploaded expense receipt in .pdf format and extract the text embedded in it. */ public String scanPDF(String fileDir) { PDFParser parser; String parsedText = null; PDFTextStripper pdfStripper = null; PDDocument pdDoc = null; COSDocument cosDoc = null; File file = new File(fileDir); if (!file.isFile()) { System.err.println("File " + fileDir + " does not exist."); return null; } try { parser = new PDFParser(new FileInputStream(file)); } catch (IOException e) { System.err.println("Unable to open PDF Parser. " + e.getMessage()); return null; } try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(pdDoc.getNumberOfPages()); parsedText = pdfStripper.getText(pdDoc); } catch (Exception e) { System.err.println("An exception occured in parsing the PDF Document." + e.getMessage()); } finally { try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e) { e.printStackTrace(); } } return parsedText; }
From source file:com.exlibris.dps.repository.plugin.riskExtractor.drmlint.PDFBoxWrapper.java
License:Apache License
/** * Check if a PDF file is valid or not/* w w w . j ava 2s.co m*/ * @param pFile file to check * @return whether the file is valid or not */ public static boolean isValid(File pFile) { boolean ret = false; try { PDFParser parser = new PDFParser(new FileInputStream(pFile)); parser.parse(); File temp = File.createTempFile("drmlint-temp-", ".pdf"); parser.getPDDocument().save(temp); parser.getDocument().close(); temp.delete(); ret = true; } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (COSVisitorException e) { // TODO Auto-generated catch block ret = false; } return ret; }
From source file:com.exlibris.dps.repository.plugin.riskExtractor.drmlint.PDFBoxWrapper.java
License:Apache License
/** * Check if a PDF file has DRM or not// w w w . j ava2s . co m * @param pFile file to check * @return whether the file is had DRM or not */ public static boolean hasDRM(File pFile) { boolean ret = false; try { PDFParser parser = new PDFParser(new FileInputStream(pFile)); parser.parse(); ret = parser.getDocument().isEncrypted(); parser.getDocument().close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return ret; }
From source file:com.exlibris.dps.repository.plugin.riskExtractor.drmlint.PDFBoxWrapper.java
License:Apache License
/** * Check for encryption with Apache PDFBox * -> query the encryption dictionary (might allow more granular checks of protection) * @param pPDF pdf file to check//from ww w . ja v a2s . c o m * @return whether or not the file has DRM */ public static boolean hasDRMGranular(File pPDF) { boolean ret = false; try { PDFParser parser = new PDFParser(new FileInputStream(pPDF)); parser.parse(); COSDictionary dict = parser.getDocument().getEncryptionDictionary(); if (dict != null) { //print encryption dictionary // for(COSName key:dict.keySet()) { // System.out.print(key.getName()); // String value = dict.getString(key); // if(value!=null){ // System.out.println(": "+value); // } else { // System.out.println(": "+dict.getLong(key)); // } // } //this feaure in pdfbox is currently broken, see: https://issues.apache.org/jira/browse/PDFBOX-1651 //AccessPermission perms = parser.getPDDocument().getCurrentAccessPermission(); //this is a work around; creating a new object from the data AccessPermission perms = new AccessPermission(dict.getInt("P")); boolean debug = false; if (debug) { System.out.println("canAssembleDocument() : " + perms.canAssembleDocument()); System.out.println("canExtractContent() : " + perms.canExtractContent()); System.out.println("canExtractForAccessibility() : " + perms.canExtractForAccessibility()); System.out.println("canFillInForm() : " + perms.canFillInForm()); System.out.println("canModify() : " + perms.canModify()); System.out.println("canModifyAnnotations() : " + perms.canModifyAnnotations()); System.out.println("canPrint() : " + perms.canPrint()); System.out.println("canPrintDegraded() : " + perms.canPrintDegraded()); System.out.println("isOwnerPermission() : " + perms.isOwnerPermission()); System.out.println("isReadOnly() : " + perms.isReadOnly()); } } parser.getDocument().close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return ret; }
From source file:com.iqtb.validacion.util.LeerPDF.java
public String pdftoText(byte[] bytesPdf) { InputStream in = new ByteArrayInputStream(bytesPdf); // Se verifica si se puede abrir el InputStream try {//from w w w .j a v a 2 s . c om parser = new PDFParser(in); } catch (IOException e) { logger.error("No se puede abrir. ERROR " + e); return null; } // En este proceso se abre, convierte y se cierra // el archivo PDF try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); parsedText = pdfStripper.getText(pdDoc); cosDoc.close(); pdDoc.close(); } catch (IOException e) { logger.error("Ocurri un error. ERROR " + e); try { if (cosDoc != null) { cosDoc.close(); } if (pdDoc != null) { pdDoc.close(); } } catch (IOException e1) { logger.error("Ocurri un error. ERROR " + e1); } return null; } return parsedText; }
From source file:com.lanacion.adminsiteln.services.PdfIndexerService.PdfIndexerService.java
/** * Metodos privados para la indexacin/*from www. j a v a 2 s . c om*/ */ private String pdftoText(String fileName, int pagina) { PDFParser parser; String parsedText = null; ; PDFTextStripper pdfStripper = null; //pdfStripper.setStartPage(0); //pdfStripper.setEndPage(0); PDDocument pdDoc = null; COSDocument cosDoc = null; File file = new File(fileName); if (!file.isFile()) { System.err.println("File " + fileName + " does not exist."); return null; } try { parser = new PDFParser(new FileInputStream(file)); } catch (IOException e) { System.err.println("Unable to open PDF Parser. " + e.getMessage()); return null; } try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(pagina); pdfStripper.setEndPage(pagina); parsedText = pdfStripper.getText(pdDoc); } catch (Exception e) { System.err.println("An exception occured in parsing the PDF Document." + e.getMessage()); } finally { try { if (cosDoc != null) { cosDoc.close(); } if (pdDoc != null) { pdDoc.close(); } } catch (Exception e) { e.printStackTrace(); } } return parsedText; }
From source file:com.lanacion.adminsiteln.services.PdfIndexerService.PdfIndexerService.java
private int pdfgetPages(String fileName) { int numero_paginas = 0; PDFParser parser;/*w ww . j av a 2 s . com*/ String parsedText = null; ; PDFTextStripper pdfStripper = null; //pdfStripper.setStartPage(0); //pdfStripper.setEndPage(0); PDDocument pdDoc = null; COSDocument cosDoc = null; File file = new File(fileName); if (!file.isFile()) { System.err.println("File " + fileName + " does not exist."); return 0; } try { parser = new PDFParser(new FileInputStream(file)); } catch (IOException e) { System.err.println("Unable to open PDF Parser. " + e.getMessage()); return 0; } try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); numero_paginas = pdDoc.getNumberOfPages(); } catch (Exception e) { System.err.println("An exception occured in parsing the PDF Document." + e.getMessage()); } finally { try { if (cosDoc != null) { cosDoc.close(); } if (pdDoc != null) { pdDoc.close(); } } catch (Exception e) { e.printStackTrace(); } } return numero_paginas; }
From source file:com.liferay.portal.util.LuceneFields.java
License:Open Source License
public static Field getFile(String field, File file, String fileExt) throws IOException { fileExt = fileExt.toLowerCase();//from w ww .ja v a 2s . co m FileInputStream fis = new FileInputStream(file); Reader reader = new BufferedReader(new InputStreamReader(fis)); String text = null; if (fileExt.equals(".doc")) { try { WordDocument wordDocument = new WordDocument(fis); StringWriter stringWriter = new StringWriter(); wordDocument.writeAllText(stringWriter); text = stringWriter.toString(); stringWriter.close(); } catch (Exception e) { _log.error(e.getMessage()); } } else if (fileExt.equals(".htm") || fileExt.equals(".html")) { try { DefaultStyledDocument dsd = new DefaultStyledDocument(); HTMLEditorKit htmlEditorKit = new HTMLEditorKit(); htmlEditorKit.read(reader, dsd, 0); text = dsd.getText(0, dsd.getLength()); } catch (Exception e) { _log.error(e.getMessage()); } } else if (fileExt.equals(".pdf")) { try { PDFParser parser = new PDFParser(fis); parser.parse(); PDDocument pdDoc = parser.getPDDocument(); StringWriter stringWriter = new StringWriter(); PDFTextStripper stripper = new PDFTextStripper(); stripper.setLineSeparator("\n"); stripper.writeText(pdDoc, stringWriter); text = stringWriter.toString(); stringWriter.close(); pdDoc.close(); } catch (Exception e) { _log.error(e.getMessage()); } } else if (fileExt.equals(".rtf")) { try { DefaultStyledDocument dsd = new DefaultStyledDocument(); RTFEditorKit rtfEditorKit = new RTFEditorKit(); rtfEditorKit.read(reader, dsd, 0); text = dsd.getText(0, dsd.getLength()); } catch (Exception e) { _log.error(e.getMessage()); } } else if (fileExt.equals(".xls")) { try { XLSTextStripper stripper = new XLSTextStripper(fis); text = stripper.getText(); } catch (Exception e) { _log.error(e.getMessage()); } } if (text != null) { return new Field(field, text, Field.Store.YES, Field.Index.NOT_ANALYZED); } else { return new Field(field, reader); } }