List of usage examples for org.apache.pdfbox.pdfparser PDFParser parse
public void parse() throws IOException
From source file:com.amandine.NewEmptyJUnitTest.java
public String pdflookbook() throws IOException { String filePath = "C:\\Users\\janitha\\OneDrive\\Documents\\lookbookSS2016.pdf"; InputStream inputStream = null; String statementPDF = null;/* www . j a va 2 s . c om*/ try { inputStream = new FileInputStream(filePath); PDFParser parser = new PDFParser(inputStream); // This will parse the stream and populate the COSDocument object. parser.parse(); // Get the document that was parsed. COSDocument cosDoc = parser.getDocument(); // This class will take a pdf document and strip out all of the text and // ignore the formatting and such. PDFTextStripper pdfStripper = new PDFTextStripper(); // This is the in-memory representation of the PDF document PDDocument pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(3); pdfStripper.setEndPage(pdDoc.getNumberOfPages() - 1); assertEquals(41, pdDoc.getNumberOfPages() - 1); // This will return the text of a document. statementPDF = pdfStripper.getText(pdDoc); // System.out.println(statementPDF); // String [] statementPDFArray = statementPDF.split("\\n"); // assertEquals(256, statementPDFArray.length); } catch (Exception e) { //Syste String errorMessage = "\nUnexpected Exception: " + e.getClass() + "\n" + e.getMessage(); for (StackTraceElement trace : e.getStackTrace()) { errorMessage += "\n\t" + trace; } System.out.println(errorMessage); } finally { if (inputStream != null) { inputStream.close(); } } return statementPDF; }
From source file:com.aurel.track.lucene.index.associatedFields.textExctractor.PdfExtractor.java
License:Open Source License
/** * Gets the text from file content /*from ww w . ja va2 s.co m*/ * @param file * @param fileExtension * @return */ @Override public String getText(File file, String fileExtension) { FileInputStream fis = null; PDDocument pdDoc = null; StringWriter stringWriter = null; try { fis = new FileInputStream(file); PDFParser parser = new PDFParser(fis); parser.parse(); pdDoc = parser.getPDDocument(); PDFTextStripper stripper = new PDFTextStripper(); stripper.setLineSeparator("\n"); stringWriter = new StringWriter(); stripper.writeText(pdDoc, stringWriter); return stringWriter.toString(); } catch (Exception e) { if (LOGGER.isDebugEnabled()) { LOGGER.debug( "Extracting text from the .pdf file " + file.getName() + " failed with " + e.getMessage()); LOGGER.debug(ExceptionUtils.getStackTrace(e)); } } finally { try { if (stringWriter != null) { stringWriter.close(); } } catch (Exception e) { } try { if (pdDoc != null) { pdDoc.close(); } } catch (Exception e) { LOGGER.info("Closing pdDoc for " + file + " failed with " + e.getMessage()); LOGGER.debug(ExceptionUtils.getStackTrace(e)); } try { if (fis != null) { fis.close(); } } catch (Exception e) { LOGGER.info("Closing the FileInputStream for " + file + " failed with " + e.getMessage()); } } return null; }
From source file:com.cisco.iwe.services.util.EmailMonitor.java
/** * /*from ww w.j ava 2 s .co m*/ * @param fileDir * @return */ /* This method is used to scan the uploaded expense receipt in .pdf format and extract the text embedded in it. */ public String scanPDF(String fileDir) { PDFParser parser; String parsedText = null; PDFTextStripper pdfStripper = null; PDDocument pdDoc = null; COSDocument cosDoc = null; File file = new File(fileDir); if (!file.isFile()) { System.err.println("File " + fileDir + " does not exist."); return null; } try { parser = new PDFParser(new FileInputStream(file)); } catch (IOException e) { System.err.println("Unable to open PDF Parser. " + e.getMessage()); return null; } try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(pdDoc.getNumberOfPages()); parsedText = pdfStripper.getText(pdDoc); } catch (Exception e) { System.err.println("An exception occured in parsing the PDF Document." + e.getMessage()); } finally { try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e) { e.printStackTrace(); } } return parsedText; }
From source file:com.exlibris.dps.repository.plugin.riskExtractor.drmlint.PDFBoxWrapper.java
License:Apache License
/** * Check if a PDF file is valid or not//from w ww. j a v a 2 s .com * @param pFile file to check * @return whether the file is valid or not */ public static boolean isValid(File pFile) { boolean ret = false; try { PDFParser parser = new PDFParser(new FileInputStream(pFile)); parser.parse(); File temp = File.createTempFile("drmlint-temp-", ".pdf"); parser.getPDDocument().save(temp); parser.getDocument().close(); temp.delete(); ret = true; } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (COSVisitorException e) { // TODO Auto-generated catch block ret = false; } return ret; }
From source file:com.exlibris.dps.repository.plugin.riskExtractor.drmlint.PDFBoxWrapper.java
License:Apache License
/** * Check if a PDF file has DRM or not/* w w w.j a v a 2s .co m*/ * @param pFile file to check * @return whether the file is had DRM or not */ public static boolean hasDRM(File pFile) { boolean ret = false; try { PDFParser parser = new PDFParser(new FileInputStream(pFile)); parser.parse(); ret = parser.getDocument().isEncrypted(); parser.getDocument().close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return ret; }
From source file:com.exlibris.dps.repository.plugin.riskExtractor.drmlint.PDFBoxWrapper.java
License:Apache License
/** * Check for encryption with Apache PDFBox * -> query the encryption dictionary (might allow more granular checks of protection) * @param pPDF pdf file to check//from ww w . j a va 2s . c o m * @return whether or not the file has DRM */ public static boolean hasDRMGranular(File pPDF) { boolean ret = false; try { PDFParser parser = new PDFParser(new FileInputStream(pPDF)); parser.parse(); COSDictionary dict = parser.getDocument().getEncryptionDictionary(); if (dict != null) { //print encryption dictionary // for(COSName key:dict.keySet()) { // System.out.print(key.getName()); // String value = dict.getString(key); // if(value!=null){ // System.out.println(": "+value); // } else { // System.out.println(": "+dict.getLong(key)); // } // } //this feaure in pdfbox is currently broken, see: https://issues.apache.org/jira/browse/PDFBOX-1651 //AccessPermission perms = parser.getPDDocument().getCurrentAccessPermission(); //this is a work around; creating a new object from the data AccessPermission perms = new AccessPermission(dict.getInt("P")); boolean debug = false; if (debug) { System.out.println("canAssembleDocument() : " + perms.canAssembleDocument()); System.out.println("canExtractContent() : " + perms.canExtractContent()); System.out.println("canExtractForAccessibility() : " + perms.canExtractForAccessibility()); System.out.println("canFillInForm() : " + perms.canFillInForm()); System.out.println("canModify() : " + perms.canModify()); System.out.println("canModifyAnnotations() : " + perms.canModifyAnnotations()); System.out.println("canPrint() : " + perms.canPrint()); System.out.println("canPrintDegraded() : " + perms.canPrintDegraded()); System.out.println("isOwnerPermission() : " + perms.isOwnerPermission()); System.out.println("isReadOnly() : " + perms.isReadOnly()); } } parser.getDocument().close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return ret; }
From source file:com.lanacion.adminsiteln.services.PdfIndexerService.PdfIndexerService.java
/** * Metodos privados para la indexacin/*from w w w. j a v a 2 s . c om*/ */ private String pdftoText(String fileName, int pagina) { PDFParser parser; String parsedText = null; ; PDFTextStripper pdfStripper = null; //pdfStripper.setStartPage(0); //pdfStripper.setEndPage(0); PDDocument pdDoc = null; COSDocument cosDoc = null; File file = new File(fileName); if (!file.isFile()) { System.err.println("File " + fileName + " does not exist."); return null; } try { parser = new PDFParser(new FileInputStream(file)); } catch (IOException e) { System.err.println("Unable to open PDF Parser. " + e.getMessage()); return null; } try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(pagina); pdfStripper.setEndPage(pagina); parsedText = pdfStripper.getText(pdDoc); } catch (Exception e) { System.err.println("An exception occured in parsing the PDF Document." + e.getMessage()); } finally { try { if (cosDoc != null) { cosDoc.close(); } if (pdDoc != null) { pdDoc.close(); } } catch (Exception e) { e.printStackTrace(); } } return parsedText; }
From source file:com.lanacion.adminsiteln.services.PdfIndexerService.PdfIndexerService.java
private int pdfgetPages(String fileName) { int numero_paginas = 0; PDFParser parser; String parsedText = null;//from w w w.j a v a 2 s . c om ; PDFTextStripper pdfStripper = null; //pdfStripper.setStartPage(0); //pdfStripper.setEndPage(0); PDDocument pdDoc = null; COSDocument cosDoc = null; File file = new File(fileName); if (!file.isFile()) { System.err.println("File " + fileName + " does not exist."); return 0; } try { parser = new PDFParser(new FileInputStream(file)); } catch (IOException e) { System.err.println("Unable to open PDF Parser. " + e.getMessage()); return 0; } try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); numero_paginas = pdDoc.getNumberOfPages(); } catch (Exception e) { System.err.println("An exception occured in parsing the PDF Document." + e.getMessage()); } finally { try { if (cosDoc != null) { cosDoc.close(); } if (pdDoc != null) { pdDoc.close(); } } catch (Exception e) { e.printStackTrace(); } } return numero_paginas; }
From source file:com.liferay.portal.util.LuceneFields.java
License:Open Source License
public static Field getFile(String field, File file, String fileExt) throws IOException { fileExt = fileExt.toLowerCase();//ww w . j a v a2 s.c om FileInputStream fis = new FileInputStream(file); Reader reader = new BufferedReader(new InputStreamReader(fis)); String text = null; if (fileExt.equals(".doc")) { try { WordDocument wordDocument = new WordDocument(fis); StringWriter stringWriter = new StringWriter(); wordDocument.writeAllText(stringWriter); text = stringWriter.toString(); stringWriter.close(); } catch (Exception e) { _log.error(e.getMessage()); } } else if (fileExt.equals(".htm") || fileExt.equals(".html")) { try { DefaultStyledDocument dsd = new DefaultStyledDocument(); HTMLEditorKit htmlEditorKit = new HTMLEditorKit(); htmlEditorKit.read(reader, dsd, 0); text = dsd.getText(0, dsd.getLength()); } catch (Exception e) { _log.error(e.getMessage()); } } else if (fileExt.equals(".pdf")) { try { PDFParser parser = new PDFParser(fis); parser.parse(); PDDocument pdDoc = parser.getPDDocument(); StringWriter stringWriter = new StringWriter(); PDFTextStripper stripper = new PDFTextStripper(); stripper.setLineSeparator("\n"); stripper.writeText(pdDoc, stringWriter); text = stringWriter.toString(); stringWriter.close(); pdDoc.close(); } catch (Exception e) { _log.error(e.getMessage()); } } else if (fileExt.equals(".rtf")) { try { DefaultStyledDocument dsd = new DefaultStyledDocument(); RTFEditorKit rtfEditorKit = new RTFEditorKit(); rtfEditorKit.read(reader, dsd, 0); text = dsd.getText(0, dsd.getLength()); } catch (Exception e) { _log.error(e.getMessage()); } } else if (fileExt.equals(".xls")) { try { XLSTextStripper stripper = new XLSTextStripper(fis); text = stripper.getText(); } catch (Exception e) { _log.error(e.getMessage()); } } if (text != null) { return new Field(field, text, Field.Store.YES, Field.Index.NOT_ANALYZED); } else { return new Field(field, reader); } }
From source file:com.openkm.extractor.PdfTextExtractor.java
License:Open Source License
/** * {@inheritDoc}/*from w ww .j av a 2s .c o m*/ */ @SuppressWarnings("rawtypes") public String extractText(InputStream stream, String type, String encoding) throws IOException { try { PDFParser parser = new PDFParser(new BufferedInputStream(stream)); try { parser.parse(); PDDocument document = parser.getPDDocument(); if (document.isEncrypted()) { try { document.decrypt(""); document.setAllSecurityToBeRemoved(true); } catch (Exception e) { throw new IOException("Unable to extract text: document encrypted", e); } } CharArrayWriter writer = new CharArrayWriter(); PDFTextStripper stripper = new PDFTextStripper(); stripper.setLineSeparator("\n"); stripper.writeText(document, writer); String st = writer.toString().trim(); log.debug("TextStripped: '{}'", st); if (Config.SYSTEM_PDF_FORCE_OCR || st.length() <= 1) { log.warn("PDF does not contains text layer"); // Extract images from PDF StringBuilder sb = new StringBuilder(); if (!Config.SYSTEM_PDFIMAGES.isEmpty()) { File tmpPdf = FileUtils.createTempFile("pdf"); File tmpDir = new File(EnvironmentDetector.getTempDir()); String baseName = FileUtils.getFileName(tmpPdf.getName()); document.save(tmpPdf); int pgNum = 1; try { for (PDPage page : (List<PDPage>) document.getDocumentCatalog().getAllPages()) { HashMap<String, Object> hm = new HashMap<String, Object>(); hm.put("fileIn", tmpPdf.getPath()); hm.put("firstPage", pgNum); hm.put("lastPage", pgNum++); hm.put("imageRoot", tmpDir + File.separator + baseName); String cmd = TemplateUtils.replace("SYSTEM_PDFIMAGES", Config.SYSTEM_PDFIMAGES, hm); ExecutionUtils.runCmd(cmd); for (File tmp : tmpDir.listFiles()) { if (tmp.getName().startsWith(baseName + "-")) { if (page.findRotation() > 0) { ImageUtils.rotate(tmp, tmp, page.findRotation()); } try { String txt = doOcr(tmp); sb.append(txt).append(" "); log.debug("OCR Extracted: {}", txt); } finally { FileUtils.deleteQuietly(tmp); } } } } } finally { FileUtils.deleteQuietly(tmpPdf); } } else { for (PDPage page : (List<PDPage>) document.getDocumentCatalog().getAllPages()) { PDResources resources = page.getResources(); Map<String, PDXObject> images = resources.getXObjects(); if (images != null) { for (String key : images.keySet()) { PDXObjectImage image = (PDXObjectImage) images.get(key); String prefix = "img-" + key + "-"; File pdfImg = null; try { pdfImg = File.createTempFile(prefix, ".png"); log.debug("Writing image: {}", pdfImg.getPath()); // Won't work until PDFBox 1.8.9 ImageIO.write(image.getRGBImage(), "png", pdfImg); if (page.findRotation() > 0) { ImageUtils.rotate(pdfImg, pdfImg, page.findRotation()); } // Do OCR String txt = doOcr(pdfImg); sb.append(txt).append(" "); log.debug("OCR Extracted: {}", txt); } finally { FileUtils.deleteQuietly(pdfImg); } } } } } return sb.toString(); } else { return writer.toString(); } } finally { try { PDDocument doc = parser.getPDDocument(); if (doc != null) { doc.close(); } } catch (IOException e) { // ignore } } } catch (Exception e) { // it may happen that PDFParser throws a runtime // exception when parsing certain pdf documents log.warn("Failed to extract PDF text content", e); throw new IOException(e.getMessage(), e); } finally { stream.close(); } }