List of usage examples for org.apache.pdfbox.text PDFTextStripper getText
public String getText(PDDocument doc) throws IOException
From source file:ExtractTextFromPdf.java
public static void main(String[] args) { PDFParser parser = null;//from ww w . jav a 2s. com PDDocument pdDoc = null; COSDocument cosDoc = null; PDFTextStripper pdfStripper; String parsedText; String fileName = "C:/Users/Kavya Gupta/Desktop/Texas_Title.pdf"; File file = new File(fileName); try { byte data[] = new byte[1024]; ((RandomAccessRead) file).read(data, 0, 1024); pdDoc = PDDocument.load(new File(fileName)); pdfStripper = new PDFTextStripper(); parsedText = pdfStripper.getText(pdDoc); System.out.println(parsedText.replaceAll("[^A-Za-z0-9. ]+", "")); } catch (Exception e) { e.printStackTrace(); try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e1) { e.printStackTrace(); } } }
From source file:com.enginkutuk.pdfboxsample.PDFBoxSample.java
public void readPdfFile(String path) { try {/* www. j a v a 2 s .c o m*/ PDDocument document = null; document = PDDocument.load(new File(path)); document.getClass(); if (!document.isEncrypted()) { PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.setSortByPosition(true); PDFTextStripper Tstripper = new PDFTextStripper(); String st = Tstripper.getText(document); System.out.println(st); JOptionPane.showMessageDialog(null, st); } } catch (Exception e) { e.printStackTrace(); } }
From source file:com.jt.tool.pdf.CreateBookmarks.java
License:Apache License
/** * parse text//from w w w. j a va2 s. c o m */ public static String getPageText(PDDocument document, int start, int offset) throws Exception { PDFTextStripper stripper = new PDFTextStripper(); stripper.setStartPage(start); stripper.setEndPage(start + offset); return stripper.getText(document); }
From source file:com.liferay.faces.bridge.test.integration.demo.JSFExportPDFPortletTester.java
License:Open Source License
private String getPDFText(InputStream inputStream) throws IOException { String text = ""; PDDocument pdDocument = null;//from w w w . j a v a 2 s. c o m try { pdDocument = PDDocument.load(inputStream); PDFTextStripper pdfTextStripper = new PDFTextStripper(); text = pdfTextStripper.getText(pdDocument); } finally { ClosableUtil.close(pdDocument); ClosableUtil.close(inputStream); } return text; }
From source file:com.proquest.demo.allinone.PDFLBase.java
/** * Extract Text using PDF BOX, instead APDFL * * @param fileNamePath// www . j a v a 2 s. c o m * @return * @throws Exception */ public byte[] extractTextPDFBox(final String fileNamePath) throws Exception { final String BLANK_SPACE = " "; final String UTF_8 = "UTF-8"; final PropertyReaderLib libPropertyReaderLib = new PropertyReaderLib(PropertyFileNames.PDFLIBRARY); final String regex = libPropertyReaderLib .getPropertyValue(PdfLibraryKeys.REGEX_TO_REMOVE_FROM_EXTRACTEDTEXT.toString()); byte[] bytesToReturn = null; try { final File file = new File(fileNamePath); final PDDocument pdfDoc = PDDocument.load(file); final PDFTextStripper pdfStripper = new PDFTextStripper(); final String textFromPDF = pdfStripper.getText(pdfDoc); pdfDoc.close(); bytesToReturn = textFromPDF.getBytes(UTF_8); final String textStr = new String(bytesToReturn).replaceAll(regex, BLANK_SPACE); bytesToReturn = textStr.getBytes(); } catch (IOException e) { throw new Exception(e.getMessage()); } return bytesToReturn; }
From source file:com.sastix.cms.common.services.htmltopdf.PdfTest.java
License:Apache License
@Test public void testPdfFromStringTo() throws Exception { // GIVEN an html template containing special characters that java stores in utf-16 internally Pdf pdf = pdfBuilder.build();// w w w. ja v a2s . co m pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Mller</h1></html>", PageType.htmlAsString); String tempFolder = temporaryFolder.newFolder().getPath(); pdf.saveAs(tempFolder + "/output.pdf"); // WHEN byte[] pdfBytes = pdf.getPDF(); PDFParser parser = new PDFParser( new RandomAccessBufferedFileInputStream(new ByteArrayInputStream(pdfBytes))); // that is a valid PDF (otherwise an IOException occurs) parser.parse(); PDFTextStripper pdfTextStripper = new PDFTextStripper(); String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument())); assertThat("document should contain the creditorName", pdfText, containsString("Mller")); }
From source file:com.testautomationguru.utility.PDFUtil.java
License:Apache License
/** * This method returns the content of the document *//*from w ww . ja v a 2 s.co m*/ private String getPDFText(String file, int startPage, int endPage) throws IOException { logger.info("file : " + file); logger.info("startPage : " + startPage); logger.info("endPage : " + endPage); PDDocument doc = PDDocument.load(new File(file)); PDFTextStripper localStripper = new PDFTextStripper(); if (null != this.stripper) { localStripper = this.stripper; } this.updateStartAndEndPages(file, startPage, endPage); localStripper.setStartPage(this.startPage); localStripper.setEndPage(this.endPage); String txt = localStripper.getText(doc); logger.info("PDF Text before trimming : " + txt); if (this.bTrimWhiteSpace) { txt = txt.trim().replaceAll("\\s+", " ").trim(); logger.info("PDF Text after trimming : " + txt); } doc.close(); return txt; }
From source file:com.validation.manager.core.server.core.AttachmentServerTest.java
License:Apache License
/** * Test of addFile method, of class AttachmentServer. *//*from w ww . j a va 2s . c om*/ @Test public void testAddRetrieveTextFile() { try { System.out.println("add text File"); File f = new File("target/Test.txt"); f.deleteOnExit(); List<String> lines = Arrays.asList("The first line", "The second line"); Path file = Paths.get(f.getAbsolutePath()); Files.write(file, lines, Charset.forName("UTF-8")); AttachmentServer instance = new AttachmentServer(); instance.addFile(f, f.getName()); instance.write2DB(); //Delete the file FileUtils.delete(f.getAbsolutePath()); assertEquals(1, (int) instance.getAttachmentType().getId());//Text file System.out.println("retrieveFile"); AttachmentServer temp = new AttachmentServer(instance.getAttachmentPK()); File loadedFile = temp.getAttachedFile("target/loaded/"); BufferedReader br = new BufferedReader(new FileReader(loadedFile)); String line; int count = 0; while ((line = br.readLine()) != null) { assertEquals(lines.get(count), line); System.out.println(line); count++; } assertEquals(lines.size(), count); //Create pdf file System.out.println("add pdf File"); File pdf = Tool.convertToPDF(loadedFile, "target/Text.pdf"); pdf.deleteOnExit(); instance = new AttachmentServer(); instance.addFile(pdf, pdf.getName()); instance.write2DB(); //Delete the file FileUtils.delete(pdf.getAbsolutePath()); assertEquals(2, (int) instance.getAttachmentType().getId());//PDF file System.out.println("retrieveFile"); temp = new AttachmentServer(instance.getAttachmentPK()); loadedFile = temp.getAttachedFile("target/loaded/"); PDFTextStripper pdfStripper; PDDocument pdDoc = null; COSDocument cosDoc = null; try { PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(loadedFile)); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(1); String parsedText = pdfStripper.getText(pdDoc); System.out.println(parsedText); } catch (IOException ex) { Exceptions.printStackTrace(ex); fail(); } finally { if (cosDoc != null) { cosDoc.close(); } if (pdDoc != null) { pdDoc.close(); } } } catch (IOException | VMException ex) { Exceptions.printStackTrace(ex); fail(); } }
From source file:converter.PDFPac.java
/** * vrati text z pdf//from ww w .ja v a2 s . com * * @param url * @return */ private String getTextFromPDF(String url) { String text = ""; try { PDDocument pdDoc = PDDocument.load(new File(url)); PDFTextStripper pdfStripper = new PDFTextStripper(); text = pdfStripper.getText(pdDoc); pdDoc.close(); } catch (IOException ex) { logger.warning("PDFPac soubor nebyl nalezen " + url + "chyba " + ex); } return text; }
From source file:cz.incad.kramerius.k5indexer.KrameriusPDFDocument.java
public String getPage(int page) throws Exception { logger.log(Level.INFO, "Getting page {0}", page); try {/*from w w w. j a v a 2 s . c o m*/ PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/); if (page != -1) { stripper.setStartPage(page); stripper.setEndPage(page); } return StringEscapeUtils.escapeXml(stripper.getText(pdDoc)); } catch (Exception ex) { return ""; } }