List of usage examples for org.apache.pdfbox.text PDFTextStripper PDFTextStripper
public PDFTextStripper() throws IOException
From source file:com.proquest.demo.allinone.PDFLBase.java
/** * Extract Text using PDF BOX, instead APDFL * * @param fileNamePath/*from www. ja v a2 s . c om*/ * @return * @throws Exception */ public byte[] extractTextPDFBox(final String fileNamePath) throws Exception { final String BLANK_SPACE = " "; final String UTF_8 = "UTF-8"; final PropertyReaderLib libPropertyReaderLib = new PropertyReaderLib(PropertyFileNames.PDFLIBRARY); final String regex = libPropertyReaderLib .getPropertyValue(PdfLibraryKeys.REGEX_TO_REMOVE_FROM_EXTRACTEDTEXT.toString()); byte[] bytesToReturn = null; try { final File file = new File(fileNamePath); final PDDocument pdfDoc = PDDocument.load(file); final PDFTextStripper pdfStripper = new PDFTextStripper(); final String textFromPDF = pdfStripper.getText(pdfDoc); pdfDoc.close(); bytesToReturn = textFromPDF.getBytes(UTF_8); final String textStr = new String(bytesToReturn).replaceAll(regex, BLANK_SPACE); bytesToReturn = textStr.getBytes(); } catch (IOException e) { throw new Exception(e.getMessage()); } return bytesToReturn; }
From source file:com.sastix.cms.common.services.htmltopdf.PdfTest.java
License:Apache License
@Test public void testPdfFromStringTo() throws Exception { // GIVEN an html template containing special characters that java stores in utf-16 internally Pdf pdf = pdfBuilder.build();// ww w . jav a 2 s . co m pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Mller</h1></html>", PageType.htmlAsString); String tempFolder = temporaryFolder.newFolder().getPath(); pdf.saveAs(tempFolder + "/output.pdf"); // WHEN byte[] pdfBytes = pdf.getPDF(); PDFParser parser = new PDFParser( new RandomAccessBufferedFileInputStream(new ByteArrayInputStream(pdfBytes))); // that is a valid PDF (otherwise an IOException occurs) parser.parse(); PDFTextStripper pdfTextStripper = new PDFTextStripper(); String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument())); assertThat("document should contain the creditorName", pdfText, containsString("Mller")); }
From source file:com.testautomationguru.utility.PDFUtil.java
License:Apache License
/** * This method returns the content of the document *///from ww w .j av a 2s . c o m private String getPDFText(String file, int startPage, int endPage) throws IOException { logger.info("file : " + file); logger.info("startPage : " + startPage); logger.info("endPage : " + endPage); PDDocument doc = PDDocument.load(new File(file)); PDFTextStripper localStripper = new PDFTextStripper(); if (null != this.stripper) { localStripper = this.stripper; } this.updateStartAndEndPages(file, startPage, endPage); localStripper.setStartPage(this.startPage); localStripper.setEndPage(this.endPage); String txt = localStripper.getText(doc); logger.info("PDF Text before trimming : " + txt); if (this.bTrimWhiteSpace) { txt = txt.trim().replaceAll("\\s+", " ").trim(); logger.info("PDF Text after trimming : " + txt); } doc.close(); return txt; }
From source file:com.validation.manager.core.server.core.AttachmentServerTest.java
License:Apache License
/** * Test of addFile method, of class AttachmentServer. *//*from w w w .j ava2 s . c o m*/ @Test public void testAddRetrieveTextFile() { try { System.out.println("add text File"); File f = new File("target/Test.txt"); f.deleteOnExit(); List<String> lines = Arrays.asList("The first line", "The second line"); Path file = Paths.get(f.getAbsolutePath()); Files.write(file, lines, Charset.forName("UTF-8")); AttachmentServer instance = new AttachmentServer(); instance.addFile(f, f.getName()); instance.write2DB(); //Delete the file FileUtils.delete(f.getAbsolutePath()); assertEquals(1, (int) instance.getAttachmentType().getId());//Text file System.out.println("retrieveFile"); AttachmentServer temp = new AttachmentServer(instance.getAttachmentPK()); File loadedFile = temp.getAttachedFile("target/loaded/"); BufferedReader br = new BufferedReader(new FileReader(loadedFile)); String line; int count = 0; while ((line = br.readLine()) != null) { assertEquals(lines.get(count), line); System.out.println(line); count++; } assertEquals(lines.size(), count); //Create pdf file System.out.println("add pdf File"); File pdf = Tool.convertToPDF(loadedFile, "target/Text.pdf"); pdf.deleteOnExit(); instance = new AttachmentServer(); instance.addFile(pdf, pdf.getName()); instance.write2DB(); //Delete the file FileUtils.delete(pdf.getAbsolutePath()); assertEquals(2, (int) instance.getAttachmentType().getId());//PDF file System.out.println("retrieveFile"); temp = new AttachmentServer(instance.getAttachmentPK()); loadedFile = temp.getAttachedFile("target/loaded/"); PDFTextStripper pdfStripper; PDDocument pdDoc = null; COSDocument cosDoc = null; try { PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(loadedFile)); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(1); String parsedText = pdfStripper.getText(pdDoc); System.out.println(parsedText); } catch (IOException ex) { Exceptions.printStackTrace(ex); fail(); } finally { if (cosDoc != null) { cosDoc.close(); } if (pdDoc != null) { pdDoc.close(); } } } catch (IOException | VMException ex) { Exceptions.printStackTrace(ex); fail(); } }
From source file:converter.PDFPac.java
/** * vrati text z pdf/*from www . jav a 2 s. c om*/ * * @param url * @return */ private String getTextFromPDF(String url) { String text = ""; try { PDDocument pdDoc = PDDocument.load(new File(url)); PDFTextStripper pdfStripper = new PDFTextStripper(); text = pdfStripper.getText(pdDoc); pdDoc.close(); } catch (IOException ex) { logger.warning("PDFPac soubor nebyl nalezen " + url + "chyba " + ex); } return text; }
From source file:cz.incad.kramerius.k5indexer.KrameriusPDFDocument.java
public String getPage(int page) throws Exception { logger.log(Level.INFO, "Getting page {0}", page); try {// w w w . j a v a 2s. co m PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/); if (page != -1) { stripper.setStartPage(page); stripper.setEndPage(page); } return StringEscapeUtils.escapeXml(stripper.getText(pdDoc)); } catch (Exception ex) { return ""; } }
From source file:cz.mzk.editor.server.handler.GetOcrFromPdfHandler.java
License:Open Source License
private String pdftoText(String fileName) throws ActionException { File pdfFile = new File(fileName); if (!pdfFile.isFile()) { LOGGER.error("The file: " + fileName + " does not exist."); throw new ActionException("Unable to parse the pdf file."); }/*from w w w. jav a 2s. co m*/ PDFParser parser = null; COSDocument cosDoc = null; PDFTextStripper pdfStripper; PDDocument pdDoc = null; String parsedText; try { parser = new PDFParser(new RandomAccessBufferedFileInputStream(new FileInputStream(pdfFile))); } catch (Exception e) { LOGGER.error("Unable to open PDF Parser.: " + e); e.printStackTrace(); throw new ActionException("Unable to parse the pdf file."); } try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); parsedText = pdfStripper.getText(pdDoc); } catch (Exception e) { LOGGER.error("An exception occured in parsing the PDF Document."); e.printStackTrace(); throw new ActionException("Unable to parse the pdf file. " + e); } finally { try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e) { e.printStackTrace(); } } return parsedText; }
From source file:data.PDFManager.java
/** * /* w ww . j ava2 s. c o m*/ * @return String do conteudo do pdf * @throws IOException */ public String ToText() throws IOException { this.pdfStripper = null; this.pdDoc = null; this.cosDoc = null; file = new File(filePath); parser = new PDFParser(new RandomAccessFile(file, "r")); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdDoc.getNumberOfPages(); pdfStripper.setStartPage(1); pdfStripper.setEndPage(pdDoc.getNumberOfPages()); Text = pdfStripper.getText(pdDoc); return Text; }
From source file:dk.defxws.fedoragsearch.server.TransformerToText.java
License:Open Source License
public static StringBuffer getTextFromPDF(PDDocument pdDoc, String pageNum) throws Exception { StringBuffer docText = new StringBuffer(); String password = ""; // extract PDF document's textual content try {/*from www. ja va 2 s . co m*/ PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/); int page = Integer.parseInt(pageNum); if (page != -1) { stripper.setStartPage(page); stripper.setEndPage(page); } docText = new StringBuffer(stripper.getText(pdDoc)); } catch (IOException e) { throw new Exception("Cannot parse PDF document", e); } return docText; }
From source file:dk.defxws.fedoragsearch.server.TransformerToText.java
License:Open Source License
/** * //from w w w . jav a2 s. c om * * @throws Exception. */ private StringBuffer getTextFromPDF(byte[] doc, String pageNum) throws Exception { StringBuffer docText = new StringBuffer(); PDDocument pdDoc = null; String password = ""; // extract PDF document's textual content try { PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/); int page = Integer.parseInt(pageNum); if (page != -1) { stripper.setStartPage(page); stripper.setEndPage(page); } //password pdDoc = PDDocument.load(new ByteArrayInputStream(doc), password); // new PDDocument(cosDoc); docText = new StringBuffer(stripper.getText(pdDoc)); } catch (IOException e) { throw new Exception("Cannot parse PDF document", e); } finally { closePDDocument(pdDoc); } return docText; }