List of usage examples for org.apache.pdfbox.text PDFTextStripper getText
public String getText(PDDocument doc) throws IOException
From source file:org.knime.ext.textprocessing.nodes.source.parser.pdf.PDFDocumentParser.java
License:Open Source License
private Document parseInternal(final InputStream is) throws Exception { m_currentDoc = new DocumentBuilder(m_tokenizerName); m_currentDoc.setDocumentFile(new File(m_docPath)); m_currentDoc.setDocumentType(m_type); m_currentDoc.addDocumentCategory(m_category); m_currentDoc.addDocumentSource(m_source); if (m_charset == null) { m_charset = Charset.defaultCharset(); }/*w w w . j a v a 2 s . c om*/ PDDocument document = null; try { document = PDDocument.load(is); // extract text from pdf PDFTextStripper stripper = new PDFTextStripper(); stripper.setSortByPosition(true); String text = stripper.getText(document); m_currentDoc.addSection(text, SectionAnnotation.UNKNOWN); // extract meta data from pdf String title = null; String authors = null; if (m_filenameAsTitle) { title = m_docPath.toString().trim(); } PDDocumentInformation information = document.getDocumentInformation(); if (information != null) { if (!checkTitle(title)) { title = information.getTitle(); } authors = information.getAuthor(); } // if title meta data does not exist use first sentence if (!checkTitle(title)) { List<Section> sections = m_currentDoc.getSections(); if (sections.size() > 0) { try { title = sections.get(0).getParagraphs().get(0).getSentences().get(0).getText().trim(); } catch (IndexOutOfBoundsException e) { LOGGER.debug("Parsed PDF document " + m_docPath + " is empty."); title = ""; } } } // if no useful first sentence exist use filename if (!checkTitle(title)) { title = m_docPath.toString().trim(); } m_currentDoc.addTitle(title); // use author meta data if (authors != null) { Set<Author> authSet = AuthorUtil.parseAuthors(authors); for (Author a : authSet) { m_currentDoc.addAuthor(a); } } // add document to list return m_currentDoc.createDocument(); } finally { if (document != null) { document.close(); } } }
From source file:org.titans.fyp.webcrawler.PageCollector.java
License:Open Source License
private static void pdfToText(String pdfURL) { pdfURL = "https://" + pdfURL.split("://")[1]; // System.out.println(pdfURL); try {/*from w w w. j a v a 2s. co m*/ PDDocument pddDocument = PDDocument.load((new URL(pdfURL)).openStream()); PDFTextStripper textStripper = new PDFTextStripper(); String doc = textStripper.getText(pddDocument); pddDocument.close(); System.out.println(doc); } catch (Exception e) { e.getMessage(); } }
From source file:org.vesalainen.ham.pdf.RfaxTest.java
License:Open Source License
public void test() throws IOException { PDDocument document = PDDocument.load(new File("rfax.pdf")); if (!document.isEncrypted()) { PDFTextStripper stripper = new PDFTextStripper(); String text = stripper.getText(document); try (BufferedWriter bw = Files.newBufferedWriter(Paths.get("src", "main", "resources", "rfax.txt"))) { bw.write(text);/*from w w w . j av a 2 s. co m*/ } } document.close(); }
From source file:PDF.PDFTest.java
License:Apache License
public static void constrainText(String start, String end, File file) throws IOException { PDDocument doc = PDDocument.load(file); PDFTextStripper stripper = new PDFTextStripper(); String text = stripper.getText(doc); // we get the text of the entire // document into a String String[] split_on_start = text.split(start); // split on the start // parameter, take upper // bound// ww w . j av a2s . c om String[] split_on_end = split_on_start[1].split(end); // split on end // parameter, // take lower // bound String constrained_string = start; constrained_string += split_on_end[0]; // the final string will be the // area in between start and end doc.close(); System.out.print(constrained_string); }
From source file:pdf.to.info.PDF.java
/** * Reading text from PDF file//from w ww.ja va 2 s . c o m * * @param filePath * @return * @throws java.io.IOException */ public String ReadText(String filePath) throws IOException { PDFTextStripper pdfStripper = new PDFTextStripper(); return pdfStripper.getText(ReadPDDoc(filePath)); }
From source file:PDSL.PDFProcessor.java
public void pdfToText(String dirFrom, String dirTo) throws IOException { File pdfFolder = new File(dirFrom); File[] listOfPDF = pdfFolder.listFiles(); for (File thePDF : listOfPDF) { PDDocument pdDoc = PDDocument.load(thePDF); PDFTextStripper pdfStripper = new PDFTextStripper(); String parsedText = pdfStripper.getText(pdDoc); PrintWriter out = new PrintWriter(dirTo + "/" + thePDF.getName().replace(".pdf", ".txt")); out.write(parsedText);//ww w. j a v a 2 s. c o m out.close(); } }
From source file:src.controller.DocumentController.java
public void convertToDocX(File filepath, File filename) { PDDocument documentpdf;//from w ww.j a va 2 s . c om try { documentpdf = PDDocument.load(filepath); PDFTextStripper pdfStripper = new PDFTextStripper(); pdfStripper.setStartPage(1); //pdfStripper.setEndPage( 1 ); String parsedText = pdfStripper.getText(documentpdf); System.out.println(parsedText); // enregistrement du document dans un fichier FileOutputStream out = new FileOutputStream(filepath + filename.toString() + ".docx"); XWPFDocument document = new XWPFDocument(); //create Paragraph XWPFParagraph paragraph = document.createParagraph(); XWPFRun run = paragraph.createRun(); run.setText(parsedText); document.write(out); out.close(); } catch (IOException ex) { Logger.getLogger(DocumentController.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:uk.org.openeyes.DICOMCommonFunctions.java
/** * * @param binData//from w w w . j a v a 2 s . co m * @return */ protected String parsePDFData(byte[] binData) { PDFParser parser = null; PDDocument pdDoc = null; COSDocument cosDoc = null; PDFTextStripper pdfStripper; String parsedText = ""; RandomAccessRead pdfData = new RandomAccessBuffer(binData); try { parser = new PDFParser(pdfData); parser.parse(); //cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = parser.getPDDocument(); if (debug) { PDFFunctions PDFHelper = new PDFFunctions(); debugMessage("<<<<<<< PDFDATA >>>>>>>>\n"); //PDFHelper.dumpPDFStructure(pdDoc); PDFHelper.dumpPDFContent(pdDoc); debugMessage("<<<<<<< /PDFDATA >>>>>>>>\n"); } parsedText = pdfStripper.getText(pdDoc); //debugMessage("<<<<<<< PDFDATA >>>>>>>>\n"+parsedText+"<<<<<<< /PDFDATA >>>>>>>>\n"); } catch (Exception e) { e.printStackTrace(); try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e1) { e.printStackTrace(); } } return parsedText; }
From source file:uk.org.openeyes.DICOMHFAVF.java
/** * * @param Attrs//from www. ja v a2s. c o m * @throws IOException */ public void collectData(Attributes Attrs) throws IOException { if (Attrs.contains(parser.getTagInteger("00420011"))) { byte[] pdfbytes = Attrs.getBytes(parser.getTagInteger("00420011")); FileOutputStream pdffos = new FileOutputStream("d:\\work\\wombex\\WombexUK\\AcrossHealth\\" + Attrs.getString(parser.getTagInteger("00420010")) + ".pdf"); pdffos.write(pdfbytes); pdffos.close(); PDFParser parser = null; PDDocument pdDoc = null; COSDocument cosDoc = null; PDFTextStripper pdfStripper; String parsedText = ""; RandomAccessRead pdfData = new RandomAccessBuffer(pdfbytes); try { parser = new PDFParser(pdfData); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); parsedText = pdfStripper.getText(pdDoc); System.out.println(parsedText); debugMessage(parsedText); } catch (Exception e) { e.printStackTrace(); try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e1) { e.printStackTrace(); } } } // this is just a test function for HFA files // 2 main groups: 7717 and 0301 can be extracted if (Attrs.contains(parser.getTagInteger("03010010"))) { debugMessage("Extracting 0301 group..."); debugMessage("Test Type: " + Attrs.getString(parser.getTagInteger("03011000"))); debugMessage("Test strategy: " + Attrs.getString(parser.getTagInteger("03011001"))); debugMessage("Test Pattern: " + Attrs.getString(parser.getTagInteger("03011002"))); debugMessage("Screening Mode: " + Attrs.getString(parser.getTagInteger("03011003"))); debugMessage("Stimulus Color: " + Attrs.getString(parser.getTagInteger("03011004"))); debugMessage("Stimulus Size: " + Attrs.getString(parser.getTagInteger("03011005"))); debugMessage("Blue Yellow: " + Attrs.getString(parser.getTagInteger("03011006"))); debugMessage("PDB Version: " + Attrs.getString(parser.getTagInteger("03011007"))); debugMessage("HFA Raw Data: "); byte[] rawbytes; try { rawbytes = Attrs.getBytes(parser.getTagInteger("03011008")); // TODO: move this part into a function, and this is a hack now!!! byte[] correctedBytes = new byte[rawbytes.length - 2]; int j = 0; for (int i = 0; i < rawbytes.length - 1; i++) { if (!String.format("%02X", rawbytes[i]).equals("04")) { correctedBytes[j] = rawbytes[i]; j++; } } FileOutputStream fos = new FileOutputStream( "d:\\work\\wombex\\WombexUK\\AcrossHealth\\byte_test_new.xml"); fos.write(correctedBytes); fos.close(); DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder dBuilder; Document doc = null; String encapsulatedBinaryData = ""; try { dBuilder = dbFactory.newDocumentBuilder(); try { doc = dBuilder.parse(new ByteArrayInputStream(correctedBytes)); //new FileInputStream("c:\\work\\wombex\\WombexUK\\AcrossHealth\\byte_test.xml")); NodeList binaryNodes; binaryNodes = doc.getElementsByTagName("xio:hfa_II_serial_binhex"); if (binaryNodes.getLength() > 0) { encapsulatedBinaryData = binaryNodes.item(0).getTextContent(); } } catch (SAXException ex) { Logger.getLogger(DICOMParser.class.getName()).log(Level.SEVERE, null, ex); } } catch (ParserConfigurationException ex) { Logger.getLogger(DICOMParser.class.getName()).log(Level.SEVERE, null, ex); } debugMessage(encapsulatedBinaryData); /*FileOutputStream decf = new FileOutputStream("c:\\work\\wombex\\WombexUK\\AcrossHealth\\decompressed.bin"); String todecompress = encapsulatedBinaryData.substring(413,500); debugMessage(todecompress); decompressor.decompress(new ByteArrayInputStream(todecompress.getBytes()), decf); decf.close(); */ extractEncapsulatedBinData(encapsulatedBinaryData); } catch (IOException ex) { Logger.getLogger(DICOMParser.class.getName()).log(Level.SEVERE, null, ex); } debugMessage(Attrs.getString(parser.getTagInteger("03011008"))); } else if (Attrs.contains(parser.getTagInteger("77170010"))) { debugMessage("Extracting 7717 group..."); debugMessage("Test name: " + Attrs.getString(parser.getTagInteger("77171001"))); debugMessage("Test strategy: " + Attrs.getString(parser.getTagInteger("77171002"))); debugMessage("Stimulus Size: " + Attrs.getString(parser.getTagInteger("77171003"))); debugMessage("Stimulus Color: " + Attrs.getString(parser.getTagInteger("77171004"))); debugMessage("Background State: " + Attrs.getString(parser.getTagInteger("77171005"))); debugMessage("Foveal Result: " + Attrs.getString(parser.getTagInteger("77171006"))); debugMessage("Screening Mode: " + Attrs.getString(parser.getTagInteger("77171007"))); debugMessage("Fixation Trials: " + Attrs.getString(parser.getTagInteger("77171008"))); debugMessage("Fixation Errors: " + Attrs.getString(parser.getTagInteger("77171009"))); debugMessage("False Positive Percent: " + Attrs.getString(parser.getTagInteger("77171010"))); debugMessage("False Positive Trials : " + Attrs.getString(parser.getTagInteger("77171011"))); debugMessage("False Positive Errors: " + Attrs.getString(parser.getTagInteger("77171012"))); debugMessage("False Negative Percent: " + Attrs.getString(parser.getTagInteger("77171013"))); debugMessage("False Negative Trials : " + Attrs.getString(parser.getTagInteger("77171014"))); debugMessage("False Negative Errors: " + Attrs.getString(parser.getTagInteger("77171015"))); debugMessage("Mean Deviation: " + Attrs.getString(parser.getTagInteger("77171016"))); debugMessage("Mean Deviation Probability: " + Attrs.getString(parser.getTagInteger("77171017"))); debugMessage("Pattern Standard Deviation: " + Attrs.getString(parser.getTagInteger("77171018"))); debugMessage( "Pattern Standard Deviation Probability: " + Attrs.getString(parser.getTagInteger("77171019"))); debugMessage("Short Term Fluctuation: " + Attrs.getString(parser.getTagInteger("77171020"))); debugMessage( "Corrected Pattern Standard Deviation: " + Attrs.getString(parser.getTagInteger("77171021"))); debugMessage("Corrected Pattern Standard Deviation Probability: " + Attrs.getString(parser.getTagInteger("77171022"))); debugMessage("Glaucoma Hemifield Test: " + Attrs.getString(parser.getTagInteger("77171023"))); debugMessage("Fixation Monitor: " + Attrs.getString(parser.getTagInteger("77171024"))); debugMessage("Fixation Target: " + Attrs.getString(parser.getTagInteger("77171025"))); debugMessage("Pupil Diameter (in mm): " + Attrs.getString(parser.getTagInteger("77171026"))); debugMessage("Sphere: " + Attrs.getString(parser.getTagInteger("77171027"))); debugMessage("Cylinder: " + Attrs.getString(parser.getTagInteger("77171028"))); debugMessage("Axis: " + Attrs.getString(parser.getTagInteger("77171029"))); debugMessage("Visual Acuity: " + Attrs.getString(parser.getTagInteger("77171030"))); debugMessage("Short Term Fluctuation Probabilit: " + Attrs.getString(parser.getTagInteger("77171031"))); debugMessage("Visual Field Index: " + Attrs.getString(parser.getTagInteger("77171034"))); debugMessage("VFM Sequence:"); if (Attrs.contains(parser.getTagInteger("77171040"))) { Sequence Seq = Attrs.getSequence(parser.getTagInteger("77171040")); for (int sq = 0; sq < Seq.size(); sq++) { Attributes AttrData = (Attributes) Seq.get(sq); debugMessage("> Private creator: " + AttrData.getString(parser.getTagInteger("77170010"))); debugMessage(">> Section Number: " + AttrData.getString(parser.getTagInteger("77171041"))); debugMessage(">> Section Value: " + AttrData.getString(parser.getTagInteger("77171042"))); } } } }