List of usage examples for org.apache.pdfbox.pdmodel PDDocument PDDocument
public PDDocument(COSDocument doc)
From source file:org.nuxeo.typeDocPkg.TestPdfBoxN.java
License:Apache License
private boolean setMain(String FileName) throws Exception { file = new File(FileName); if (!file.isFile()) { System.err.println("File " + "test.pdf" + " does not exist."); return false; }/*from ww w . j av a 2s . c om*/ try { parser = new PDFParser(new FileInputStream(file)); } catch (IOException e) { System.err.println("Unable to open PDF Parser. " + e.getMessage()); return false; } try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); } catch (Exception e) { return false; } return true; }
From source file:org.wso2.carbon.apimgt.impl.indexing.indexer.DocumentIndexer.java
License:Open Source License
/** * Write document content to document artifact as its raw content * * @param registry/*from www . j a v a 2 s . c om*/ * @param documentResource * @return * @throws RegistryException * @throws IOException * @throws APIManagementException */ private String fetchDocumentContent(Registry registry, Resource documentResource) throws RegistryException, IOException, APIManagementException { GenericArtifactManager docArtifactManager = APIUtil.getArtifactManager(registry, APIConstants.DOCUMENTATION_KEY); GenericArtifact documentArtifact = docArtifactManager.getGenericArtifact(documentResource.getUUID()); String sourceType = documentArtifact.getAttribute(APIConstants.DOC_SOURCE_TYPE); String contentString = null; if (Documentation.DocumentSourceType.FILE.name().equals(sourceType)) { Association fileAssociations[] = registry.getAssociations(documentResource.getPath(), APIConstants.DOCUMENTATION_FILE_ASSOCIATION); Association fileAssociation; if (fileAssociations.length < 1) { String error = "No document associated to API"; log.error(error); throw new APIManagementException(error); } //a file document can have one file association fileAssociation = fileAssociations[0]; String contentPath = fileAssociation.getDestinationPath(); if (!registry.resourceExists(contentPath)) { String error = "API not found at " + contentPath; log.error(error); throw new APIManagementException(error); } Resource contentResource = registry.get(contentPath); String fileName = ((ResourceImpl) contentResource).getName(); String extension = FilenameUtils.getExtension(fileName); InputStream inputStream = null; try { inputStream = contentResource.getContentStream(); switch (extension) { case APIConstants.PDF_EXTENSION: PDFParser pdfParser = new PDFParser(inputStream); pdfParser.parse(); COSDocument cosDocument = pdfParser.getDocument(); PDFTextStripper stripper = new PDFTextStripper(); contentString = stripper.getText(new PDDocument(cosDocument)); break; case APIConstants.DOC_EXTENSION: { POIFSFileSystem pfs = new POIFSFileSystem(inputStream); WordExtractor msWord2003Extractor = new WordExtractor(pfs); contentString = msWord2003Extractor.getText(); break; } case APIConstants.DOCX_EXTENSION: XWPFDocument doc = new XWPFDocument(inputStream); XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc); contentString = msWord2007Extractor.getText(); break; case APIConstants.XLS_EXTENSION: { POIFSFileSystem pfs = new POIFSFileSystem(inputStream); ExcelExtractor extractor = new ExcelExtractor(pfs); contentString = extractor.getText(); break; } case APIConstants.XLSX_EXTENSION: XSSFWorkbook xssfSheets = new XSSFWorkbook(inputStream); XSSFExcelExtractor xssfExcelExtractor = new XSSFExcelExtractor(xssfSheets); contentString = xssfExcelExtractor.getText(); break; case APIConstants.PPT_EXTENSION: { POIFSFileSystem fs = new POIFSFileSystem(inputStream); PowerPointExtractor extractor = new PowerPointExtractor(fs); contentString = extractor.getText(); break; } case APIConstants.PPTX_EXTENSION: XMLSlideShow xmlSlideShow = new XMLSlideShow(inputStream); XSLFPowerPointExtractor xslfPowerPointExtractor = new XSLFPowerPointExtractor(xmlSlideShow); contentString = xslfPowerPointExtractor.getText(); break; case APIConstants.TXT_EXTENSION: case APIConstants.WSDL_EXTENSION: case APIConstants.XML_DOC_EXTENSION: BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)); String line; StringBuilder contentBuilder = new StringBuilder(); while ((line = reader.readLine()) != null) { contentBuilder.append(line); } contentString = contentBuilder.toString(); break; } } finally { IOUtils.closeQuietly(inputStream); } } else if (Documentation.DocumentSourceType.INLINE.name().equals(sourceType)) { Association contentAssociations[] = registry.getAssociations(documentResource.getPath(), APIConstants.DOCUMENTATION_CONTENT_ASSOCIATION); Association contentAssociation; //an inline document can have one or no content associations if (contentAssociations.length == 1) { contentAssociation = contentAssociations[0]; String contentPath = contentAssociation.getDestinationPath(); if (registry.resourceExists(contentPath)) { Resource contentResource = registry.get(contentPath); InputStream instream = null; BufferedReader reader = null; String line; try { instream = contentResource.getContentStream(); reader = new BufferedReader(new InputStreamReader(instream)); StringBuilder contentBuilder = new StringBuilder(); while ((line = reader.readLine()) != null) { contentBuilder.append(line); } contentString = contentBuilder.toString(); } finally { if (reader != null) { IOUtils.closeQuietly(reader); } } } } } return contentString; }
From source file:pdf.to.info.PDF.java
/** * Creating a PDDocument object/* ww w.java2 s. co m*/ * * @param filePath * @return * @throws java.io.IOException */ private PDDocument ReadPDDoc(String filePath) throws IOException { File file = new File(filePath); PDFParser parser = new PDFParser(new RandomAccessFile(file, "r")); // update for PDFBox V 2.0 parser.parse(); COSDocument cosDoc = parser.getDocument(); PDFTextStripper pdfStripper = new PDFTextStripper(); PDDocument pdDoc = new PDDocument(cosDoc); pdDoc.getNumberOfPages(); pdfStripper.setStartPage(1); pdfStripper.setEndPage(1); // for reading all pages of pdf file // pdfStripper.setEndPage(pdDoc.getNumberOfPages()); return pdDoc; }
From source file:pdftotext.Pdfprac2.java
public String pdftoText(String fileName) { PDFParser parser;// w w w.j a v a 2 s . c o m String parsedText = null; ; String location = fileName.replaceAll(".pdf", ".txt"); //System.out.println(location); PDFTextStripper pdfStripper = null; PDDocument pdDoc = null; COSDocument cosDoc = null; File file = new File(fileName); if (!file.isFile()) { System.err.println("File " + fileName + " does not exist."); return null; } try { parser = new PDFParser(new FileInputStream(file)); } catch (IOException e) { System.err.println("Unable to open PDF Parser. " + e.getMessage()); return null; } try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); //pdfStripper.setStartPage(1); //pdfStripper.setEndPage(5); parsedText = pdfStripper.getText(pdDoc); try (PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(location, true)))) { out.println(parsedText); } } catch (IOException e) { System.err.println("An exception occured in parsing the PDF Document." + e.getMessage()); } finally { try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e) { e.printStackTrace(); } } return location; }
From source file:pln.Pln.java
static String pdftoText(String fileName) { PDFParser parser;/*ww w. j a va 2 s . c om*/ String parsedText = null; ; PDFTextStripper pdfStripper = null; PDDocument pdDoc = null; COSDocument cosDoc = null; File file = new File(fileName); if (!file.isFile()) { System.err.println("File " + fileName + " does not exist."); return null; } try { parser = new PDFParser(new FileInputStream(file)); } catch (IOException e) { System.err.println("Unable to open PDF Parser. " + e.getMessage()); return null; } try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(pdDoc.getNumberOfPages()); parsedText = pdfStripper.getText(pdDoc); } catch (Exception e) { System.err.println("An exception occured in parsing the PDF Document." + e.getMessage()); } finally { try { if (cosDoc != null) { cosDoc.close(); } if (pdDoc != null) { pdDoc.close(); } } catch (Exception e) { e.printStackTrace(); } } return parsedText; }
From source file:plnwork.PLNwork.java
public static String pdfToText(String path) { PDFTextStripper pdfStripper = null;// w ww .j av a2 s. c o m PDDocument pdDoc = null; COSDocument cosDoc = null; String text = null; File file = new File(path); try { PDFParser parser = new PDFParser(new FileInputStream(file)); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); text = pdfStripper.getText(pdDoc); } catch (IOException e) { e.printStackTrace(); } return text; }
From source file:steffen.haertlein.file.FileObject.java
License:Apache License
private void readPDFDocument() { try {//from w w w . j av a 2 s. co m FileInputStream fs = new FileInputStream(f); String text = ""; PDFParser parser = new PDFParser(fs); parser.parse(); COSDocument cosDoc = parser.getDocument(); PDFTextStripper pdfStripper = new PDFTextStripper(); PDDocument pdDoc = new PDDocument(cosDoc); text = pdfStripper.getText(pdDoc); String[] docxLines = text.split(System.lineSeparator()); for (String line : docxLines) { lines.add(line); } fs.close(); } catch (Exception e) { JOptionPane.showMessageDialog(null, "Fehler in readPDFDocument", "Fehler", JOptionPane.ERROR_MESSAGE); e.printStackTrace(); } }
From source file:uk.org.openeyes.DICOMHFAVF.java
/** * * @param Attrs//from www .j a va2 s .co m * @throws IOException */ public void collectData(Attributes Attrs) throws IOException { if (Attrs.contains(parser.getTagInteger("00420011"))) { byte[] pdfbytes = Attrs.getBytes(parser.getTagInteger("00420011")); FileOutputStream pdffos = new FileOutputStream("d:\\work\\wombex\\WombexUK\\AcrossHealth\\" + Attrs.getString(parser.getTagInteger("00420010")) + ".pdf"); pdffos.write(pdfbytes); pdffos.close(); PDFParser parser = null; PDDocument pdDoc = null; COSDocument cosDoc = null; PDFTextStripper pdfStripper; String parsedText = ""; RandomAccessRead pdfData = new RandomAccessBuffer(pdfbytes); try { parser = new PDFParser(pdfData); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); parsedText = pdfStripper.getText(pdDoc); System.out.println(parsedText); debugMessage(parsedText); } catch (Exception e) { e.printStackTrace(); try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e1) { e.printStackTrace(); } } } // this is just a test function for HFA files // 2 main groups: 7717 and 0301 can be extracted if (Attrs.contains(parser.getTagInteger("03010010"))) { debugMessage("Extracting 0301 group..."); debugMessage("Test Type: " + Attrs.getString(parser.getTagInteger("03011000"))); debugMessage("Test strategy: " + Attrs.getString(parser.getTagInteger("03011001"))); debugMessage("Test Pattern: " + Attrs.getString(parser.getTagInteger("03011002"))); debugMessage("Screening Mode: " + Attrs.getString(parser.getTagInteger("03011003"))); debugMessage("Stimulus Color: " + Attrs.getString(parser.getTagInteger("03011004"))); debugMessage("Stimulus Size: " + Attrs.getString(parser.getTagInteger("03011005"))); debugMessage("Blue Yellow: " + Attrs.getString(parser.getTagInteger("03011006"))); debugMessage("PDB Version: " + Attrs.getString(parser.getTagInteger("03011007"))); debugMessage("HFA Raw Data: "); byte[] rawbytes; try { rawbytes = Attrs.getBytes(parser.getTagInteger("03011008")); // TODO: move this part into a function, and this is a hack now!!! byte[] correctedBytes = new byte[rawbytes.length - 2]; int j = 0; for (int i = 0; i < rawbytes.length - 1; i++) { if (!String.format("%02X", rawbytes[i]).equals("04")) { correctedBytes[j] = rawbytes[i]; j++; } } FileOutputStream fos = new FileOutputStream( "d:\\work\\wombex\\WombexUK\\AcrossHealth\\byte_test_new.xml"); fos.write(correctedBytes); fos.close(); DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder dBuilder; Document doc = null; String encapsulatedBinaryData = ""; try { dBuilder = dbFactory.newDocumentBuilder(); try { doc = dBuilder.parse(new ByteArrayInputStream(correctedBytes)); //new FileInputStream("c:\\work\\wombex\\WombexUK\\AcrossHealth\\byte_test.xml")); NodeList binaryNodes; binaryNodes = doc.getElementsByTagName("xio:hfa_II_serial_binhex"); if (binaryNodes.getLength() > 0) { encapsulatedBinaryData = binaryNodes.item(0).getTextContent(); } } catch (SAXException ex) { Logger.getLogger(DICOMParser.class.getName()).log(Level.SEVERE, null, ex); } } catch (ParserConfigurationException ex) { Logger.getLogger(DICOMParser.class.getName()).log(Level.SEVERE, null, ex); } debugMessage(encapsulatedBinaryData); /*FileOutputStream decf = new FileOutputStream("c:\\work\\wombex\\WombexUK\\AcrossHealth\\decompressed.bin"); String todecompress = encapsulatedBinaryData.substring(413,500); debugMessage(todecompress); decompressor.decompress(new ByteArrayInputStream(todecompress.getBytes()), decf); decf.close(); */ extractEncapsulatedBinData(encapsulatedBinaryData); } catch (IOException ex) { Logger.getLogger(DICOMParser.class.getName()).log(Level.SEVERE, null, ex); } debugMessage(Attrs.getString(parser.getTagInteger("03011008"))); } else if (Attrs.contains(parser.getTagInteger("77170010"))) { debugMessage("Extracting 7717 group..."); debugMessage("Test name: " + Attrs.getString(parser.getTagInteger("77171001"))); debugMessage("Test strategy: " + Attrs.getString(parser.getTagInteger("77171002"))); debugMessage("Stimulus Size: " + Attrs.getString(parser.getTagInteger("77171003"))); debugMessage("Stimulus Color: " + Attrs.getString(parser.getTagInteger("77171004"))); debugMessage("Background State: " + Attrs.getString(parser.getTagInteger("77171005"))); debugMessage("Foveal Result: " + Attrs.getString(parser.getTagInteger("77171006"))); debugMessage("Screening Mode: " + Attrs.getString(parser.getTagInteger("77171007"))); debugMessage("Fixation Trials: " + Attrs.getString(parser.getTagInteger("77171008"))); debugMessage("Fixation Errors: " + Attrs.getString(parser.getTagInteger("77171009"))); debugMessage("False Positive Percent: " + Attrs.getString(parser.getTagInteger("77171010"))); debugMessage("False Positive Trials : " + Attrs.getString(parser.getTagInteger("77171011"))); debugMessage("False Positive Errors: " + Attrs.getString(parser.getTagInteger("77171012"))); debugMessage("False Negative Percent: " + Attrs.getString(parser.getTagInteger("77171013"))); debugMessage("False Negative Trials : " + Attrs.getString(parser.getTagInteger("77171014"))); debugMessage("False Negative Errors: " + Attrs.getString(parser.getTagInteger("77171015"))); debugMessage("Mean Deviation: " + Attrs.getString(parser.getTagInteger("77171016"))); debugMessage("Mean Deviation Probability: " + Attrs.getString(parser.getTagInteger("77171017"))); debugMessage("Pattern Standard Deviation: " + Attrs.getString(parser.getTagInteger("77171018"))); debugMessage( "Pattern Standard Deviation Probability: " + Attrs.getString(parser.getTagInteger("77171019"))); debugMessage("Short Term Fluctuation: " + Attrs.getString(parser.getTagInteger("77171020"))); debugMessage( "Corrected Pattern Standard Deviation: " + Attrs.getString(parser.getTagInteger("77171021"))); debugMessage("Corrected Pattern Standard Deviation Probability: " + Attrs.getString(parser.getTagInteger("77171022"))); debugMessage("Glaucoma Hemifield Test: " + Attrs.getString(parser.getTagInteger("77171023"))); debugMessage("Fixation Monitor: " + Attrs.getString(parser.getTagInteger("77171024"))); debugMessage("Fixation Target: " + Attrs.getString(parser.getTagInteger("77171025"))); debugMessage("Pupil Diameter (in mm): " + Attrs.getString(parser.getTagInteger("77171026"))); debugMessage("Sphere: " + Attrs.getString(parser.getTagInteger("77171027"))); debugMessage("Cylinder: " + Attrs.getString(parser.getTagInteger("77171028"))); debugMessage("Axis: " + Attrs.getString(parser.getTagInteger("77171029"))); debugMessage("Visual Acuity: " + Attrs.getString(parser.getTagInteger("77171030"))); debugMessage("Short Term Fluctuation Probabilit: " + Attrs.getString(parser.getTagInteger("77171031"))); debugMessage("Visual Field Index: " + Attrs.getString(parser.getTagInteger("77171034"))); debugMessage("VFM Sequence:"); if (Attrs.contains(parser.getTagInteger("77171040"))) { Sequence Seq = Attrs.getSequence(parser.getTagInteger("77171040")); for (int sq = 0; sq < Seq.size(); sq++) { Attributes AttrData = (Attributes) Seq.get(sq); debugMessage("> Private creator: " + AttrData.getString(parser.getTagInteger("77170010"))); debugMessage(">> Section Number: " + AttrData.getString(parser.getTagInteger("77171041"))); debugMessage(">> Section Value: " + AttrData.getString(parser.getTagInteger("77171042"))); } } } }
From source file:vortext.TextHighlight.java
License:Apache License
public static void main(final String args[]) throws Exception { if (args.length != 3) { usage();// w ww .j a v a 2 s . c o m } PDDocument pdDoc = null; final File file = new File(args[0]); if (!file.isFile()) { System.err.println("File " + args[0] + " does not exist."); return; } final PDFParser parser = new PDFParser(new FileInputStream(file)); parser.parse(); pdDoc = new PDDocument(parser.getDocument()); final TextHighlight pdfHighlight = new TextHighlight("UTF-8"); // depends on what you want to match, but this creates a long string // without newlines pdfHighlight.setSkipAllWhitespace(true); pdfHighlight.setNormalizeText(true); pdfHighlight.initialize(pdDoc); List<PDAnnotationTextMarkup> highlightDefault = pdfHighlight.highlightDefault(args[2]); pdDoc.save(args[1]); try { if (parser.getDocument() != null) { parser.getDocument().close(); } if (pdDoc != null) { pdDoc.close(); } } catch (final Exception e) { e.printStackTrace(); } }