Example usage for org.apache.pdfbox.pdmodel PDDocument PDDocument

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument PDDocument.

Prototype

public PDDocument(COSDocument doc)

Source Link

Document

Constructor that uses an existing document.

Usage

From source file:org.nuxeo.typeDocPkg.TestPdfBoxN.java

License:Apache License

private boolean setMain(String FileName) throws Exception {
    file = new File(FileName);
    if (!file.isFile()) {
        System.err.println("File " + "test.pdf" + " does not exist.");
        return false;
    }/*from   ww w . j  av  a  2s . c om*/
    try {
        parser = new PDFParser(new FileInputStream(file));
    } catch (IOException e) {
        System.err.println("Unable to open PDF Parser. " + e.getMessage());
        return false;
    }

    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
    } catch (Exception e) {
        return false;
    }

    return true;
}

From source file:org.wso2.carbon.apimgt.impl.indexing.indexer.DocumentIndexer.java

License:Open Source License

/**
 * Write document content to document artifact as its raw content
 *
 * @param registry/*from  www . j a v a 2  s . c  om*/
 * @param documentResource
 * @return
 * @throws RegistryException
 * @throws IOException
 * @throws APIManagementException
 */
private String fetchDocumentContent(Registry registry, Resource documentResource)
        throws RegistryException, IOException, APIManagementException {
    GenericArtifactManager docArtifactManager = APIUtil.getArtifactManager(registry,
            APIConstants.DOCUMENTATION_KEY);
    GenericArtifact documentArtifact = docArtifactManager.getGenericArtifact(documentResource.getUUID());
    String sourceType = documentArtifact.getAttribute(APIConstants.DOC_SOURCE_TYPE);

    String contentString = null;
    if (Documentation.DocumentSourceType.FILE.name().equals(sourceType)) {
        Association fileAssociations[] = registry.getAssociations(documentResource.getPath(),
                APIConstants.DOCUMENTATION_FILE_ASSOCIATION);
        Association fileAssociation;

        if (fileAssociations.length < 1) {
            String error = "No document associated to API";
            log.error(error);
            throw new APIManagementException(error);
        }

        //a file document can have one file association
        fileAssociation = fileAssociations[0];
        String contentPath = fileAssociation.getDestinationPath();

        if (!registry.resourceExists(contentPath)) {
            String error = "API not found at " + contentPath;
            log.error(error);
            throw new APIManagementException(error);
        }

        Resource contentResource = registry.get(contentPath);

        String fileName = ((ResourceImpl) contentResource).getName();
        String extension = FilenameUtils.getExtension(fileName);
        InputStream inputStream = null;
        try {
            inputStream = contentResource.getContentStream();
            switch (extension) {
            case APIConstants.PDF_EXTENSION:
                PDFParser pdfParser = new PDFParser(inputStream);
                pdfParser.parse();
                COSDocument cosDocument = pdfParser.getDocument();
                PDFTextStripper stripper = new PDFTextStripper();
                contentString = stripper.getText(new PDDocument(cosDocument));
                break;
            case APIConstants.DOC_EXTENSION: {
                POIFSFileSystem pfs = new POIFSFileSystem(inputStream);
                WordExtractor msWord2003Extractor = new WordExtractor(pfs);
                contentString = msWord2003Extractor.getText();
                break;
            }
            case APIConstants.DOCX_EXTENSION:
                XWPFDocument doc = new XWPFDocument(inputStream);
                XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc);
                contentString = msWord2007Extractor.getText();
                break;
            case APIConstants.XLS_EXTENSION: {
                POIFSFileSystem pfs = new POIFSFileSystem(inputStream);
                ExcelExtractor extractor = new ExcelExtractor(pfs);
                contentString = extractor.getText();
                break;
            }
            case APIConstants.XLSX_EXTENSION:
                XSSFWorkbook xssfSheets = new XSSFWorkbook(inputStream);
                XSSFExcelExtractor xssfExcelExtractor = new XSSFExcelExtractor(xssfSheets);
                contentString = xssfExcelExtractor.getText();
                break;
            case APIConstants.PPT_EXTENSION: {
                POIFSFileSystem fs = new POIFSFileSystem(inputStream);
                PowerPointExtractor extractor = new PowerPointExtractor(fs);
                contentString = extractor.getText();
                break;
            }
            case APIConstants.PPTX_EXTENSION:
                XMLSlideShow xmlSlideShow = new XMLSlideShow(inputStream);
                XSLFPowerPointExtractor xslfPowerPointExtractor = new XSLFPowerPointExtractor(xmlSlideShow);
                contentString = xslfPowerPointExtractor.getText();
                break;
            case APIConstants.TXT_EXTENSION:
            case APIConstants.WSDL_EXTENSION:
            case APIConstants.XML_DOC_EXTENSION:
                BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
                String line;
                StringBuilder contentBuilder = new StringBuilder();
                while ((line = reader.readLine()) != null) {
                    contentBuilder.append(line);
                }
                contentString = contentBuilder.toString();
                break;
            }
        } finally {
            IOUtils.closeQuietly(inputStream);
        }

    } else if (Documentation.DocumentSourceType.INLINE.name().equals(sourceType)) {
        Association contentAssociations[] = registry.getAssociations(documentResource.getPath(),
                APIConstants.DOCUMENTATION_CONTENT_ASSOCIATION);
        Association contentAssociation;

        //an inline document can have one or no content associations
        if (contentAssociations.length == 1) {
            contentAssociation = contentAssociations[0];
            String contentPath = contentAssociation.getDestinationPath();

            if (registry.resourceExists(contentPath)) {
                Resource contentResource = registry.get(contentPath);

                InputStream instream = null;
                BufferedReader reader = null;
                String line;
                try {
                    instream = contentResource.getContentStream();
                    reader = new BufferedReader(new InputStreamReader(instream));
                    StringBuilder contentBuilder = new StringBuilder();
                    while ((line = reader.readLine()) != null) {
                        contentBuilder.append(line);
                    }
                    contentString = contentBuilder.toString();
                } finally {
                    if (reader != null) {
                        IOUtils.closeQuietly(reader);
                    }
                }
            }
        }
    }
    return contentString;
}

From source file:pdf.to.info.PDF.java

/**
 * Creating a PDDocument object/*  ww  w.java2  s.  co  m*/
 *
 * @param filePath
 * @return
 * @throws java.io.IOException
 */
private PDDocument ReadPDDoc(String filePath) throws IOException {
    File file = new File(filePath);
    PDFParser parser = new PDFParser(new RandomAccessFile(file, "r")); // update for PDFBox V 2.0
    parser.parse();
    COSDocument cosDoc = parser.getDocument();
    PDFTextStripper pdfStripper = new PDFTextStripper();
    PDDocument pdDoc = new PDDocument(cosDoc);
    pdDoc.getNumberOfPages();
    pdfStripper.setStartPage(1);
    pdfStripper.setEndPage(1);
    // for reading all pages of pdf file
    // pdfStripper.setEndPage(pdDoc.getNumberOfPages());
    return pdDoc;
}

From source file:pdftotext.Pdfprac2.java

public String pdftoText(String fileName) {
    PDFParser parser;// w w w.j  a v a  2  s  . c  o m
    String parsedText = null;
    ;
    String location = fileName.replaceAll(".pdf", ".txt");
    //System.out.println(location);
    PDFTextStripper pdfStripper = null;
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    File file = new File(fileName);
    if (!file.isFile()) {
        System.err.println("File " + fileName + " does not exist.");
        return null;
    }
    try {
        parser = new PDFParser(new FileInputStream(file));
    } catch (IOException e) {
        System.err.println("Unable to open PDF Parser. " + e.getMessage());
        return null;
    }
    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        //pdfStripper.setStartPage(1);
        //pdfStripper.setEndPage(5);
        parsedText = pdfStripper.getText(pdDoc);
        try (PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(location, true)))) {
            out.println(parsedText);
        }
    } catch (IOException e) {
        System.err.println("An exception occured in parsing the PDF Document." + e.getMessage());
    } finally {
        try {
            if (cosDoc != null)
                cosDoc.close();
            if (pdDoc != null)
                pdDoc.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    return location;
}

From source file:pln.Pln.java

static String pdftoText(String fileName) {
    PDFParser parser;/*ww w.  j  a va 2  s  .  c om*/
    String parsedText = null;
    ;
    PDFTextStripper pdfStripper = null;
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    File file = new File(fileName);
    if (!file.isFile()) {
        System.err.println("File " + fileName + " does not exist.");
        return null;
    }
    try {
        parser = new PDFParser(new FileInputStream(file));
    } catch (IOException e) {
        System.err.println("Unable to open PDF Parser. " + e.getMessage());
        return null;
    }
    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        pdfStripper.setStartPage(1);
        pdfStripper.setEndPage(pdDoc.getNumberOfPages());
        parsedText = pdfStripper.getText(pdDoc);
    } catch (Exception e) {
        System.err.println("An exception occured in parsing the PDF Document." + e.getMessage());
    } finally {
        try {
            if (cosDoc != null) {
                cosDoc.close();
            }
            if (pdDoc != null) {
                pdDoc.close();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    return parsedText;
}

From source file:plnwork.PLNwork.java

public static String pdfToText(String path) {
    PDFTextStripper pdfStripper = null;//  w  ww .j  av  a2  s.  c o  m
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    String text = null;

    File file = new File(path);
    try {
        PDFParser parser = new PDFParser(new FileInputStream(file));
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);

        text = pdfStripper.getText(pdDoc);
    } catch (IOException e) {
        e.printStackTrace();
    }
    return text;
}

From source file:steffen.haertlein.file.FileObject.java

License:Apache License

private void readPDFDocument() {
    try {//from   w w  w  . j  av a  2 s.  co m
        FileInputStream fs = new FileInputStream(f);
        String text = "";
        PDFParser parser = new PDFParser(fs);
        parser.parse();
        COSDocument cosDoc = parser.getDocument();
        PDFTextStripper pdfStripper = new PDFTextStripper();
        PDDocument pdDoc = new PDDocument(cosDoc);
        text = pdfStripper.getText(pdDoc);
        String[] docxLines = text.split(System.lineSeparator());
        for (String line : docxLines) {
            lines.add(line);
        }
        fs.close();
    } catch (Exception e) {
        JOptionPane.showMessageDialog(null, "Fehler in readPDFDocument", "Fehler", JOptionPane.ERROR_MESSAGE);
        e.printStackTrace();
    }
}

From source file:uk.org.openeyes.DICOMHFAVF.java

/**
 *
 * @param Attrs//from   www .j a va2 s  .co m
 * @throws IOException
 */
public void collectData(Attributes Attrs) throws IOException {

    if (Attrs.contains(parser.getTagInteger("00420011"))) {
        byte[] pdfbytes = Attrs.getBytes(parser.getTagInteger("00420011"));

        FileOutputStream pdffos = new FileOutputStream("d:\\work\\wombex\\WombexUK\\AcrossHealth\\"
                + Attrs.getString(parser.getTagInteger("00420010")) + ".pdf");
        pdffos.write(pdfbytes);
        pdffos.close();

        PDFParser parser = null;
        PDDocument pdDoc = null;
        COSDocument cosDoc = null;
        PDFTextStripper pdfStripper;
        String parsedText = "";

        RandomAccessRead pdfData = new RandomAccessBuffer(pdfbytes);
        try {
            parser = new PDFParser(pdfData);
            parser.parse();
            cosDoc = parser.getDocument();
            pdfStripper = new PDFTextStripper();
            pdDoc = new PDDocument(cosDoc);
            parsedText = pdfStripper.getText(pdDoc);
            System.out.println(parsedText);
            debugMessage(parsedText);
        } catch (Exception e) {
            e.printStackTrace();
            try {
                if (cosDoc != null)
                    cosDoc.close();
                if (pdDoc != null)
                    pdDoc.close();
            } catch (Exception e1) {
                e.printStackTrace();
            }

        }

    }

    // this is just a test function for HFA files
    // 2 main groups: 7717 and 0301 can be extracted
    if (Attrs.contains(parser.getTagInteger("03010010"))) {
        debugMessage("Extracting 0301 group...");
        debugMessage("Test Type: " + Attrs.getString(parser.getTagInteger("03011000")));
        debugMessage("Test strategy: " + Attrs.getString(parser.getTagInteger("03011001")));
        debugMessage("Test Pattern: " + Attrs.getString(parser.getTagInteger("03011002")));
        debugMessage("Screening Mode: " + Attrs.getString(parser.getTagInteger("03011003")));

        debugMessage("Stimulus Color: " + Attrs.getString(parser.getTagInteger("03011004")));
        debugMessage("Stimulus Size: " + Attrs.getString(parser.getTagInteger("03011005")));
        debugMessage("Blue Yellow: " + Attrs.getString(parser.getTagInteger("03011006")));
        debugMessage("PDB Version: " + Attrs.getString(parser.getTagInteger("03011007")));

        debugMessage("HFA Raw Data: ");
        byte[] rawbytes;

        try {
            rawbytes = Attrs.getBytes(parser.getTagInteger("03011008"));
            // TODO: move this part into a function, and this is a hack now!!!
            byte[] correctedBytes = new byte[rawbytes.length - 2];
            int j = 0;
            for (int i = 0; i < rawbytes.length - 1; i++) {
                if (!String.format("%02X", rawbytes[i]).equals("04")) {
                    correctedBytes[j] = rawbytes[i];
                    j++;
                }
            }

            FileOutputStream fos = new FileOutputStream(
                    "d:\\work\\wombex\\WombexUK\\AcrossHealth\\byte_test_new.xml");
            fos.write(correctedBytes);
            fos.close();

            DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
            DocumentBuilder dBuilder;
            Document doc = null;
            String encapsulatedBinaryData = "";
            try {
                dBuilder = dbFactory.newDocumentBuilder();
                try {
                    doc = dBuilder.parse(new ByteArrayInputStream(correctedBytes));
                    //new FileInputStream("c:\\work\\wombex\\WombexUK\\AcrossHealth\\byte_test.xml"));
                    NodeList binaryNodes;
                    binaryNodes = doc.getElementsByTagName("xio:hfa_II_serial_binhex");
                    if (binaryNodes.getLength() > 0) {
                        encapsulatedBinaryData = binaryNodes.item(0).getTextContent();
                    }

                } catch (SAXException ex) {
                    Logger.getLogger(DICOMParser.class.getName()).log(Level.SEVERE, null, ex);
                }
            } catch (ParserConfigurationException ex) {
                Logger.getLogger(DICOMParser.class.getName()).log(Level.SEVERE, null, ex);
            }

            debugMessage(encapsulatedBinaryData);

            /*FileOutputStream decf = new FileOutputStream("c:\\work\\wombex\\WombexUK\\AcrossHealth\\decompressed.bin");
            String todecompress = encapsulatedBinaryData.substring(413,500);
            debugMessage(todecompress);
            decompressor.decompress(new ByteArrayInputStream(todecompress.getBytes()), decf);
            decf.close();
            */
            extractEncapsulatedBinData(encapsulatedBinaryData);

        } catch (IOException ex) {
            Logger.getLogger(DICOMParser.class.getName()).log(Level.SEVERE, null, ex);
        }

        debugMessage(Attrs.getString(parser.getTagInteger("03011008")));

    } else if (Attrs.contains(parser.getTagInteger("77170010"))) {
        debugMessage("Extracting 7717 group...");
        debugMessage("Test name: " + Attrs.getString(parser.getTagInteger("77171001")));
        debugMessage("Test strategy: " + Attrs.getString(parser.getTagInteger("77171002")));
        debugMessage("Stimulus Size: " + Attrs.getString(parser.getTagInteger("77171003")));
        debugMessage("Stimulus Color: " + Attrs.getString(parser.getTagInteger("77171004")));

        debugMessage("Background State: " + Attrs.getString(parser.getTagInteger("77171005")));
        debugMessage("Foveal Result: " + Attrs.getString(parser.getTagInteger("77171006")));
        debugMessage("Screening Mode: " + Attrs.getString(parser.getTagInteger("77171007")));
        debugMessage("Fixation Trials: " + Attrs.getString(parser.getTagInteger("77171008")));
        debugMessage("Fixation Errors: " + Attrs.getString(parser.getTagInteger("77171009")));

        debugMessage("False Positive Percent: " + Attrs.getString(parser.getTagInteger("77171010")));
        debugMessage("False Positive Trials : " + Attrs.getString(parser.getTagInteger("77171011")));
        debugMessage("False Positive Errors: " + Attrs.getString(parser.getTagInteger("77171012")));
        debugMessage("False Negative Percent: " + Attrs.getString(parser.getTagInteger("77171013")));
        debugMessage("False Negative Trials : " + Attrs.getString(parser.getTagInteger("77171014")));
        debugMessage("False Negative Errors: " + Attrs.getString(parser.getTagInteger("77171015")));
        debugMessage("Mean Deviation: " + Attrs.getString(parser.getTagInteger("77171016")));
        debugMessage("Mean Deviation Probability: " + Attrs.getString(parser.getTagInteger("77171017")));
        debugMessage("Pattern Standard Deviation: " + Attrs.getString(parser.getTagInteger("77171018")));
        debugMessage(
                "Pattern Standard Deviation Probability: " + Attrs.getString(parser.getTagInteger("77171019")));
        debugMessage("Short Term Fluctuation: " + Attrs.getString(parser.getTagInteger("77171020")));
        debugMessage(
                "Corrected Pattern Standard Deviation: " + Attrs.getString(parser.getTagInteger("77171021")));
        debugMessage("Corrected Pattern Standard Deviation Probability: "
                + Attrs.getString(parser.getTagInteger("77171022")));
        debugMessage("Glaucoma Hemifield Test: " + Attrs.getString(parser.getTagInteger("77171023")));

        debugMessage("Fixation Monitor: " + Attrs.getString(parser.getTagInteger("77171024")));
        debugMessage("Fixation Target: " + Attrs.getString(parser.getTagInteger("77171025")));
        debugMessage("Pupil Diameter (in mm): " + Attrs.getString(parser.getTagInteger("77171026")));
        debugMessage("Sphere: " + Attrs.getString(parser.getTagInteger("77171027")));
        debugMessage("Cylinder: " + Attrs.getString(parser.getTagInteger("77171028")));
        debugMessage("Axis: " + Attrs.getString(parser.getTagInteger("77171029")));
        debugMessage("Visual Acuity: " + Attrs.getString(parser.getTagInteger("77171030")));
        debugMessage("Short Term Fluctuation Probabilit: " + Attrs.getString(parser.getTagInteger("77171031")));
        debugMessage("Visual Field Index: " + Attrs.getString(parser.getTagInteger("77171034")));
        debugMessage("VFM Sequence:");
        if (Attrs.contains(parser.getTagInteger("77171040"))) {
            Sequence Seq = Attrs.getSequence(parser.getTagInteger("77171040"));
            for (int sq = 0; sq < Seq.size(); sq++) {
                Attributes AttrData = (Attributes) Seq.get(sq);
                debugMessage("> Private creator: " + AttrData.getString(parser.getTagInteger("77170010")));
                debugMessage(">> Section Number: " + AttrData.getString(parser.getTagInteger("77171041")));
                debugMessage(">> Section Value: " + AttrData.getString(parser.getTagInteger("77171042")));
            }
        }
    }
}

From source file:vortext.TextHighlight.java

License:Apache License

public static void main(final String args[]) throws Exception {
    if (args.length != 3) {
        usage();//  w ww  .j a  v a  2  s .  c  o m
    }
    PDDocument pdDoc = null;
    final File file = new File(args[0]);

    if (!file.isFile()) {
        System.err.println("File " + args[0] + " does not exist.");
        return;
    }

    final PDFParser parser = new PDFParser(new FileInputStream(file));

    parser.parse();
    pdDoc = new PDDocument(parser.getDocument());

    final TextHighlight pdfHighlight = new TextHighlight("UTF-8");
    // depends on what you want to match, but this creates a long string
    // without newlines
    pdfHighlight.setSkipAllWhitespace(true);
    pdfHighlight.setNormalizeText(true);
    pdfHighlight.initialize(pdDoc);

    List<PDAnnotationTextMarkup> highlightDefault = pdfHighlight.highlightDefault(args[2]);

    pdDoc.save(args[1]);
    try {
        if (parser.getDocument() != null) {
            parser.getDocument().close();
        }
        if (pdDoc != null) {
            pdDoc.close();
        }
    } catch (final Exception e) {
        e.printStackTrace();
    }
}