Example usage for org.apache.pdfbox.pdmodel.interactive.action PDActionURI getURI

List of usage examples for org.apache.pdfbox.pdmodel.interactive.action PDActionURI getURI

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel.interactive.action PDActionURI getURI.

Prototype

public String getURI() 

Source Link

Document

This will get the uniform resource identifier to resolve.

Usage

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

License:Apache License

@Override
protected void endPage(PDPage page) throws IOException {

    try {/*www.  j  av a2s.  c  o  m*/
        for (PDAnnotation annotation : page.getAnnotations()) {

            if (annotation instanceof PDAnnotationFileAttachment) {
                PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
                PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
                try {
                    AttributesImpl attributes = new AttributesImpl();
                    attributes.addAttribute("", "source", "source", "CDATA", "annotation");
                    extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes);
                } catch (SAXException e) {
                    throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
                } catch (TikaException e) {
                    throw new IOExceptionWithCause("file embedded in annotation tika exception", e);
                } catch (IOException e) {
                    handleCatchableIOE(e);
                }
            } else if (annotation instanceof PDAnnotationWidget) {
                handleWidget((PDAnnotationWidget) annotation);
            }
            // TODO: remove once PDFBOX-1143 is fixed:
            if (config.getExtractAnnotationText()) {
                if (annotation instanceof PDAnnotationLink) {
                    PDAnnotationLink annotationlink = (PDAnnotationLink) annotation;
                    if (annotationlink.getAction() != null) {
                        PDAction action = annotationlink.getAction();
                        if (action instanceof PDActionURI) {
                            //can't currently associate link to text.
                            //for now, extract link and repeat the link as if it
                            //were the visible text
                            PDActionURI uri = (PDActionURI) action;
                            String link = uri.getURI();
                            if (link != null && link.trim().length() > 0) {
                                xhtml.startElement("div", "class", "annotation");
                                xhtml.startElement("a", "href", link);
                                xhtml.characters(link);
                                xhtml.endElement("a");
                                xhtml.endElement("div");
                            }
                        }
                    }
                }

                if (annotation instanceof PDAnnotationMarkup) {
                    PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
                    String title = annotationMarkup.getTitlePopup();
                    String subject = annotationMarkup.getSubject();
                    String contents = annotationMarkup.getContents();
                    // TODO: maybe also annotationMarkup.getRichContents()?
                    if (title != null || subject != null || contents != null) {
                        xhtml.startElement("div", "class", "annotation");

                        if (title != null) {
                            xhtml.startElement("div", "class", "annotationTitle");
                            xhtml.characters(title);
                            xhtml.endElement("div");
                        }

                        if (subject != null) {
                            xhtml.startElement("div", "class", "annotationSubject");
                            xhtml.characters(subject);
                            xhtml.endElement("div");
                        }

                        if (contents != null) {
                            xhtml.startElement("div", "class", "annotationContents");
                            xhtml.characters(contents);
                            xhtml.endElement("div");
                        }

                        xhtml.endElement("div");
                    }
                }
            }
        }
        if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
            doOCROnCurrentPage();
        }

        PDPageAdditionalActions pageActions = page.getActions();
        if (pageActions != null) {
            handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE);
            handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN);
        }
        xhtml.endElement("div");
    } catch (SAXException | TikaException e) {
        throw new IOExceptionWithCause("Unable to end a page", e);
    } catch (IOException e) {
        exceptions.add(e);
    } finally {
        pageIndex++;
    }
}

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTMLPureJava.java

License:Apache License

@Override
protected void endPage(PDPage page) throws IOException {

    try {//from   w w  w.j  av  a 2s  . c om
        for (PDAnnotation annotation : page.getAnnotations()) {

            if (annotation instanceof PDAnnotationFileAttachment) {
                PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
                PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
                try {
                    AttributesImpl attributes = new AttributesImpl();
                    attributes.addAttribute("", "source", "source", "CDATA", "annotation");
                    extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes);
                } catch (SAXException e) {
                    throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
                } catch (TikaException e) {
                    throw new IOExceptionWithCause("file embedded in annotation tika exception", e);
                } catch (IOException e) {
                    handleCatchableIOE(e);
                }
            } else if (annotation instanceof PDAnnotationWidget) {
                handleWidget((PDAnnotationWidget) annotation);
            }
            // TODO: remove once PDFBOX-1143 is fixed:
            if (config.getExtractAnnotationText()) {
                if (annotation instanceof PDAnnotationLink) {
                    PDAnnotationLink annotationlink = (PDAnnotationLink) annotation;
                    if (annotationlink.getAction() != null) {
                        PDAction action = annotationlink.getAction();
                        if (action instanceof PDActionURI) {
                            //can't currently associate link to text.
                            //for now, extract link and repeat the link as if it
                            //were the visible text
                            PDActionURI uri = (PDActionURI) action;
                            String link = uri.getURI();
                            if (link != null && link.trim().length() > 0) {
                                xhtml.startElement("div", "class", "annotation");
                                xhtml.startElement("a", "href", link);
                                xhtml.characters(link);
                                xhtml.endElement("a");
                                xhtml.endElement("div");
                            }
                        }
                    }
                }

                if (annotation instanceof PDAnnotationMarkup) {
                    PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
                    String title = annotationMarkup.getTitlePopup();
                    String subject = annotationMarkup.getSubject();
                    String contents = annotationMarkup.getContents();
                    // TODO: maybe also annotationMarkup.getRichContents()?
                    if (title != null || subject != null || contents != null) {
                        xhtml.startElement("div", "class", "annotation");

                        if (title != null) {
                            xhtml.startElement("div", "class", "annotationTitle");
                            xhtml.characters(title);
                            xhtml.endElement("div");
                        }

                        if (subject != null) {
                            xhtml.startElement("div", "class", "annotationSubject");
                            xhtml.characters(subject);
                            xhtml.endElement("div");
                        }

                        if (contents != null) {
                            xhtml.startElement("div", "class", "annotationContents");
                            xhtml.characters(contents);
                            xhtml.endElement("div");
                        }

                        xhtml.endElement("div");
                    }
                }
            }
        }

        PDPageAdditionalActions pageActions = page.getActions();
        if (pageActions != null) {
            handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE);
            handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN);
        }
        xhtml.endElement("div");
    } catch (SAXException | TikaException e) {
        throw new IOExceptionWithCause("Unable to end a page", e);
    } catch (IOException e) {
        exceptions.add(e);
    } finally {
        pageIndex++;
    }
}

From source file:org.apache.tika.parser.pdf.EnhancedPDF2XHTML.java

License:Apache License

@Override
protected void endPage(PDPage page) throws IOException {
    try {//from  www .  j a v  a2  s.c o m
        writeParagraphEnd();

        extractImages(page.getResources());

        EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
        for (PDAnnotation annotation : page.getAnnotations()) {

            if (annotation instanceof PDAnnotationFileAttachment) {
                PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
                PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
                try {
                    extractMultiOSPDEmbeddedFiles("", fileSpec, extractor);
                } catch (SAXException e) {
                    throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
                } catch (TikaException e) {
                    throw new IOExceptionWithCause("file embedded in annotation tika exception", e);
                }
            }
            // TODO: remove once PDFBOX-1143 is fixed:
            if (config.getExtractAnnotationText()) {
                if (annotation instanceof PDAnnotationLink) {
                    PDAnnotationLink annotationlink = (PDAnnotationLink) annotation;
                    if (annotationlink.getAction() != null) {
                        PDAction action = annotationlink.getAction();
                        if (action instanceof PDActionURI) {
                            PDActionURI uri = (PDActionURI) action;
                            String link = uri.getURI();
                            if (link != null) {
                                handler.startElement("div", "class", "annotation");
                                handler.startElement("a", "href", link);
                                handler.endElement("a");
                                handler.endElement("div");
                            }
                        }
                    }
                }

                if (annotation instanceof PDAnnotationMarkup) {
                    PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
                    String title = annotationMarkup.getTitlePopup();
                    String subject = annotationMarkup.getSubject();
                    String contents = annotationMarkup.getContents();
                    // TODO: maybe also annotationMarkup.getRichContents()?
                    if (title != null || subject != null || contents != null) {
                        handler.startElement("div", "class", "annotation");

                        if (title != null) {
                            handler.startElement("div", "class", "annotationTitle");
                            handler.characters(title);
                            handler.endElement("div");
                        }

                        if (subject != null) {
                            handler.startElement("div", "class", "annotationSubject");
                            handler.characters(subject);
                            handler.endElement("div");
                        }

                        if (contents != null) {
                            handler.startElement("div", "class", "annotationContents");
                            handler.characters(contents);
                            handler.endElement("div");
                        }

                        handler.endElement("div");
                    }
                }
            }
        }

        handler.endElement("div");
    } catch (SAXException e) {
        throw new IOExceptionWithCause("Unable to end a page", e);
    }
}

From source file:org.xwiki.test.misc.PDFTest.java

License:Open Source License

private Map<String, String> extractURLs(URL url) throws Exception {
    Map<String, String> urls = new HashMap<String, String>();
    PDDocument document = null;//from  w  w w. j  ava2s. co m
    try {
        document = PDDocument.load(IOUtils.toByteArray(url));
        for (Map.Entry<String, PDAction> entry : extractLinks(document).entrySet()) {
            if (entry.getValue() instanceof PDActionURI) {
                PDActionURI uri = (PDActionURI) entry.getValue();
                urls.put(entry.getKey(), uri.getURI());
            }
        }
    } finally {
        if (document != null) {
            document.close();
        }
    }
    return urls;
}

From source file:uk.ac.leeds.ccg.andyt.rdl.web.RDL_ParsePDF.java

/**
 * https://svn.apache.org/viewvc/pdfbox/trunk/examples/ Based on
 * https://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/pdmodel/PrintURLs.java?view=markup&pathrev=1703066
 *
 * @param f/*from   ww  w . j ava  2 s  . c  o  m*/
 * @param filter
 * @param fis
 * @return
 * @throws IOException
 * @throws TikaException
 * @throws SAXException
 */
public static ArrayList<String[]> parseForLinks(File f, String filter, FileInputStream fis)
        throws IOException, TikaException, SAXException {
    ArrayList<String[]> result;
    result = new ArrayList<String[]>();
    PDDocument doc = PDDocument.load(f);
    int pageNum = 0;
    for (PDPage page : doc.getPages()) {
        pageNum++;

        //            if (pageNum == 11) { //Degug test hack
        System.out.println("Parsing page " + pageNum);
        PDFTextStripperByArea stripper = new PDFTextStripperByArea();
        List<PDAnnotation> annotations = page.getAnnotations();
        //first setup text extraction regions
        for (int j = 0; j < annotations.size(); j++) {
            PDAnnotation annot = annotations.get(j);
            if (annot instanceof PDAnnotationLink) {
                PDAnnotationLink link = (PDAnnotationLink) annot;
                PDRectangle rect = link.getRectangle();
                //need to reposition link rectangle to match text space
                float x = rect.getLowerLeftX();
                float y = rect.getUpperRightY();
                float width = rect.getWidth();
                float height = rect.getHeight();
                int rotation = page.getRotation();
                if (rotation == 0) {
                    PDRectangle pageSize = page.getMediaBox();
                    y = pageSize.getHeight() - y;
                } else if (rotation == 90) {
                    //do nothing
                }

                //Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height);
                // Rounding here could be a problem!
                Rectangle2D.Double awtRect = new Rectangle2D.Double(x, y, width, height);
                stripper.addRegion("" + j, awtRect);
            }
        }

        stripper.extractRegions(page);

        for (int j = 0; j < annotations.size(); j++) {
            PDAnnotation annot = annotations.get(j);
            if (annot instanceof PDAnnotationLink) {
                PDAnnotationLink link = (PDAnnotationLink) annot;
                PDAction action = link.getAction();
                if (action == null) {
                    System.out.println(link.getContents());
                    System.out.println(annot.getClass().getName());
                    System.out.println(annot.getAnnotationName());
                    //System.out.println(annot.getNormalAppearanceStream().toString());
                    System.out.println(annot.getContents());
                    System.out.println(annot.getSubtype());
                } else {
                    String urlText = stripper.getTextForRegion("" + j);
                    if (action instanceof PDActionURI) {
                        PDActionURI uri = (PDActionURI) action;
                        String url;
                        url = uri.getURI();
                        if (url.contains(filter)) {
                            String[] partResult;
                            partResult = new String[3];
                            partResult[0] = "Page " + pageNum;
                            partResult[1] = "urlText " + urlText;
                            partResult[2] = "URL " + uri.getURI();
                            System.out.println(partResult[0]);
                            System.out.println(partResult[1]);
                            System.out.println(partResult[2]);
                            System.out.println("URL " + uri.getURI());
                            result.add(partResult);
                        } else {
                            System.out.println("URL " + uri.getURI());
                        }
                    } else {
                        System.out.println(action.getType());
                    }
                }
            } else {
                System.out.println(annot.getClass().getName());
                System.out.println(annot.getAnnotationName());
                System.out.println(annot.getContents());
                System.out.println(annot.getSubtype());
            }
        }

        //}
    }
    //       PDDocument doc = PDDocument.load(f);
    //        int pageNum = 0;
    //        for (PDPage page : doc.getPages()) {
    //            pageNum++;
    //            List<PDAnnotation> annotations = page.getAnnotations();
    //
    //            for (PDAnnotation annotation : annotations) {
    //                PDAnnotation annot = annotation;
    //                if (annot instanceof PDAnnotationLink) {
    //                    PDAnnotationLink link = (PDAnnotationLink) annot;
    //                    PDAction action = link.getAction();
    //                    if (action instanceof PDActionURI) {
    //                        PDActionURI uri = (PDActionURI) action;
    //                        String oldURI = uri.getURI();
    //                        String name = annot.getAnnotationName();
    //                        String contents = annot.getContents();
    //                        PDAppearanceStream a = annot.getNormalAppearanceStream();
    //                        //String newURI = "http://pdfbox.apache.org";
    //                        System.out.println(oldURI + " " + name + " " + contents);
    //                        //uri.setURI(newURI);
    //                    }
    //                }
    //            }
    //        }

    //        result = parseWithTika(fis);
    //XMPSchema schema;
    //schema = new XMPSchema();
    //List<String> XMPBagOrSeqList;
    //XMPBagOrSeqList = getXMPBagOrSeqList(XMPSchema schema, String name) {

    //        PDDocument tPDDocument;
    //        tPDDocument = PDDocument.load(f);
    //        COSDocument tCOSDocument;
    //        tCOSDocument = tPDDocument.getDocument();

    //        String header;
    //        header = tCOSDocument.getHeaderString();
    //        System.out.println(header);

    //        PDDocumentCatalog tPDDocumentCatalog;
    //        tPDDocumentCatalog = tPDDocument.getDocumentCatalog();
    //        PDDocumentNameDictionary tPDDocumentNameDictionary;
    //        tPDDocumentNameDictionary = tPDDocumentCatalog.getNames();

    //        COSDictionary tCOSDictionary;
    //        tCOSDictionary = tPDDocumentNameDictionary.getCOSDictionary();
    //tCOSDictionary.
    //        PDPageNode tPDPageNode;
    //        tPDPageNode = tPDDocumentCatalog.getPages();

    //        List<COSObject> tCOSObjects;
    //        tCOSObjects = tCOSDocument.getObjects();
    //        int n;
    //        n = tCOSObjects.size();
    //        System.out.println(n);
    //        COSObject aCOSObject;
    //        String s;
    //        for (int i = 0; i < n; i++) {
    //            aCOSObject = tCOSObjects.get(i);
    //            s = aCOSObject.toString();
    //            System.out.println(s);
    //        }

    //        XMPMetadata tXMPMetadata;
    //        tXMPMetadata = getXMPMetadata(tPDDocument);

    //        Document XMPDocument;
    //        XMPDocument = tXMPMetadata.getXMPDocument();
    //        Node n;
    //        n = XMPDocument.getFirstChild();
    //        parseNode(n);
    return result;
}