List of usage examples for org.apache.pdfbox.pdmodel.interactive.action PDActionURI getURI
public String getURI()
From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java
License:Apache License
@Override protected void endPage(PDPage page) throws IOException { try {/*www. j av a2s. c o m*/ for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); try { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "source", "source", "CDATA", "annotation"); extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes); } catch (SAXException e) { throw new IOExceptionWithCause("file embedded in annotation sax exception", e); } catch (TikaException e) { throw new IOExceptionWithCause("file embedded in annotation tika exception", e); } catch (IOException e) { handleCatchableIOE(e); } } else if (annotation instanceof PDAnnotationWidget) { handleWidget((PDAnnotationWidget) annotation); } // TODO: remove once PDFBOX-1143 is fixed: if (config.getExtractAnnotationText()) { if (annotation instanceof PDAnnotationLink) { PDAnnotationLink annotationlink = (PDAnnotationLink) annotation; if (annotationlink.getAction() != null) { PDAction action = annotationlink.getAction(); if (action instanceof PDActionURI) { //can't currently associate link to text. //for now, extract link and repeat the link as if it //were the visible text PDActionURI uri = (PDActionURI) action; String link = uri.getURI(); if (link != null && link.trim().length() > 0) { xhtml.startElement("div", "class", "annotation"); xhtml.startElement("a", "href", link); xhtml.characters(link); xhtml.endElement("a"); xhtml.endElement("div"); } } } } if (annotation instanceof PDAnnotationMarkup) { PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; String title = annotationMarkup.getTitlePopup(); String subject = annotationMarkup.getSubject(); String contents = annotationMarkup.getContents(); // TODO: maybe also annotationMarkup.getRichContents()? if (title != null || subject != null || contents != null) { xhtml.startElement("div", "class", "annotation"); if (title != null) { xhtml.startElement("div", "class", "annotationTitle"); xhtml.characters(title); xhtml.endElement("div"); } if (subject != null) { xhtml.startElement("div", "class", "annotationSubject"); xhtml.characters(subject); xhtml.endElement("div"); } if (contents != null) { xhtml.startElement("div", "class", "annotationContents"); xhtml.characters(contents); xhtml.endElement("div"); } xhtml.endElement("div"); } } } } if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { doOCROnCurrentPage(); } PDPageAdditionalActions pageActions = page.getActions(); if (pageActions != null) { handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE); handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN); } xhtml.endElement("div"); } catch (SAXException | TikaException e) { throw new IOExceptionWithCause("Unable to end a page", e); } catch (IOException e) { exceptions.add(e); } finally { pageIndex++; } }
From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTMLPureJava.java
License:Apache License
@Override protected void endPage(PDPage page) throws IOException { try {//from w w w.j av a 2s . c om for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); try { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "source", "source", "CDATA", "annotation"); extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes); } catch (SAXException e) { throw new IOExceptionWithCause("file embedded in annotation sax exception", e); } catch (TikaException e) { throw new IOExceptionWithCause("file embedded in annotation tika exception", e); } catch (IOException e) { handleCatchableIOE(e); } } else if (annotation instanceof PDAnnotationWidget) { handleWidget((PDAnnotationWidget) annotation); } // TODO: remove once PDFBOX-1143 is fixed: if (config.getExtractAnnotationText()) { if (annotation instanceof PDAnnotationLink) { PDAnnotationLink annotationlink = (PDAnnotationLink) annotation; if (annotationlink.getAction() != null) { PDAction action = annotationlink.getAction(); if (action instanceof PDActionURI) { //can't currently associate link to text. //for now, extract link and repeat the link as if it //were the visible text PDActionURI uri = (PDActionURI) action; String link = uri.getURI(); if (link != null && link.trim().length() > 0) { xhtml.startElement("div", "class", "annotation"); xhtml.startElement("a", "href", link); xhtml.characters(link); xhtml.endElement("a"); xhtml.endElement("div"); } } } } if (annotation instanceof PDAnnotationMarkup) { PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; String title = annotationMarkup.getTitlePopup(); String subject = annotationMarkup.getSubject(); String contents = annotationMarkup.getContents(); // TODO: maybe also annotationMarkup.getRichContents()? if (title != null || subject != null || contents != null) { xhtml.startElement("div", "class", "annotation"); if (title != null) { xhtml.startElement("div", "class", "annotationTitle"); xhtml.characters(title); xhtml.endElement("div"); } if (subject != null) { xhtml.startElement("div", "class", "annotationSubject"); xhtml.characters(subject); xhtml.endElement("div"); } if (contents != null) { xhtml.startElement("div", "class", "annotationContents"); xhtml.characters(contents); xhtml.endElement("div"); } xhtml.endElement("div"); } } } } PDPageAdditionalActions pageActions = page.getActions(); if (pageActions != null) { handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE); handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN); } xhtml.endElement("div"); } catch (SAXException | TikaException e) { throw new IOExceptionWithCause("Unable to end a page", e); } catch (IOException e) { exceptions.add(e); } finally { pageIndex++; } }
From source file:org.apache.tika.parser.pdf.EnhancedPDF2XHTML.java
License:Apache License
@Override protected void endPage(PDPage page) throws IOException { try {//from www . j a v a2 s.c o m writeParagraphEnd(); extractImages(page.getResources()); EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); try { extractMultiOSPDEmbeddedFiles("", fileSpec, extractor); } catch (SAXException e) { throw new IOExceptionWithCause("file embedded in annotation sax exception", e); } catch (TikaException e) { throw new IOExceptionWithCause("file embedded in annotation tika exception", e); } } // TODO: remove once PDFBOX-1143 is fixed: if (config.getExtractAnnotationText()) { if (annotation instanceof PDAnnotationLink) { PDAnnotationLink annotationlink = (PDAnnotationLink) annotation; if (annotationlink.getAction() != null) { PDAction action = annotationlink.getAction(); if (action instanceof PDActionURI) { PDActionURI uri = (PDActionURI) action; String link = uri.getURI(); if (link != null) { handler.startElement("div", "class", "annotation"); handler.startElement("a", "href", link); handler.endElement("a"); handler.endElement("div"); } } } } if (annotation instanceof PDAnnotationMarkup) { PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; String title = annotationMarkup.getTitlePopup(); String subject = annotationMarkup.getSubject(); String contents = annotationMarkup.getContents(); // TODO: maybe also annotationMarkup.getRichContents()? if (title != null || subject != null || contents != null) { handler.startElement("div", "class", "annotation"); if (title != null) { handler.startElement("div", "class", "annotationTitle"); handler.characters(title); handler.endElement("div"); } if (subject != null) { handler.startElement("div", "class", "annotationSubject"); handler.characters(subject); handler.endElement("div"); } if (contents != null) { handler.startElement("div", "class", "annotationContents"); handler.characters(contents); handler.endElement("div"); } handler.endElement("div"); } } } } handler.endElement("div"); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to end a page", e); } }
From source file:org.xwiki.test.misc.PDFTest.java
License:Open Source License
private Map<String, String> extractURLs(URL url) throws Exception { Map<String, String> urls = new HashMap<String, String>(); PDDocument document = null;//from w w w. j ava2s. co m try { document = PDDocument.load(IOUtils.toByteArray(url)); for (Map.Entry<String, PDAction> entry : extractLinks(document).entrySet()) { if (entry.getValue() instanceof PDActionURI) { PDActionURI uri = (PDActionURI) entry.getValue(); urls.put(entry.getKey(), uri.getURI()); } } } finally { if (document != null) { document.close(); } } return urls; }
From source file:uk.ac.leeds.ccg.andyt.rdl.web.RDL_ParsePDF.java
/** * https://svn.apache.org/viewvc/pdfbox/trunk/examples/ Based on * https://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/pdmodel/PrintURLs.java?view=markup&pathrev=1703066 * * @param f/*from ww w . j ava 2 s . c o m*/ * @param filter * @param fis * @return * @throws IOException * @throws TikaException * @throws SAXException */ public static ArrayList<String[]> parseForLinks(File f, String filter, FileInputStream fis) throws IOException, TikaException, SAXException { ArrayList<String[]> result; result = new ArrayList<String[]>(); PDDocument doc = PDDocument.load(f); int pageNum = 0; for (PDPage page : doc.getPages()) { pageNum++; // if (pageNum == 11) { //Degug test hack System.out.println("Parsing page " + pageNum); PDFTextStripperByArea stripper = new PDFTextStripperByArea(); List<PDAnnotation> annotations = page.getAnnotations(); //first setup text extraction regions for (int j = 0; j < annotations.size(); j++) { PDAnnotation annot = annotations.get(j); if (annot instanceof PDAnnotationLink) { PDAnnotationLink link = (PDAnnotationLink) annot; PDRectangle rect = link.getRectangle(); //need to reposition link rectangle to match text space float x = rect.getLowerLeftX(); float y = rect.getUpperRightY(); float width = rect.getWidth(); float height = rect.getHeight(); int rotation = page.getRotation(); if (rotation == 0) { PDRectangle pageSize = page.getMediaBox(); y = pageSize.getHeight() - y; } else if (rotation == 90) { //do nothing } //Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height); // Rounding here could be a problem! Rectangle2D.Double awtRect = new Rectangle2D.Double(x, y, width, height); stripper.addRegion("" + j, awtRect); } } stripper.extractRegions(page); for (int j = 0; j < annotations.size(); j++) { PDAnnotation annot = annotations.get(j); if (annot instanceof PDAnnotationLink) { PDAnnotationLink link = (PDAnnotationLink) annot; PDAction action = link.getAction(); if (action == null) { System.out.println(link.getContents()); System.out.println(annot.getClass().getName()); System.out.println(annot.getAnnotationName()); //System.out.println(annot.getNormalAppearanceStream().toString()); System.out.println(annot.getContents()); System.out.println(annot.getSubtype()); } else { String urlText = stripper.getTextForRegion("" + j); if (action instanceof PDActionURI) { PDActionURI uri = (PDActionURI) action; String url; url = uri.getURI(); if (url.contains(filter)) { String[] partResult; partResult = new String[3]; partResult[0] = "Page " + pageNum; partResult[1] = "urlText " + urlText; partResult[2] = "URL " + uri.getURI(); System.out.println(partResult[0]); System.out.println(partResult[1]); System.out.println(partResult[2]); System.out.println("URL " + uri.getURI()); result.add(partResult); } else { System.out.println("URL " + uri.getURI()); } } else { System.out.println(action.getType()); } } } else { System.out.println(annot.getClass().getName()); System.out.println(annot.getAnnotationName()); System.out.println(annot.getContents()); System.out.println(annot.getSubtype()); } } //} } // PDDocument doc = PDDocument.load(f); // int pageNum = 0; // for (PDPage page : doc.getPages()) { // pageNum++; // List<PDAnnotation> annotations = page.getAnnotations(); // // for (PDAnnotation annotation : annotations) { // PDAnnotation annot = annotation; // if (annot instanceof PDAnnotationLink) { // PDAnnotationLink link = (PDAnnotationLink) annot; // PDAction action = link.getAction(); // if (action instanceof PDActionURI) { // PDActionURI uri = (PDActionURI) action; // String oldURI = uri.getURI(); // String name = annot.getAnnotationName(); // String contents = annot.getContents(); // PDAppearanceStream a = annot.getNormalAppearanceStream(); // //String newURI = "http://pdfbox.apache.org"; // System.out.println(oldURI + " " + name + " " + contents); // //uri.setURI(newURI); // } // } // } // } // result = parseWithTika(fis); //XMPSchema schema; //schema = new XMPSchema(); //List<String> XMPBagOrSeqList; //XMPBagOrSeqList = getXMPBagOrSeqList(XMPSchema schema, String name) { // PDDocument tPDDocument; // tPDDocument = PDDocument.load(f); // COSDocument tCOSDocument; // tCOSDocument = tPDDocument.getDocument(); // String header; // header = tCOSDocument.getHeaderString(); // System.out.println(header); // PDDocumentCatalog tPDDocumentCatalog; // tPDDocumentCatalog = tPDDocument.getDocumentCatalog(); // PDDocumentNameDictionary tPDDocumentNameDictionary; // tPDDocumentNameDictionary = tPDDocumentCatalog.getNames(); // COSDictionary tCOSDictionary; // tCOSDictionary = tPDDocumentNameDictionary.getCOSDictionary(); //tCOSDictionary. // PDPageNode tPDPageNode; // tPDPageNode = tPDDocumentCatalog.getPages(); // List<COSObject> tCOSObjects; // tCOSObjects = tCOSDocument.getObjects(); // int n; // n = tCOSObjects.size(); // System.out.println(n); // COSObject aCOSObject; // String s; // for (int i = 0; i < n; i++) { // aCOSObject = tCOSObjects.get(i); // s = aCOSObject.toString(); // System.out.println(s); // } // XMPMetadata tXMPMetadata; // tXMPMetadata = getXMPMetadata(tPDDocument); // Document XMPDocument; // XMPDocument = tXMPMetadata.getXMPDocument(); // Node n; // n = XMPDocument.getFirstChild(); // parseNode(n); return result; }