List of usage examples for org.apache.pdfbox.pdmodel.interactive.annotation PDAnnotationLink getRectangle
public PDRectangle getRectangle()
From source file:com.vns.pdf.impl.PdfDocument.java
License:Apache License
private List<Annotation> parseAnnotation(PDPage pdPage) throws IOException { List<Annotation> annotations = new ArrayList<>(); for (PDAnnotation annt : pdPage.getAnnotations()) { if (annt instanceof PDAnnotationLink) { PDAnnotationLink link = (PDAnnotationLink) annt; PDRectangle rect = link.getRectangle(); float x = rect.getLowerLeftX(); float y = rect.getUpperRightY(); float width = rect.getWidth(); float height = rect.getHeight(); int rotation = pdPage.getRotation(); if (rotation == 0) { PDRectangle pageSize = pdPage.getMediaBox(); y = pageSize.getHeight() - y; } else if (rotation == 90) { //do nothing }/*from ww w .j a v a 2s . c o m*/ ActionData actionData = parsePDAction(link.getAction()); if (actionData == null) { actionData = parsePDDestination(link.getDestination()); } if (actionData != null) { Annotation a = new Annotation(x, y, width, height, actionData.destX, actionData.destY, actionData.destPage, actionData.destZoom); annotations.add(a); } } } return annotations; }
From source file:com.zilbo.flamingSailor.TE.PDFParser.java
License:Apache License
@Override protected void endPage(PDPage page) throws IOException { super.endPage(page); int pieceID = 0; Map<String, Map<Integer, Long>> fontCounts = new HashMap<>(); List<TextPiece> wordsOfThisPage = new ArrayList<>(); for (List<TextPosition> aCharactersByArticle : charactersByArticle) { // int len = aCharactersByArticle.size(); for (TextPosition t : aCharactersByArticle) { // copy information TextPiece w = new TextPiece(pieceID++); PDFont font = t.getFont();//from w ww . j a v a 2 s.co m PDFontDescriptor fontDescriptor = font.getFontDescriptor(); // w.setFontDescriptor(fontDescriptor); if (fontDescriptor == null) { w.setFontName("UNKNOWN"); } else { w.setFontName(fontDescriptor.getFontName()); } /* * 100: a simple step to fix the font size to the normal range, for those documents in unknown codes that PDFBox can not process now */ if (t.getFontSize() < 0.3 && t.getYScale() <= 1.0) { w.setFontSize(t.getFontSize() * 100); w.setHeight(Math.max(t.getYScale(), t.getFontSize()) * 100); w.setXScale(t.getXScale()); w.setYScale(t.getYScale()); } else { if (t.getYScale() < 0.3 && t.getFontSize() <= 1.0) { w.setYScale(t.getYScale() * 100); w.setXScale(t.getXScale() * 100); w.setHeight(Math.max(t.getYScale() * 100, t.getFontSize())); } else { w.setFontSize(t.getFontSize()); w.setHeight(Math.max(t.getYScale(), t.getFontSize())); w.setXScale(t.getXScale()); w.setYScale(t.getYScale()); } } Map<Integer, Long> counts = fontCounts.get(w.getFontName()); if (counts == null) { counts = new HashMap<>(); fontCounts.put(w.getFontName(), counts); } Long count = counts.get((int) Math.round(w.getHeight())); if (count == null) { count = 1L; } else { count += 1L; } counts.put((int) Math.round(w.getHeight()), count); w.setWidth(Math.abs(t.getWidth())); w.setGeom(t.getX(), t.getY(), w.getWidth(), w.getHeight()); w.setText(t.getCharacter()); w.setWidthOfSpace(t.getWidthOfSpace()); wordsOfThisPage.add(w); } } currentPage.processPage(wordsOfThisPage, fontCounts); currentPage.setText(outString.getBuffer().toString()); outString.getBuffer().setLength(0); List<PDAnnotation> annotations = page.getAnnotations(); for (PDAnnotation annotation : annotations) { if (annotation instanceof PDAnnotationLink) { PDAnnotationLink l = (PDAnnotationLink) annotation; PDRectangle rect = l.getRectangle(); PDDestination dest = l.getDestination(); if (dest instanceof PDPageXYZDestination) { PDPageXYZDestination xyzDestination = (PDPageXYZDestination) dest; PDPage pageDest = ((PDPageXYZDestination) dest).getPage(); if (rect != null) { if (xyzDestination.getPageNumber() < 0) { int pageNumber = allpages.indexOf(pageDest) + 1; Rectangle2D hotbox = new Rectangle2D.Double(rect.getLowerLeftX(), rect.getLowerLeftY(), (rect.getUpperRightX() - rect.getLowerLeftX()), (rect.getUpperRightY() - rect.getLowerLeftY())); Point2D toPoint = new Point2D.Double(xyzDestination.getLeft(), xyzDestination.getTop()); currentPage.addLink(new PDLink(hotbox, pageNumber, toPoint)); } } } } } /* The following code is REALLY raw. initial testing seemed to show memory leaks, and was REALLY slow. PDResources r = page.getResources(); Map<String, PDXObjectImage> images = r.getImages(); for (Map.Entry<String, PDXObjectImage> e : images.entrySet()) { BufferedImage bi = null; try { // currentPage.addImage(bi); // (e.getValue()).write2file("/tmp/II" + e.getKey()); if (e.getValue() instanceof PDJpeg) { PDJpeg jpg = (PDJpeg) e.getValue(); bi = jpg.getRGBImage(); ColorSpace cs = bi.getColorModel().getColorSpace(); File jpgFile = new File("/tmp/II" + e.getKey() + ".jpg"); if (cs instanceof ColorSpaceCMYK) { logger.info("Ignoring image with CMYK color space"); } else { // ImageIO.write(bi, "jpg", jpgFile); jpg.write2file("/tmp/II"+ e.getKey()); } } else { (e.getValue()).write2file("/tmp/II" + e.getKey()); } } catch (Exception ee) { logger.info("can't read image ;-(", ee); } } */ textPageList.add(currentPage); currentPage = null; }
From source file:org.nuxeo.pdf.PDFLinks.java
License:Apache License
protected void loadAndPreflightPdf() throws NuxeoException { if (pdfDoc == null) { pdfDoc = PDFUtils.load(pdfBlob, password); @SuppressWarnings("unchecked") List<PDPage> allPages = pdfDoc.getDocumentCatalog().getAllPages(); try {//www . java 2 s .co m stripper = new PDFTextStripperByArea(); for (PDPage page : allPages) { List<PDAnnotation> annotations = page.getAnnotations(); for (int j = 0; j < annotations.size(); j++) { PDAnnotation annot = (PDAnnotation) annotations.get(j); if (annot instanceof PDAnnotationLink) { PDAnnotationLink link = (PDAnnotationLink) annot; PDRectangle rect = link.getRectangle(); // need to reposition link rectangle to match text space float x = rect.getLowerLeftX(); float y = rect.getUpperRightY(); float width = rect.getWidth(); float height = rect.getHeight(); int rotation = page.findRotation(); if (rotation == 0) { PDRectangle pageSize = page.findMediaBox(); y = pageSize.getHeight() - y; } else if (rotation == 90) { // do nothing } Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height); stripper.addRegion("" + j, awtRect); } } } } catch (IOException e) { throw new NuxeoException("Cannot prefilght and prepare regions", e); } } }
From source file:org.paxle.parser.pdf.impl.PdfParser.java
License:Open Source License
/** * A function to extract embedded URIs from the PDF-document. * // w ww . j a v a 2 s. c om */ protected void extractURLs(IParserDocument parserDoc, PDDocument pddDoc) throws IOException { final PDDocumentCatalog pddDocCatalog = pddDoc.getDocumentCatalog(); if (pddDocCatalog == null) return; @SuppressWarnings("unchecked") final List<PDPage> allPages = pddDocCatalog.getAllPages(); if (allPages == null || allPages.isEmpty()) return; for (int i = 0; i < allPages.size(); i++) { final PDFTextStripperByArea stripper = new PDFTextStripperByArea(); final PDPage page = (PDPage) allPages.get(i); @SuppressWarnings("unchecked") final List<PDAnnotation> annotations = page.getAnnotations(); if (annotations == null || annotations.isEmpty()) return; //first setup text extraction regions for (int j = 0; j < annotations.size(); j++) { final PDAnnotation annot = (PDAnnotation) annotations.get(j); if (annot instanceof PDAnnotationLink) { final PDAnnotationLink link = (PDAnnotationLink) annot; final PDRectangle rect = link.getRectangle(); //need to reposition link rectangle to match text space float x = rect.getLowerLeftX(); float y = rect.getUpperRightY(); float width = rect.getWidth(); float height = rect.getHeight(); int rotation = page.findRotation(); if (rotation == 0) { PDRectangle pageSize = page.findMediaBox(); y = pageSize.getHeight() - y; } else if (rotation == 90) { //do nothing } Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height); stripper.addRegion("" + j, awtRect); } } stripper.extractRegions(page); for (int j = 0; j < annotations.size(); j++) { final PDAnnotation annot = (PDAnnotation) annotations.get(j); if (annot instanceof PDAnnotationLink) { final PDAnnotationLink link = (PDAnnotationLink) annot; final PDAction action = link.getAction(); final String urlText = stripper.getTextForRegion("" + j); if (action instanceof PDActionURI) { final PDActionURI embeddedUri = (PDActionURI) action; final URI temp = URI.create(embeddedUri.getURI()); parserDoc.addReference(temp, urlText, Constants.SERVICE_PID + ":" + PID); } } } } }
From source file:org.xwiki.test.misc.PDFTest.java
License:Open Source License
/** * Code adapted from http://www.docjar.com/html/api/org/apache/pdfbox/examples/pdmodel/PrintURLs.java.html *///from www . j a v a 2 s. c o m private Map<String, PDAction> extractLinks(PDPage page) throws Exception { Map<String, PDAction> links = new HashMap<String, PDAction>(); PDFTextStripperByArea stripper = new PDFTextStripperByArea(); List<PDAnnotation> annotations = page.getAnnotations(); // First setup the text extraction regions. for (int j = 0; j < annotations.size(); j++) { PDAnnotation annotation = annotations.get(j); if (annotation instanceof PDAnnotationLink) { PDAnnotationLink link = (PDAnnotationLink) annotation; PDRectangle rect = link.getRectangle(); // Need to reposition link rectangle to match text space. float x = rect.getLowerLeftX(); float y = rect.getUpperRightY(); float width = rect.getWidth(); float height = rect.getHeight(); int rotation = page.getRotation(); if (rotation == 0) { PDRectangle pageSize = page.getMediaBox(); y = pageSize.getHeight() - y; } else if (rotation == 90) { // Do nothing. } Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height); stripper.addRegion(String.valueOf(j), awtRect); } } stripper.extractRegions(page); for (int j = 0; j < annotations.size(); j++) { PDAnnotation annotation = annotations.get(j); if (annotation instanceof PDAnnotationLink) { PDAnnotationLink link = (PDAnnotationLink) annotation; String label = stripper.getTextForRegion(String.valueOf(j)).trim(); links.put(label, link.getAction()); } } return links; }
From source file:uk.ac.leeds.ccg.andyt.rdl.web.RDL_ParsePDF.java
/** * https://svn.apache.org/viewvc/pdfbox/trunk/examples/ Based on * https://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/pdmodel/PrintURLs.java?view=markup&pathrev=1703066 * * @param f//from w w w. j a v a2s. c om * @param filter * @param fis * @return * @throws IOException * @throws TikaException * @throws SAXException */ public static ArrayList<String[]> parseForLinks(File f, String filter, FileInputStream fis) throws IOException, TikaException, SAXException { ArrayList<String[]> result; result = new ArrayList<String[]>(); PDDocument doc = PDDocument.load(f); int pageNum = 0; for (PDPage page : doc.getPages()) { pageNum++; // if (pageNum == 11) { //Degug test hack System.out.println("Parsing page " + pageNum); PDFTextStripperByArea stripper = new PDFTextStripperByArea(); List<PDAnnotation> annotations = page.getAnnotations(); //first setup text extraction regions for (int j = 0; j < annotations.size(); j++) { PDAnnotation annot = annotations.get(j); if (annot instanceof PDAnnotationLink) { PDAnnotationLink link = (PDAnnotationLink) annot; PDRectangle rect = link.getRectangle(); //need to reposition link rectangle to match text space float x = rect.getLowerLeftX(); float y = rect.getUpperRightY(); float width = rect.getWidth(); float height = rect.getHeight(); int rotation = page.getRotation(); if (rotation == 0) { PDRectangle pageSize = page.getMediaBox(); y = pageSize.getHeight() - y; } else if (rotation == 90) { //do nothing } //Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height); // Rounding here could be a problem! Rectangle2D.Double awtRect = new Rectangle2D.Double(x, y, width, height); stripper.addRegion("" + j, awtRect); } } stripper.extractRegions(page); for (int j = 0; j < annotations.size(); j++) { PDAnnotation annot = annotations.get(j); if (annot instanceof PDAnnotationLink) { PDAnnotationLink link = (PDAnnotationLink) annot; PDAction action = link.getAction(); if (action == null) { System.out.println(link.getContents()); System.out.println(annot.getClass().getName()); System.out.println(annot.getAnnotationName()); //System.out.println(annot.getNormalAppearanceStream().toString()); System.out.println(annot.getContents()); System.out.println(annot.getSubtype()); } else { String urlText = stripper.getTextForRegion("" + j); if (action instanceof PDActionURI) { PDActionURI uri = (PDActionURI) action; String url; url = uri.getURI(); if (url.contains(filter)) { String[] partResult; partResult = new String[3]; partResult[0] = "Page " + pageNum; partResult[1] = "urlText " + urlText; partResult[2] = "URL " + uri.getURI(); System.out.println(partResult[0]); System.out.println(partResult[1]); System.out.println(partResult[2]); System.out.println("URL " + uri.getURI()); result.add(partResult); } else { System.out.println("URL " + uri.getURI()); } } else { System.out.println(action.getType()); } } } else { System.out.println(annot.getClass().getName()); System.out.println(annot.getAnnotationName()); System.out.println(annot.getContents()); System.out.println(annot.getSubtype()); } } //} } // PDDocument doc = PDDocument.load(f); // int pageNum = 0; // for (PDPage page : doc.getPages()) { // pageNum++; // List<PDAnnotation> annotations = page.getAnnotations(); // // for (PDAnnotation annotation : annotations) { // PDAnnotation annot = annotation; // if (annot instanceof PDAnnotationLink) { // PDAnnotationLink link = (PDAnnotationLink) annot; // PDAction action = link.getAction(); // if (action instanceof PDActionURI) { // PDActionURI uri = (PDActionURI) action; // String oldURI = uri.getURI(); // String name = annot.getAnnotationName(); // String contents = annot.getContents(); // PDAppearanceStream a = annot.getNormalAppearanceStream(); // //String newURI = "http://pdfbox.apache.org"; // System.out.println(oldURI + " " + name + " " + contents); // //uri.setURI(newURI); // } // } // } // } // result = parseWithTika(fis); //XMPSchema schema; //schema = new XMPSchema(); //List<String> XMPBagOrSeqList; //XMPBagOrSeqList = getXMPBagOrSeqList(XMPSchema schema, String name) { // PDDocument tPDDocument; // tPDDocument = PDDocument.load(f); // COSDocument tCOSDocument; // tCOSDocument = tPDDocument.getDocument(); // String header; // header = tCOSDocument.getHeaderString(); // System.out.println(header); // PDDocumentCatalog tPDDocumentCatalog; // tPDDocumentCatalog = tPDDocument.getDocumentCatalog(); // PDDocumentNameDictionary tPDDocumentNameDictionary; // tPDDocumentNameDictionary = tPDDocumentCatalog.getNames(); // COSDictionary tCOSDictionary; // tCOSDictionary = tPDDocumentNameDictionary.getCOSDictionary(); //tCOSDictionary. // PDPageNode tPDPageNode; // tPDPageNode = tPDDocumentCatalog.getPages(); // List<COSObject> tCOSObjects; // tCOSObjects = tCOSDocument.getObjects(); // int n; // n = tCOSObjects.size(); // System.out.println(n); // COSObject aCOSObject; // String s; // for (int i = 0; i < n; i++) { // aCOSObject = tCOSObjects.get(i); // s = aCOSObject.toString(); // System.out.println(s); // } // XMPMetadata tXMPMetadata; // tXMPMetadata = getXMPMetadata(tPDDocument); // Document XMPDocument; // XMPDocument = tXMPMetadata.getXMPDocument(); // Node n; // n = XMPDocument.getFirstChild(); // parseNode(n); return result; }