List of usage examples for org.apache.pdfbox.pdmodel PDPage getAnnotations
public List<PDAnnotation> getAnnotations() throws IOException
From source file:com.vns.pdf.impl.PdfDocument.java
License:Apache License
private List<Annotation> parseAnnotation(PDPage pdPage) throws IOException { List<Annotation> annotations = new ArrayList<>(); for (PDAnnotation annt : pdPage.getAnnotations()) { if (annt instanceof PDAnnotationLink) { PDAnnotationLink link = (PDAnnotationLink) annt; PDRectangle rect = link.getRectangle(); float x = rect.getLowerLeftX(); float y = rect.getUpperRightY(); float width = rect.getWidth(); float height = rect.getHeight(); int rotation = pdPage.getRotation(); if (rotation == 0) { PDRectangle pageSize = pdPage.getMediaBox(); y = pageSize.getHeight() - y; } else if (rotation == 90) { //do nothing }/*w ww . j ava2s . c om*/ ActionData actionData = parsePDAction(link.getAction()); if (actionData == null) { actionData = parsePDDestination(link.getDestination()); } if (actionData != null) { Annotation a = new Annotation(x, y, width, height, actionData.destX, actionData.destY, actionData.destPage, actionData.destZoom); annotations.add(a); } } } return annotations; }
From source file:com.zilbo.flamingSailor.TE.PDFParser.java
License:Apache License
@Override protected void endPage(PDPage page) throws IOException { super.endPage(page); int pieceID = 0; Map<String, Map<Integer, Long>> fontCounts = new HashMap<>(); List<TextPiece> wordsOfThisPage = new ArrayList<>(); for (List<TextPosition> aCharactersByArticle : charactersByArticle) { // int len = aCharactersByArticle.size(); for (TextPosition t : aCharactersByArticle) { // copy information TextPiece w = new TextPiece(pieceID++); PDFont font = t.getFont();/* ww w.j a va 2s . c o m*/ PDFontDescriptor fontDescriptor = font.getFontDescriptor(); // w.setFontDescriptor(fontDescriptor); if (fontDescriptor == null) { w.setFontName("UNKNOWN"); } else { w.setFontName(fontDescriptor.getFontName()); } /* * 100: a simple step to fix the font size to the normal range, for those documents in unknown codes that PDFBox can not process now */ if (t.getFontSize() < 0.3 && t.getYScale() <= 1.0) { w.setFontSize(t.getFontSize() * 100); w.setHeight(Math.max(t.getYScale(), t.getFontSize()) * 100); w.setXScale(t.getXScale()); w.setYScale(t.getYScale()); } else { if (t.getYScale() < 0.3 && t.getFontSize() <= 1.0) { w.setYScale(t.getYScale() * 100); w.setXScale(t.getXScale() * 100); w.setHeight(Math.max(t.getYScale() * 100, t.getFontSize())); } else { w.setFontSize(t.getFontSize()); w.setHeight(Math.max(t.getYScale(), t.getFontSize())); w.setXScale(t.getXScale()); w.setYScale(t.getYScale()); } } Map<Integer, Long> counts = fontCounts.get(w.getFontName()); if (counts == null) { counts = new HashMap<>(); fontCounts.put(w.getFontName(), counts); } Long count = counts.get((int) Math.round(w.getHeight())); if (count == null) { count = 1L; } else { count += 1L; } counts.put((int) Math.round(w.getHeight()), count); w.setWidth(Math.abs(t.getWidth())); w.setGeom(t.getX(), t.getY(), w.getWidth(), w.getHeight()); w.setText(t.getCharacter()); w.setWidthOfSpace(t.getWidthOfSpace()); wordsOfThisPage.add(w); } } currentPage.processPage(wordsOfThisPage, fontCounts); currentPage.setText(outString.getBuffer().toString()); outString.getBuffer().setLength(0); List<PDAnnotation> annotations = page.getAnnotations(); for (PDAnnotation annotation : annotations) { if (annotation instanceof PDAnnotationLink) { PDAnnotationLink l = (PDAnnotationLink) annotation; PDRectangle rect = l.getRectangle(); PDDestination dest = l.getDestination(); if (dest instanceof PDPageXYZDestination) { PDPageXYZDestination xyzDestination = (PDPageXYZDestination) dest; PDPage pageDest = ((PDPageXYZDestination) dest).getPage(); if (rect != null) { if (xyzDestination.getPageNumber() < 0) { int pageNumber = allpages.indexOf(pageDest) + 1; Rectangle2D hotbox = new Rectangle2D.Double(rect.getLowerLeftX(), rect.getLowerLeftY(), (rect.getUpperRightX() - rect.getLowerLeftX()), (rect.getUpperRightY() - rect.getLowerLeftY())); Point2D toPoint = new Point2D.Double(xyzDestination.getLeft(), xyzDestination.getTop()); currentPage.addLink(new PDLink(hotbox, pageNumber, toPoint)); } } } } } /* The following code is REALLY raw. initial testing seemed to show memory leaks, and was REALLY slow. PDResources r = page.getResources(); Map<String, PDXObjectImage> images = r.getImages(); for (Map.Entry<String, PDXObjectImage> e : images.entrySet()) { BufferedImage bi = null; try { // currentPage.addImage(bi); // (e.getValue()).write2file("/tmp/II" + e.getKey()); if (e.getValue() instanceof PDJpeg) { PDJpeg jpg = (PDJpeg) e.getValue(); bi = jpg.getRGBImage(); ColorSpace cs = bi.getColorModel().getColorSpace(); File jpgFile = new File("/tmp/II" + e.getKey() + ".jpg"); if (cs instanceof ColorSpaceCMYK) { logger.info("Ignoring image with CMYK color space"); } else { // ImageIO.write(bi, "jpg", jpgFile); jpg.write2file("/tmp/II"+ e.getKey()); } } else { (e.getValue()).write2file("/tmp/II" + e.getKey()); } } catch (Exception ee) { logger.info("can't read image ;-(", ee); } } */ textPageList.add(currentPage); currentPage = null; }
From source file:de.berber.kindle.annotator.lib.Annotation.java
License:Apache License
/** * Checks whether the current belongs to the page {@code currentPageNumber} * and adds the annotation in such an case. * //from w ww. j a v a 2s .co m * @param currentPageNumber * The page number of {@code page}. * @param documentOutline * The pdf outline * @param page * The pdf page. */ @SuppressWarnings("unchecked") public void toPDAnnotation(final @Nonnegative int currentPageNumber, final @Nonnull PDDocumentOutline documentOutline, final @Nonnull PDPage page) { if (this.page != currentPageNumber) { return; } try { final List<PDAnnotation> annotations = ((List<PDAnnotation>) page.getAnnotations()); final PDAnnotation annotation = toPDAnnotation(documentOutline, page); if (annotation != null) { annotations.add(annotation); } } catch (IOException e) { LOG.error("Cannot read annotations from PDF."); } }
From source file:dev.ztgnrw.ExtractEmbeddedFiles.java
License:Apache License
/** * This is the main method./*ww w .j a v a 2 s . co m*/ * * @param args The command line arguments. * * @throws IOException If there is an error parsing the document. */ public static void extractEmbeddedFiles(String file) throws IOException { PDDocument document = null; try { File pdfFile = new File(file); String filePath = pdfFile.getParent() + System.getProperty("file.separator"); document = PDDocument.load(pdfFile); PDDocumentNameDictionary namesDictionary = new PDDocumentNameDictionary(document.getDocumentCatalog()); PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles(); if (efTree != null) { Map<String, PDComplexFileSpecification> names = efTree.getNames(); if (names != null) { extractFiles(names, filePath); } else { List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids(); for (PDNameTreeNode<PDComplexFileSpecification> node : kids) { names = node.getNames(); extractFiles(names, filePath); } } } // extract files from annotations for (PDPage page : document.getPages()) { for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment annotationFileAttachment = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) annotationFileAttachment .getFile(); PDEmbeddedFile embeddedFile = getEmbeddedFile(fileSpec); extractFile(filePath, fileSpec.getFilename(), embeddedFile); } } } } finally { if (document != null) { document.close(); } } }
From source file:fi.riista.feature.permit.invoice.pdf.PermitHarvestInvoicePdfBuilder.java
private void addFormFieldData() throws IOException { textField("iban", model.getInvoiceAccountDetails().getCombinedBankNameAndIbanString()); textField("bic", model.getInvoiceAccountDetails().getBic().toString()); textField("saaja", model.getPaymentRecipient()); textField("maksaja", Joiner.on('\n').join(model.getInvoiceRecipient().formatAsLines())); textField("summa", model.getAmountText()); textField("viitenumero", model.getInvoiceReferenceForHuman()); textField("lisatiedot", model.getInvoiceAdditionalInfo()); this.acroForm.setNeedAppearances(false); // Fix annotations for (PDPage page : this.pdfDocument.getPages()) { for (PDAnnotation annot : page.getAnnotations()) { annot.setPage(page);/*w ww. j a v a 2 s. c om*/ } } // Define font resources names used in PDF template final PDResources dr = new PDResources(); dr.put(COSName.getPDFName("Helv"), PDType1Font.HELVETICA); dr.put(COSName.getPDFName("HeBo"), PDType1Font.HELVETICA_BOLD); this.acroForm.setDefaultResources(dr); // Convert form fields to text this.acroForm.flatten(); }
From source file:helper.pdfpreprocessing.pdf.TextHighlight.java
License:Apache License
private boolean markupMatch(Color color, PDPageContentStream contentStream, Match markingMatch, int height, boolean withId, PDPage page, String comment, boolean commentOnly) throws IOException { final List<PDRectangle> textBoundingBoxes = getTextBoundingBoxes(markingMatch.positions); if (textBoundingBoxes.size() > 0) { contentStream.setNonStrokingColor(color); for (PDRectangle textBoundingBox : textBoundingBoxes) { if (comment.isEmpty()) { contentStream.addRect(textBoundingBox.getLowerLeftX(), textBoundingBox.getLowerLeftY(), Math .max(Math.abs(textBoundingBox.getUpperRightX() - textBoundingBox.getLowerLeftX()), 10), height);/*from w w w .jav a 2s. c om*/ contentStream.fill(); } if (withId) { PDFont font = PDType1Font.HELVETICA; contentStream.beginText(); contentStream.setFont(font, 5); contentStream.newLineAtOffset(textBoundingBox.getUpperRightX(), textBoundingBox.getUpperRightY()); contentStream.showText(markingMatch.str); contentStream.endText(); } if (!comment.isEmpty() && !commentOnly) { PDAnnotationTextMarkup txtMark = new PDAnnotationTextMarkup( PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT); PDRectangle position = new PDRectangle(); position.setLowerLeftX(textBoundingBox.getLowerLeftX()); position.setLowerLeftY(textBoundingBox.getLowerLeftY()); position.setUpperRightX(textBoundingBox.getLowerLeftX() + Math .max(Math.abs(textBoundingBox.getUpperRightX() - textBoundingBox.getLowerLeftX()), 10)); position.setUpperRightY(textBoundingBox.getLowerLeftY() + 10); txtMark.setRectangle(position); float[] quads = new float[8]; quads[0] = position.getLowerLeftX(); // x1 quads[1] = position.getUpperRightY() - 2; // y1 quads[2] = position.getUpperRightX(); // x2 quads[3] = quads[1]; // y2 quads[4] = quads[0]; // x3 quads[5] = position.getLowerLeftY() - 2; // y3 quads[6] = quads[2]; // x4 quads[7] = quads[5]; // y5 txtMark.setQuadPoints(quads); txtMark.setConstantOpacity((float) 0.5); txtMark.setContents("Missing Assumption/s (" + markingMatch.str + "):\n" + comment); float[] colorArray = new float[] { 0, 0, 0 }; colorArray = color.getColorComponents(colorArray); PDColor hColor = new PDColor(colorArray, PDDeviceRGB.INSTANCE); txtMark.setColor(hColor); txtMark.setCreationDate(Calendar.getInstance()); txtMark.setTitlePopup("Assumption Error"); page.getAnnotations().add(txtMark); } else if (!comment.isEmpty() && commentOnly) { for (int i = 0; i < page.getAnnotations().size(); i++) { String extractedComment = page.getAnnotations().get(i).getContents(); if (extractedComment != null) { String commentID = extractedComment.substring(extractedComment.indexOf("(") + 1, extractedComment.indexOf(")")); if (markingMatch.str.equals(commentID) && extractedComment.contains(comment)) { page.getAnnotations().get(i).setContents(extractedComment + "\n" + comment); } } } } } return true; } return false; }
From source file:hightlighting.PDFTextAnnotator.java
License:Apache License
/** * Highlights a pattern within the PDF with the default color * Returns the list of added annotations for further modification * Note: it will process every page, but cannot process patterns that span multiple pages * Note: it will not work for top-bottom text (such as Chinese) * /*from ww w . j a v a 2s .c o m*/ * @param pdf * PDDocument * @param pattern * Pattern (regex) * @throws Exception */ public List<PDAnnotationTextMarkup> highlight(PDDocument pdf, Pattern pattern) throws Exception { if (textCache == null) { throw new Exception("TextCache was not initilized, please run initialize on the document first"); } List<PDPage> pages = pdf.getDocumentCatalog().getAllPages(); ArrayList<PDAnnotationTextMarkup> highligts = new ArrayList<PDAnnotationTextMarkup>(); for (int pageIndex = getStartPage() - 1; pageIndex < getEndPage() && pageIndex < pages.size(); pageIndex++) { PDPage page = pages.get(pageIndex); List<PDAnnotation> annotations = page.getAnnotations(); List<Match> matches = this.textCache.getTextPositions(pageIndex + 1, pattern); for (Match match : matches) { List<PDRectangle> textBoundingBoxes = getTextBoundingBoxes(match.positions); if (textBoundingBoxes.size() > 0) { float[] quads = new float[8]; int cursor = 0; for (PDRectangle rect : textBoundingBoxes) { PDAnnotationTextMarkup markup = new PDAnnotationTextMarkup( PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT); markup.setRectangle(rect); float[] tmp = computeQuads(rect); for (int i = 0; i < tmp.length; i++) { quads[cursor + i] = tmp[i]; } //cursor = cursor + 8; markup.setQuadPoints(quads); markup.setConstantOpacity((float) 0.8); markup.setColour(getDefaultColor()); markup.setPrinted(true); markup.setContents(match.str); annotations.add(markup); highligts.add(markup); } } } } return highligts; }
From source file:mj.ocraptor.extraction.tika.parser.pdf.PDF2XHTML.java
License:Apache License
@Override protected void endPage(PDPage page) throws IOException { try {// www . ja va 2 s . c om writeParagraphEnd(); // TODO: remove once PDFBOX-1143 is fixed: if (config.getExtractAnnotationText()) { for (Object o : page.getAnnotations()) { if (o instanceof PDAnnotationLink) { PDAnnotationLink annotationlink = (PDAnnotationLink) o; if (annotationlink.getAction() != null) { PDAction action = annotationlink.getAction(); if (action instanceof PDActionURI) { PDActionURI uri = (PDActionURI) action; String link = uri.getURI(); if (link != null) { handler.startElement("div", "class", "annotation"); handler.startElement("a", "href", link); handler.endElement("a"); handler.endElement("div"); } } } } if (o instanceof PDAnnotationMarkup) { PDAnnotationMarkup annot = (PDAnnotationMarkup) o; String title = annot.getTitlePopup(); String subject = annot.getSubject(); String contents = annot.getContents(); // TODO: maybe also annot.getRichContents()? if (title != null || subject != null || contents != null) { handler.startElement("div", "class", "annotation"); if (title != null) { handler.startElement("div", "class", "annotationTitle"); handler.characters(title); handler.endElement("div"); } if (subject != null) { handler.startElement("div", "class", "annotationSubject"); handler.characters(subject); handler.endElement("div"); } if (contents != null) { handler.startElement("div", "class", "annotationContents"); handler.characters(contents); handler.endElement("div"); } handler.endElement("div"); } } } } handler.endElement("div"); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to end a page", e); } }
From source file:net.bookinaction.ExtractAnnotations.java
License:Apache License
public void doJob(String job, Float[] pA) throws IOException { PDDocument document = null;//from ww w.j av a 2 s . co m Stamper s = new Stamper(); // utility class final String job_file = job + ".pdf"; final String dic_file = job + "-dict.txt"; final String new_job = job + "-new.pdf"; PrintWriter writer = new PrintWriter(dic_file); ImageLocationListener imageLocationsListener = new ImageLocationListener(); AnnotationMaker annotMaker = new AnnotationMaker(); try { document = PDDocument.load(new File(job_file)); int pageNum = 0; for (PDPage page : document.getPages()) { pageNum++; PDRectangle cropBox = page.getCropBox(); List<PDAnnotation> annotations = page.getAnnotations(); // extract image locations List<Rectangle2D> imageRects = new ArrayList<Rectangle2D>(); imageLocationsListener.setImageRects(imageRects); imageLocationsListener.processPage(page); int im = 0; for (Rectangle2D pdImageRect : imageRects) { s.recordImage(writer, pageNum, "[im" + im + "]", (Rectangle2D.Float) pdImageRect); annotations.add(annotMaker.squareAnnotation(Color.YELLOW, (Rectangle2D.Float) pdImageRect, "[im" + im + "]")); im++; } PDFTextStripperByArea stripper = new PDFTextStripperByArea(); int j = 0; List<PDAnnotation> viableAnnots = new ArrayList(); for (PDAnnotation annot : annotations) { if (annot instanceof PDAnnotationTextMarkup || annot instanceof PDAnnotationLink) { stripper.addRegion(Integer.toString(j++), s.getAwtRect( s.adjustedRect(annot.getRectangle(), pA[0], pA[1], pA[2], pA[3]), cropBox)); viableAnnots.add(annot); } else if (annot instanceof PDAnnotationPopup || annot instanceof PDAnnotationText) { viableAnnots.add(annot); } } stripper.extractRegions(page); List<PDRectangle> rects = new ArrayList<PDRectangle>(); List<String> comments = new ArrayList<String>(); List<String> highlightTexts = new ArrayList<String>(); j = 0; for (PDAnnotation viableAnnot : viableAnnots) { if (viableAnnot instanceof PDAnnotationTextMarkup) { String highlightText = stripper.getTextForRegion(Integer.toString(j++)); String withoutCR = highlightText.replace((char) 0x0A, '^'); String comment = viableAnnot.getContents(); String colorString = String.format("%06x", viableAnnot.getColor().toRGB()); PDRectangle aRect = s.adjustedRect(viableAnnot.getRectangle(), pA[4], pA[5], pA[6], pA[7]); rects.add(aRect); comments.add(comment); highlightTexts.add(highlightText); s.recordTextMarkup(writer, pageNum, comment, withoutCR, aRect, colorString); } else if (viableAnnot instanceof PDAnnotationText) { String comment = viableAnnot.getContents(); String colorString = String.format("%06x", viableAnnot.getColor().toRGB()); for (Rectangle2D pdImageRect : imageRects) { if (pdImageRect.contains(viableAnnot.getRectangle().getLowerLeftX(), viableAnnot.getRectangle().getLowerLeftY())) { s.recordTextMarkup(writer, pageNum, comment, "", (Rectangle2D.Float) pdImageRect, colorString); annotations.add(annotMaker.squareAnnotation(Color.GREEN, (Rectangle2D.Float) pdImageRect, comment)); } ; } } } PDPageContentStream canvas = new PDPageContentStream(document, page, true, true, true); int i = 0; for (PDRectangle pdRect : rects) { String comment = comments.get(i); String highlightText = highlightTexts.get(i); //annotations.add(linkAnnotation(pdRect, comment, highlightText)); //annotations.add(annotationSquareCircle(pdRect, BLUE)); s.showBox(canvas, new Rectangle2D.Float(pdRect.getLowerLeftX(), pdRect.getUpperRightY(), pdRect.getWidth(), pdRect.getHeight()), cropBox, Color.BLUE); i++; } canvas.close(); } writer.close(); document.save(new_job); } finally { if (document != null) { document.close(); } } }
From source file:net.padaf.preflight.helpers.PagesValidationHelper.java
License:Apache License
/** * /*from w ww .j ava 2 s. c o m*/ * @param page * @param handler * @param result * @return * @throws ValidationException */ protected boolean validateAnnotation(PDPage page, DocumentHandler handler, List<ValidationError> result) throws ValidationException { try { List<?> lAnnots = page.getAnnotations(); for (Object object : lAnnots) { if (object instanceof PDAnnotation) { COSDictionary cosAnnot = ((PDAnnotation) object).getDictionary(); AnnotationValidator validator = this.annotFact.getAnnotationValidator(cosAnnot, handler, result); if (validator != null) { return validator.validate(result); } } } } catch (IOException e) { throw new ValidationException("Unable to access Annotation", e); } // --- No annotations, validation OK return true; }