List of usage examples for org.apache.pdfbox.pdmodel PDPage getThreadBeads
public List<PDThreadBead> getThreadBeads()
From source file:aplicacion.sistema.indexer.test.PDFTextStripperOrg.java
License:Apache License
/** * This will process the contents of a page. * * @param page The page to process./*from ww w. j a v a2 s .co m*/ * @param content The contents of the page. * * @throws IOException If there is an error processing the page. */ protected void processPage(PDPage page, COSStream content) throws IOException { if (currentPageNo >= startPage && currentPageNo <= endPage && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber) && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber)) { startPage(page); pageArticles = page.getThreadBeads(); int numberOfArticleSections = 1 + pageArticles.size() * 2; if (!shouldSeparateByBeads) { numberOfArticleSections = 1; } int originalSize = charactersByArticle.size(); charactersByArticle.setSize(numberOfArticleSections); for (int i = 0; i < numberOfArticleSections; i++) { if (numberOfArticleSections < originalSize) { ((List) charactersByArticle.get(i)).clear(); } else { charactersByArticle.set(i, new ArrayList()); } } characterListMapping.clear(); processStream(page, page.findResources(), content); writePage(); endPage(page); } }
From source file:com.repeatability.pdf.PDFTextStripper.java
License:Apache License
private void fillBeadRectangles(PDPage page) { beadRectangles = new ArrayList<PDRectangle>(); for (PDThreadBead bead : page.getThreadBeads()) { if (bead == null) { // can't skip, because of null entry handling in processTextPosition() beadRectangles.add(null);/*from w w w . j a v a 2 s. c o m*/ continue; } PDRectangle rect = bead.getRectangle(); // bead rectangle is in PDF coordinates (y=0 is bottom), // glyphs are in image coordinates (y=0 is top), // so we must flip PDRectangle mediaBox = page.getMediaBox(); float upperRightY = mediaBox.getUpperRightY() - rect.getLowerLeftY(); float lowerLeftY = mediaBox.getUpperRightY() - rect.getUpperRightY(); rect.setLowerLeftY(lowerLeftY); rect.setUpperRightY(upperRightY); // adjust for cropbox PDRectangle cropBox = page.getCropBox(); if (cropBox.getLowerLeftX() != 0 || cropBox.getLowerLeftY() != 0) { rect.setLowerLeftX(rect.getLowerLeftX() - cropBox.getLowerLeftX()); rect.setLowerLeftY(rect.getLowerLeftY() - cropBox.getLowerLeftY()); rect.setUpperRightX(rect.getUpperRightX() - cropBox.getLowerLeftX()); rect.setUpperRightY(rect.getUpperRightY() - cropBox.getLowerLeftY()); } beadRectangles.add(rect); } }
From source file:com.yiyihealth.tools.test.DrawPrintTextLocations.java
License:Apache License
private void stripPage(int page) throws IOException { PDFRenderer pdfRenderer = new PDFRenderer(document); image = pdfRenderer.renderImage(page, SCALE); PDPage pdPage = document.getPage(page); PDRectangle cropBox = pdPage.getCropBox(); // flip y-axis flipAT = new AffineTransform(); flipAT.translate(0, pdPage.getBBox().getHeight()); flipAT.scale(1, -1);//from ww w .j a v a 2s. c o m // page may be rotated rotateAT = new AffineTransform(); int rotation = pdPage.getRotation(); if (rotation != 0) { PDRectangle mediaBox = pdPage.getMediaBox(); switch (rotation) { case 90: rotateAT.translate(mediaBox.getHeight(), 0); break; case 270: rotateAT.translate(0, mediaBox.getWidth()); break; case 180: rotateAT.translate(mediaBox.getWidth(), mediaBox.getHeight()); break; default: break; } rotateAT.rotate(Math.toRadians(rotation)); } g2d = image.createGraphics(); g2d.setStroke(new BasicStroke(0.1f)); g2d.scale(SCALE, SCALE); setStartPage(page + 1); setEndPage(page + 1); Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream()); writeText(document, dummy); // beads in green g2d.setStroke(new BasicStroke(0.4f)); List<PDThreadBead> pageArticles = pdPage.getThreadBeads(); for (PDThreadBead bead : pageArticles) { PDRectangle r = bead.getRectangle(); GeneralPath p = r .transform(Matrix.getTranslateInstance(-cropBox.getLowerLeftX(), cropBox.getLowerLeftY())); Shape s = flipAT.createTransformedShape(p); s = rotateAT.createTransformedShape(s); g2d.setColor(Color.green); g2d.draw(s); } g2d.dispose(); String imageFilename = filename; int pt = imageFilename.lastIndexOf('.'); imageFilename = imageFilename.substring(0, pt) + "-marked-" + (page + 1) + ".png"; ImageIO.write(image, "png", new File(imageFilename)); }
From source file:de.tudarmstadt.ukp.dkpro.core.io.pdf.PdfLayoutEventStripper.java
License:Apache License
/** * This will process the contents of a page. * //from www . j ava2s . c o m * @param page * The page to process. * @param content * The contents of the page. * * @throws IOException * If there is an error processing the page. */ protected void processPage(final PDPage page, final COSStream content) throws IOException { if ((currentPageNo >= startPage) && (currentPageNo <= endPage)) { startPage(startPage, Math.min(maxPage, endPage), currentPageNo, page); pageArticles = page.getThreadBeads(); int numberOfArticleSections = 1 + pageArticles.size() * 2; if (!shouldSeparateByBeads) { numberOfArticleSections = 1; } final int originalSize = charactersByArticle.size(); charactersByArticle.setSize(numberOfArticleSections); for (int i = 0; i < numberOfArticleSections; i++) { if (numberOfArticleSections < originalSize) { charactersByArticle.get(i).clear(); } else { charactersByArticle.set(i, new ArrayList<TextPosition>()); } } characterListMapping.clear(); // processStream will call showCharacter were we will simply // collect all the TextPositions for the page processStream(page, page.findResources(), content); // Now we do the real processing for (int i = 0; i < charactersByArticle.size(); i++) { processArticle(charactersByArticle.get(i)); } endPage(startPage, endPage, currentPageNo, page); } }
From source file:edu.isi.bmkeg.lapdf.extraction.LAPDFTextStripper.java
License:Apache License
/** * This will process the contents of a page. * * @param page The page to process.// w w w . j ava2 s . com * @param content The contents of the page. * * @throws IOException If there is an error processing the page. */ protected void processPage(PDPage page, COSStream content) throws IOException { if (currentPageNo >= startPage && currentPageNo <= endPage && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber) && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber)) { startPage(page); pageArticles = page.getThreadBeads(); int numberOfArticleSections = 1 + pageArticles.size() * 2; if (!shouldSeparateByBeads) { numberOfArticleSections = 1; } int originalSize = charactersByArticle.size(); charactersByArticle.setSize(numberOfArticleSections); for (int i = 0; i < numberOfArticleSections; i++) { if (numberOfArticleSections < originalSize) { ((List<TextPosition>) charactersByArticle.get(i)).clear(); } else { charactersByArticle.set(i, new ArrayList<TextPosition>()); } } characterListMapping.clear(); processStream(page, page.findResources(), content); writePage(); endPage(page); } }
From source file:onyx.core.parser.PDFTextStripper.java
License:Apache License
/** * This will process the contents of a page. * * @param page The page to process./* ww w . jav a2s . c o m*/ * @param content The contents of the page. * * @throws IOException If there is an error processing the page. */ protected void processPage(PDPage page, COSStream content) throws IOException { if (currentPageNo >= startPage && currentPageNo <= endPage && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber) && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber)) { startPage(page); pageArticles = page.getThreadBeads(); int numberOfArticleSections = 1 + pageArticles.size() * 2; if (!shouldSeparateByBeads) { numberOfArticleSections = 1; } int originalSize = charactersByArticle.size(); charactersByArticle.setSize(numberOfArticleSections); for (int i = 0; i < numberOfArticleSections; i++) { if (numberOfArticleSections < originalSize) { ((List<TextPosition>) charactersByArticle.get(i)).clear(); } else { charactersByArticle.set(i, new ArrayList<TextPosition>()); } } characterListMapping.clear(); processStream(page, page.findResources(), content); writePage(); endPage(page); } }