Example usage for org.apache.pdfbox.pdmodel PDPage getThreadBeads

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDPage getThreadBeads.

Prototype

public List<PDThreadBead> getThreadBeads()

Source Link

Document

This will get a list of PDThreadBead objects, which are article threads in the document.

Usage

From source file:aplicacion.sistema.indexer.test.PDFTextStripperOrg.java

License:Apache License

/**
 * This will process the contents of a page.
 *
 * @param page The page to process./*from   ww  w. j a  v a2  s  .co  m*/
 * @param content The contents of the page.
 *
 * @throws IOException If there is an error processing the page.
 */
protected void processPage(PDPage page, COSStream content) throws IOException {
    if (currentPageNo >= startPage && currentPageNo <= endPage
            && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber)
            && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber)) {
        startPage(page);
        pageArticles = page.getThreadBeads();
        int numberOfArticleSections = 1 + pageArticles.size() * 2;
        if (!shouldSeparateByBeads) {
            numberOfArticleSections = 1;
        }
        int originalSize = charactersByArticle.size();
        charactersByArticle.setSize(numberOfArticleSections);
        for (int i = 0; i < numberOfArticleSections; i++) {
            if (numberOfArticleSections < originalSize) {
                ((List) charactersByArticle.get(i)).clear();
            } else {
                charactersByArticle.set(i, new ArrayList());
            }
        }

        characterListMapping.clear();
        processStream(page, page.findResources(), content);
        writePage();
        endPage(page);
    }

}

From source file:com.repeatability.pdf.PDFTextStripper.java

License:Apache License

private void fillBeadRectangles(PDPage page) {
    beadRectangles = new ArrayList<PDRectangle>();
    for (PDThreadBead bead : page.getThreadBeads()) {
        if (bead == null) {
            // can't skip, because of null entry handling in processTextPosition()
            beadRectangles.add(null);/*from   w w  w  . j  a v a  2 s. c  o  m*/
            continue;
        }

        PDRectangle rect = bead.getRectangle();

        // bead rectangle is in PDF coordinates (y=0 is bottom),
        // glyphs are in image coordinates (y=0 is top),
        // so we must flip
        PDRectangle mediaBox = page.getMediaBox();
        float upperRightY = mediaBox.getUpperRightY() - rect.getLowerLeftY();
        float lowerLeftY = mediaBox.getUpperRightY() - rect.getUpperRightY();
        rect.setLowerLeftY(lowerLeftY);
        rect.setUpperRightY(upperRightY);

        // adjust for cropbox
        PDRectangle cropBox = page.getCropBox();
        if (cropBox.getLowerLeftX() != 0 || cropBox.getLowerLeftY() != 0) {
            rect.setLowerLeftX(rect.getLowerLeftX() - cropBox.getLowerLeftX());
            rect.setLowerLeftY(rect.getLowerLeftY() - cropBox.getLowerLeftY());
            rect.setUpperRightX(rect.getUpperRightX() - cropBox.getLowerLeftX());
            rect.setUpperRightY(rect.getUpperRightY() - cropBox.getLowerLeftY());
        }

        beadRectangles.add(rect);
    }
}

From source file:com.yiyihealth.tools.test.DrawPrintTextLocations.java

License:Apache License

private void stripPage(int page) throws IOException {
    PDFRenderer pdfRenderer = new PDFRenderer(document);
    image = pdfRenderer.renderImage(page, SCALE);

    PDPage pdPage = document.getPage(page);
    PDRectangle cropBox = pdPage.getCropBox();

    // flip y-axis
    flipAT = new AffineTransform();
    flipAT.translate(0, pdPage.getBBox().getHeight());
    flipAT.scale(1, -1);//from   ww  w  .j  a v  a 2s. c o  m

    // page may be rotated
    rotateAT = new AffineTransform();
    int rotation = pdPage.getRotation();
    if (rotation != 0) {
        PDRectangle mediaBox = pdPage.getMediaBox();
        switch (rotation) {
        case 90:
            rotateAT.translate(mediaBox.getHeight(), 0);
            break;
        case 270:
            rotateAT.translate(0, mediaBox.getWidth());
            break;
        case 180:
            rotateAT.translate(mediaBox.getWidth(), mediaBox.getHeight());
            break;
        default:
            break;
        }
        rotateAT.rotate(Math.toRadians(rotation));
    }

    g2d = image.createGraphics();
    g2d.setStroke(new BasicStroke(0.1f));
    g2d.scale(SCALE, SCALE);

    setStartPage(page + 1);
    setEndPage(page + 1);

    Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
    writeText(document, dummy);

    // beads in green
    g2d.setStroke(new BasicStroke(0.4f));
    List<PDThreadBead> pageArticles = pdPage.getThreadBeads();
    for (PDThreadBead bead : pageArticles) {
        PDRectangle r = bead.getRectangle();
        GeneralPath p = r
                .transform(Matrix.getTranslateInstance(-cropBox.getLowerLeftX(), cropBox.getLowerLeftY()));

        Shape s = flipAT.createTransformedShape(p);
        s = rotateAT.createTransformedShape(s);
        g2d.setColor(Color.green);
        g2d.draw(s);
    }

    g2d.dispose();

    String imageFilename = filename;
    int pt = imageFilename.lastIndexOf('.');
    imageFilename = imageFilename.substring(0, pt) + "-marked-" + (page + 1) + ".png";
    ImageIO.write(image, "png", new File(imageFilename));
}

From source file:de.tudarmstadt.ukp.dkpro.core.io.pdf.PdfLayoutEventStripper.java

License:Apache License

/**
 * This will process the contents of a page.
 * //from www .  j ava2s  . c o m
 * @param page
 *            The page to process.
 * @param content
 *            The contents of the page.
 * 
 * @throws IOException
 *             If there is an error processing the page.
 */
protected void processPage(final PDPage page, final COSStream content) throws IOException {
    if ((currentPageNo >= startPage) && (currentPageNo <= endPage)) {
        startPage(startPage, Math.min(maxPage, endPage), currentPageNo, page);
        pageArticles = page.getThreadBeads();
        int numberOfArticleSections = 1 + pageArticles.size() * 2;
        if (!shouldSeparateByBeads) {
            numberOfArticleSections = 1;
        }
        final int originalSize = charactersByArticle.size();
        charactersByArticle.setSize(numberOfArticleSections);
        for (int i = 0; i < numberOfArticleSections; i++) {
            if (numberOfArticleSections < originalSize) {
                charactersByArticle.get(i).clear();
            } else {
                charactersByArticle.set(i, new ArrayList<TextPosition>());
            }
        }

        characterListMapping.clear();

        // processStream will call showCharacter were we will simply
        // collect all the TextPositions for the page
        processStream(page, page.findResources(), content);

        // Now we do the real processing
        for (int i = 0; i < charactersByArticle.size(); i++) {
            processArticle(charactersByArticle.get(i));
        }

        endPage(startPage, endPage, currentPageNo, page);
    }
}

From source file:edu.isi.bmkeg.lapdf.extraction.LAPDFTextStripper.java

License:Apache License

/**
 * This will process the contents of a page.
 *
 * @param page The page to process.//  w w w . j  ava2  s . com
 * @param content The contents of the page.
 *
 * @throws IOException If there is an error processing the page.
 */
protected void processPage(PDPage page, COSStream content) throws IOException {
    if (currentPageNo >= startPage && currentPageNo <= endPage
            && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber)
            && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber)) {
        startPage(page);
        pageArticles = page.getThreadBeads();
        int numberOfArticleSections = 1 + pageArticles.size() * 2;
        if (!shouldSeparateByBeads) {
            numberOfArticleSections = 1;
        }
        int originalSize = charactersByArticle.size();
        charactersByArticle.setSize(numberOfArticleSections);
        for (int i = 0; i < numberOfArticleSections; i++) {
            if (numberOfArticleSections < originalSize) {
                ((List<TextPosition>) charactersByArticle.get(i)).clear();
            } else {
                charactersByArticle.set(i, new ArrayList<TextPosition>());
            }
        }
        characterListMapping.clear();
        processStream(page, page.findResources(), content);
        writePage();
        endPage(page);
    }
}

From source file:onyx.core.parser.PDFTextStripper.java

License:Apache License

/**
 * This will process the contents of a page.
 *
 * @param page The page to process./* ww w  .  jav a2s .  c o  m*/
 * @param content The contents of the page.
 *
 * @throws IOException If there is an error processing the page.
 */
protected void processPage(PDPage page, COSStream content) throws IOException {
    if (currentPageNo >= startPage && currentPageNo <= endPage
            && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber)
            && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber)) {
        startPage(page);
        pageArticles = page.getThreadBeads();
        int numberOfArticleSections = 1 + pageArticles.size() * 2;
        if (!shouldSeparateByBeads) {
            numberOfArticleSections = 1;
        }
        int originalSize = charactersByArticle.size();
        charactersByArticle.setSize(numberOfArticleSections);
        for (int i = 0; i < numberOfArticleSections; i++) {
            if (numberOfArticleSections < originalSize) {
                ((List<TextPosition>) charactersByArticle.get(i)).clear();
            } else {
                charactersByArticle.set(i, new ArrayList<TextPosition>());
            }
        }

        characterListMapping.clear();
        processStream(page, page.findResources(), content);
        writePage();
        endPage(page);
    }

}