Example usage for org.apache.pdfbox.pdmodel PDPage getThreadBeads

List of usage examples for org.apache.pdfbox.pdmodel PDPage getThreadBeads

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDPage getThreadBeads.

Prototype

public List<PDThreadBead> getThreadBeads() 

Source Link

Document

This will get a list of PDThreadBead objects, which are article threads in the document.

Usage

From source file:aplicacion.sistema.indexer.test.PDFTextStripperOrg.java

License:Apache License

/**
 * This will process the contents of a page.
 *
 * @param page The page to process./*from   ww  w. j a  v a2  s  .co  m*/
 * @param content The contents of the page.
 *
 * @throws IOException If there is an error processing the page.
 */
protected void processPage(PDPage page, COSStream content) throws IOException {
    if (currentPageNo >= startPage && currentPageNo <= endPage
            && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber)
            && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber)) {
        startPage(page);
        pageArticles = page.getThreadBeads();
        int numberOfArticleSections = 1 + pageArticles.size() * 2;
        if (!shouldSeparateByBeads) {
            numberOfArticleSections = 1;
        }
        int originalSize = charactersByArticle.size();
        charactersByArticle.setSize(numberOfArticleSections);
        for (int i = 0; i < numberOfArticleSections; i++) {
            if (numberOfArticleSections < originalSize) {
                ((List) charactersByArticle.get(i)).clear();
            } else {
                charactersByArticle.set(i, new ArrayList());
            }
        }

        characterListMapping.clear();
        processStream(page, page.findResources(), content);
        writePage();
        endPage(page);
    }

}

From source file:com.repeatability.pdf.PDFTextStripper.java

License:Apache License

private void fillBeadRectangles(PDPage page) {
    beadRectangles = new ArrayList<PDRectangle>();
    for (PDThreadBead bead : page.getThreadBeads()) {
        if (bead == null) {
            // can't skip, because of null entry handling in processTextPosition()
            beadRectangles.add(null);/*from   w w  w  . j  a v a  2 s. c  o  m*/
            continue;
        }

        PDRectangle rect = bead.getRectangle();

        // bead rectangle is in PDF coordinates (y=0 is bottom),
        // glyphs are in image coordinates (y=0 is top),
        // so we must flip
        PDRectangle mediaBox = page.getMediaBox();
        float upperRightY = mediaBox.getUpperRightY() - rect.getLowerLeftY();
        float lowerLeftY = mediaBox.getUpperRightY() - rect.getUpperRightY();
        rect.setLowerLeftY(lowerLeftY);
        rect.setUpperRightY(upperRightY);

        // adjust for cropbox
        PDRectangle cropBox = page.getCropBox();
        if (cropBox.getLowerLeftX() != 0 || cropBox.getLowerLeftY() != 0) {
            rect.setLowerLeftX(rect.getLowerLeftX() - cropBox.getLowerLeftX());
            rect.setLowerLeftY(rect.getLowerLeftY() - cropBox.getLowerLeftY());
            rect.setUpperRightX(rect.getUpperRightX() - cropBox.getLowerLeftX());
            rect.setUpperRightY(rect.getUpperRightY() - cropBox.getLowerLeftY());
        }

        beadRectangles.add(rect);
    }
}

From source file:com.yiyihealth.tools.test.DrawPrintTextLocations.java

License:Apache License

private void stripPage(int page) throws IOException {
    PDFRenderer pdfRenderer = new PDFRenderer(document);
    image = pdfRenderer.renderImage(page, SCALE);

    PDPage pdPage = document.getPage(page);
    PDRectangle cropBox = pdPage.getCropBox();

    // flip y-axis
    flipAT = new AffineTransform();
    flipAT.translate(0, pdPage.getBBox().getHeight());
    flipAT.scale(1, -1);//from   ww  w  .j  a v  a 2s. c o  m

    // page may be rotated
    rotateAT = new AffineTransform();
    int rotation = pdPage.getRotation();
    if (rotation != 0) {
        PDRectangle mediaBox = pdPage.getMediaBox();
        switch (rotation) {
        case 90:
            rotateAT.translate(mediaBox.getHeight(), 0);
            break;
        case 270:
            rotateAT.translate(0, mediaBox.getWidth());
            break;
        case 180:
            rotateAT.translate(mediaBox.getWidth(), mediaBox.getHeight());
            break;
        default:
            break;
        }
        rotateAT.rotate(Math.toRadians(rotation));
    }

    g2d = image.createGraphics();
    g2d.setStroke(new BasicStroke(0.1f));
    g2d.scale(SCALE, SCALE);

    setStartPage(page + 1);
    setEndPage(page + 1);

    Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
    writeText(document, dummy);

    // beads in green
    g2d.setStroke(new BasicStroke(0.4f));
    List<PDThreadBead> pageArticles = pdPage.getThreadBeads();
    for (PDThreadBead bead : pageArticles) {
        PDRectangle r = bead.getRectangle();
        GeneralPath p = r
                .transform(Matrix.getTranslateInstance(-cropBox.getLowerLeftX(), cropBox.getLowerLeftY()));

        Shape s = flipAT.createTransformedShape(p);
        s = rotateAT.createTransformedShape(s);
        g2d.setColor(Color.green);
        g2d.draw(s);
    }

    g2d.dispose();

    String imageFilename = filename;
    int pt = imageFilename.lastIndexOf('.');
    imageFilename = imageFilename.substring(0, pt) + "-marked-" + (page + 1) + ".png";
    ImageIO.write(image, "png", new File(imageFilename));
}

From source file:de.tudarmstadt.ukp.dkpro.core.io.pdf.PdfLayoutEventStripper.java

License:Apache License

/**
 * This will process the contents of a page.
 * //from www .  j ava2s  . c o m
 * @param page
 *            The page to process.
 * @param content
 *            The contents of the page.
 * 
 * @throws IOException
 *             If there is an error processing the page.
 */
protected void processPage(final PDPage page, final COSStream content) throws IOException {
    if ((currentPageNo >= startPage) && (currentPageNo <= endPage)) {
        startPage(startPage, Math.min(maxPage, endPage), currentPageNo, page);
        pageArticles = page.getThreadBeads();
        int numberOfArticleSections = 1 + pageArticles.size() * 2;
        if (!shouldSeparateByBeads) {
            numberOfArticleSections = 1;
        }
        final int originalSize = charactersByArticle.size();
        charactersByArticle.setSize(numberOfArticleSections);
        for (int i = 0; i < numberOfArticleSections; i++) {
            if (numberOfArticleSections < originalSize) {
                charactersByArticle.get(i).clear();
            } else {
                charactersByArticle.set(i, new ArrayList<TextPosition>());
            }
        }

        characterListMapping.clear();

        // processStream will call showCharacter were we will simply
        // collect all the TextPositions for the page
        processStream(page, page.findResources(), content);

        // Now we do the real processing
        for (int i = 0; i < charactersByArticle.size(); i++) {
            processArticle(charactersByArticle.get(i));
        }

        endPage(startPage, endPage, currentPageNo, page);
    }
}

From source file:edu.isi.bmkeg.lapdf.extraction.LAPDFTextStripper.java

License:Apache License

/**
 * This will process the contents of a page.
 *
 * @param page The page to process.//  w w w . j  ava2  s . com
 * @param content The contents of the page.
 *
 * @throws IOException If there is an error processing the page.
 */
protected void processPage(PDPage page, COSStream content) throws IOException {
    if (currentPageNo >= startPage && currentPageNo <= endPage
            && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber)
            && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber)) {
        startPage(page);
        pageArticles = page.getThreadBeads();
        int numberOfArticleSections = 1 + pageArticles.size() * 2;
        if (!shouldSeparateByBeads) {
            numberOfArticleSections = 1;
        }
        int originalSize = charactersByArticle.size();
        charactersByArticle.setSize(numberOfArticleSections);
        for (int i = 0; i < numberOfArticleSections; i++) {
            if (numberOfArticleSections < originalSize) {
                ((List<TextPosition>) charactersByArticle.get(i)).clear();
            } else {
                charactersByArticle.set(i, new ArrayList<TextPosition>());
            }
        }
        characterListMapping.clear();
        processStream(page, page.findResources(), content);
        writePage();
        endPage(page);
    }
}

From source file:onyx.core.parser.PDFTextStripper.java

License:Apache License

/**
 * This will process the contents of a page.
 *
 * @param page The page to process./* ww w  .  jav a2s .  c o  m*/
 * @param content The contents of the page.
 *
 * @throws IOException If there is an error processing the page.
 */
protected void processPage(PDPage page, COSStream content) throws IOException {
    if (currentPageNo >= startPage && currentPageNo <= endPage
            && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber)
            && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber)) {
        startPage(page);
        pageArticles = page.getThreadBeads();
        int numberOfArticleSections = 1 + pageArticles.size() * 2;
        if (!shouldSeparateByBeads) {
            numberOfArticleSections = 1;
        }
        int originalSize = charactersByArticle.size();
        charactersByArticle.setSize(numberOfArticleSections);
        for (int i = 0; i < numberOfArticleSections; i++) {
            if (numberOfArticleSections < originalSize) {
                ((List<TextPosition>) charactersByArticle.get(i)).clear();
            } else {
                charactersByArticle.set(i, new ArrayList<TextPosition>());
            }
        }

        characterListMapping.clear();
        processStream(page, page.findResources(), content);
        writePage();
        endPage(page);
    }

}