Example usage for org.apache.pdfbox.pdmodel.common PDRectangle contains

List of usage examples for org.apache.pdfbox.pdmodel.common PDRectangle contains

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel.common PDRectangle contains.

Prototype

public boolean contains(float x, float y) 

Source Link

Document

Method to determine if the x/y point is inside this rectangle.

Usage

From source file:airviewer.AbstractDocumentCommandWrapper.java

License:Apache License

/**
 * This method finds and returns the "last" (upper most) annotation in
 * annotations that contains the specified x and y coordinates
 *
 * @param annotations A list of candidate annotations
 * @param x An X coordinate in the PDF coordinate system
 * @param y A Y coordinate in the PDF coordinate system
 * @return The annotation if any that was found at {x,y} on the page with
 * pageIndex/* w  w  w .  java 2 s. com*/
 */
protected PDAnnotation getLastAnnotationAtPoint(List<PDAnnotation> annotations, float x, float y) {
    PDAnnotation result = null;

    for (int i = annotations.size() - 1; i >= 0; --i) {
        PDAnnotation candidate = annotations.get(i);
        if (!candidate.isHidden() && !candidate.isReadOnly()) {
            PDRectangle boundingBox = candidate.getRectangle();

            if (boundingBox.contains(x, y)) {
                return candidate;

            }
        }
    }

    return result;
}

From source file:aplicacion.sistema.indexer.test.PDFTextStripperOrg.java

License:Apache License

/**
 * This will process a TextPosition object and add the
 * text to the list of characters on a page.  It takes care of
 * overlapping text.// ww w  .j  ava2s.  c  om
 *
 * @param text The text to process.
 */
protected void processTextPosition(TextPosition text) {
    boolean showCharacter = true;
    if (suppressDuplicateOverlappingText) {
        showCharacter = false;
        String textCharacter = text.getCharacter();
        float textX = text.getX();
        float textY = text.getY();
        List sameTextCharacters = (List) characterListMapping.get(textCharacter);
        if (sameTextCharacters == null) {
            sameTextCharacters = new ArrayList();
            characterListMapping.put(textCharacter, sameTextCharacters);
        }

        // RDD - Here we compute the value that represents the end of the rendered
        // text.  This value is used to determine whether subsequent text rendered
        // on the same line overwrites the current text.
        //
        // We subtract any positive padding to handle cases where extreme amounts
        // of padding are applied, then backed off (not sure why this is done, but there
        // are cases where the padding is on the order of 10x the character width, and
        // the TJ just backs up to compensate after each character).  Also, we subtract
        // an amount to allow for kerning (a percentage of the width of the last
        // character).
        //
        boolean suppressCharacter = false;
        float tolerance = (text.getWidth() / textCharacter.length()) / 3.0f;
        for (int i = 0; i < sameTextCharacters.size() && textCharacter != null; i++) {
            TextPosition character = (TextPosition) sameTextCharacters.get(i);
            String charCharacter = character.getCharacter();
            float charX = character.getX();
            float charY = character.getY();
            //only want to suppress

            if (charCharacter != null &&
            //charCharacter.equals( textCharacter ) &&
                    within(charX, textX, tolerance) && within(charY, textY, tolerance)) {
                suppressCharacter = true;
            }
        }
        if (!suppressCharacter) {
            sameTextCharacters.add(text);
            showCharacter = true;
        }
    }

    if (showCharacter) {
        //if we are showing the character then we need to determine which
        //article it belongs to.
        int foundArticleDivisionIndex = -1;
        int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
        int notFoundButFirstLeftArticleDivisionIndex = -1;
        int notFoundButFirstAboveArticleDivisionIndex = -1;
        float x = text.getX();
        float y = text.getY();
        if (shouldSeparateByBeads) {
            for (int i = 0; i < pageArticles.size() && foundArticleDivisionIndex == -1; i++) {
                PDThreadBead bead = (PDThreadBead) pageArticles.get(i);
                if (bead != null) {
                    PDRectangle rect = bead.getRectangle();
                    if (rect.contains(x, y)) {
                        foundArticleDivisionIndex = i * 2 + 1;
                    } else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY())
                            && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) {
                        notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2;
                    } else if (x < rect.getLowerLeftX() && notFoundButFirstLeftArticleDivisionIndex == -1) {
                        notFoundButFirstLeftArticleDivisionIndex = i * 2;
                    } else if (y < rect.getUpperRightY() && notFoundButFirstAboveArticleDivisionIndex == -1) {
                        notFoundButFirstAboveArticleDivisionIndex = i * 2;
                    }
                } else {
                    foundArticleDivisionIndex = 0;
                }
            }
        } else {
            foundArticleDivisionIndex = 0;
        }
        int articleDivisionIndex = -1;
        if (foundArticleDivisionIndex != -1) {
            articleDivisionIndex = foundArticleDivisionIndex;
        } else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
        } else if (notFoundButFirstLeftArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
        } else if (notFoundButFirstAboveArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
        } else {
            articleDivisionIndex = charactersByArticle.size() - 1;
        }

        List textList = (List) charactersByArticle.get(articleDivisionIndex);

        /* In the wild, some PDF encoded documents put diacritics (accents on
         * top of characters) into a separate Tj element.  When displaying them
         * graphically, the two chunks get overlayed.  With text output though,
         * we need to do the overlay. This code recombines the diacritic with
         * its associated character if the two are consecutive.
         */
        if (textList.isEmpty()) {
            textList.add(text);
        } else {
            /* test if we overlap the previous entry.  
             * Note that we are making an assumption that we need to only look back
             * one TextPosition to find what we are overlapping.  
             * This may not always be true. */
            TextPosition previousTextPosition = (TextPosition) textList.get(textList.size() - 1);
            if (text.isDiacritic() && previousTextPosition.contains(text)) {
                previousTextPosition.mergeDiacritic(text, normalize);
            }
            /* If the previous TextPosition was the diacritic, merge it into this
             * one and remove it from the list. */
            else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) {
                text.mergeDiacritic(previousTextPosition, normalize);
                textList.remove(textList.size() - 1);
                textList.add(text);
            } else {
                textList.add(text);
            }
        }
    }
}

From source file:com.repeatability.pdf.PDFTextStripper.java

License:Apache License

/**
 * This will process a TextPosition object and add the text to the list of characters on a page. It takes care of
 * overlapping text./*from   w  w w  . j  a v  a 2 s. c  om*/
 *
 * @param text The text to process.
 */
@Override
protected void processTextPosition(TextPosition text) {
    boolean showCharacter = true;
    if (suppressDuplicateOverlappingText) {
        showCharacter = false;
        String textCharacter = text.getUnicode();
        float textX = text.getX();
        float textY = text.getY();
        TreeMap<Float, TreeSet<Float>> sameTextCharacters = characterListMapping.get(textCharacter);
        if (sameTextCharacters == null) {
            sameTextCharacters = new TreeMap<Float, TreeSet<Float>>();
            characterListMapping.put(textCharacter, sameTextCharacters);
        }
        // RDD - Here we compute the value that represents the end of the rendered
        // text. This value is used to determine whether subsequent text rendered
        // on the same line overwrites the current text.
        //
        // We subtract any positive padding to handle cases where extreme amounts
        // of padding are applied, then backed off (not sure why this is done, but there
        // are cases where the padding is on the order of 10x the character width, and
        // the TJ just backs up to compensate after each character). Also, we subtract
        // an amount to allow for kerning (a percentage of the width of the last
        // character).
        boolean suppressCharacter = false;
        float tolerance = text.getWidth() / textCharacter.length() / 3.0f;

        SortedMap<Float, TreeSet<Float>> xMatches = sameTextCharacters.subMap(textX - tolerance,
                textX + tolerance);
        for (TreeSet<Float> xMatch : xMatches.values()) {
            SortedSet<Float> yMatches = xMatch.subSet(textY - tolerance, textY + tolerance);
            if (!yMatches.isEmpty()) {
                suppressCharacter = true;
                break;
            }
        }
        if (!suppressCharacter) {
            TreeSet<Float> ySet = sameTextCharacters.get(textX);
            if (ySet == null) {
                ySet = new TreeSet<Float>();
                sameTextCharacters.put(textX, ySet);
            }
            ySet.add(textY);
            showCharacter = true;
        }
    }
    if (showCharacter) {
        // if we are showing the character then we need to determine which article it belongs to
        int foundArticleDivisionIndex = -1;
        int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
        int notFoundButFirstLeftArticleDivisionIndex = -1;
        int notFoundButFirstAboveArticleDivisionIndex = -1;
        float x = text.getX();
        float y = text.getY();
        if (shouldSeparateByBeads) {
            for (int i = 0; i < beadRectangles.size() && foundArticleDivisionIndex == -1; i++) {
                PDRectangle rect = beadRectangles.get(i);
                if (rect != null) {
                    if (rect.contains(x, y)) {
                        foundArticleDivisionIndex = i * 2 + 1;
                    } else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY())
                            && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) {
                        notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2;
                    } else if (x < rect.getLowerLeftX() && notFoundButFirstLeftArticleDivisionIndex == -1) {
                        notFoundButFirstLeftArticleDivisionIndex = i * 2;
                    } else if (y < rect.getUpperRightY() && notFoundButFirstAboveArticleDivisionIndex == -1) {
                        notFoundButFirstAboveArticleDivisionIndex = i * 2;
                    }
                } else {
                    foundArticleDivisionIndex = 0;
                }
            }
        } else {
            foundArticleDivisionIndex = 0;
        }
        int articleDivisionIndex;
        if (foundArticleDivisionIndex != -1) {
            articleDivisionIndex = foundArticleDivisionIndex;
        } else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
        } else if (notFoundButFirstLeftArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
        } else if (notFoundButFirstAboveArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
        } else {
            articleDivisionIndex = charactersByArticle.size() - 1;
        }

        List<TextPosition> textList = charactersByArticle.get(articleDivisionIndex);

        // In the wild, some PDF encoded documents put diacritics (accents on
        // top of characters) into a separate Tj element. When displaying them
        // graphically, the two chunks get overlayed. With text output though,
        // we need to do the overlay. This code recombines the diacritic with
        // its associated character if the two are consecutive.
        if (textList.isEmpty()) {
            textList.add(text);
        } else {
            // test if we overlap the previous entry.
            // Note that we are making an assumption that we need to only look back
            // one TextPosition to find what we are overlapping.
            // This may not always be true. */
            TextPosition previousTextPosition = textList.get(textList.size() - 1);
            if (text.isDiacritic() && previousTextPosition.contains(text)) {
                previousTextPosition.mergeDiacritic(text);
            }
            // If the previous TextPosition was the diacritic, merge it into this
            // one and remove it from the list.
            else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) {
                text.mergeDiacritic(previousTextPosition);
                textList.remove(textList.size() - 1);
                textList.add(text);
            } else {
                textList.add(text);
            }
        }
    }
}

From source file:de.tudarmstadt.ukp.dkpro.core.io.pdf.PdfLayoutEventStripper.java

License:Apache License

/**
 * This will show add a character to the list of characters to be printed to the text file.
 * /*from w  w w  .ja  v a 2  s .c om*/
 * @param text
 *            The description of the character to display.
 */
@Override
protected void processTextPosition(final TextPosition text) {
    boolean showCharacter = true;
    if (suppressDuplicateOverlappingText) {
        showCharacter = false;
        final String textCharacter = text.getCharacter();
        final float textX = text.getX();
        final float textY = text.getY();
        List<TextPosition> sameTextCharacters = characterListMapping.get(textCharacter);
        if (sameTextCharacters == null) {
            sameTextCharacters = new ArrayList<TextPosition>();
            characterListMapping.put(textCharacter, sameTextCharacters);
        }

        // RDD - Here we compute the value that represents the end of the
        // rendered
        // text. This value is used to determine whether subsequent text
        // rendered
        // on the same line overwrites the current text.
        //
        // We subtract any positive padding to handle cases where extreme
        // amounts
        // of padding are applied, then backed off (not sure why this is
        // done, but there
        // are cases where the padding is on the order of 10x the character
        // width, and
        // the TJ just backs up to compensate after each character). Also,
        // we subtract
        // an amount to allow for kerning (a percentage of the width of the
        // last
        // character).
        //
        boolean suppressCharacter = false;
        final float tolerance = (text.getWidth() / textCharacter.length()) / 3.0f;
        for (int i = 0; i < sameTextCharacters.size() && textCharacter != null; i++) {
            final TextPosition character = sameTextCharacters.get(i);
            final String charCharacter = character.getCharacter();
            final float charX = character.getX();
            final float charY = character.getY();
            // only want to suppress

            if (charCharacter != null &&
            // charCharacter.equals( textCharacter ) &&
                    within(charX, textX, tolerance) && within(charY, textY, tolerance)) {
                suppressCharacter = true;
            }
        }
        if (!suppressCharacter && (text.getCharacter() != null) && (text.getCharacter().length() > 0)) {
            sameTextCharacters.add(text);
            showCharacter = true;
        }
    }

    if (showCharacter) {
        // if we are showing the character then we need to determine which
        // article it belongs to.
        int foundArticleDivisionIndex = -1;
        int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
        int notFoundButFirstLeftArticleDivisionIndex = -1;
        int notFoundButFirstAboveArticleDivisionIndex = -1;
        final float x = text.getX();
        final float y = text.getY();
        if (shouldSeparateByBeads) {
            for (int i = 0; i < pageArticles.size() && foundArticleDivisionIndex == -1; i++) {
                final PDThreadBead bead = pageArticles.get(i);
                if (bead != null) {
                    final PDRectangle rect = bead.getRectangle();
                    if (rect.contains(x, y)) {
                        foundArticleDivisionIndex = i * 2 + 1;
                    } else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY())
                            && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) {
                        notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2;
                    } else if (x < rect.getLowerLeftX() && notFoundButFirstLeftArticleDivisionIndex == -1) {
                        notFoundButFirstLeftArticleDivisionIndex = i * 2;
                    } else if (y < rect.getUpperRightY() && notFoundButFirstAboveArticleDivisionIndex == -1) {
                        notFoundButFirstAboveArticleDivisionIndex = i * 2;
                    }
                } else {
                    foundArticleDivisionIndex = 0;
                }
            }
        } else {
            foundArticleDivisionIndex = 0;
        }
        int articleDivisionIndex = -1;
        if (foundArticleDivisionIndex != -1) {
            articleDivisionIndex = foundArticleDivisionIndex;
        } else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
        } else if (notFoundButFirstLeftArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
        } else if (notFoundButFirstAboveArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
        } else {
            articleDivisionIndex = charactersByArticle.size() - 1;
        }
        final List<TextPosition> textList = charactersByArticle.get(articleDivisionIndex);
        textList.add(text);
    }
}

From source file:edu.isi.bmkeg.lapdf.extraction.LAPDFTextStripper.java

License:Apache License

/**
 * This will process a TextPosition object and add the
 * text to the list of characters on a page.  It takes care of
 * overlapping text.//w  w w.j a  va 2 s . co  m
 *
 * @param text The text to process.
 */
protected void processTextPosition(TextPosition text) {
    boolean showCharacter = true;
    if (suppressDuplicateOverlappingText) {
        showCharacter = false;
        String textCharacter = text.getCharacter();
        float textX = text.getX();
        float textY = text.getY();
        TreeMap<Float, TreeSet<Float>> sameTextCharacters = characterListMapping.get(textCharacter);
        if (sameTextCharacters == null) {
            sameTextCharacters = new TreeMap<Float, TreeSet<Float>>();
            characterListMapping.put(textCharacter, sameTextCharacters);
        }
        // RDD - Here we compute the value that represents the end of the rendered
        // text.  This value is used to determine whether subsequent text rendered
        // on the same line overwrites the current text.
        //
        // We subtract any positive padding to handle cases where extreme amounts
        // of padding are applied, then backed off (not sure why this is done, but there
        // are cases where the padding is on the order of 10x the character width, and
        // the TJ just backs up to compensate after each character).  Also, we subtract
        // an amount to allow for kerning (a percentage of the width of the last
        // character).
        //
        boolean suppressCharacter = false;
        float tolerance = (text.getWidth() / textCharacter.length()) / 3.0f;

        SortedMap<Float, TreeSet<Float>> xMatches = sameTextCharacters.subMap(textX - tolerance,
                textX + tolerance);
        for (TreeSet<Float> xMatch : xMatches.values()) {
            SortedSet<Float> yMatches = xMatch.subSet(textY - tolerance, textY + tolerance);
            if (!yMatches.isEmpty()) {
                suppressCharacter = true;
                break;
            }
        }
        if (!suppressCharacter) {
            TreeSet<Float> ySet = sameTextCharacters.get(textX);
            if (ySet == null) {
                ySet = new TreeSet<Float>();
                sameTextCharacters.put(textX, ySet);
            }
            ySet.add(textY);
            showCharacter = true;
        }
    }
    if (showCharacter) {
        //if we are showing the character then we need to determine which
        //article it belongs to.
        int foundArticleDivisionIndex = -1;
        int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
        int notFoundButFirstLeftArticleDivisionIndex = -1;
        int notFoundButFirstAboveArticleDivisionIndex = -1;
        float x = text.getX();
        float y = text.getY();
        if (shouldSeparateByBeads) {
            for (int i = 0; i < pageArticles.size() && foundArticleDivisionIndex == -1; i++) {
                PDThreadBead bead = (PDThreadBead) pageArticles.get(i);
                if (bead != null) {
                    PDRectangle rect = bead.getRectangle();
                    if (rect.contains(x, y)) {
                        foundArticleDivisionIndex = i * 2 + 1;
                    } else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY())
                            && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) {
                        notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2;
                    } else if (x < rect.getLowerLeftX() && notFoundButFirstLeftArticleDivisionIndex == -1) {
                        notFoundButFirstLeftArticleDivisionIndex = i * 2;
                    } else if (y < rect.getUpperRightY() && notFoundButFirstAboveArticleDivisionIndex == -1) {
                        notFoundButFirstAboveArticleDivisionIndex = i * 2;
                    }
                } else {
                    foundArticleDivisionIndex = 0;
                }
            }
        } else {
            foundArticleDivisionIndex = 0;
        }
        int articleDivisionIndex = -1;
        if (foundArticleDivisionIndex != -1) {
            articleDivisionIndex = foundArticleDivisionIndex;
        } else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
        } else if (notFoundButFirstLeftArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
        } else if (notFoundButFirstAboveArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
        } else {
            articleDivisionIndex = charactersByArticle.size() - 1;
        }

        List<TextPosition> textList = (List<TextPosition>) charactersByArticle.get(articleDivisionIndex);

        /* In the wild, some PDF encoded documents put diacritics (accents on
         * top of characters) into a separate Tj element.  When displaying them
         * graphically, the two chunks get overlayed.  With text output though,
         * we need to do the overlay. This code recombines the diacritic with
         * its associated character if the two are consecutive.
         */
        if (textList.isEmpty()) {
            textList.add(text);
        } else {
            /* test if we overlap the previous entry.  
             * Note that we are making an assumption that we need to only look back
             * one TextPosition to find what we are overlapping.  
             * This may not always be true. */
            TextPosition previousTextPosition = (TextPosition) textList.get(textList.size() - 1);
            if (text.isDiacritic() && previousTextPosition.contains(text)) {
                previousTextPosition.mergeDiacritic(text, normalize);
            }
            /* If the previous TextPosition was the diacritic, merge it into this
             * one and remove it from the list. */
            else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) {
                text.mergeDiacritic(previousTextPosition, normalize);
                textList.remove(textList.size() - 1);
                textList.add(text);
            } else {
                textList.add(text);
            }
        }
    }
}

From source file:onyx.core.parser.PDFTextStripper.java

License:Apache License

/**
 * This will process a TextPosition object and add the
 * text to the list of characters on a page.  It takes care of
 * overlapping text./*  www.  j a  v  a  2  s . co  m*/
 *
 * @param text The text to process.
 */
protected void processTextPosition(TextPosition text) {
    boolean showCharacter = true;
    if (suppressDuplicateOverlappingText) {
        showCharacter = false;
        String textCharacter = text.getCharacter();
        float textX = text.getX();
        float textY = text.getY();
        TreeMap<Float, TreeSet<Float>> sameTextCharacters = characterListMapping.get(textCharacter);
        if (sameTextCharacters == null) {
            sameTextCharacters = new TreeMap<Float, TreeSet<Float>>();
            characterListMapping.put(textCharacter, sameTextCharacters);
        }

        // RDD - Here we compute the value that represents the end of the rendered
        // text.  This value is used to determine whether subsequent text rendered
        // on the same line overwrites the current text.
        //
        // We subtract any positive padding to handle cases where extreme amounts
        // of padding are applied, then backed off (not sure why this is done, but there
        // are cases where the padding is on the order of 10x the character width, and
        // the TJ just backs up to compensate after each character).  Also, we subtract
        // an amount to allow for kerning (a percentage of the width of the last
        // character).
        //
        boolean suppressCharacter = false;
        float tolerance = (text.getWidth() / textCharacter.length()) / 3.0f;

        SortedMap<Float, TreeSet<Float>> xMatches = sameTextCharacters.subMap(textX - tolerance,
                textX + tolerance);
        for (TreeSet<Float> xMatch : xMatches.values()) {
            SortedSet<Float> yMatches = xMatch.subSet(textY - tolerance, textY + tolerance);
            if (!yMatches.isEmpty()) {
                suppressCharacter = true;
                break;
            }
        }

        if (!suppressCharacter) {
            TreeSet<Float> ySet = sameTextCharacters.get(textX);
            if (ySet == null) {
                ySet = new TreeSet<Float>();
                sameTextCharacters.put(textX, ySet);
            }
            ySet.add(textY);
            showCharacter = true;
        }
    }

    if (showCharacter) {
        //if we are showing the character then we need to determine which
        //article it belongs to.
        int foundArticleDivisionIndex = -1;
        int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
        int notFoundButFirstLeftArticleDivisionIndex = -1;
        int notFoundButFirstAboveArticleDivisionIndex = -1;
        float x = text.getX();
        float y = text.getY();
        if (shouldSeparateByBeads) {
            for (int i = 0; i < pageArticles.size() && foundArticleDivisionIndex == -1; i++) {
                PDThreadBead bead = (PDThreadBead) pageArticles.get(i);
                if (bead != null) {
                    PDRectangle rect = bead.getRectangle();
                    if (rect.contains(x, y)) {
                        foundArticleDivisionIndex = i * 2 + 1;
                    } else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY())
                            && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) {
                        notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2;
                    } else if (x < rect.getLowerLeftX() && notFoundButFirstLeftArticleDivisionIndex == -1) {
                        notFoundButFirstLeftArticleDivisionIndex = i * 2;
                    } else if (y < rect.getUpperRightY() && notFoundButFirstAboveArticleDivisionIndex == -1) {
                        notFoundButFirstAboveArticleDivisionIndex = i * 2;
                    }
                } else {
                    foundArticleDivisionIndex = 0;
                }
            }
        } else {
            foundArticleDivisionIndex = 0;
        }
        int articleDivisionIndex = -1;
        if (foundArticleDivisionIndex != -1) {
            articleDivisionIndex = foundArticleDivisionIndex;
        } else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
        } else if (notFoundButFirstLeftArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
        } else if (notFoundButFirstAboveArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
        } else {
            articleDivisionIndex = charactersByArticle.size() - 1;
        }

        List<TextPosition> textList = (List<TextPosition>) charactersByArticle.get(articleDivisionIndex);

        /* In the wild, some PDF encoded documents put diacritics (accents on
         * top of characters) into a separate Tj element.  When displaying them
         * graphically, the two chunks get overlayed.  With text output though,
         * we need to do the overlay. This code recombines the diacritic with
         * its associated character if the two are consecutive.
         */
        if (textList.isEmpty()) {
            textList.add(text);
        } else {
            /* test if we overlap the previous entry.  
             * Note that we are making an assumption that we need to only look back
             * one TextPosition to find what we are overlapping.  
             * This may not always be true. */
            TextPosition previousTextPosition = (TextPosition) textList.get(textList.size() - 1);
            if (text.isDiacritic() && previousTextPosition.contains(text)) {
                previousTextPosition.mergeDiacritic(text, normalize);
            }
            /* If the previous TextPosition was the diacritic, merge it into this
             * one and remove it from the list. */
            else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) {
                text.mergeDiacritic(previousTextPosition, normalize);
                textList.remove(textList.size() - 1);
                textList.add(text);
            } else {
                textList.add(text);
            }
        }
    }
}

From source file:org.polarsys.kitalpha.doc.doc2model.tikaparsing.pdf.Doc2ModelTextStripper.java

License:Apache License

/**
 * This will process a TextPosition object and add the
 * text to the list of characters on a page.  It takes care of
 * overlapping text./*from   w ww .  ja  v  a  2 s . c om*/
 *
 * @param text The text to process.
 */
protected void processTextPosition(TextPosition rawtext) {

    StylizedTextPosition text = (StylizedTextPosition) processStyle(rawtext);

    boolean showCharacter = true;
    if (suppressDuplicateOverlappingText) {
        showCharacter = false;
        String textCharacter = text.getCharacter();
        float textX = text.getX();
        float textY = text.getY();
        List<TextPosition> sameTextCharacters = (List<TextPosition>) characterListMapping.get(textCharacter);
        if (sameTextCharacters == null) {
            sameTextCharacters = new ArrayList<TextPosition>();
            characterListMapping.put(textCharacter, sameTextCharacters);
        }

        // RDD - Here we compute the value that represents the end of the rendered
        // text.  This value is used to determine whether subsequent text rendered
        // on the same line overwrites the current text.
        //
        // We subtract any positive padding to handle cases where extreme amounts
        // of padding are applied, then backed off (not sure why this is done, but there
        // are cases where the padding is on the order of 10x the character width, and
        // the TJ just backs up to compensate after each character).  Also, we subtract
        // an amount to allow for kerning (a percentage of the width of the last
        // character).
        //
        boolean suppressCharacter = false;
        float tolerance = (text.getWidth() / textCharacter.length()) / 3.0f;
        for (int i = 0; i < sameTextCharacters.size() && textCharacter != null; i++) {
            TextPosition character = sameTextCharacters.get(i);
            String charCharacter = character.getCharacter();
            float charX = character.getX();
            float charY = character.getY();
            //only want to suppress

            if (charCharacter != null &&
            //charCharacter.equals( textCharacter ) &&
                    within(charX, textX, tolerance) && within(charY, textY, tolerance)) {
                suppressCharacter = true;
            }
        }
        if (!suppressCharacter) {
            sameTextCharacters.add(text);
            showCharacter = true;
        }
    }

    if (showCharacter) {
        //if we are showing the character then we need to determine which
        //article it belongs to.
        int foundArticleDivisionIndex = -1;
        int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
        int notFoundButFirstLeftArticleDivisionIndex = -1;
        int notFoundButFirstAboveArticleDivisionIndex = -1;
        float x = text.getX();
        float y = text.getY();
        if (shouldSeparateByBeads) {
            for (int i = 0; i < pageArticles.size() && foundArticleDivisionIndex == -1; i++) {
                PDThreadBead bead = (PDThreadBead) pageArticles.get(i);
                if (bead != null) {
                    PDRectangle rect = bead.getRectangle();
                    if (rect.contains(x, y)) {
                        foundArticleDivisionIndex = i * 2 + 1;
                    } else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY())
                            && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) {
                        notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2;
                    } else if (x < rect.getLowerLeftX() && notFoundButFirstLeftArticleDivisionIndex == -1) {
                        notFoundButFirstLeftArticleDivisionIndex = i * 2;
                    } else if (y < rect.getUpperRightY() && notFoundButFirstAboveArticleDivisionIndex == -1) {
                        notFoundButFirstAboveArticleDivisionIndex = i * 2;
                    }
                } else {
                    foundArticleDivisionIndex = 0;
                }
            }
        } else {
            foundArticleDivisionIndex = 0;
        }
        int articleDivisionIndex = -1;
        if (foundArticleDivisionIndex != -1) {
            articleDivisionIndex = foundArticleDivisionIndex;
        } else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
        } else if (notFoundButFirstLeftArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
        } else if (notFoundButFirstAboveArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
        } else {
            articleDivisionIndex = charactersByArticle.size() - 1;
        }

        List<TextPosition> textList = (List<TextPosition>) charactersByArticle.get(articleDivisionIndex);

        /* In the wild, some PDF encoded documents put diacritics (accents on
         * top of characters) into a separate Tj element.  When displaying them
         * graphically, the two chunks get overlayed.  With text output though,
         * we need to do the overlay. This code recombines the diacritic with
         * its associated character if the two are consecutive.
         */
        if (textList.isEmpty()) {
            textList.add(text);
        } else {
            /* test if we overlap the previous entry.  
             * Note that we are making an assumption that we need to only look back
             * one TextPosition to find what we are overlapping.  
             * This may not always be true. */
            TextPosition previousTextPosition = (TextPosition) textList.get(textList.size() - 1);
            if (text.isDiacritic() && previousTextPosition.contains(text)) {
                previousTextPosition.mergeDiacritic(text, normalize);
            }
            /* If the previous TextPosition was the diacritic, merge it into this
             * one and remove it from the list. */
            else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) {
                text.mergeDiacritic(previousTextPosition, normalize);
                textList.remove(textList.size() - 1);
                textList.add(text);
            } else {
                textList.add(text);
            }
        }
    }
}