Example usage for org.apache.pdfbox.text TextPosition mergeDiacritic

Introduction

In this page you can find the example usage for org.apache.pdfbox.text TextPosition mergeDiacritic.

Prototype

public void mergeDiacritic(TextPosition diacritic)

Source Link

Document

Merge a single character TextPosition into the current object.

Usage

From source file:com.repeatability.pdf.PDFTextStripper.java

License:Apache License

/**
 * This will process a TextPosition object and add the text to the list of characters on a page. It takes care of
 * overlapping text./*from   w  w w. j a  va2  s . c  om*/
 *
 * @param text The text to process.
 */
@Override
protected void processTextPosition(TextPosition text) {
    boolean showCharacter = true;
    if (suppressDuplicateOverlappingText) {
        showCharacter = false;
        String textCharacter = text.getUnicode();
        float textX = text.getX();
        float textY = text.getY();
        TreeMap<Float, TreeSet<Float>> sameTextCharacters = characterListMapping.get(textCharacter);
        if (sameTextCharacters == null) {
            sameTextCharacters = new TreeMap<Float, TreeSet<Float>>();
            characterListMapping.put(textCharacter, sameTextCharacters);
        }
        // RDD - Here we compute the value that represents the end of the rendered
        // text. This value is used to determine whether subsequent text rendered
        // on the same line overwrites the current text.
        //
        // We subtract any positive padding to handle cases where extreme amounts
        // of padding are applied, then backed off (not sure why this is done, but there
        // are cases where the padding is on the order of 10x the character width, and
        // the TJ just backs up to compensate after each character). Also, we subtract
        // an amount to allow for kerning (a percentage of the width of the last
        // character).
        boolean suppressCharacter = false;
        float tolerance = text.getWidth() / textCharacter.length() / 3.0f;

        SortedMap<Float, TreeSet<Float>> xMatches = sameTextCharacters.subMap(textX - tolerance,
                textX + tolerance);
        for (TreeSet<Float> xMatch : xMatches.values()) {
            SortedSet<Float> yMatches = xMatch.subSet(textY - tolerance, textY + tolerance);
            if (!yMatches.isEmpty()) {
                suppressCharacter = true;
                break;
            }
        }
        if (!suppressCharacter) {
            TreeSet<Float> ySet = sameTextCharacters.get(textX);
            if (ySet == null) {
                ySet = new TreeSet<Float>();
                sameTextCharacters.put(textX, ySet);
            }
            ySet.add(textY);
            showCharacter = true;
        }
    }
    if (showCharacter) {
        // if we are showing the character then we need to determine which article it belongs to
        int foundArticleDivisionIndex = -1;
        int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
        int notFoundButFirstLeftArticleDivisionIndex = -1;
        int notFoundButFirstAboveArticleDivisionIndex = -1;
        float x = text.getX();
        float y = text.getY();
        if (shouldSeparateByBeads) {
            for (int i = 0; i < beadRectangles.size() && foundArticleDivisionIndex == -1; i++) {
                PDRectangle rect = beadRectangles.get(i);
                if (rect != null) {
                    if (rect.contains(x, y)) {
                        foundArticleDivisionIndex = i * 2 + 1;
                    } else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY())
                            && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) {
                        notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2;
                    } else if (x < rect.getLowerLeftX() && notFoundButFirstLeftArticleDivisionIndex == -1) {
                        notFoundButFirstLeftArticleDivisionIndex = i * 2;
                    } else if (y < rect.getUpperRightY() && notFoundButFirstAboveArticleDivisionIndex == -1) {
                        notFoundButFirstAboveArticleDivisionIndex = i * 2;
                    }
                } else {
                    foundArticleDivisionIndex = 0;
                }
            }
        } else {
            foundArticleDivisionIndex = 0;
        }
        int articleDivisionIndex;
        if (foundArticleDivisionIndex != -1) {
            articleDivisionIndex = foundArticleDivisionIndex;
        } else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
        } else if (notFoundButFirstLeftArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
        } else if (notFoundButFirstAboveArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
        } else {
            articleDivisionIndex = charactersByArticle.size() - 1;
        }

        List<TextPosition> textList = charactersByArticle.get(articleDivisionIndex);

        // In the wild, some PDF encoded documents put diacritics (accents on
        // top of characters) into a separate Tj element. When displaying them
        // graphically, the two chunks get overlayed. With text output though,
        // we need to do the overlay. This code recombines the diacritic with
        // its associated character if the two are consecutive.
        if (textList.isEmpty()) {
            textList.add(text);
        } else {
            // test if we overlap the previous entry.
            // Note that we are making an assumption that we need to only look back
            // one TextPosition to find what we are overlapping.
            // This may not always be true. */
            TextPosition previousTextPosition = textList.get(textList.size() - 1);
            if (text.isDiacritic() && previousTextPosition.contains(text)) {
                previousTextPosition.mergeDiacritic(text);
            }
            // If the previous TextPosition was the diacritic, merge it into this
            // one and remove it from the list.
            else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) {
                text.mergeDiacritic(previousTextPosition);
                textList.remove(textList.size() - 1);
                textList.add(text);
            } else {
                textList.add(text);
            }
        }
    }
}

From source file:org.fit.pdfdom.PDFBoxTree.java

License:Open Source License

@Override
protected void processTextPosition(TextPosition text) {
    if (text.isDiacritic()) {
        lastDia = text;/*from  w  ww  .ja va 2 s .  c om*/
    } else if (!text.getUnicode().trim().isEmpty()) {
        if (lastDia != null) {
            if (text.contains(lastDia))
                text.mergeDiacritic(lastDia);
            lastDia = null;
        }

        /*float[] c = transformPosition(text.getX(), text.getY());
        cur_x = c[0];
        cur_y = c[1];*/
        cur_x = text.getX();
        cur_y = text.getY();

        /*System.out.println("Text: " + text.getCharacter());
        System.out.println(" Font size: " + text.getFontSize() + " " + text.getFontSizeInPt() + "pt");
        System.out.println(" Width: " + text.getWidth());
        System.out.println(" Width adj: " + text.getWidthDirAdj());
        System.out.println(" Height: " + text.getHeight());
        System.out.println(" Height dir: " + text.getHeightDir());
        System.out.println(" XScale: " + text.getXScale());
        System.out.println(" YScale: " + text.getYScale());*/

        float distx = 0;
        float disty = 0;
        if (lastText != null) {
            distx = text.getX() - (lastText.getX() + lastText.getWidth());
            disty = text.getY() - lastText.getY();
        }

        //should we split the boxes?
        boolean split = lastText == null || distx > 1.0f || distx < -6.0f || Math.abs(disty) > 1.0f
                || isReversed(getTextDirectionality(text)) != isReversed(getTextDirectionality(lastText));
        //if the style changed, we should split the boxes
        updateStyle(style, text);
        if (!style.equals(curstyle))
            split = true;

        if (split) //start of a new box
        {
            //finish current box (if any)
            if (lastText != null) {
                finishBox();
            }
            //start a new box
            curstyle = new BoxStyle(style);
        }
        textLine.append(text.getUnicode());
        if (textMetrics == null)
            textMetrics = new TextMetrics(text);
        else
            textMetrics.append(text);
        lastText = text;
    }
}