Example usage for org.apache.pdfbox.text TextPosition mergeDiacritic

List of usage examples for org.apache.pdfbox.text TextPosition mergeDiacritic

Introduction

In this page you can find the example usage for org.apache.pdfbox.text TextPosition mergeDiacritic.

Prototype

public void mergeDiacritic(TextPosition diacritic) 

Source Link

Document

Merge a single character TextPosition into the current object.

Usage

From source file:com.repeatability.pdf.PDFTextStripper.java

License:Apache License

/**
 * This will process a TextPosition object and add the text to the list of characters on a page. It takes care of
 * overlapping text./*from   w  w w. j a  va2  s . c  om*/
 *
 * @param text The text to process.
 */
@Override
protected void processTextPosition(TextPosition text) {
    boolean showCharacter = true;
    if (suppressDuplicateOverlappingText) {
        showCharacter = false;
        String textCharacter = text.getUnicode();
        float textX = text.getX();
        float textY = text.getY();
        TreeMap<Float, TreeSet<Float>> sameTextCharacters = characterListMapping.get(textCharacter);
        if (sameTextCharacters == null) {
            sameTextCharacters = new TreeMap<Float, TreeSet<Float>>();
            characterListMapping.put(textCharacter, sameTextCharacters);
        }
        // RDD - Here we compute the value that represents the end of the rendered
        // text. This value is used to determine whether subsequent text rendered
        // on the same line overwrites the current text.
        //
        // We subtract any positive padding to handle cases where extreme amounts
        // of padding are applied, then backed off (not sure why this is done, but there
        // are cases where the padding is on the order of 10x the character width, and
        // the TJ just backs up to compensate after each character). Also, we subtract
        // an amount to allow for kerning (a percentage of the width of the last
        // character).
        boolean suppressCharacter = false;
        float tolerance = text.getWidth() / textCharacter.length() / 3.0f;

        SortedMap<Float, TreeSet<Float>> xMatches = sameTextCharacters.subMap(textX - tolerance,
                textX + tolerance);
        for (TreeSet<Float> xMatch : xMatches.values()) {
            SortedSet<Float> yMatches = xMatch.subSet(textY - tolerance, textY + tolerance);
            if (!yMatches.isEmpty()) {
                suppressCharacter = true;
                break;
            }
        }
        if (!suppressCharacter) {
            TreeSet<Float> ySet = sameTextCharacters.get(textX);
            if (ySet == null) {
                ySet = new TreeSet<Float>();
                sameTextCharacters.put(textX, ySet);
            }
            ySet.add(textY);
            showCharacter = true;
        }
    }
    if (showCharacter) {
        // if we are showing the character then we need to determine which article it belongs to
        int foundArticleDivisionIndex = -1;
        int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
        int notFoundButFirstLeftArticleDivisionIndex = -1;
        int notFoundButFirstAboveArticleDivisionIndex = -1;
        float x = text.getX();
        float y = text.getY();
        if (shouldSeparateByBeads) {
            for (int i = 0; i < beadRectangles.size() && foundArticleDivisionIndex == -1; i++) {
                PDRectangle rect = beadRectangles.get(i);
                if (rect != null) {
                    if (rect.contains(x, y)) {
                        foundArticleDivisionIndex = i * 2 + 1;
                    } else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY())
                            && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) {
                        notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2;
                    } else if (x < rect.getLowerLeftX() && notFoundButFirstLeftArticleDivisionIndex == -1) {
                        notFoundButFirstLeftArticleDivisionIndex = i * 2;
                    } else if (y < rect.getUpperRightY() && notFoundButFirstAboveArticleDivisionIndex == -1) {
                        notFoundButFirstAboveArticleDivisionIndex = i * 2;
                    }
                } else {
                    foundArticleDivisionIndex = 0;
                }
            }
        } else {
            foundArticleDivisionIndex = 0;
        }
        int articleDivisionIndex;
        if (foundArticleDivisionIndex != -1) {
            articleDivisionIndex = foundArticleDivisionIndex;
        } else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
        } else if (notFoundButFirstLeftArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
        } else if (notFoundButFirstAboveArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
        } else {
            articleDivisionIndex = charactersByArticle.size() - 1;
        }

        List<TextPosition> textList = charactersByArticle.get(articleDivisionIndex);

        // In the wild, some PDF encoded documents put diacritics (accents on
        // top of characters) into a separate Tj element. When displaying them
        // graphically, the two chunks get overlayed. With text output though,
        // we need to do the overlay. This code recombines the diacritic with
        // its associated character if the two are consecutive.
        if (textList.isEmpty()) {
            textList.add(text);
        } else {
            // test if we overlap the previous entry.
            // Note that we are making an assumption that we need to only look back
            // one TextPosition to find what we are overlapping.
            // This may not always be true. */
            TextPosition previousTextPosition = textList.get(textList.size() - 1);
            if (text.isDiacritic() && previousTextPosition.contains(text)) {
                previousTextPosition.mergeDiacritic(text);
            }
            // If the previous TextPosition was the diacritic, merge it into this
            // one and remove it from the list.
            else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) {
                text.mergeDiacritic(previousTextPosition);
                textList.remove(textList.size() - 1);
                textList.add(text);
            } else {
                textList.add(text);
            }
        }
    }
}

From source file:org.fit.pdfdom.PDFBoxTree.java

License:Open Source License

@Override
protected void processTextPosition(TextPosition text) {
    if (text.isDiacritic()) {
        lastDia = text;/*from  w  ww  .ja va 2 s .  c om*/
    } else if (!text.getUnicode().trim().isEmpty()) {
        if (lastDia != null) {
            if (text.contains(lastDia))
                text.mergeDiacritic(lastDia);
            lastDia = null;
        }

        /*float[] c = transformPosition(text.getX(), text.getY());
        cur_x = c[0];
        cur_y = c[1];*/
        cur_x = text.getX();
        cur_y = text.getY();

        /*System.out.println("Text: " + text.getCharacter());
        System.out.println(" Font size: " + text.getFontSize() + " " + text.getFontSizeInPt() + "pt");
        System.out.println(" Width: " + text.getWidth());
        System.out.println(" Width adj: " + text.getWidthDirAdj());
        System.out.println(" Height: " + text.getHeight());
        System.out.println(" Height dir: " + text.getHeightDir());
        System.out.println(" XScale: " + text.getXScale());
        System.out.println(" YScale: " + text.getYScale());*/

        float distx = 0;
        float disty = 0;
        if (lastText != null) {
            distx = text.getX() - (lastText.getX() + lastText.getWidth());
            disty = text.getY() - lastText.getY();
        }

        //should we split the boxes?
        boolean split = lastText == null || distx > 1.0f || distx < -6.0f || Math.abs(disty) > 1.0f
                || isReversed(getTextDirectionality(text)) != isReversed(getTextDirectionality(lastText));
        //if the style changed, we should split the boxes
        updateStyle(style, text);
        if (!style.equals(curstyle))
            split = true;

        if (split) //start of a new box
        {
            //finish current box (if any)
            if (lastText != null) {
                finishBox();
            }
            //start a new box
            curstyle = new BoxStyle(style);
        }
        textLine.append(text.getUnicode());
        if (textMetrics == null)
            textMetrics = new TextMetrics(text);
        else
            textMetrics.append(text);
        lastText = text;
    }
}