Example usage for org.apache.pdfbox.text TextPosition getX

Introduction

In this page you can find the example usage for org.apache.pdfbox.text TextPosition getX.

Prototype

public float getX()

Source Link

Document

This will get the page rotation adjusted x position of the character.

Usage

From source file:at.knowcenter.wag.egov.egiz.pdfbox2.pdf.PDFPage.java

License:EUPL

/**
 * A method provided as an event interface to allow a subclass to perform
 * some specific functionality when a character needs to be displayed. This
 * method is used to calculate the latest position of a text in the page.
 * Sorry for this missinterpretation of the method, but it is the only way
 * to do this (provided by PDFBox)!!!// ww  w. j  a  v a  2  s.c  om
 * 
 * @param text
 *            the character to be displayed -> calculate there y position.
 */
protected void showCharacter(TextPosition text) {
    float current_y = text.getY();
    final String character = text.toString();

    if (at.gv.egiz.pdfas.common.utils.StringUtils.whiteSpaceTrim(character).isEmpty()) {
        return;
    }

    int pageRotation = this.getCurrentPage().getRotation();
    // logger_.debug("PageRotation = " + pageRotation);
    /*if (pageRotation == 0) {
       current_y = text.getY();
    }
    if (pageRotation == 90) {
       current_y = text.getY();
    }
    if (pageRotation == 180) {
       current_y = text.getY();
    }
    if (pageRotation == 270) {
       current_y = text.getY();
    }
            
    if (current_y > this.effectivePageHeight) {
       this.max_character_ypos = this.effectivePageHeight;
       return;
    }
            
    // store ypos of the char if it is not empty
    if (current_y > this.max_character_ypos) {
       this.max_character_ypos = current_y;
    }*/

    if (pageRotation == 0) {
        current_y = text.getY();
    }
    if (pageRotation == 90) {
        current_y = text.getX();
    }
    if (pageRotation == 180) {
        float page_height = this.getCurrentPage().getMediaBox().getHeight();
        current_y = page_height - text.getY();
    }
    if (pageRotation == 270) {
        float page_height = this.getCurrentPage().getMediaBox().getHeight();
        current_y = page_height - text.getX();
    }

    if (current_y > this.effectivePageHeight) {
        // logger_.debug("character is below footer_line. footer_line = " +
        // this.footer_line + ", text.character=" + character + ", y=" +
        // current_y);
        return;
    }

    // store ypos of the char if it is not empty
    if (current_y > this.max_character_ypos) {
        this.max_character_ypos = current_y;
    }
}

From source file:com.repeatability.pdf.PDFTextStripper.java

License:Apache License

/**
 * This will print the text of the processed page to "output". It will estimate, based on the coordinates of the
 * text, where newlines and word spacings should be placed. The text will be sorted only if that feature was
 * enabled.//from www  .  j ava2s .com
 *
 * @throws IOException If there is an error writing the text.
 */
protected void writePage() throws IOException {
    float maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
    float minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
    float endOfLastTextX = END_OF_LAST_TEXT_X_RESET_VALUE;
    float lastWordSpacing = LAST_WORD_SPACING_RESET_VALUE;
    float maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
    PositionWrapper lastPosition = null;
    PositionWrapper lastLineStartPosition = null;

    boolean startOfPage = true; // flag to indicate start of page
    boolean startOfArticle;
    if (charactersByArticle.size() > 0) {
        writePageStart();
    }

    for (List<TextPosition> textList : charactersByArticle) {
        if (getSortByPosition()) {
            TextPositionComparator comparator = new TextPositionComparator();

            // because the TextPositionComparator is not transitive, but
            // JDK7+ enforces transitivity on comparators, we need to use
            // a custom quicksort implementation (which is slower, unfortunately).
            if (useCustomQuickSort) {
                QuickSort.sort(textList, comparator);
            } else {
                Collections.sort(textList, comparator);
            }
        }

        Iterator<TextPosition> textIter = textList.iterator();

        startArticle();
        startOfArticle = true;

        // Now cycle through to print the text.
        // We queue up a line at a time before we print so that we can convert
        // the line from presentation form to logical form (if needed).
        List<LineItem> line = new ArrayList<LineItem>();

        textIter = textList.iterator(); // start from the beginning again
        // PDF files don't always store spaces. We will need to guess where we should add
        // spaces based on the distances between TextPositions. Historically, this was done
        // based on the size of the space character provided by the font. In general, this
        // worked but there were cases where it did not work. Calculating the average character
        // width and using that as a metric works better in some cases but fails in some cases
        // where the spacing worked. So we use both. NOTE: Adobe reader also fails on some of
        // these examples.

        // Keeps track of the previous average character width
        float previousAveCharWidth = -1;
        while (textIter.hasNext()) {
            TextPosition position = textIter.next();
            PositionWrapper current = new PositionWrapper(position);
            String characterValue = position.getUnicode();

            // Resets the average character width when we see a change in font
            // or a change in the font size
            if (lastPosition != null && (position.getFont() != lastPosition.getTextPosition().getFont()
                    || position.getFontSize() != lastPosition.getTextPosition().getFontSize())) {
                previousAveCharWidth = -1;
            }

            float positionX;
            float positionY;
            float positionWidth;
            float positionHeight;

            // If we are sorting, then we need to use the text direction
            // adjusted coordinates, because they were used in the sorting.
            if (getSortByPosition()) {
                positionX = position.getXDirAdj();
                positionY = position.getYDirAdj();
                positionWidth = position.getWidthDirAdj();
                positionHeight = position.getHeightDir();
            } else {
                positionX = position.getX();
                positionY = position.getY();
                positionWidth = position.getWidth();
                positionHeight = position.getHeight();
            }

            // The current amount of characters in a word
            int wordCharCount = position.getIndividualWidths().length;

            // Estimate the expected width of the space based on the
            // space character with some margin.
            float wordSpacing = position.getWidthOfSpace();
            float deltaSpace;
            if (wordSpacing == 0 || Float.isNaN(wordSpacing)) {
                deltaSpace = Float.MAX_VALUE;
            } else {
                if (lastWordSpacing < 0) {
                    deltaSpace = wordSpacing * getSpacingTolerance();
                } else {
                    deltaSpace = (wordSpacing + lastWordSpacing) / 2f * getSpacingTolerance();
                }
            }

            // Estimate the expected width of the space based on the average character width
            // with some margin. This calculation does not make a true average (average of
            // averages) but we found that it gave the best results after numerous experiments.
            // Based on experiments we also found that .3 worked well.
            float averageCharWidth;
            if (previousAveCharWidth < 0) {
                averageCharWidth = positionWidth / wordCharCount;
            } else {
                averageCharWidth = (previousAveCharWidth + positionWidth / wordCharCount) / 2f;
            }
            float deltaCharWidth = averageCharWidth * getAverageCharTolerance();

            // Compares the values obtained by the average method and the wordSpacing method
            // and picks the smaller number.
            float expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
            if (endOfLastTextX != END_OF_LAST_TEXT_X_RESET_VALUE) {
                if (deltaCharWidth > deltaSpace) {
                    expectedStartOfNextWordX = endOfLastTextX + deltaSpace;
                } else {
                    expectedStartOfNextWordX = endOfLastTextX + deltaCharWidth;
                }
            }

            if (lastPosition != null) {
                if (startOfArticle) {
                    lastPosition.setArticleStart();
                    startOfArticle = false;
                }
                // RDD - Here we determine whether this text object is on the current
                // line. We use the lastBaselineFontSize to handle the superscript
                // case, and the size of the current font to handle the subscript case.
                // Text must overlap with the last rendered baseline text by at least
                // a small amount in order to be considered as being on the same line.

                // XXX BC: In theory, this check should really check if the next char is in
                // full range seen in this line. This is what I tried to do with minYTopForLine,
                // but this caused a lot of regression test failures. So, I'm leaving it be for
                // now
                if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine)) {
                    writeLine(normalize(line));
                    line.clear();
                    lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition,
                            maxHeightForLine);
                    expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
                    maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
                    maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
                    minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
                }
                // test if our TextPosition starts after a new word would be expected to start
                if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE
                        && expectedStartOfNextWordX < positionX &&
                        // only bother adding a space if the last character was not a space
                        lastPosition.getTextPosition().getUnicode() != null
                        && !lastPosition.getTextPosition().getUnicode().endsWith(" ")) {
                    line.add(LineItem.getWordSeparator());
                }
            }
            if (positionY >= maxYForLine) {
                maxYForLine = positionY;
            }
            // RDD - endX is what PDF considers to be the x coordinate of the
            // end position of the text. We use it in computing our metrics below.
            endOfLastTextX = positionX + positionWidth;

            // add it to the list
            if (characterValue != null) {
                if (startOfPage && lastPosition == null) {
                    writeParagraphStart();// not sure this is correct for RTL?
                }
                line.add(new LineItem(position));
            }
            maxHeightForLine = Math.max(maxHeightForLine, positionHeight);
            minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight);
            lastPosition = current;
            if (startOfPage) {
                lastPosition.setParagraphStart();
                lastPosition.setLineStart();
                lastLineStartPosition = lastPosition;
                startOfPage = false;
            }
            lastWordSpacing = wordSpacing;
            previousAveCharWidth = averageCharWidth;
        }
        // print the final line
        if (line.size() > 0) {
            writeLine(normalize(line));
            writeParagraphEnd();
        }
        endArticle();
    }
    writePageEnd();
}

From source file:com.repeatability.pdf.PDFTextStripper.java

License:Apache License

/**
 * This will process a TextPosition object and add the text to the list of characters on a page. It takes care of
 * overlapping text./*from   ww  w  .  j a v a 2 s.com*/
 *
 * @param text The text to process.
 */
@Override
protected void processTextPosition(TextPosition text) {
    boolean showCharacter = true;
    if (suppressDuplicateOverlappingText) {
        showCharacter = false;
        String textCharacter = text.getUnicode();
        float textX = text.getX();
        float textY = text.getY();
        TreeMap<Float, TreeSet<Float>> sameTextCharacters = characterListMapping.get(textCharacter);
        if (sameTextCharacters == null) {
            sameTextCharacters = new TreeMap<Float, TreeSet<Float>>();
            characterListMapping.put(textCharacter, sameTextCharacters);
        }
        // RDD - Here we compute the value that represents the end of the rendered
        // text. This value is used to determine whether subsequent text rendered
        // on the same line overwrites the current text.
        //
        // We subtract any positive padding to handle cases where extreme amounts
        // of padding are applied, then backed off (not sure why this is done, but there
        // are cases where the padding is on the order of 10x the character width, and
        // the TJ just backs up to compensate after each character). Also, we subtract
        // an amount to allow for kerning (a percentage of the width of the last
        // character).
        boolean suppressCharacter = false;
        float tolerance = text.getWidth() / textCharacter.length() / 3.0f;

        SortedMap<Float, TreeSet<Float>> xMatches = sameTextCharacters.subMap(textX - tolerance,
                textX + tolerance);
        for (TreeSet<Float> xMatch : xMatches.values()) {
            SortedSet<Float> yMatches = xMatch.subSet(textY - tolerance, textY + tolerance);
            if (!yMatches.isEmpty()) {
                suppressCharacter = true;
                break;
            }
        }
        if (!suppressCharacter) {
            TreeSet<Float> ySet = sameTextCharacters.get(textX);
            if (ySet == null) {
                ySet = new TreeSet<Float>();
                sameTextCharacters.put(textX, ySet);
            }
            ySet.add(textY);
            showCharacter = true;
        }
    }
    if (showCharacter) {
        // if we are showing the character then we need to determine which article it belongs to
        int foundArticleDivisionIndex = -1;
        int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
        int notFoundButFirstLeftArticleDivisionIndex = -1;
        int notFoundButFirstAboveArticleDivisionIndex = -1;
        float x = text.getX();
        float y = text.getY();
        if (shouldSeparateByBeads) {
            for (int i = 0; i < beadRectangles.size() && foundArticleDivisionIndex == -1; i++) {
                PDRectangle rect = beadRectangles.get(i);
                if (rect != null) {
                    if (rect.contains(x, y)) {
                        foundArticleDivisionIndex = i * 2 + 1;
                    } else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY())
                            && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) {
                        notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2;
                    } else if (x < rect.getLowerLeftX() && notFoundButFirstLeftArticleDivisionIndex == -1) {
                        notFoundButFirstLeftArticleDivisionIndex = i * 2;
                    } else if (y < rect.getUpperRightY() && notFoundButFirstAboveArticleDivisionIndex == -1) {
                        notFoundButFirstAboveArticleDivisionIndex = i * 2;
                    }
                } else {
                    foundArticleDivisionIndex = 0;
                }
            }
        } else {
            foundArticleDivisionIndex = 0;
        }
        int articleDivisionIndex;
        if (foundArticleDivisionIndex != -1) {
            articleDivisionIndex = foundArticleDivisionIndex;
        } else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
        } else if (notFoundButFirstLeftArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
        } else if (notFoundButFirstAboveArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
        } else {
            articleDivisionIndex = charactersByArticle.size() - 1;
        }

        List<TextPosition> textList = charactersByArticle.get(articleDivisionIndex);

        // In the wild, some PDF encoded documents put diacritics (accents on
        // top of characters) into a separate Tj element. When displaying them
        // graphically, the two chunks get overlayed. With text output though,
        // we need to do the overlay. This code recombines the diacritic with
        // its associated character if the two are consecutive.
        if (textList.isEmpty()) {
            textList.add(text);
        } else {
            // test if we overlap the previous entry.
            // Note that we are making an assumption that we need to only look back
            // one TextPosition to find what we are overlapping.
            // This may not always be true. */
            TextPosition previousTextPosition = textList.get(textList.size() - 1);
            if (text.isDiacritic() && previousTextPosition.contains(text)) {
                previousTextPosition.mergeDiacritic(text);
            }
            // If the previous TextPosition was the diacritic, merge it into this
            // one and remove it from the list.
            else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) {
                text.mergeDiacritic(previousTextPosition);
                textList.remove(textList.size() - 1);
                textList.add(text);
            } else {
                textList.add(text);
            }
        }
    }
}

From source file:org.fit.pdfdom.PDFBoxTree.java

License:Open Source License

@Override
protected void processTextPosition(TextPosition text) {
    if (text.isDiacritic()) {
        lastDia = text;//from w w w  . ja v  a2 s.  co  m
    } else if (!text.getUnicode().trim().isEmpty()) {
        if (lastDia != null) {
            if (text.contains(lastDia))
                text.mergeDiacritic(lastDia);
            lastDia = null;
        }

        /*float[] c = transformPosition(text.getX(), text.getY());
        cur_x = c[0];
        cur_y = c[1];*/
        cur_x = text.getX();
        cur_y = text.getY();

        /*System.out.println("Text: " + text.getCharacter());
        System.out.println(" Font size: " + text.getFontSize() + " " + text.getFontSizeInPt() + "pt");
        System.out.println(" Width: " + text.getWidth());
        System.out.println(" Width adj: " + text.getWidthDirAdj());
        System.out.println(" Height: " + text.getHeight());
        System.out.println(" Height dir: " + text.getHeightDir());
        System.out.println(" XScale: " + text.getXScale());
        System.out.println(" YScale: " + text.getYScale());*/

        float distx = 0;
        float disty = 0;
        if (lastText != null) {
            distx = text.getX() - (lastText.getX() + lastText.getWidth());
            disty = text.getY() - lastText.getY();
        }

        //should we split the boxes?
        boolean split = lastText == null || distx > 1.0f || distx < -6.0f || Math.abs(disty) > 1.0f
                || isReversed(getTextDirectionality(text)) != isReversed(getTextDirectionality(lastText));
        //if the style changed, we should split the boxes
        updateStyle(style, text);
        if (!style.equals(curstyle))
            split = true;

        if (split) //start of a new box
        {
            //finish current box (if any)
            if (lastText != null) {
                finishBox();
            }
            //start a new box
            curstyle = new BoxStyle(style);
        }
        textLine.append(text.getUnicode());
        if (textMetrics == null)
            textMetrics = new TextMetrics(text);
        else
            textMetrics.append(text);
        lastText = text;
    }
}