Example usage for org.apache.pdfbox.text TextPosition getIndividualWidths

Introduction

In this page you can find the example usage for org.apache.pdfbox.text TextPosition getIndividualWidths.

Prototype

public float[] getIndividualWidths()

Source Link

Document

Get the widths of each individual character.

Usage

From source file:com.repeatability.pdf.PDFTextStripper.java

License:Apache License

/**
 * This will print the text of the processed page to "output". It will estimate, based on the coordinates of the
 * text, where newlines and word spacings should be placed. The text will be sorted only if that feature was
 * enabled.//from   w  w w.  j  a v  a  2  s.  co  m
 *
 * @throws IOException If there is an error writing the text.
 */
protected void writePage() throws IOException {
    float maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
    float minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
    float endOfLastTextX = END_OF_LAST_TEXT_X_RESET_VALUE;
    float lastWordSpacing = LAST_WORD_SPACING_RESET_VALUE;
    float maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
    PositionWrapper lastPosition = null;
    PositionWrapper lastLineStartPosition = null;

    boolean startOfPage = true; // flag to indicate start of page
    boolean startOfArticle;
    if (charactersByArticle.size() > 0) {
        writePageStart();
    }

    for (List<TextPosition> textList : charactersByArticle) {
        if (getSortByPosition()) {
            TextPositionComparator comparator = new TextPositionComparator();

            // because the TextPositionComparator is not transitive, but
            // JDK7+ enforces transitivity on comparators, we need to use
            // a custom quicksort implementation (which is slower, unfortunately).
            if (useCustomQuickSort) {
                QuickSort.sort(textList, comparator);
            } else {
                Collections.sort(textList, comparator);
            }
        }

        Iterator<TextPosition> textIter = textList.iterator();

        startArticle();
        startOfArticle = true;

        // Now cycle through to print the text.
        // We queue up a line at a time before we print so that we can convert
        // the line from presentation form to logical form (if needed).
        List<LineItem> line = new ArrayList<LineItem>();

        textIter = textList.iterator(); // start from the beginning again
        // PDF files don't always store spaces. We will need to guess where we should add
        // spaces based on the distances between TextPositions. Historically, this was done
        // based on the size of the space character provided by the font. In general, this
        // worked but there were cases where it did not work. Calculating the average character
        // width and using that as a metric works better in some cases but fails in some cases
        // where the spacing worked. So we use both. NOTE: Adobe reader also fails on some of
        // these examples.

        // Keeps track of the previous average character width
        float previousAveCharWidth = -1;
        while (textIter.hasNext()) {
            TextPosition position = textIter.next();
            PositionWrapper current = new PositionWrapper(position);
            String characterValue = position.getUnicode();

            // Resets the average character width when we see a change in font
            // or a change in the font size
            if (lastPosition != null && (position.getFont() != lastPosition.getTextPosition().getFont()
                    || position.getFontSize() != lastPosition.getTextPosition().getFontSize())) {
                previousAveCharWidth = -1;
            }

            float positionX;
            float positionY;
            float positionWidth;
            float positionHeight;

            // If we are sorting, then we need to use the text direction
            // adjusted coordinates, because they were used in the sorting.
            if (getSortByPosition()) {
                positionX = position.getXDirAdj();
                positionY = position.getYDirAdj();
                positionWidth = position.getWidthDirAdj();
                positionHeight = position.getHeightDir();
            } else {
                positionX = position.getX();
                positionY = position.getY();
                positionWidth = position.getWidth();
                positionHeight = position.getHeight();
            }

            // The current amount of characters in a word
            int wordCharCount = position.getIndividualWidths().length;

            // Estimate the expected width of the space based on the
            // space character with some margin.
            float wordSpacing = position.getWidthOfSpace();
            float deltaSpace;
            if (wordSpacing == 0 || Float.isNaN(wordSpacing)) {
                deltaSpace = Float.MAX_VALUE;
            } else {
                if (lastWordSpacing < 0) {
                    deltaSpace = wordSpacing * getSpacingTolerance();
                } else {
                    deltaSpace = (wordSpacing + lastWordSpacing) / 2f * getSpacingTolerance();
                }
            }

            // Estimate the expected width of the space based on the average character width
            // with some margin. This calculation does not make a true average (average of
            // averages) but we found that it gave the best results after numerous experiments.
            // Based on experiments we also found that .3 worked well.
            float averageCharWidth;
            if (previousAveCharWidth < 0) {
                averageCharWidth = positionWidth / wordCharCount;
            } else {
                averageCharWidth = (previousAveCharWidth + positionWidth / wordCharCount) / 2f;
            }
            float deltaCharWidth = averageCharWidth * getAverageCharTolerance();

            // Compares the values obtained by the average method and the wordSpacing method
            // and picks the smaller number.
            float expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
            if (endOfLastTextX != END_OF_LAST_TEXT_X_RESET_VALUE) {
                if (deltaCharWidth > deltaSpace) {
                    expectedStartOfNextWordX = endOfLastTextX + deltaSpace;
                } else {
                    expectedStartOfNextWordX = endOfLastTextX + deltaCharWidth;
                }
            }

            if (lastPosition != null) {
                if (startOfArticle) {
                    lastPosition.setArticleStart();
                    startOfArticle = false;
                }
                // RDD - Here we determine whether this text object is on the current
                // line. We use the lastBaselineFontSize to handle the superscript
                // case, and the size of the current font to handle the subscript case.
                // Text must overlap with the last rendered baseline text by at least
                // a small amount in order to be considered as being on the same line.

                // XXX BC: In theory, this check should really check if the next char is in
                // full range seen in this line. This is what I tried to do with minYTopForLine,
                // but this caused a lot of regression test failures. So, I'm leaving it be for
                // now
                if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine)) {
                    writeLine(normalize(line));
                    line.clear();
                    lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition,
                            maxHeightForLine);
                    expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
                    maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
                    maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
                    minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
                }
                // test if our TextPosition starts after a new word would be expected to start
                if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE
                        && expectedStartOfNextWordX < positionX &&
                        // only bother adding a space if the last character was not a space
                        lastPosition.getTextPosition().getUnicode() != null
                        && !lastPosition.getTextPosition().getUnicode().endsWith(" ")) {
                    line.add(LineItem.getWordSeparator());
                }
            }
            if (positionY >= maxYForLine) {
                maxYForLine = positionY;
            }
            // RDD - endX is what PDF considers to be the x coordinate of the
            // end position of the text. We use it in computing our metrics below.
            endOfLastTextX = positionX + positionWidth;

            // add it to the list
            if (characterValue != null) {
                if (startOfPage && lastPosition == null) {
                    writeParagraphStart();// not sure this is correct for RTL?
                }
                line.add(new LineItem(position));
            }
            maxHeightForLine = Math.max(maxHeightForLine, positionHeight);
            minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight);
            lastPosition = current;
            if (startOfPage) {
                lastPosition.setParagraphStart();
                lastPosition.setLineStart();
                lastLineStartPosition = lastPosition;
                startOfPage = false;
            }
            lastWordSpacing = wordSpacing;
            previousAveCharWidth = averageCharWidth;
        }
        // print the final line
        if (line.size() > 0) {
            writeLine(normalize(line));
            writeParagraphEnd();
        }
        endArticle();
    }
    writePageEnd();
}