Example usage for org.apache.pdfbox.text TextPosition getIndividualWidths

List of usage examples for org.apache.pdfbox.text TextPosition getIndividualWidths

Introduction

In this page you can find the example usage for org.apache.pdfbox.text TextPosition getIndividualWidths.

Prototype

public float[] getIndividualWidths() 

Source Link

Document

Get the widths of each individual character.

Usage

From source file:com.repeatability.pdf.PDFTextStripper.java

License:Apache License

/**
 * This will print the text of the processed page to "output". It will estimate, based on the coordinates of the
 * text, where newlines and word spacings should be placed. The text will be sorted only if that feature was
 * enabled.//from   w  w w.  j  a v  a  2  s.  co  m
 *
 * @throws IOException If there is an error writing the text.
 */
protected void writePage() throws IOException {
    float maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
    float minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
    float endOfLastTextX = END_OF_LAST_TEXT_X_RESET_VALUE;
    float lastWordSpacing = LAST_WORD_SPACING_RESET_VALUE;
    float maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
    PositionWrapper lastPosition = null;
    PositionWrapper lastLineStartPosition = null;

    boolean startOfPage = true; // flag to indicate start of page
    boolean startOfArticle;
    if (charactersByArticle.size() > 0) {
        writePageStart();
    }

    for (List<TextPosition> textList : charactersByArticle) {
        if (getSortByPosition()) {
            TextPositionComparator comparator = new TextPositionComparator();

            // because the TextPositionComparator is not transitive, but
            // JDK7+ enforces transitivity on comparators, we need to use
            // a custom quicksort implementation (which is slower, unfortunately).
            if (useCustomQuickSort) {
                QuickSort.sort(textList, comparator);
            } else {
                Collections.sort(textList, comparator);
            }
        }

        Iterator<TextPosition> textIter = textList.iterator();

        startArticle();
        startOfArticle = true;

        // Now cycle through to print the text.
        // We queue up a line at a time before we print so that we can convert
        // the line from presentation form to logical form (if needed).
        List<LineItem> line = new ArrayList<LineItem>();

        textIter = textList.iterator(); // start from the beginning again
        // PDF files don't always store spaces. We will need to guess where we should add
        // spaces based on the distances between TextPositions. Historically, this was done
        // based on the size of the space character provided by the font. In general, this
        // worked but there were cases where it did not work. Calculating the average character
        // width and using that as a metric works better in some cases but fails in some cases
        // where the spacing worked. So we use both. NOTE: Adobe reader also fails on some of
        // these examples.

        // Keeps track of the previous average character width
        float previousAveCharWidth = -1;
        while (textIter.hasNext()) {
            TextPosition position = textIter.next();
            PositionWrapper current = new PositionWrapper(position);
            String characterValue = position.getUnicode();

            // Resets the average character width when we see a change in font
            // or a change in the font size
            if (lastPosition != null && (position.getFont() != lastPosition.getTextPosition().getFont()
                    || position.getFontSize() != lastPosition.getTextPosition().getFontSize())) {
                previousAveCharWidth = -1;
            }

            float positionX;
            float positionY;
            float positionWidth;
            float positionHeight;

            // If we are sorting, then we need to use the text direction
            // adjusted coordinates, because they were used in the sorting.
            if (getSortByPosition()) {
                positionX = position.getXDirAdj();
                positionY = position.getYDirAdj();
                positionWidth = position.getWidthDirAdj();
                positionHeight = position.getHeightDir();
            } else {
                positionX = position.getX();
                positionY = position.getY();
                positionWidth = position.getWidth();
                positionHeight = position.getHeight();
            }

            // The current amount of characters in a word
            int wordCharCount = position.getIndividualWidths().length;

            // Estimate the expected width of the space based on the
            // space character with some margin.
            float wordSpacing = position.getWidthOfSpace();
            float deltaSpace;
            if (wordSpacing == 0 || Float.isNaN(wordSpacing)) {
                deltaSpace = Float.MAX_VALUE;
            } else {
                if (lastWordSpacing < 0) {
                    deltaSpace = wordSpacing * getSpacingTolerance();
                } else {
                    deltaSpace = (wordSpacing + lastWordSpacing) / 2f * getSpacingTolerance();
                }
            }

            // Estimate the expected width of the space based on the average character width
            // with some margin. This calculation does not make a true average (average of
            // averages) but we found that it gave the best results after numerous experiments.
            // Based on experiments we also found that .3 worked well.
            float averageCharWidth;
            if (previousAveCharWidth < 0) {
                averageCharWidth = positionWidth / wordCharCount;
            } else {
                averageCharWidth = (previousAveCharWidth + positionWidth / wordCharCount) / 2f;
            }
            float deltaCharWidth = averageCharWidth * getAverageCharTolerance();

            // Compares the values obtained by the average method and the wordSpacing method
            // and picks the smaller number.
            float expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
            if (endOfLastTextX != END_OF_LAST_TEXT_X_RESET_VALUE) {
                if (deltaCharWidth > deltaSpace) {
                    expectedStartOfNextWordX = endOfLastTextX + deltaSpace;
                } else {
                    expectedStartOfNextWordX = endOfLastTextX + deltaCharWidth;
                }
            }

            if (lastPosition != null) {
                if (startOfArticle) {
                    lastPosition.setArticleStart();
                    startOfArticle = false;
                }
                // RDD - Here we determine whether this text object is on the current
                // line. We use the lastBaselineFontSize to handle the superscript
                // case, and the size of the current font to handle the subscript case.
                // Text must overlap with the last rendered baseline text by at least
                // a small amount in order to be considered as being on the same line.

                // XXX BC: In theory, this check should really check if the next char is in
                // full range seen in this line. This is what I tried to do with minYTopForLine,
                // but this caused a lot of regression test failures. So, I'm leaving it be for
                // now
                if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine)) {
                    writeLine(normalize(line));
                    line.clear();
                    lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition,
                            maxHeightForLine);
                    expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
                    maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
                    maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
                    minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
                }
                // test if our TextPosition starts after a new word would be expected to start
                if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE
                        && expectedStartOfNextWordX < positionX &&
                        // only bother adding a space if the last character was not a space
                        lastPosition.getTextPosition().getUnicode() != null
                        && !lastPosition.getTextPosition().getUnicode().endsWith(" ")) {
                    line.add(LineItem.getWordSeparator());
                }
            }
            if (positionY >= maxYForLine) {
                maxYForLine = positionY;
            }
            // RDD - endX is what PDF considers to be the x coordinate of the
            // end position of the text. We use it in computing our metrics below.
            endOfLastTextX = positionX + positionWidth;

            // add it to the list
            if (characterValue != null) {
                if (startOfPage && lastPosition == null) {
                    writeParagraphStart();// not sure this is correct for RTL?
                }
                line.add(new LineItem(position));
            }
            maxHeightForLine = Math.max(maxHeightForLine, positionHeight);
            minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight);
            lastPosition = current;
            if (startOfPage) {
                lastPosition.setParagraphStart();
                lastPosition.setLineStart();
                lastLineStartPosition = lastPosition;
                startOfPage = false;
            }
            lastWordSpacing = wordSpacing;
            previousAveCharWidth = averageCharWidth;
        }
        // print the final line
        if (line.size() > 0) {
            writeLine(normalize(line));
            writeParagraphEnd();
        }
        endArticle();
    }
    writePageEnd();
}