Example usage for org.apache.pdfbox.text TextPosition getFont

List of usage examples for org.apache.pdfbox.text TextPosition getFont

Introduction

In this page you can find the example usage for org.apache.pdfbox.text TextPosition getFont.

Prototype

public PDFont getFont() 

Source Link

Document

This will get the font for the text being drawn.

Usage

From source file:com.repeatability.pdf.PDFTextStripper.java

License:Apache License

/**
 * This will print the text of the processed page to "output". It will estimate, based on the coordinates of the
 * text, where newlines and word spacings should be placed. The text will be sorted only if that feature was
 * enabled.//from ww w  .j  a va2  s .  c o  m
 *
 * @throws IOException If there is an error writing the text.
 */
protected void writePage() throws IOException {
    float maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
    float minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
    float endOfLastTextX = END_OF_LAST_TEXT_X_RESET_VALUE;
    float lastWordSpacing = LAST_WORD_SPACING_RESET_VALUE;
    float maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
    PositionWrapper lastPosition = null;
    PositionWrapper lastLineStartPosition = null;

    boolean startOfPage = true; // flag to indicate start of page
    boolean startOfArticle;
    if (charactersByArticle.size() > 0) {
        writePageStart();
    }

    for (List<TextPosition> textList : charactersByArticle) {
        if (getSortByPosition()) {
            TextPositionComparator comparator = new TextPositionComparator();

            // because the TextPositionComparator is not transitive, but
            // JDK7+ enforces transitivity on comparators, we need to use
            // a custom quicksort implementation (which is slower, unfortunately).
            if (useCustomQuickSort) {
                QuickSort.sort(textList, comparator);
            } else {
                Collections.sort(textList, comparator);
            }
        }

        Iterator<TextPosition> textIter = textList.iterator();

        startArticle();
        startOfArticle = true;

        // Now cycle through to print the text.
        // We queue up a line at a time before we print so that we can convert
        // the line from presentation form to logical form (if needed).
        List<LineItem> line = new ArrayList<LineItem>();

        textIter = textList.iterator(); // start from the beginning again
        // PDF files don't always store spaces. We will need to guess where we should add
        // spaces based on the distances between TextPositions. Historically, this was done
        // based on the size of the space character provided by the font. In general, this
        // worked but there were cases where it did not work. Calculating the average character
        // width and using that as a metric works better in some cases but fails in some cases
        // where the spacing worked. So we use both. NOTE: Adobe reader also fails on some of
        // these examples.

        // Keeps track of the previous average character width
        float previousAveCharWidth = -1;
        while (textIter.hasNext()) {
            TextPosition position = textIter.next();
            PositionWrapper current = new PositionWrapper(position);
            String characterValue = position.getUnicode();

            // Resets the average character width when we see a change in font
            // or a change in the font size
            if (lastPosition != null && (position.getFont() != lastPosition.getTextPosition().getFont()
                    || position.getFontSize() != lastPosition.getTextPosition().getFontSize())) {
                previousAveCharWidth = -1;
            }

            float positionX;
            float positionY;
            float positionWidth;
            float positionHeight;

            // If we are sorting, then we need to use the text direction
            // adjusted coordinates, because they were used in the sorting.
            if (getSortByPosition()) {
                positionX = position.getXDirAdj();
                positionY = position.getYDirAdj();
                positionWidth = position.getWidthDirAdj();
                positionHeight = position.getHeightDir();
            } else {
                positionX = position.getX();
                positionY = position.getY();
                positionWidth = position.getWidth();
                positionHeight = position.getHeight();
            }

            // The current amount of characters in a word
            int wordCharCount = position.getIndividualWidths().length;

            // Estimate the expected width of the space based on the
            // space character with some margin.
            float wordSpacing = position.getWidthOfSpace();
            float deltaSpace;
            if (wordSpacing == 0 || Float.isNaN(wordSpacing)) {
                deltaSpace = Float.MAX_VALUE;
            } else {
                if (lastWordSpacing < 0) {
                    deltaSpace = wordSpacing * getSpacingTolerance();
                } else {
                    deltaSpace = (wordSpacing + lastWordSpacing) / 2f * getSpacingTolerance();
                }
            }

            // Estimate the expected width of the space based on the average character width
            // with some margin. This calculation does not make a true average (average of
            // averages) but we found that it gave the best results after numerous experiments.
            // Based on experiments we also found that .3 worked well.
            float averageCharWidth;
            if (previousAveCharWidth < 0) {
                averageCharWidth = positionWidth / wordCharCount;
            } else {
                averageCharWidth = (previousAveCharWidth + positionWidth / wordCharCount) / 2f;
            }
            float deltaCharWidth = averageCharWidth * getAverageCharTolerance();

            // Compares the values obtained by the average method and the wordSpacing method
            // and picks the smaller number.
            float expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
            if (endOfLastTextX != END_OF_LAST_TEXT_X_RESET_VALUE) {
                if (deltaCharWidth > deltaSpace) {
                    expectedStartOfNextWordX = endOfLastTextX + deltaSpace;
                } else {
                    expectedStartOfNextWordX = endOfLastTextX + deltaCharWidth;
                }
            }

            if (lastPosition != null) {
                if (startOfArticle) {
                    lastPosition.setArticleStart();
                    startOfArticle = false;
                }
                // RDD - Here we determine whether this text object is on the current
                // line. We use the lastBaselineFontSize to handle the superscript
                // case, and the size of the current font to handle the subscript case.
                // Text must overlap with the last rendered baseline text by at least
                // a small amount in order to be considered as being on the same line.

                // XXX BC: In theory, this check should really check if the next char is in
                // full range seen in this line. This is what I tried to do with minYTopForLine,
                // but this caused a lot of regression test failures. So, I'm leaving it be for
                // now
                if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine)) {
                    writeLine(normalize(line));
                    line.clear();
                    lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition,
                            maxHeightForLine);
                    expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
                    maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
                    maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
                    minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
                }
                // test if our TextPosition starts after a new word would be expected to start
                if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE
                        && expectedStartOfNextWordX < positionX &&
                        // only bother adding a space if the last character was not a space
                        lastPosition.getTextPosition().getUnicode() != null
                        && !lastPosition.getTextPosition().getUnicode().endsWith(" ")) {
                    line.add(LineItem.getWordSeparator());
                }
            }
            if (positionY >= maxYForLine) {
                maxYForLine = positionY;
            }
            // RDD - endX is what PDF considers to be the x coordinate of the
            // end position of the text. We use it in computing our metrics below.
            endOfLastTextX = positionX + positionWidth;

            // add it to the list
            if (characterValue != null) {
                if (startOfPage && lastPosition == null) {
                    writeParagraphStart();// not sure this is correct for RTL?
                }
                line.add(new LineItem(position));
            }
            maxHeightForLine = Math.max(maxHeightForLine, positionHeight);
            minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight);
            lastPosition = current;
            if (startOfPage) {
                lastPosition.setParagraphStart();
                lastPosition.setLineStart();
                lastLineStartPosition = lastPosition;
                startOfPage = false;
            }
            lastWordSpacing = wordSpacing;
            previousAveCharWidth = averageCharWidth;
        }
        // print the final line
        if (line.size() > 0) {
            writeLine(normalize(line));
            writeParagraphEnd();
        }
        endArticle();
    }
    writePageEnd();
}

From source file:com.tekstosense.segmenter.data.Text.java

License:Open Source License

public static Text newFor(TextPosition tp, PDGraphicsState gs, String text) {
    Text t = new Text();
    t.x = tp.getXDirAdj();/*from w  ww. ja va2s . c  o m*/
    t.baseline = tp.getYDirAdj();
    t.font = tp.getFont();
    t.strokeColor = gs.getStrokingColor();
    t.nonStrokeColor = gs.getNonStrokingColor();
    t.run = tp.getUnicode();
    t.width = tp.getWidth();
    t.height = tp.getHeight();
    t.pointSize = tp.getFontSizeInPt();
    t.fontSize = tp.getYScale();
    t.tempRun = t.run;

    // Bump the width by the word spacing for each space in tp.
    /*      for (int i=0; i<tp.getCharacter().length(); i++) {
              Character c = tp.getCharacter().charAt(i);
              if (c.equals(" ")) {
    t.width -= tp.getWidthOfSpace();
      t.width += tp.getWordSpacing();
              }
          }
    */
    return t;
}

From source file:com.trollworks.gcs.pdfview.PdfRenderer.java

License:Open Source License

@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
    text = text.toLowerCase();//from  ww w  . j a  v  a  2s. com
    int index = text.indexOf(mTextToHighlight);
    if (index != -1) {
        PDPage currentPage = getCurrentPage();
        PDRectangle pageBoundingBox = currentPage.getBBox();
        AffineTransform flip = new AffineTransform();
        flip.translate(0, pageBoundingBox.getHeight());
        flip.scale(1, -1);
        PDRectangle mediaBox = currentPage.getMediaBox();
        float mediaHeight = mediaBox.getHeight();
        float mediaWidth = mediaBox.getWidth();
        int size = textPositions.size();
        while (index != -1) {
            int last = index + mTextToHighlight.length() - 1;
            for (int i = index; i <= last; i++) {
                TextPosition pos = textPositions.get(i);
                PDFont font = pos.getFont();
                BoundingBox bbox = font.getBoundingBox();
                Rectangle2D.Float rect = new Rectangle2D.Float(0, bbox.getLowerLeftY(),
                        font.getWidth(pos.getCharacterCodes()[0]), bbox.getHeight());
                AffineTransform at = pos.getTextMatrix().createAffineTransform();
                if (font instanceof PDType3Font) {
                    at.concatenate(font.getFontMatrix().createAffineTransform());
                } else {
                    at.scale(1 / 1000f, 1 / 1000f);
                }
                Shape shape = flip.createTransformedShape(at.createTransformedShape(rect));
                AffineTransform transform = mGC.getTransform();
                int rotation = currentPage.getRotation();
                if (rotation != 0) {
                    switch (rotation) {
                    case 90:
                        mGC.translate(mediaHeight, 0);
                        break;
                    case 270:
                        mGC.translate(0, mediaWidth);
                        break;
                    case 180:
                        mGC.translate(mediaWidth, mediaHeight);
                        break;
                    default:
                        break;
                    }
                    mGC.rotate(Math.toRadians(rotation));
                }
                mGC.fill(shape);
                if (rotation != 0) {
                    mGC.setTransform(transform);
                }
            }
            index = last < size - 1 ? text.indexOf(mTextToHighlight, last + 1) : -1;
        }
    }
}

From source file:com.yiyihealth.tools.test.DrawPrintTextLocations.java

License:Apache License

/**
 * Override the default functionality of PDFTextStripper.
 *//*from   w w  w  . j  a v a2 s.c o  m*/
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
    for (TextPosition text : textPositions) {
        System.out.println("String[" + text.getXDirAdj() + "," + text.getYDirAdj() + " fs=" + text.getFontSize()
                + " xscale=" + text.getXScale() + " height=" + text.getHeightDir() + " space="
                + text.getWidthOfSpace() + " width=" + text.getWidthDirAdj() + "]" + text.getUnicode());

        // in red:
        // show rectangles with the "height" (not a real height, but used for text extraction 
        // heuristics, it is 1/2 of the bounding box height and starts at y=0)
        Rectangle2D.Float rect = new Rectangle2D.Float(text.getXDirAdj(),
                (text.getYDirAdj() - text.getHeightDir()), text.getWidthDirAdj(), text.getHeightDir());
        g2d.setColor(Color.red);
        g2d.draw(rect);

        // in blue:
        // show rectangle with the real vertical bounds, based on the font bounding box y values
        // usually, the height is identical to what you see when marking text in Adobe Reader
        PDFont font = text.getFont();
        BoundingBox bbox = font.getBoundingBox();

        // advance width, bbox height (glyph space)
        float xadvance = font.getWidth(text.getCharacterCodes()[0]); // todo: should iterate all chars
        rect = new Rectangle2D.Float(0, bbox.getLowerLeftY(), xadvance, bbox.getHeight());

        // glyph space -> user space
        // note: text.getTextMatrix() is *not* the Text Matrix, it's the Text Rendering Matrix
        AffineTransform at = text.getTextMatrix().createAffineTransform();
        if (font instanceof PDType3Font) {
            // bbox and font matrix are unscaled
            at.concatenate(font.getFontMatrix().createAffineTransform());
        } else {
            // bbox and font matrix are already scaled to 1000
            at.scale(1 / 1000f, 1 / 1000f);
        }
        Shape s = at.createTransformedShape(rect);

        s = flipAT.createTransformedShape(s);
        s = rotateAT.createTransformedShape(s);

        g2d.setColor(Color.blue);
        g2d.draw(s);
    }
}

From source file:edu.ist.psu.sagnik.research.pdfbox2playground.javatest.DrawPrintTextLocations.java

License:Apache License

/**
 * Override the default functionality of PDFTextStripper.
 *///from   w w w. j  av a  2s.  c  om
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
    for (TextPosition text : textPositions) {
        System.out.println("String[" + text.getXDirAdj() + "," + text.getYDirAdj() + " fs=" + text.getFontSize()
                + " xscale=" + text.getXScale() + " height=" + text.getHeightDir() + " space="
                + text.getWidthOfSpace() + " width=" + text.getWidthDirAdj() + "]" + text.getUnicode());

        // in red:
        // show rectangles with the "height" (not a real height, but used for text extraction
        // heuristics, it is 1/2 of the bounding box height and starts at y=0)
        Rectangle2D.Float rect = new Rectangle2D.Float(text.getXDirAdj(),
                (text.getYDirAdj() - text.getHeightDir()), text.getWidthDirAdj(), text.getHeightDir());
        g2d.setColor(Color.red);
        g2d.draw(rect);

        // in blue:
        // show rectangle with the real vertical bounds, based on the font bounding box y values
        // usually, the height is identical to what you see when marking text in Adobe Reader
        PDFont font = text.getFont();
        BoundingBox bbox = font.getBoundingBox();

        // advance width, bbox height (glyph space)
        float xadvance = font.getWidth(text.getCharacterCodes()[0]); // todo: should iterate all chars
        rect = new Rectangle2D.Float(0, bbox.getLowerLeftY(), xadvance, bbox.getHeight());

        // glyph space -> user space
        // note: text.getTextMatrix() is *not* the Text Matrix, it's the Text Rendering Matrix
        AffineTransform at = text.getTextMatrix().createAffineTransform();
        if (font instanceof PDType3Font) {
            // bbox and font matrix are unscaled
            at.concatenate(font.getFontMatrix().createAffineTransform());
        } else {
            // bbox and font matrix are already scaled to 1000
            at.scale(1 / 1000f, 1 / 1000f);
        }
        Shape s = at.createTransformedShape(rect);

        s = flipAT.createTransformedShape(s);
        s = rotateAT.createTransformedShape(s);

        g2d.setColor(Color.blue);
        g2d.draw(s);
    }
}

From source file:org.fit.pdfdom.PDFBoxTree.java

License:Open Source License

/**
 * Updates the text style according to a new text position
 * @param bstyle the style to be updated
 * @param text the text position/*from   ww  w.j  a  va 2 s .co m*/
 */
protected void updateStyle(BoxStyle bstyle, TextPosition text) {
    String font = text.getFont().getName();
    String family = null;
    String weight = null;
    String fstyle = null;

    bstyle.setFontSize(text.getFontSizeInPt());
    bstyle.setLineHeight(text.getHeight());

    if (font != null) {
        //font style and weight
        for (int i = 0; i < pdFontType.length; i++) {
            if (font.toLowerCase().lastIndexOf(pdFontType[i]) >= 0) {
                weight = cssFontWeight[i];
                fstyle = cssFontStyle[i];
                break;
            }
        }
        if (weight != null)
            bstyle.setFontWeight(weight);
        else
            bstyle.setFontWeight(cssFontWeight[0]);
        if (fstyle != null)
            bstyle.setFontStyle(fstyle);
        else
            bstyle.setFontStyle(cssFontStyle[0]);

        //font family
        //If it's a known common font don't embed in html output to save space
        String knownFontFamily = findKnownFontFamily(font);
        if (!knownFontFamily.equals(""))
            family = knownFontFamily;
        else {
            family = fontTable.getUsedName(font);
            if (family == null)
                family = font;
        }

        if (family != null)
            bstyle.setFontFamily(family);
    }

    updateStyleForRenderingMode();
}