Example usage for org.apache.pdfbox.text TextPosition getUnicode

Introduction

In this page you can find the example usage for org.apache.pdfbox.text TextPosition getUnicode.

Prototype

public String getUnicode()

Source Link

Document

Return the string of characters stored in this object.

Usage

From source file:PDFTextExtract.java

License:Apache License

/**
 * Override the default functionality of PDFTextStripper.
 */// w  w  w .j  a v a  2  s .com
@Override

protected void writeString(String _string, List<TextPosition> textPositions) throws IOException {
    for (TextPosition text : textPositions) {
        {
            _tmp.add(new Text(text.getXDirAdj(), text.getYDirAdj(), text.getFontSize(), text.getXScale(),
                    text.getHeightDir(), text.getWidthOfSpace(), text.getWidthDirAdj(), text.getUnicode()));
        }
    }
}

From source file:com.formkiq.core.service.generator.pdfbox.TextToPDFieldMapper.java

License:Apache License

/**
 * Remove Non Printable Characters and extra spaces.
 * @param textPositions {@link List} of {@link TextPosition}
 * @return {@link List} of {@link TextPosition}
 *//*from   www  .  j  av a2 s  . com*/
private List<TextPosition> removeNonPrintableAndExtraSpaces(final List<TextPosition> textPositions) {

    List<TextPosition> list = textPositions.stream()
            .filter(s -> cleanTextContent(s.getUnicode()).equals(s.getUnicode())).collect(Collectors.toList());

    int c = 0;
    Iterator<TextPosition> itr = list.iterator();
    while (itr.hasNext()) {

        TextPosition tp = itr.next();
        if (isEmpty(tp.getUnicode().trim())) {
            c++;
            if (c > 2) {
                itr.remove();
            }
        } else {
            c = 0;
        }
    }

    return list;
}

From source file:com.repeatability.pdf.PDFTextStripper.java

License:Apache License

/**
 * This will print the text of the processed page to "output". It will estimate, based on the coordinates of the
 * text, where newlines and word spacings should be placed. The text will be sorted only if that feature was
 * enabled.//from w w w.j  av  a2s  .  c o m
 *
 * @throws IOException If there is an error writing the text.
 */
protected void writePage() throws IOException {
    float maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
    float minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
    float endOfLastTextX = END_OF_LAST_TEXT_X_RESET_VALUE;
    float lastWordSpacing = LAST_WORD_SPACING_RESET_VALUE;
    float maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
    PositionWrapper lastPosition = null;
    PositionWrapper lastLineStartPosition = null;

    boolean startOfPage = true; // flag to indicate start of page
    boolean startOfArticle;
    if (charactersByArticle.size() > 0) {
        writePageStart();
    }

    for (List<TextPosition> textList : charactersByArticle) {
        if (getSortByPosition()) {
            TextPositionComparator comparator = new TextPositionComparator();

            // because the TextPositionComparator is not transitive, but
            // JDK7+ enforces transitivity on comparators, we need to use
            // a custom quicksort implementation (which is slower, unfortunately).
            if (useCustomQuickSort) {
                QuickSort.sort(textList, comparator);
            } else {
                Collections.sort(textList, comparator);
            }
        }

        Iterator<TextPosition> textIter = textList.iterator();

        startArticle();
        startOfArticle = true;

        // Now cycle through to print the text.
        // We queue up a line at a time before we print so that we can convert
        // the line from presentation form to logical form (if needed).
        List<LineItem> line = new ArrayList<LineItem>();

        textIter = textList.iterator(); // start from the beginning again
        // PDF files don't always store spaces. We will need to guess where we should add
        // spaces based on the distances between TextPositions. Historically, this was done
        // based on the size of the space character provided by the font. In general, this
        // worked but there were cases where it did not work. Calculating the average character
        // width and using that as a metric works better in some cases but fails in some cases
        // where the spacing worked. So we use both. NOTE: Adobe reader also fails on some of
        // these examples.

        // Keeps track of the previous average character width
        float previousAveCharWidth = -1;
        while (textIter.hasNext()) {
            TextPosition position = textIter.next();
            PositionWrapper current = new PositionWrapper(position);
            String characterValue = position.getUnicode();

            // Resets the average character width when we see a change in font
            // or a change in the font size
            if (lastPosition != null && (position.getFont() != lastPosition.getTextPosition().getFont()
                    || position.getFontSize() != lastPosition.getTextPosition().getFontSize())) {
                previousAveCharWidth = -1;
            }

            float positionX;
            float positionY;
            float positionWidth;
            float positionHeight;

            // If we are sorting, then we need to use the text direction
            // adjusted coordinates, because they were used in the sorting.
            if (getSortByPosition()) {
                positionX = position.getXDirAdj();
                positionY = position.getYDirAdj();
                positionWidth = position.getWidthDirAdj();
                positionHeight = position.getHeightDir();
            } else {
                positionX = position.getX();
                positionY = position.getY();
                positionWidth = position.getWidth();
                positionHeight = position.getHeight();
            }

            // The current amount of characters in a word
            int wordCharCount = position.getIndividualWidths().length;

            // Estimate the expected width of the space based on the
            // space character with some margin.
            float wordSpacing = position.getWidthOfSpace();
            float deltaSpace;
            if (wordSpacing == 0 || Float.isNaN(wordSpacing)) {
                deltaSpace = Float.MAX_VALUE;
            } else {
                if (lastWordSpacing < 0) {
                    deltaSpace = wordSpacing * getSpacingTolerance();
                } else {
                    deltaSpace = (wordSpacing + lastWordSpacing) / 2f * getSpacingTolerance();
                }
            }

            // Estimate the expected width of the space based on the average character width
            // with some margin. This calculation does not make a true average (average of
            // averages) but we found that it gave the best results after numerous experiments.
            // Based on experiments we also found that .3 worked well.
            float averageCharWidth;
            if (previousAveCharWidth < 0) {
                averageCharWidth = positionWidth / wordCharCount;
            } else {
                averageCharWidth = (previousAveCharWidth + positionWidth / wordCharCount) / 2f;
            }
            float deltaCharWidth = averageCharWidth * getAverageCharTolerance();

            // Compares the values obtained by the average method and the wordSpacing method
            // and picks the smaller number.
            float expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
            if (endOfLastTextX != END_OF_LAST_TEXT_X_RESET_VALUE) {
                if (deltaCharWidth > deltaSpace) {
                    expectedStartOfNextWordX = endOfLastTextX + deltaSpace;
                } else {
                    expectedStartOfNextWordX = endOfLastTextX + deltaCharWidth;
                }
            }

            if (lastPosition != null) {
                if (startOfArticle) {
                    lastPosition.setArticleStart();
                    startOfArticle = false;
                }
                // RDD - Here we determine whether this text object is on the current
                // line. We use the lastBaselineFontSize to handle the superscript
                // case, and the size of the current font to handle the subscript case.
                // Text must overlap with the last rendered baseline text by at least
                // a small amount in order to be considered as being on the same line.

                // XXX BC: In theory, this check should really check if the next char is in
                // full range seen in this line. This is what I tried to do with minYTopForLine,
                // but this caused a lot of regression test failures. So, I'm leaving it be for
                // now
                if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine)) {
                    writeLine(normalize(line));
                    line.clear();
                    lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition,
                            maxHeightForLine);
                    expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
                    maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
                    maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
                    minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
                }
                // test if our TextPosition starts after a new word would be expected to start
                if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE
                        && expectedStartOfNextWordX < positionX &&
                        // only bother adding a space if the last character was not a space
                        lastPosition.getTextPosition().getUnicode() != null
                        && !lastPosition.getTextPosition().getUnicode().endsWith(" ")) {
                    line.add(LineItem.getWordSeparator());
                }
            }
            if (positionY >= maxYForLine) {
                maxYForLine = positionY;
            }
            // RDD - endX is what PDF considers to be the x coordinate of the
            // end position of the text. We use it in computing our metrics below.
            endOfLastTextX = positionX + positionWidth;

            // add it to the list
            if (characterValue != null) {
                if (startOfPage && lastPosition == null) {
                    writeParagraphStart();// not sure this is correct for RTL?
                }
                line.add(new LineItem(position));
            }
            maxHeightForLine = Math.max(maxHeightForLine, positionHeight);
            minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight);
            lastPosition = current;
            if (startOfPage) {
                lastPosition.setParagraphStart();
                lastPosition.setLineStart();
                lastLineStartPosition = lastPosition;
                startOfPage = false;
            }
            lastWordSpacing = wordSpacing;
            previousAveCharWidth = averageCharWidth;
        }
        // print the final line
        if (line.size() > 0) {
            writeLine(normalize(line));
            writeParagraphEnd();
        }
        endArticle();
    }
    writePageEnd();
}

From source file:com.repeatability.pdf.PDFTextStripper.java

License:Apache License

/**
 * Write the string in TextPosition to the output stream.
 *
 * @param text The text to write to the stream.
 * @throws IOException If there is an error when writing the text.
 *//* w  ww  .j av a2s .  co m*/
protected void writeCharacters(TextPosition text) throws IOException {
    output.write(text.getUnicode());
}

From source file:com.repeatability.pdf.PDFTextStripper.java

License:Apache License

/**
 * This will process a TextPosition object and add the text to the list of characters on a page. It takes care of
 * overlapping text.//  w  ww  .  j  a  v a  2 s .c  o  m
 *
 * @param text The text to process.
 */
@Override
protected void processTextPosition(TextPosition text) {
    boolean showCharacter = true;
    if (suppressDuplicateOverlappingText) {
        showCharacter = false;
        String textCharacter = text.getUnicode();
        float textX = text.getX();
        float textY = text.getY();
        TreeMap<Float, TreeSet<Float>> sameTextCharacters = characterListMapping.get(textCharacter);
        if (sameTextCharacters == null) {
            sameTextCharacters = new TreeMap<Float, TreeSet<Float>>();
            characterListMapping.put(textCharacter, sameTextCharacters);
        }
        // RDD - Here we compute the value that represents the end of the rendered
        // text. This value is used to determine whether subsequent text rendered
        // on the same line overwrites the current text.
        //
        // We subtract any positive padding to handle cases where extreme amounts
        // of padding are applied, then backed off (not sure why this is done, but there
        // are cases where the padding is on the order of 10x the character width, and
        // the TJ just backs up to compensate after each character). Also, we subtract
        // an amount to allow for kerning (a percentage of the width of the last
        // character).
        boolean suppressCharacter = false;
        float tolerance = text.getWidth() / textCharacter.length() / 3.0f;

        SortedMap<Float, TreeSet<Float>> xMatches = sameTextCharacters.subMap(textX - tolerance,
                textX + tolerance);
        for (TreeSet<Float> xMatch : xMatches.values()) {
            SortedSet<Float> yMatches = xMatch.subSet(textY - tolerance, textY + tolerance);
            if (!yMatches.isEmpty()) {
                suppressCharacter = true;
                break;
            }
        }
        if (!suppressCharacter) {
            TreeSet<Float> ySet = sameTextCharacters.get(textX);
            if (ySet == null) {
                ySet = new TreeSet<Float>();
                sameTextCharacters.put(textX, ySet);
            }
            ySet.add(textY);
            showCharacter = true;
        }
    }
    if (showCharacter) {
        // if we are showing the character then we need to determine which article it belongs to
        int foundArticleDivisionIndex = -1;
        int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
        int notFoundButFirstLeftArticleDivisionIndex = -1;
        int notFoundButFirstAboveArticleDivisionIndex = -1;
        float x = text.getX();
        float y = text.getY();
        if (shouldSeparateByBeads) {
            for (int i = 0; i < beadRectangles.size() && foundArticleDivisionIndex == -1; i++) {
                PDRectangle rect = beadRectangles.get(i);
                if (rect != null) {
                    if (rect.contains(x, y)) {
                        foundArticleDivisionIndex = i * 2 + 1;
                    } else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY())
                            && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) {
                        notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2;
                    } else if (x < rect.getLowerLeftX() && notFoundButFirstLeftArticleDivisionIndex == -1) {
                        notFoundButFirstLeftArticleDivisionIndex = i * 2;
                    } else if (y < rect.getUpperRightY() && notFoundButFirstAboveArticleDivisionIndex == -1) {
                        notFoundButFirstAboveArticleDivisionIndex = i * 2;
                    }
                } else {
                    foundArticleDivisionIndex = 0;
                }
            }
        } else {
            foundArticleDivisionIndex = 0;
        }
        int articleDivisionIndex;
        if (foundArticleDivisionIndex != -1) {
            articleDivisionIndex = foundArticleDivisionIndex;
        } else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
        } else if (notFoundButFirstLeftArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
        } else if (notFoundButFirstAboveArticleDivisionIndex != -1) {
            articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
        } else {
            articleDivisionIndex = charactersByArticle.size() - 1;
        }

        List<TextPosition> textList = charactersByArticle.get(articleDivisionIndex);

        // In the wild, some PDF encoded documents put diacritics (accents on
        // top of characters) into a separate Tj element. When displaying them
        // graphically, the two chunks get overlayed. With text output though,
        // we need to do the overlay. This code recombines the diacritic with
        // its associated character if the two are consecutive.
        if (textList.isEmpty()) {
            textList.add(text);
        } else {
            // test if we overlap the previous entry.
            // Note that we are making an assumption that we need to only look back
            // one TextPosition to find what we are overlapping.
            // This may not always be true. */
            TextPosition previousTextPosition = textList.get(textList.size() - 1);
            if (text.isDiacritic() && previousTextPosition.contains(text)) {
                previousTextPosition.mergeDiacritic(text);
            }
            // If the previous TextPosition was the diacritic, merge it into this
            // one and remove it from the list.
            else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) {
                text.mergeDiacritic(previousTextPosition);
                textList.remove(textList.size() - 1);
                textList.add(text);
            } else {
                textList.add(text);
            }
        }
    }
}

From source file:com.repeatability.pdf.PDFTextStripper.java

License:Apache License

/**
 * returns the list item Pattern object that matches the text at the specified PositionWrapper or null if the text
 * does not match such a pattern. The list of Patterns tested against is given by the {@link #getListItemPatterns()}
 * method. To add to the list, simply override that method (if sub-classing) or explicitly supply your own list
 * using {@link #setListItemPatterns(List)}.
 * /*from w  w  w .ja  v a 2  s . c om*/
 * @param pw position
 * @return the matching pattern
 */
private Pattern matchListItemPattern(PositionWrapper pw) {
    TextPosition tp = pw.getTextPosition();
    String txt = tp.getUnicode();
    return matchPattern(txt, getListItemPatterns());
}

From source file:com.repeatability.pdf.PDFTextStripper.java

License:Apache License

/**
 * Used within {@link #normalize(List, boolean, boolean)} to handle a {@link TextPosition}.
 * /*from w w  w  . j  a  v a2 s. c  om*/
 * @return The StringBuilder that must be used when calling this method.
 */
// kwa
//    private StringBuilder normalizeAdd(List<WordWithTextPositions> normalized,
//            StringBuilder lineBuilder, List<TextPosition> wordPositions, LineItem item)
protected StringBuilder normalizeAdd(List<WordWithTextPositions> normalized, StringBuilder lineBuilder,
        List<TextPosition> wordPositions, LineItem item) {
    if (item.isWordSeparator()) {
        normalized.add(createWord(lineBuilder.toString(), new ArrayList<TextPosition>(wordPositions)));
        lineBuilder = new StringBuilder();
        wordPositions.clear();
    } else {
        TextPosition text = item.getTextPosition();
        if (text.getUnicode().length() != 1) {
            ; // System.out.println("Ha!");

        }
        lineBuilder.append(text.getUnicode());
        wordPositions.add(text);
    }
    return lineBuilder;
}

From source file:com.tekstosense.segmenter.data.Text.java

License:Open Source License

public static Text newFor(TextPosition tp, PDGraphicsState gs, String text) {
    Text t = new Text();
    t.x = tp.getXDirAdj();//w ww.j a  va 2s  . com
    t.baseline = tp.getYDirAdj();
    t.font = tp.getFont();
    t.strokeColor = gs.getStrokingColor();
    t.nonStrokeColor = gs.getNonStrokingColor();
    t.run = tp.getUnicode();
    t.width = tp.getWidth();
    t.height = tp.getHeight();
    t.pointSize = tp.getFontSizeInPt();
    t.fontSize = tp.getYScale();
    t.tempRun = t.run;

    // Bump the width by the word spacing for each space in tp.
    /*      for (int i=0; i<tp.getCharacter().length(); i++) {
              Character c = tp.getCharacter().charAt(i);
              if (c.equals(" ")) {
    t.width -= tp.getWidthOfSpace();
      t.width += tp.getWordSpacing();
              }
          }
    */
    return t;
}

From source file:com.yiyihealth.tools.test.DrawPrintTextLocations.java

License:Apache License

/**
 * Override the default functionality of PDFTextStripper.
 *//*from  ww  w. j a  v  a  2s.  c o m*/
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
    for (TextPosition text : textPositions) {
        System.out.println("String[" + text.getXDirAdj() + "," + text.getYDirAdj() + " fs=" + text.getFontSize()
                + " xscale=" + text.getXScale() + " height=" + text.getHeightDir() + " space="
                + text.getWidthOfSpace() + " width=" + text.getWidthDirAdj() + "]" + text.getUnicode());

        // in red:
        // show rectangles with the "height" (not a real height, but used for text extraction 
        // heuristics, it is 1/2 of the bounding box height and starts at y=0)
        Rectangle2D.Float rect = new Rectangle2D.Float(text.getXDirAdj(),
                (text.getYDirAdj() - text.getHeightDir()), text.getWidthDirAdj(), text.getHeightDir());
        g2d.setColor(Color.red);
        g2d.draw(rect);

        // in blue:
        // show rectangle with the real vertical bounds, based on the font bounding box y values
        // usually, the height is identical to what you see when marking text in Adobe Reader
        PDFont font = text.getFont();
        BoundingBox bbox = font.getBoundingBox();

        // advance width, bbox height (glyph space)
        float xadvance = font.getWidth(text.getCharacterCodes()[0]); // todo: should iterate all chars
        rect = new Rectangle2D.Float(0, bbox.getLowerLeftY(), xadvance, bbox.getHeight());

        // glyph space -> user space
        // note: text.getTextMatrix() is *not* the Text Matrix, it's the Text Rendering Matrix
        AffineTransform at = text.getTextMatrix().createAffineTransform();
        if (font instanceof PDType3Font) {
            // bbox and font matrix are unscaled
            at.concatenate(font.getFontMatrix().createAffineTransform());
        } else {
            // bbox and font matrix are already scaled to 1000
            at.scale(1 / 1000f, 1 / 1000f);
        }
        Shape s = at.createTransformedShape(rect);

        s = flipAT.createTransformedShape(s);
        s = rotateAT.createTransformedShape(s);

        g2d.setColor(Color.blue);
        g2d.draw(s);
    }
}

From source file:edu.ist.psu.sagnik.research.pdfbox2playground.javatest.DrawPrintTextLocations.java

License:Apache License

/**
 * Override the default functionality of PDFTextStripper.
 *//*w ww  .  j  a  v a2 s  . co m*/
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
    for (TextPosition text : textPositions) {
        System.out.println("String[" + text.getXDirAdj() + "," + text.getYDirAdj() + " fs=" + text.getFontSize()
                + " xscale=" + text.getXScale() + " height=" + text.getHeightDir() + " space="
                + text.getWidthOfSpace() + " width=" + text.getWidthDirAdj() + "]" + text.getUnicode());

        // in red:
        // show rectangles with the "height" (not a real height, but used for text extraction
        // heuristics, it is 1/2 of the bounding box height and starts at y=0)
        Rectangle2D.Float rect = new Rectangle2D.Float(text.getXDirAdj(),
                (text.getYDirAdj() - text.getHeightDir()), text.getWidthDirAdj(), text.getHeightDir());
        g2d.setColor(Color.red);
        g2d.draw(rect);

        // in blue:
        // show rectangle with the real vertical bounds, based on the font bounding box y values
        // usually, the height is identical to what you see when marking text in Adobe Reader
        PDFont font = text.getFont();
        BoundingBox bbox = font.getBoundingBox();

        // advance width, bbox height (glyph space)
        float xadvance = font.getWidth(text.getCharacterCodes()[0]); // todo: should iterate all chars
        rect = new Rectangle2D.Float(0, bbox.getLowerLeftY(), xadvance, bbox.getHeight());

        // glyph space -> user space
        // note: text.getTextMatrix() is *not* the Text Matrix, it's the Text Rendering Matrix
        AffineTransform at = text.getTextMatrix().createAffineTransform();
        if (font instanceof PDType3Font) {
            // bbox and font matrix are unscaled
            at.concatenate(font.getFontMatrix().createAffineTransform());
        } else {
            // bbox and font matrix are already scaled to 1000
            at.scale(1 / 1000f, 1 / 1000f);
        }
        Shape s = at.createTransformedShape(rect);

        s = flipAT.createTransformedShape(s);
        s = rotateAT.createTransformedShape(s);

        g2d.setColor(Color.blue);
        g2d.draw(s);
    }
}