List of usage examples for org.apache.pdfbox.text TextPosition getX
public float getX()
From source file:at.knowcenter.wag.egov.egiz.pdfbox2.pdf.PDFPage.java
License:EUPL
/** * A method provided as an event interface to allow a subclass to perform * some specific functionality when a character needs to be displayed. This * method is used to calculate the latest position of a text in the page. * Sorry for this missinterpretation of the method, but it is the only way * to do this (provided by PDFBox)!!!// ww w. j a v a 2 s.c om * * @param text * the character to be displayed -> calculate there y position. */ protected void showCharacter(TextPosition text) { float current_y = text.getY(); final String character = text.toString(); if (at.gv.egiz.pdfas.common.utils.StringUtils.whiteSpaceTrim(character).isEmpty()) { return; } int pageRotation = this.getCurrentPage().getRotation(); // logger_.debug("PageRotation = " + pageRotation); /*if (pageRotation == 0) { current_y = text.getY(); } if (pageRotation == 90) { current_y = text.getY(); } if (pageRotation == 180) { current_y = text.getY(); } if (pageRotation == 270) { current_y = text.getY(); } if (current_y > this.effectivePageHeight) { this.max_character_ypos = this.effectivePageHeight; return; } // store ypos of the char if it is not empty if (current_y > this.max_character_ypos) { this.max_character_ypos = current_y; }*/ if (pageRotation == 0) { current_y = text.getY(); } if (pageRotation == 90) { current_y = text.getX(); } if (pageRotation == 180) { float page_height = this.getCurrentPage().getMediaBox().getHeight(); current_y = page_height - text.getY(); } if (pageRotation == 270) { float page_height = this.getCurrentPage().getMediaBox().getHeight(); current_y = page_height - text.getX(); } if (current_y > this.effectivePageHeight) { // logger_.debug("character is below footer_line. footer_line = " + // this.footer_line + ", text.character=" + character + ", y=" + // current_y); return; } // store ypos of the char if it is not empty if (current_y > this.max_character_ypos) { this.max_character_ypos = current_y; } }
From source file:com.repeatability.pdf.PDFTextStripper.java
License:Apache License
/** * This will print the text of the processed page to "output". It will estimate, based on the coordinates of the * text, where newlines and word spacings should be placed. The text will be sorted only if that feature was * enabled.//from www . j ava2s .com * * @throws IOException If there is an error writing the text. */ protected void writePage() throws IOException { float maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE; float minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE; float endOfLastTextX = END_OF_LAST_TEXT_X_RESET_VALUE; float lastWordSpacing = LAST_WORD_SPACING_RESET_VALUE; float maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE; PositionWrapper lastPosition = null; PositionWrapper lastLineStartPosition = null; boolean startOfPage = true; // flag to indicate start of page boolean startOfArticle; if (charactersByArticle.size() > 0) { writePageStart(); } for (List<TextPosition> textList : charactersByArticle) { if (getSortByPosition()) { TextPositionComparator comparator = new TextPositionComparator(); // because the TextPositionComparator is not transitive, but // JDK7+ enforces transitivity on comparators, we need to use // a custom quicksort implementation (which is slower, unfortunately). if (useCustomQuickSort) { QuickSort.sort(textList, comparator); } else { Collections.sort(textList, comparator); } } Iterator<TextPosition> textIter = textList.iterator(); startArticle(); startOfArticle = true; // Now cycle through to print the text. // We queue up a line at a time before we print so that we can convert // the line from presentation form to logical form (if needed). List<LineItem> line = new ArrayList<LineItem>(); textIter = textList.iterator(); // start from the beginning again // PDF files don't always store spaces. We will need to guess where we should add // spaces based on the distances between TextPositions. Historically, this was done // based on the size of the space character provided by the font. In general, this // worked but there were cases where it did not work. Calculating the average character // width and using that as a metric works better in some cases but fails in some cases // where the spacing worked. So we use both. NOTE: Adobe reader also fails on some of // these examples. // Keeps track of the previous average character width float previousAveCharWidth = -1; while (textIter.hasNext()) { TextPosition position = textIter.next(); PositionWrapper current = new PositionWrapper(position); String characterValue = position.getUnicode(); // Resets the average character width when we see a change in font // or a change in the font size if (lastPosition != null && (position.getFont() != lastPosition.getTextPosition().getFont() || position.getFontSize() != lastPosition.getTextPosition().getFontSize())) { previousAveCharWidth = -1; } float positionX; float positionY; float positionWidth; float positionHeight; // If we are sorting, then we need to use the text direction // adjusted coordinates, because they were used in the sorting. if (getSortByPosition()) { positionX = position.getXDirAdj(); positionY = position.getYDirAdj(); positionWidth = position.getWidthDirAdj(); positionHeight = position.getHeightDir(); } else { positionX = position.getX(); positionY = position.getY(); positionWidth = position.getWidth(); positionHeight = position.getHeight(); } // The current amount of characters in a word int wordCharCount = position.getIndividualWidths().length; // Estimate the expected width of the space based on the // space character with some margin. float wordSpacing = position.getWidthOfSpace(); float deltaSpace; if (wordSpacing == 0 || Float.isNaN(wordSpacing)) { deltaSpace = Float.MAX_VALUE; } else { if (lastWordSpacing < 0) { deltaSpace = wordSpacing * getSpacingTolerance(); } else { deltaSpace = (wordSpacing + lastWordSpacing) / 2f * getSpacingTolerance(); } } // Estimate the expected width of the space based on the average character width // with some margin. This calculation does not make a true average (average of // averages) but we found that it gave the best results after numerous experiments. // Based on experiments we also found that .3 worked well. float averageCharWidth; if (previousAveCharWidth < 0) { averageCharWidth = positionWidth / wordCharCount; } else { averageCharWidth = (previousAveCharWidth + positionWidth / wordCharCount) / 2f; } float deltaCharWidth = averageCharWidth * getAverageCharTolerance(); // Compares the values obtained by the average method and the wordSpacing method // and picks the smaller number. float expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE; if (endOfLastTextX != END_OF_LAST_TEXT_X_RESET_VALUE) { if (deltaCharWidth > deltaSpace) { expectedStartOfNextWordX = endOfLastTextX + deltaSpace; } else { expectedStartOfNextWordX = endOfLastTextX + deltaCharWidth; } } if (lastPosition != null) { if (startOfArticle) { lastPosition.setArticleStart(); startOfArticle = false; } // RDD - Here we determine whether this text object is on the current // line. We use the lastBaselineFontSize to handle the superscript // case, and the size of the current font to handle the subscript case. // Text must overlap with the last rendered baseline text by at least // a small amount in order to be considered as being on the same line. // XXX BC: In theory, this check should really check if the next char is in // full range seen in this line. This is what I tried to do with minYTopForLine, // but this caused a lot of regression test failures. So, I'm leaving it be for // now if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine)) { writeLine(normalize(line)); line.clear(); lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine); expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE; maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE; maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE; minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE; } // test if our TextPosition starts after a new word would be expected to start if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE && expectedStartOfNextWordX < positionX && // only bother adding a space if the last character was not a space lastPosition.getTextPosition().getUnicode() != null && !lastPosition.getTextPosition().getUnicode().endsWith(" ")) { line.add(LineItem.getWordSeparator()); } } if (positionY >= maxYForLine) { maxYForLine = positionY; } // RDD - endX is what PDF considers to be the x coordinate of the // end position of the text. We use it in computing our metrics below. endOfLastTextX = positionX + positionWidth; // add it to the list if (characterValue != null) { if (startOfPage && lastPosition == null) { writeParagraphStart();// not sure this is correct for RTL? } line.add(new LineItem(position)); } maxHeightForLine = Math.max(maxHeightForLine, positionHeight); minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight); lastPosition = current; if (startOfPage) { lastPosition.setParagraphStart(); lastPosition.setLineStart(); lastLineStartPosition = lastPosition; startOfPage = false; } lastWordSpacing = wordSpacing; previousAveCharWidth = averageCharWidth; } // print the final line if (line.size() > 0) { writeLine(normalize(line)); writeParagraphEnd(); } endArticle(); } writePageEnd(); }
From source file:com.repeatability.pdf.PDFTextStripper.java
License:Apache License
/** * This will process a TextPosition object and add the text to the list of characters on a page. It takes care of * overlapping text./*from ww w . j a v a 2 s.com*/ * * @param text The text to process. */ @Override protected void processTextPosition(TextPosition text) { boolean showCharacter = true; if (suppressDuplicateOverlappingText) { showCharacter = false; String textCharacter = text.getUnicode(); float textX = text.getX(); float textY = text.getY(); TreeMap<Float, TreeSet<Float>> sameTextCharacters = characterListMapping.get(textCharacter); if (sameTextCharacters == null) { sameTextCharacters = new TreeMap<Float, TreeSet<Float>>(); characterListMapping.put(textCharacter, sameTextCharacters); } // RDD - Here we compute the value that represents the end of the rendered // text. This value is used to determine whether subsequent text rendered // on the same line overwrites the current text. // // We subtract any positive padding to handle cases where extreme amounts // of padding are applied, then backed off (not sure why this is done, but there // are cases where the padding is on the order of 10x the character width, and // the TJ just backs up to compensate after each character). Also, we subtract // an amount to allow for kerning (a percentage of the width of the last // character). boolean suppressCharacter = false; float tolerance = text.getWidth() / textCharacter.length() / 3.0f; SortedMap<Float, TreeSet<Float>> xMatches = sameTextCharacters.subMap(textX - tolerance, textX + tolerance); for (TreeSet<Float> xMatch : xMatches.values()) { SortedSet<Float> yMatches = xMatch.subSet(textY - tolerance, textY + tolerance); if (!yMatches.isEmpty()) { suppressCharacter = true; break; } } if (!suppressCharacter) { TreeSet<Float> ySet = sameTextCharacters.get(textX); if (ySet == null) { ySet = new TreeSet<Float>(); sameTextCharacters.put(textX, ySet); } ySet.add(textY); showCharacter = true; } } if (showCharacter) { // if we are showing the character then we need to determine which article it belongs to int foundArticleDivisionIndex = -1; int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1; int notFoundButFirstLeftArticleDivisionIndex = -1; int notFoundButFirstAboveArticleDivisionIndex = -1; float x = text.getX(); float y = text.getY(); if (shouldSeparateByBeads) { for (int i = 0; i < beadRectangles.size() && foundArticleDivisionIndex == -1; i++) { PDRectangle rect = beadRectangles.get(i); if (rect != null) { if (rect.contains(x, y)) { foundArticleDivisionIndex = i * 2 + 1; } else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY()) && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) { notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2; } else if (x < rect.getLowerLeftX() && notFoundButFirstLeftArticleDivisionIndex == -1) { notFoundButFirstLeftArticleDivisionIndex = i * 2; } else if (y < rect.getUpperRightY() && notFoundButFirstAboveArticleDivisionIndex == -1) { notFoundButFirstAboveArticleDivisionIndex = i * 2; } } else { foundArticleDivisionIndex = 0; } } } else { foundArticleDivisionIndex = 0; } int articleDivisionIndex; if (foundArticleDivisionIndex != -1) { articleDivisionIndex = foundArticleDivisionIndex; } else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex; } else if (notFoundButFirstLeftArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex; } else if (notFoundButFirstAboveArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex; } else { articleDivisionIndex = charactersByArticle.size() - 1; } List<TextPosition> textList = charactersByArticle.get(articleDivisionIndex); // In the wild, some PDF encoded documents put diacritics (accents on // top of characters) into a separate Tj element. When displaying them // graphically, the two chunks get overlayed. With text output though, // we need to do the overlay. This code recombines the diacritic with // its associated character if the two are consecutive. if (textList.isEmpty()) { textList.add(text); } else { // test if we overlap the previous entry. // Note that we are making an assumption that we need to only look back // one TextPosition to find what we are overlapping. // This may not always be true. */ TextPosition previousTextPosition = textList.get(textList.size() - 1); if (text.isDiacritic() && previousTextPosition.contains(text)) { previousTextPosition.mergeDiacritic(text); } // If the previous TextPosition was the diacritic, merge it into this // one and remove it from the list. else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) { text.mergeDiacritic(previousTextPosition); textList.remove(textList.size() - 1); textList.add(text); } else { textList.add(text); } } } }
From source file:org.fit.pdfdom.PDFBoxTree.java
License:Open Source License
@Override protected void processTextPosition(TextPosition text) { if (text.isDiacritic()) { lastDia = text;//from w w w . ja v a2 s. co m } else if (!text.getUnicode().trim().isEmpty()) { if (lastDia != null) { if (text.contains(lastDia)) text.mergeDiacritic(lastDia); lastDia = null; } /*float[] c = transformPosition(text.getX(), text.getY()); cur_x = c[0]; cur_y = c[1];*/ cur_x = text.getX(); cur_y = text.getY(); /*System.out.println("Text: " + text.getCharacter()); System.out.println(" Font size: " + text.getFontSize() + " " + text.getFontSizeInPt() + "pt"); System.out.println(" Width: " + text.getWidth()); System.out.println(" Width adj: " + text.getWidthDirAdj()); System.out.println(" Height: " + text.getHeight()); System.out.println(" Height dir: " + text.getHeightDir()); System.out.println(" XScale: " + text.getXScale()); System.out.println(" YScale: " + text.getYScale());*/ float distx = 0; float disty = 0; if (lastText != null) { distx = text.getX() - (lastText.getX() + lastText.getWidth()); disty = text.getY() - lastText.getY(); } //should we split the boxes? boolean split = lastText == null || distx > 1.0f || distx < -6.0f || Math.abs(disty) > 1.0f || isReversed(getTextDirectionality(text)) != isReversed(getTextDirectionality(lastText)); //if the style changed, we should split the boxes updateStyle(style, text); if (!style.equals(curstyle)) split = true; if (split) //start of a new box { //finish current box (if any) if (lastText != null) { finishBox(); } //start a new box curstyle = new BoxStyle(style); } textLine.append(text.getUnicode()); if (textMetrics == null) textMetrics = new TextMetrics(text); else textMetrics.append(text); lastText = text; } }