List of usage examples for org.apache.pdfbox.text TextPosition getXDirAdj
public float getXDirAdj()
From source file:PDFTextExtract.java
License:Apache License
/** * Override the default functionality of PDFTextStripper. *//*w w w . j a v a2s . c o m*/ @Override protected void writeString(String _string, List<TextPosition> textPositions) throws IOException { for (TextPosition text : textPositions) { { _tmp.add(new Text(text.getXDirAdj(), text.getYDirAdj(), text.getFontSize(), text.getXScale(), text.getHeightDir(), text.getWidthOfSpace(), text.getWidthDirAdj(), text.getUnicode())); } } }
From source file:com.plumblarrick.andrew.cityrecordtextextractor.CRTextStripper.java
@Override protected void writeString(String text, List<TextPosition> textPositions) throws IOException { TextPosition tPos = textPositions.get(0); TextPosition ePos = textPositions.get(textPositions.size() - 1); //Pattern capNoSpace = Pattern.compile("[a-z]([A-Z])"); String overRun = ""; text = text.trim();/*from w w w . j av a 2s . c o m*/ int startPoint = Math.round(tPos.getXDirAdj()); int endPoint = Math.round(ePos.getXDirAdj()); int splitPoint = 0; if (textChunksOnLine == 0) { output.write(startPoint + "\t"); if (avgFirstColStarts == 0) { temp_sumFirstColStarts = startPoint; temp_sumFirstColEnds = endPoint; } else { temp_sumFirstColStarts = sumFirstColStarts + startPoint; temp_sumFirstColEnds = sumFirstColEnds + endPoint; } } else { if (textChunksOnLine == 1) { columnLineCounter++; if (avgSecondColStarts == 0) { avgSecondColStarts = startPoint; sumSecondColStarts = startPoint; //avgSecondColEnds = endPoint; sumSecondColEnds = endPoint; sumFirstColStarts = temp_sumFirstColStarts; avgFirstColStarts = temp_sumFirstColStarts; } else { sumSecondColStarts = sumSecondColStarts + startPoint; sumSecondColEnds = sumSecondColEnds + endPoint; sumFirstColStarts = temp_sumFirstColStarts; sumFirstColEnds = temp_sumFirstColEnds; avgFirstColStarts = sumFirstColStarts / columnLineCounter; //avgFirstColEnds = sumFirstColEnds / columnLineCounter; } temp_sumFirstColStarts = 0; temp_sumFirstColEnds = 0; } output.write("|" + startPoint + "\t"); } if (startPoint > endPoint) { //catches overrun issues in columns other than the first //check for beginning of overrun float prevXPos = 0; float currXPos; int splitIndex = 0; for (int i = 0; i < text.length(); i++) { currXPos = textPositions.get(i).getXDirAdj(); if (currXPos < prevXPos) { splitIndex = i; splitPoint = Math.round(currXPos); overRun = text.substring(splitIndex); text = text.substring(0, splitIndex); //output.write("|" + startPoint + "\t"); output.write(text); output.write("\n" + splitPoint + "\t" + overRun); writeLineSeparator(); break; } prevXPos = currXPos; } } else { output.write(text); } textChunksOnLine++; }
From source file:com.plumblarrick.andrew.cityrecordtextextractor.CRTStripper.java
@Override protected void writeString(String text, List<TextPosition> textPositions) throws IOException { if (pageCounter > 2) { TextPosition tPos = textPositions.get(0); TextPosition ePos = textPositions.get(textPositions.size() - 1); //Pattern capNoSpace = Pattern.compile("[a-z]([A-Z])"); String overRun = ""; text = text.trim();// w ww . j av a 2 s.co m int startPoint = Math.round(tPos.getXDirAdj()); int endPoint = Math.round(ePos.getXDirAdj()); int splitPoint = 0; double prevX = 0; if (textChunksOnLine > 0) { output.write("|"); } adjustXPosOrder(textPositions, text); textChunksOnLine++; } else { output.write(text); } }
From source file:com.plumblarrick.andrew.cityrecordtextextractor.CRTStripper.java
private void adjustXPosOrder(List<TextPosition> textPositions, String text) throws IOException { TextPosition stPos = textPositions.get(0); boolean inOrder = true; double prevX = 0; double prevXPos = 0; String overRun = ""; int startPoint = Math.round(stPos.getXDirAdj()); long splitPoint; int splitIndex = 0; output.write(String.valueOf(startPoint)); output.write("\t"); for (int i = 0; i < text.length(); i++) { double currXPos = textPositions.get(i).getXDirAdj(); if (i + 1 == text.length()) { output.write(text);/*from w w w. j a v a2 s . c om*/ } if (currXPos < prevXPos || (prevXPos > 0 && currXPos > prevXPos + 9)) { //'backwards' x-axis movement (in this set of docs) //is assumed to indicate an 'overrun' or erroneous //concat //long x-coord gaps may also indicate erroneous concat //and if this over-matches 'gaps' within columns //that should come back out in page processing //b/c reported x-coord should still map to appropriate //column splitIndex = i; splitPoint = Math.round(currXPos); overRun = text.substring(splitIndex); text = text.substring(0, splitIndex); output.write(text); writeLineSeparator(); inOrder = false; adjustXPosOrder(textPositions.subList(i, textPositions.size()), overRun); } prevXPos = currXPos; } }
From source file:com.repeatability.pdf.PDFTextStripper.java
License:Apache License
/** * This will print the text of the processed page to "output". It will estimate, based on the coordinates of the * text, where newlines and word spacings should be placed. The text will be sorted only if that feature was * enabled.//from ww w. j av a 2 s. co m * * @throws IOException If there is an error writing the text. */ protected void writePage() throws IOException { float maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE; float minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE; float endOfLastTextX = END_OF_LAST_TEXT_X_RESET_VALUE; float lastWordSpacing = LAST_WORD_SPACING_RESET_VALUE; float maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE; PositionWrapper lastPosition = null; PositionWrapper lastLineStartPosition = null; boolean startOfPage = true; // flag to indicate start of page boolean startOfArticle; if (charactersByArticle.size() > 0) { writePageStart(); } for (List<TextPosition> textList : charactersByArticle) { if (getSortByPosition()) { TextPositionComparator comparator = new TextPositionComparator(); // because the TextPositionComparator is not transitive, but // JDK7+ enforces transitivity on comparators, we need to use // a custom quicksort implementation (which is slower, unfortunately). if (useCustomQuickSort) { QuickSort.sort(textList, comparator); } else { Collections.sort(textList, comparator); } } Iterator<TextPosition> textIter = textList.iterator(); startArticle(); startOfArticle = true; // Now cycle through to print the text. // We queue up a line at a time before we print so that we can convert // the line from presentation form to logical form (if needed). List<LineItem> line = new ArrayList<LineItem>(); textIter = textList.iterator(); // start from the beginning again // PDF files don't always store spaces. We will need to guess where we should add // spaces based on the distances between TextPositions. Historically, this was done // based on the size of the space character provided by the font. In general, this // worked but there were cases where it did not work. Calculating the average character // width and using that as a metric works better in some cases but fails in some cases // where the spacing worked. So we use both. NOTE: Adobe reader also fails on some of // these examples. // Keeps track of the previous average character width float previousAveCharWidth = -1; while (textIter.hasNext()) { TextPosition position = textIter.next(); PositionWrapper current = new PositionWrapper(position); String characterValue = position.getUnicode(); // Resets the average character width when we see a change in font // or a change in the font size if (lastPosition != null && (position.getFont() != lastPosition.getTextPosition().getFont() || position.getFontSize() != lastPosition.getTextPosition().getFontSize())) { previousAveCharWidth = -1; } float positionX; float positionY; float positionWidth; float positionHeight; // If we are sorting, then we need to use the text direction // adjusted coordinates, because they were used in the sorting. if (getSortByPosition()) { positionX = position.getXDirAdj(); positionY = position.getYDirAdj(); positionWidth = position.getWidthDirAdj(); positionHeight = position.getHeightDir(); } else { positionX = position.getX(); positionY = position.getY(); positionWidth = position.getWidth(); positionHeight = position.getHeight(); } // The current amount of characters in a word int wordCharCount = position.getIndividualWidths().length; // Estimate the expected width of the space based on the // space character with some margin. float wordSpacing = position.getWidthOfSpace(); float deltaSpace; if (wordSpacing == 0 || Float.isNaN(wordSpacing)) { deltaSpace = Float.MAX_VALUE; } else { if (lastWordSpacing < 0) { deltaSpace = wordSpacing * getSpacingTolerance(); } else { deltaSpace = (wordSpacing + lastWordSpacing) / 2f * getSpacingTolerance(); } } // Estimate the expected width of the space based on the average character width // with some margin. This calculation does not make a true average (average of // averages) but we found that it gave the best results after numerous experiments. // Based on experiments we also found that .3 worked well. float averageCharWidth; if (previousAveCharWidth < 0) { averageCharWidth = positionWidth / wordCharCount; } else { averageCharWidth = (previousAveCharWidth + positionWidth / wordCharCount) / 2f; } float deltaCharWidth = averageCharWidth * getAverageCharTolerance(); // Compares the values obtained by the average method and the wordSpacing method // and picks the smaller number. float expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE; if (endOfLastTextX != END_OF_LAST_TEXT_X_RESET_VALUE) { if (deltaCharWidth > deltaSpace) { expectedStartOfNextWordX = endOfLastTextX + deltaSpace; } else { expectedStartOfNextWordX = endOfLastTextX + deltaCharWidth; } } if (lastPosition != null) { if (startOfArticle) { lastPosition.setArticleStart(); startOfArticle = false; } // RDD - Here we determine whether this text object is on the current // line. We use the lastBaselineFontSize to handle the superscript // case, and the size of the current font to handle the subscript case. // Text must overlap with the last rendered baseline text by at least // a small amount in order to be considered as being on the same line. // XXX BC: In theory, this check should really check if the next char is in // full range seen in this line. This is what I tried to do with minYTopForLine, // but this caused a lot of regression test failures. So, I'm leaving it be for // now if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine)) { writeLine(normalize(line)); line.clear(); lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine); expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE; maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE; maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE; minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE; } // test if our TextPosition starts after a new word would be expected to start if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE && expectedStartOfNextWordX < positionX && // only bother adding a space if the last character was not a space lastPosition.getTextPosition().getUnicode() != null && !lastPosition.getTextPosition().getUnicode().endsWith(" ")) { line.add(LineItem.getWordSeparator()); } } if (positionY >= maxYForLine) { maxYForLine = positionY; } // RDD - endX is what PDF considers to be the x coordinate of the // end position of the text. We use it in computing our metrics below. endOfLastTextX = positionX + positionWidth; // add it to the list if (characterValue != null) { if (startOfPage && lastPosition == null) { writeParagraphStart();// not sure this is correct for RTL? } line.add(new LineItem(position)); } maxHeightForLine = Math.max(maxHeightForLine, positionHeight); minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight); lastPosition = current; if (startOfPage) { lastPosition.setParagraphStart(); lastPosition.setLineStart(); lastLineStartPosition = lastPosition; startOfPage = false; } lastWordSpacing = wordSpacing; previousAveCharWidth = averageCharWidth; } // print the final line if (line.size() > 0) { writeLine(normalize(line)); writeParagraphEnd(); } endArticle(); } writePageEnd(); }
From source file:com.tekstosense.segmenter.data.Text.java
License:Open Source License
public static Text newFor(TextPosition tp, PDGraphicsState gs, String text) { Text t = new Text(); t.x = tp.getXDirAdj(); t.baseline = tp.getYDirAdj();//from w w w .jav a 2 s . c o m t.font = tp.getFont(); t.strokeColor = gs.getStrokingColor(); t.nonStrokeColor = gs.getNonStrokingColor(); t.run = tp.getUnicode(); t.width = tp.getWidth(); t.height = tp.getHeight(); t.pointSize = tp.getFontSizeInPt(); t.fontSize = tp.getYScale(); t.tempRun = t.run; // Bump the width by the word spacing for each space in tp. /* for (int i=0; i<tp.getCharacter().length(); i++) { Character c = tp.getCharacter().charAt(i); if (c.equals(" ")) { t.width -= tp.getWidthOfSpace(); t.width += tp.getWordSpacing(); } } */ return t; }
From source file:com.yiyihealth.tools.test.DrawPrintTextLocations.java
License:Apache License
/** * Override the default functionality of PDFTextStripper. */// www . j a va 2s. c o m @Override protected void writeString(String string, List<TextPosition> textPositions) throws IOException { for (TextPosition text : textPositions) { System.out.println("String[" + text.getXDirAdj() + "," + text.getYDirAdj() + " fs=" + text.getFontSize() + " xscale=" + text.getXScale() + " height=" + text.getHeightDir() + " space=" + text.getWidthOfSpace() + " width=" + text.getWidthDirAdj() + "]" + text.getUnicode()); // in red: // show rectangles with the "height" (not a real height, but used for text extraction // heuristics, it is 1/2 of the bounding box height and starts at y=0) Rectangle2D.Float rect = new Rectangle2D.Float(text.getXDirAdj(), (text.getYDirAdj() - text.getHeightDir()), text.getWidthDirAdj(), text.getHeightDir()); g2d.setColor(Color.red); g2d.draw(rect); // in blue: // show rectangle with the real vertical bounds, based on the font bounding box y values // usually, the height is identical to what you see when marking text in Adobe Reader PDFont font = text.getFont(); BoundingBox bbox = font.getBoundingBox(); // advance width, bbox height (glyph space) float xadvance = font.getWidth(text.getCharacterCodes()[0]); // todo: should iterate all chars rect = new Rectangle2D.Float(0, bbox.getLowerLeftY(), xadvance, bbox.getHeight()); // glyph space -> user space // note: text.getTextMatrix() is *not* the Text Matrix, it's the Text Rendering Matrix AffineTransform at = text.getTextMatrix().createAffineTransform(); if (font instanceof PDType3Font) { // bbox and font matrix are unscaled at.concatenate(font.getFontMatrix().createAffineTransform()); } else { // bbox and font matrix are already scaled to 1000 at.scale(1 / 1000f, 1 / 1000f); } Shape s = at.createTransformedShape(rect); s = flipAT.createTransformedShape(s); s = rotateAT.createTransformedShape(s); g2d.setColor(Color.blue); g2d.draw(s); } }
From source file:edu.ist.psu.sagnik.research.pdfbox2playground.javatest.DrawPrintTextLocations.java
License:Apache License
/** * Override the default functionality of PDFTextStripper. *//*from ww w. j a v a 2s. c o m*/ @Override protected void writeString(String string, List<TextPosition> textPositions) throws IOException { for (TextPosition text : textPositions) { System.out.println("String[" + text.getXDirAdj() + "," + text.getYDirAdj() + " fs=" + text.getFontSize() + " xscale=" + text.getXScale() + " height=" + text.getHeightDir() + " space=" + text.getWidthOfSpace() + " width=" + text.getWidthDirAdj() + "]" + text.getUnicode()); // in red: // show rectangles with the "height" (not a real height, but used for text extraction // heuristics, it is 1/2 of the bounding box height and starts at y=0) Rectangle2D.Float rect = new Rectangle2D.Float(text.getXDirAdj(), (text.getYDirAdj() - text.getHeightDir()), text.getWidthDirAdj(), text.getHeightDir()); g2d.setColor(Color.red); g2d.draw(rect); // in blue: // show rectangle with the real vertical bounds, based on the font bounding box y values // usually, the height is identical to what you see when marking text in Adobe Reader PDFont font = text.getFont(); BoundingBox bbox = font.getBoundingBox(); // advance width, bbox height (glyph space) float xadvance = font.getWidth(text.getCharacterCodes()[0]); // todo: should iterate all chars rect = new Rectangle2D.Float(0, bbox.getLowerLeftY(), xadvance, bbox.getHeight()); // glyph space -> user space // note: text.getTextMatrix() is *not* the Text Matrix, it's the Text Rendering Matrix AffineTransform at = text.getTextMatrix().createAffineTransform(); if (font instanceof PDType3Font) { // bbox and font matrix are unscaled at.concatenate(font.getFontMatrix().createAffineTransform()); } else { // bbox and font matrix are already scaled to 1000 at.scale(1 / 1000f, 1 / 1000f); } Shape s = at.createTransformedShape(rect); s = flipAT.createTransformedShape(s); s = rotateAT.createTransformedShape(s); g2d.setColor(Color.blue); g2d.draw(s); } }
From source file:uk.org.openeyes.PDFFunctions.java
/** * Override the default functionality of PDFTextStripper. * @param string/* www. j av a 2 s . c o m*/ * @param textPositions * @throws java.io.IOException */ @Override protected void writeString(String string, List<TextPosition> textPositions) throws IOException { for (TextPosition text : textPositions) { System.out.println((char) 27 + "[0m String[" + text.getXDirAdj() + "," + text.getYDirAdj() + " fs=" + text.getFontSize() + " xscale=" + text.getXScale() + " height=" + text.getHeightDir() + " space=" + text.getWidthOfSpace() + " width=" + text.getWidthDirAdj() + "] " + (char) 27 + "[34;43m \t" + text.getUnicode() + " "); } }