List of usage examples for org.apache.pdfbox.pdmodel.interactive.pagenavigation PDThreadBead getRectangle
public PDRectangle getRectangle()
From source file:aplicacion.sistema.indexer.test.PDFTextStripperOrg.java
License:Apache License
/** * This will process a TextPosition object and add the * text to the list of characters on a page. It takes care of * overlapping text./*from ww w . jav a 2s . c om*/ * * @param text The text to process. */ protected void processTextPosition(TextPosition text) { boolean showCharacter = true; if (suppressDuplicateOverlappingText) { showCharacter = false; String textCharacter = text.getCharacter(); float textX = text.getX(); float textY = text.getY(); List sameTextCharacters = (List) characterListMapping.get(textCharacter); if (sameTextCharacters == null) { sameTextCharacters = new ArrayList(); characterListMapping.put(textCharacter, sameTextCharacters); } // RDD - Here we compute the value that represents the end of the rendered // text. This value is used to determine whether subsequent text rendered // on the same line overwrites the current text. // // We subtract any positive padding to handle cases where extreme amounts // of padding are applied, then backed off (not sure why this is done, but there // are cases where the padding is on the order of 10x the character width, and // the TJ just backs up to compensate after each character). Also, we subtract // an amount to allow for kerning (a percentage of the width of the last // character). // boolean suppressCharacter = false; float tolerance = (text.getWidth() / textCharacter.length()) / 3.0f; for (int i = 0; i < sameTextCharacters.size() && textCharacter != null; i++) { TextPosition character = (TextPosition) sameTextCharacters.get(i); String charCharacter = character.getCharacter(); float charX = character.getX(); float charY = character.getY(); //only want to suppress if (charCharacter != null && //charCharacter.equals( textCharacter ) && within(charX, textX, tolerance) && within(charY, textY, tolerance)) { suppressCharacter = true; } } if (!suppressCharacter) { sameTextCharacters.add(text); showCharacter = true; } } if (showCharacter) { //if we are showing the character then we need to determine which //article it belongs to. int foundArticleDivisionIndex = -1; int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1; int notFoundButFirstLeftArticleDivisionIndex = -1; int notFoundButFirstAboveArticleDivisionIndex = -1; float x = text.getX(); float y = text.getY(); if (shouldSeparateByBeads) { for (int i = 0; i < pageArticles.size() && foundArticleDivisionIndex == -1; i++) { PDThreadBead bead = (PDThreadBead) pageArticles.get(i); if (bead != null) { PDRectangle rect = bead.getRectangle(); if (rect.contains(x, y)) { foundArticleDivisionIndex = i * 2 + 1; } else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY()) && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) { notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2; } else if (x < rect.getLowerLeftX() && notFoundButFirstLeftArticleDivisionIndex == -1) { notFoundButFirstLeftArticleDivisionIndex = i * 2; } else if (y < rect.getUpperRightY() && notFoundButFirstAboveArticleDivisionIndex == -1) { notFoundButFirstAboveArticleDivisionIndex = i * 2; } } else { foundArticleDivisionIndex = 0; } } } else { foundArticleDivisionIndex = 0; } int articleDivisionIndex = -1; if (foundArticleDivisionIndex != -1) { articleDivisionIndex = foundArticleDivisionIndex; } else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex; } else if (notFoundButFirstLeftArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex; } else if (notFoundButFirstAboveArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex; } else { articleDivisionIndex = charactersByArticle.size() - 1; } List textList = (List) charactersByArticle.get(articleDivisionIndex); /* In the wild, some PDF encoded documents put diacritics (accents on * top of characters) into a separate Tj element. When displaying them * graphically, the two chunks get overlayed. With text output though, * we need to do the overlay. This code recombines the diacritic with * its associated character if the two are consecutive. */ if (textList.isEmpty()) { textList.add(text); } else { /* test if we overlap the previous entry. * Note that we are making an assumption that we need to only look back * one TextPosition to find what we are overlapping. * This may not always be true. */ TextPosition previousTextPosition = (TextPosition) textList.get(textList.size() - 1); if (text.isDiacritic() && previousTextPosition.contains(text)) { previousTextPosition.mergeDiacritic(text, normalize); } /* If the previous TextPosition was the diacritic, merge it into this * one and remove it from the list. */ else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) { text.mergeDiacritic(previousTextPosition, normalize); textList.remove(textList.size() - 1); textList.add(text); } else { textList.add(text); } } } }
From source file:com.repeatability.pdf.PDFTextStripper.java
License:Apache License
private void fillBeadRectangles(PDPage page) { beadRectangles = new ArrayList<PDRectangle>(); for (PDThreadBead bead : page.getThreadBeads()) { if (bead == null) { // can't skip, because of null entry handling in processTextPosition() beadRectangles.add(null);/*from w w w . j av a 2 s. c om*/ continue; } PDRectangle rect = bead.getRectangle(); // bead rectangle is in PDF coordinates (y=0 is bottom), // glyphs are in image coordinates (y=0 is top), // so we must flip PDRectangle mediaBox = page.getMediaBox(); float upperRightY = mediaBox.getUpperRightY() - rect.getLowerLeftY(); float lowerLeftY = mediaBox.getUpperRightY() - rect.getUpperRightY(); rect.setLowerLeftY(lowerLeftY); rect.setUpperRightY(upperRightY); // adjust for cropbox PDRectangle cropBox = page.getCropBox(); if (cropBox.getLowerLeftX() != 0 || cropBox.getLowerLeftY() != 0) { rect.setLowerLeftX(rect.getLowerLeftX() - cropBox.getLowerLeftX()); rect.setLowerLeftY(rect.getLowerLeftY() - cropBox.getLowerLeftY()); rect.setUpperRightX(rect.getUpperRightX() - cropBox.getLowerLeftX()); rect.setUpperRightY(rect.getUpperRightY() - cropBox.getLowerLeftY()); } beadRectangles.add(rect); } }
From source file:com.yiyihealth.tools.test.DrawPrintTextLocations.java
License:Apache License
private void stripPage(int page) throws IOException { PDFRenderer pdfRenderer = new PDFRenderer(document); image = pdfRenderer.renderImage(page, SCALE); PDPage pdPage = document.getPage(page); PDRectangle cropBox = pdPage.getCropBox(); // flip y-axis flipAT = new AffineTransform(); flipAT.translate(0, pdPage.getBBox().getHeight()); flipAT.scale(1, -1);// ww w .j a v a 2 s .c om // page may be rotated rotateAT = new AffineTransform(); int rotation = pdPage.getRotation(); if (rotation != 0) { PDRectangle mediaBox = pdPage.getMediaBox(); switch (rotation) { case 90: rotateAT.translate(mediaBox.getHeight(), 0); break; case 270: rotateAT.translate(0, mediaBox.getWidth()); break; case 180: rotateAT.translate(mediaBox.getWidth(), mediaBox.getHeight()); break; default: break; } rotateAT.rotate(Math.toRadians(rotation)); } g2d = image.createGraphics(); g2d.setStroke(new BasicStroke(0.1f)); g2d.scale(SCALE, SCALE); setStartPage(page + 1); setEndPage(page + 1); Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream()); writeText(document, dummy); // beads in green g2d.setStroke(new BasicStroke(0.4f)); List<PDThreadBead> pageArticles = pdPage.getThreadBeads(); for (PDThreadBead bead : pageArticles) { PDRectangle r = bead.getRectangle(); GeneralPath p = r .transform(Matrix.getTranslateInstance(-cropBox.getLowerLeftX(), cropBox.getLowerLeftY())); Shape s = flipAT.createTransformedShape(p); s = rotateAT.createTransformedShape(s); g2d.setColor(Color.green); g2d.draw(s); } g2d.dispose(); String imageFilename = filename; int pt = imageFilename.lastIndexOf('.'); imageFilename = imageFilename.substring(0, pt) + "-marked-" + (page + 1) + ".png"; ImageIO.write(image, "png", new File(imageFilename)); }
From source file:de.tudarmstadt.ukp.dkpro.core.io.pdf.PdfLayoutEventStripper.java
License:Apache License
/** * This will show add a character to the list of characters to be printed to the text file. * // ww w. j a va2 s.co m * @param text * The description of the character to display. */ @Override protected void processTextPosition(final TextPosition text) { boolean showCharacter = true; if (suppressDuplicateOverlappingText) { showCharacter = false; final String textCharacter = text.getCharacter(); final float textX = text.getX(); final float textY = text.getY(); List<TextPosition> sameTextCharacters = characterListMapping.get(textCharacter); if (sameTextCharacters == null) { sameTextCharacters = new ArrayList<TextPosition>(); characterListMapping.put(textCharacter, sameTextCharacters); } // RDD - Here we compute the value that represents the end of the // rendered // text. This value is used to determine whether subsequent text // rendered // on the same line overwrites the current text. // // We subtract any positive padding to handle cases where extreme // amounts // of padding are applied, then backed off (not sure why this is // done, but there // are cases where the padding is on the order of 10x the character // width, and // the TJ just backs up to compensate after each character). Also, // we subtract // an amount to allow for kerning (a percentage of the width of the // last // character). // boolean suppressCharacter = false; final float tolerance = (text.getWidth() / textCharacter.length()) / 3.0f; for (int i = 0; i < sameTextCharacters.size() && textCharacter != null; i++) { final TextPosition character = sameTextCharacters.get(i); final String charCharacter = character.getCharacter(); final float charX = character.getX(); final float charY = character.getY(); // only want to suppress if (charCharacter != null && // charCharacter.equals( textCharacter ) && within(charX, textX, tolerance) && within(charY, textY, tolerance)) { suppressCharacter = true; } } if (!suppressCharacter && (text.getCharacter() != null) && (text.getCharacter().length() > 0)) { sameTextCharacters.add(text); showCharacter = true; } } if (showCharacter) { // if we are showing the character then we need to determine which // article it belongs to. int foundArticleDivisionIndex = -1; int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1; int notFoundButFirstLeftArticleDivisionIndex = -1; int notFoundButFirstAboveArticleDivisionIndex = -1; final float x = text.getX(); final float y = text.getY(); if (shouldSeparateByBeads) { for (int i = 0; i < pageArticles.size() && foundArticleDivisionIndex == -1; i++) { final PDThreadBead bead = pageArticles.get(i); if (bead != null) { final PDRectangle rect = bead.getRectangle(); if (rect.contains(x, y)) { foundArticleDivisionIndex = i * 2 + 1; } else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY()) && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) { notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2; } else if (x < rect.getLowerLeftX() && notFoundButFirstLeftArticleDivisionIndex == -1) { notFoundButFirstLeftArticleDivisionIndex = i * 2; } else if (y < rect.getUpperRightY() && notFoundButFirstAboveArticleDivisionIndex == -1) { notFoundButFirstAboveArticleDivisionIndex = i * 2; } } else { foundArticleDivisionIndex = 0; } } } else { foundArticleDivisionIndex = 0; } int articleDivisionIndex = -1; if (foundArticleDivisionIndex != -1) { articleDivisionIndex = foundArticleDivisionIndex; } else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex; } else if (notFoundButFirstLeftArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex; } else if (notFoundButFirstAboveArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex; } else { articleDivisionIndex = charactersByArticle.size() - 1; } final List<TextPosition> textList = charactersByArticle.get(articleDivisionIndex); textList.add(text); } }
From source file:edu.isi.bmkeg.lapdf.extraction.LAPDFTextStripper.java
License:Apache License
/** * This will process a TextPosition object and add the * text to the list of characters on a page. It takes care of * overlapping text.//w ww . ja v a 2s . c om * * @param text The text to process. */ protected void processTextPosition(TextPosition text) { boolean showCharacter = true; if (suppressDuplicateOverlappingText) { showCharacter = false; String textCharacter = text.getCharacter(); float textX = text.getX(); float textY = text.getY(); TreeMap<Float, TreeSet<Float>> sameTextCharacters = characterListMapping.get(textCharacter); if (sameTextCharacters == null) { sameTextCharacters = new TreeMap<Float, TreeSet<Float>>(); characterListMapping.put(textCharacter, sameTextCharacters); } // RDD - Here we compute the value that represents the end of the rendered // text. This value is used to determine whether subsequent text rendered // on the same line overwrites the current text. // // We subtract any positive padding to handle cases where extreme amounts // of padding are applied, then backed off (not sure why this is done, but there // are cases where the padding is on the order of 10x the character width, and // the TJ just backs up to compensate after each character). Also, we subtract // an amount to allow for kerning (a percentage of the width of the last // character). // boolean suppressCharacter = false; float tolerance = (text.getWidth() / textCharacter.length()) / 3.0f; SortedMap<Float, TreeSet<Float>> xMatches = sameTextCharacters.subMap(textX - tolerance, textX + tolerance); for (TreeSet<Float> xMatch : xMatches.values()) { SortedSet<Float> yMatches = xMatch.subSet(textY - tolerance, textY + tolerance); if (!yMatches.isEmpty()) { suppressCharacter = true; break; } } if (!suppressCharacter) { TreeSet<Float> ySet = sameTextCharacters.get(textX); if (ySet == null) { ySet = new TreeSet<Float>(); sameTextCharacters.put(textX, ySet); } ySet.add(textY); showCharacter = true; } } if (showCharacter) { //if we are showing the character then we need to determine which //article it belongs to. int foundArticleDivisionIndex = -1; int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1; int notFoundButFirstLeftArticleDivisionIndex = -1; int notFoundButFirstAboveArticleDivisionIndex = -1; float x = text.getX(); float y = text.getY(); if (shouldSeparateByBeads) { for (int i = 0; i < pageArticles.size() && foundArticleDivisionIndex == -1; i++) { PDThreadBead bead = (PDThreadBead) pageArticles.get(i); if (bead != null) { PDRectangle rect = bead.getRectangle(); if (rect.contains(x, y)) { foundArticleDivisionIndex = i * 2 + 1; } else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY()) && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) { notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2; } else if (x < rect.getLowerLeftX() && notFoundButFirstLeftArticleDivisionIndex == -1) { notFoundButFirstLeftArticleDivisionIndex = i * 2; } else if (y < rect.getUpperRightY() && notFoundButFirstAboveArticleDivisionIndex == -1) { notFoundButFirstAboveArticleDivisionIndex = i * 2; } } else { foundArticleDivisionIndex = 0; } } } else { foundArticleDivisionIndex = 0; } int articleDivisionIndex = -1; if (foundArticleDivisionIndex != -1) { articleDivisionIndex = foundArticleDivisionIndex; } else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex; } else if (notFoundButFirstLeftArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex; } else if (notFoundButFirstAboveArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex; } else { articleDivisionIndex = charactersByArticle.size() - 1; } List<TextPosition> textList = (List<TextPosition>) charactersByArticle.get(articleDivisionIndex); /* In the wild, some PDF encoded documents put diacritics (accents on * top of characters) into a separate Tj element. When displaying them * graphically, the two chunks get overlayed. With text output though, * we need to do the overlay. This code recombines the diacritic with * its associated character if the two are consecutive. */ if (textList.isEmpty()) { textList.add(text); } else { /* test if we overlap the previous entry. * Note that we are making an assumption that we need to only look back * one TextPosition to find what we are overlapping. * This may not always be true. */ TextPosition previousTextPosition = (TextPosition) textList.get(textList.size() - 1); if (text.isDiacritic() && previousTextPosition.contains(text)) { previousTextPosition.mergeDiacritic(text, normalize); } /* If the previous TextPosition was the diacritic, merge it into this * one and remove it from the list. */ else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) { text.mergeDiacritic(previousTextPosition, normalize); textList.remove(textList.size() - 1); textList.add(text); } else { textList.add(text); } } } }
From source file:onyx.core.parser.PDFTextStripper.java
License:Apache License
/** * This will process a TextPosition object and add the * text to the list of characters on a page. It takes care of * overlapping text.//from ww w . j a v a 2 s. c o m * * @param text The text to process. */ protected void processTextPosition(TextPosition text) { boolean showCharacter = true; if (suppressDuplicateOverlappingText) { showCharacter = false; String textCharacter = text.getCharacter(); float textX = text.getX(); float textY = text.getY(); TreeMap<Float, TreeSet<Float>> sameTextCharacters = characterListMapping.get(textCharacter); if (sameTextCharacters == null) { sameTextCharacters = new TreeMap<Float, TreeSet<Float>>(); characterListMapping.put(textCharacter, sameTextCharacters); } // RDD - Here we compute the value that represents the end of the rendered // text. This value is used to determine whether subsequent text rendered // on the same line overwrites the current text. // // We subtract any positive padding to handle cases where extreme amounts // of padding are applied, then backed off (not sure why this is done, but there // are cases where the padding is on the order of 10x the character width, and // the TJ just backs up to compensate after each character). Also, we subtract // an amount to allow for kerning (a percentage of the width of the last // character). // boolean suppressCharacter = false; float tolerance = (text.getWidth() / textCharacter.length()) / 3.0f; SortedMap<Float, TreeSet<Float>> xMatches = sameTextCharacters.subMap(textX - tolerance, textX + tolerance); for (TreeSet<Float> xMatch : xMatches.values()) { SortedSet<Float> yMatches = xMatch.subSet(textY - tolerance, textY + tolerance); if (!yMatches.isEmpty()) { suppressCharacter = true; break; } } if (!suppressCharacter) { TreeSet<Float> ySet = sameTextCharacters.get(textX); if (ySet == null) { ySet = new TreeSet<Float>(); sameTextCharacters.put(textX, ySet); } ySet.add(textY); showCharacter = true; } } if (showCharacter) { //if we are showing the character then we need to determine which //article it belongs to. int foundArticleDivisionIndex = -1; int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1; int notFoundButFirstLeftArticleDivisionIndex = -1; int notFoundButFirstAboveArticleDivisionIndex = -1; float x = text.getX(); float y = text.getY(); if (shouldSeparateByBeads) { for (int i = 0; i < pageArticles.size() && foundArticleDivisionIndex == -1; i++) { PDThreadBead bead = (PDThreadBead) pageArticles.get(i); if (bead != null) { PDRectangle rect = bead.getRectangle(); if (rect.contains(x, y)) { foundArticleDivisionIndex = i * 2 + 1; } else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY()) && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) { notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2; } else if (x < rect.getLowerLeftX() && notFoundButFirstLeftArticleDivisionIndex == -1) { notFoundButFirstLeftArticleDivisionIndex = i * 2; } else if (y < rect.getUpperRightY() && notFoundButFirstAboveArticleDivisionIndex == -1) { notFoundButFirstAboveArticleDivisionIndex = i * 2; } } else { foundArticleDivisionIndex = 0; } } } else { foundArticleDivisionIndex = 0; } int articleDivisionIndex = -1; if (foundArticleDivisionIndex != -1) { articleDivisionIndex = foundArticleDivisionIndex; } else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex; } else if (notFoundButFirstLeftArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex; } else if (notFoundButFirstAboveArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex; } else { articleDivisionIndex = charactersByArticle.size() - 1; } List<TextPosition> textList = (List<TextPosition>) charactersByArticle.get(articleDivisionIndex); /* In the wild, some PDF encoded documents put diacritics (accents on * top of characters) into a separate Tj element. When displaying them * graphically, the two chunks get overlayed. With text output though, * we need to do the overlay. This code recombines the diacritic with * its associated character if the two are consecutive. */ if (textList.isEmpty()) { textList.add(text); } else { /* test if we overlap the previous entry. * Note that we are making an assumption that we need to only look back * one TextPosition to find what we are overlapping. * This may not always be true. */ TextPosition previousTextPosition = (TextPosition) textList.get(textList.size() - 1); if (text.isDiacritic() && previousTextPosition.contains(text)) { previousTextPosition.mergeDiacritic(text, normalize); } /* If the previous TextPosition was the diacritic, merge it into this * one and remove it from the list. */ else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) { text.mergeDiacritic(previousTextPosition, normalize); textList.remove(textList.size() - 1); textList.add(text); } else { textList.add(text); } } } }
From source file:org.polarsys.kitalpha.doc.doc2model.tikaparsing.pdf.Doc2ModelTextStripper.java
License:Apache License
/** * This will process a TextPosition object and add the * text to the list of characters on a page. It takes care of * overlapping text.// www . j a v a 2 s . co m * * @param text The text to process. */ protected void processTextPosition(TextPosition rawtext) { StylizedTextPosition text = (StylizedTextPosition) processStyle(rawtext); boolean showCharacter = true; if (suppressDuplicateOverlappingText) { showCharacter = false; String textCharacter = text.getCharacter(); float textX = text.getX(); float textY = text.getY(); List<TextPosition> sameTextCharacters = (List<TextPosition>) characterListMapping.get(textCharacter); if (sameTextCharacters == null) { sameTextCharacters = new ArrayList<TextPosition>(); characterListMapping.put(textCharacter, sameTextCharacters); } // RDD - Here we compute the value that represents the end of the rendered // text. This value is used to determine whether subsequent text rendered // on the same line overwrites the current text. // // We subtract any positive padding to handle cases where extreme amounts // of padding are applied, then backed off (not sure why this is done, but there // are cases where the padding is on the order of 10x the character width, and // the TJ just backs up to compensate after each character). Also, we subtract // an amount to allow for kerning (a percentage of the width of the last // character). // boolean suppressCharacter = false; float tolerance = (text.getWidth() / textCharacter.length()) / 3.0f; for (int i = 0; i < sameTextCharacters.size() && textCharacter != null; i++) { TextPosition character = sameTextCharacters.get(i); String charCharacter = character.getCharacter(); float charX = character.getX(); float charY = character.getY(); //only want to suppress if (charCharacter != null && //charCharacter.equals( textCharacter ) && within(charX, textX, tolerance) && within(charY, textY, tolerance)) { suppressCharacter = true; } } if (!suppressCharacter) { sameTextCharacters.add(text); showCharacter = true; } } if (showCharacter) { //if we are showing the character then we need to determine which //article it belongs to. int foundArticleDivisionIndex = -1; int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1; int notFoundButFirstLeftArticleDivisionIndex = -1; int notFoundButFirstAboveArticleDivisionIndex = -1; float x = text.getX(); float y = text.getY(); if (shouldSeparateByBeads) { for (int i = 0; i < pageArticles.size() && foundArticleDivisionIndex == -1; i++) { PDThreadBead bead = (PDThreadBead) pageArticles.get(i); if (bead != null) { PDRectangle rect = bead.getRectangle(); if (rect.contains(x, y)) { foundArticleDivisionIndex = i * 2 + 1; } else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY()) && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) { notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2; } else if (x < rect.getLowerLeftX() && notFoundButFirstLeftArticleDivisionIndex == -1) { notFoundButFirstLeftArticleDivisionIndex = i * 2; } else if (y < rect.getUpperRightY() && notFoundButFirstAboveArticleDivisionIndex == -1) { notFoundButFirstAboveArticleDivisionIndex = i * 2; } } else { foundArticleDivisionIndex = 0; } } } else { foundArticleDivisionIndex = 0; } int articleDivisionIndex = -1; if (foundArticleDivisionIndex != -1) { articleDivisionIndex = foundArticleDivisionIndex; } else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex; } else if (notFoundButFirstLeftArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex; } else if (notFoundButFirstAboveArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex; } else { articleDivisionIndex = charactersByArticle.size() - 1; } List<TextPosition> textList = (List<TextPosition>) charactersByArticle.get(articleDivisionIndex); /* In the wild, some PDF encoded documents put diacritics (accents on * top of characters) into a separate Tj element. When displaying them * graphically, the two chunks get overlayed. With text output though, * we need to do the overlay. This code recombines the diacritic with * its associated character if the two are consecutive. */ if (textList.isEmpty()) { textList.add(text); } else { /* test if we overlap the previous entry. * Note that we are making an assumption that we need to only look back * one TextPosition to find what we are overlapping. * This may not always be true. */ TextPosition previousTextPosition = (TextPosition) textList.get(textList.size() - 1); if (text.isDiacritic() && previousTextPosition.contains(text)) { previousTextPosition.mergeDiacritic(text, normalize); } /* If the previous TextPosition was the diacritic, merge it into this * one and remove it from the list. */ else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) { text.mergeDiacritic(previousTextPosition, normalize); textList.remove(textList.size() - 1); textList.add(text); } else { textList.add(text); } } } }