List of usage examples for org.apache.pdfbox.text PDFTextStripperByArea getTextForRegion
public String getTextForRegion(String regionName)
From source file:com.amolik.misc.ExtractTextByArea.java
License:Apache License
/** * This will print the documents text in a certain area. * * @param args The command line arguments. * * @throws IOException If there is an error parsing the document. */// ww w . ja va 2 s. com public static void main(String[] args) throws IOException { //args[0]= "E:\\Automation\\uphillit\\Fiscal_demo_data.pdf"; // if( args.length != 1 ) // { // usage(); // } // else // { PDDocument document = null; try { document = PDDocument.load(new File("E:\\Automation\\uphillit\\Fiscal_demo_data.pdf")); int numberOfPages = document.getNumberOfPages(); if (numberOfPages > 0) { PDPage page = (PDPage) document.getPages().get(0); System.out.println(page.getContents()); } PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.setSortByPosition(true); Rectangle rect = new Rectangle(3, 1, 600, 6000); stripper.addRegion("class1", rect); PDPage firstPage = document.getPage(0); stripper.extractRegions(firstPage); System.out.println("Text in the area:" + rect); System.out.println(stripper.getTextForRegion("class1")); } finally { if (document != null) { document.close(); } } // } }
From source file:net.bookinaction.ExtractAnnotations.java
License:Apache License
public void doJob(String job, Float[] pA) throws IOException { PDDocument document = null;//from www.j a v a2 s. co m Stamper s = new Stamper(); // utility class final String job_file = job + ".pdf"; final String dic_file = job + "-dict.txt"; final String new_job = job + "-new.pdf"; PrintWriter writer = new PrintWriter(dic_file); ImageLocationListener imageLocationsListener = new ImageLocationListener(); AnnotationMaker annotMaker = new AnnotationMaker(); try { document = PDDocument.load(new File(job_file)); int pageNum = 0; for (PDPage page : document.getPages()) { pageNum++; PDRectangle cropBox = page.getCropBox(); List<PDAnnotation> annotations = page.getAnnotations(); // extract image locations List<Rectangle2D> imageRects = new ArrayList<Rectangle2D>(); imageLocationsListener.setImageRects(imageRects); imageLocationsListener.processPage(page); int im = 0; for (Rectangle2D pdImageRect : imageRects) { s.recordImage(writer, pageNum, "[im" + im + "]", (Rectangle2D.Float) pdImageRect); annotations.add(annotMaker.squareAnnotation(Color.YELLOW, (Rectangle2D.Float) pdImageRect, "[im" + im + "]")); im++; } PDFTextStripperByArea stripper = new PDFTextStripperByArea(); int j = 0; List<PDAnnotation> viableAnnots = new ArrayList(); for (PDAnnotation annot : annotations) { if (annot instanceof PDAnnotationTextMarkup || annot instanceof PDAnnotationLink) { stripper.addRegion(Integer.toString(j++), s.getAwtRect( s.adjustedRect(annot.getRectangle(), pA[0], pA[1], pA[2], pA[3]), cropBox)); viableAnnots.add(annot); } else if (annot instanceof PDAnnotationPopup || annot instanceof PDAnnotationText) { viableAnnots.add(annot); } } stripper.extractRegions(page); List<PDRectangle> rects = new ArrayList<PDRectangle>(); List<String> comments = new ArrayList<String>(); List<String> highlightTexts = new ArrayList<String>(); j = 0; for (PDAnnotation viableAnnot : viableAnnots) { if (viableAnnot instanceof PDAnnotationTextMarkup) { String highlightText = stripper.getTextForRegion(Integer.toString(j++)); String withoutCR = highlightText.replace((char) 0x0A, '^'); String comment = viableAnnot.getContents(); String colorString = String.format("%06x", viableAnnot.getColor().toRGB()); PDRectangle aRect = s.adjustedRect(viableAnnot.getRectangle(), pA[4], pA[5], pA[6], pA[7]); rects.add(aRect); comments.add(comment); highlightTexts.add(highlightText); s.recordTextMarkup(writer, pageNum, comment, withoutCR, aRect, colorString); } else if (viableAnnot instanceof PDAnnotationText) { String comment = viableAnnot.getContents(); String colorString = String.format("%06x", viableAnnot.getColor().toRGB()); for (Rectangle2D pdImageRect : imageRects) { if (pdImageRect.contains(viableAnnot.getRectangle().getLowerLeftX(), viableAnnot.getRectangle().getLowerLeftY())) { s.recordTextMarkup(writer, pageNum, comment, "", (Rectangle2D.Float) pdImageRect, colorString); annotations.add(annotMaker.squareAnnotation(Color.GREEN, (Rectangle2D.Float) pdImageRect, comment)); } ; } } } PDPageContentStream canvas = new PDPageContentStream(document, page, true, true, true); int i = 0; for (PDRectangle pdRect : rects) { String comment = comments.get(i); String highlightText = highlightTexts.get(i); //annotations.add(linkAnnotation(pdRect, comment, highlightText)); //annotations.add(annotationSquareCircle(pdRect, BLUE)); s.showBox(canvas, new Rectangle2D.Float(pdRect.getLowerLeftX(), pdRect.getUpperRightY(), pdRect.getWidth(), pdRect.getHeight()), cropBox, Color.BLUE); i++; } canvas.close(); } writer.close(); document.save(new_job); } finally { if (document != null) { document.close(); } } }
From source file:org.xwiki.test.misc.PDFTest.java
License:Open Source License
/** * Code adapted from http://www.docjar.com/html/api/org/apache/pdfbox/examples/pdmodel/PrintURLs.java.html *//*from w w w .j a v a 2 s . c om*/ private Map<String, PDAction> extractLinks(PDPage page) throws Exception { Map<String, PDAction> links = new HashMap<String, PDAction>(); PDFTextStripperByArea stripper = new PDFTextStripperByArea(); List<PDAnnotation> annotations = page.getAnnotations(); // First setup the text extraction regions. for (int j = 0; j < annotations.size(); j++) { PDAnnotation annotation = annotations.get(j); if (annotation instanceof PDAnnotationLink) { PDAnnotationLink link = (PDAnnotationLink) annotation; PDRectangle rect = link.getRectangle(); // Need to reposition link rectangle to match text space. float x = rect.getLowerLeftX(); float y = rect.getUpperRightY(); float width = rect.getWidth(); float height = rect.getHeight(); int rotation = page.getRotation(); if (rotation == 0) { PDRectangle pageSize = page.getMediaBox(); y = pageSize.getHeight() - y; } else if (rotation == 90) { // Do nothing. } Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height); stripper.addRegion(String.valueOf(j), awtRect); } } stripper.extractRegions(page); for (int j = 0; j < annotations.size(); j++) { PDAnnotation annotation = annotations.get(j); if (annotation instanceof PDAnnotationLink) { PDAnnotationLink link = (PDAnnotationLink) annotation; String label = stripper.getTextForRegion(String.valueOf(j)).trim(); links.put(label, link.getAction()); } } return links; }
From source file:org.xwiki.test.misc.PDFTest.java
License:Open Source License
private String getDestinationText(PDPageXYZDestination destination) throws Exception { PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.addRegion("destination", getRectangleBelowDestination(destination)); stripper.extractRegions(destination.getPage()); return stripper.getTextForRegion("destination").trim(); }
From source file:pdfconverter.converter3.java
@SuppressWarnings("deprecation") public static void main(String[] args) throws IOException, WriteException { workbook = Workbook.createWorkbook(new File(output)); System.out.println("File created"); WritableSheet sheet = workbook.createSheet("Page", 0); ExcelStart(sheet);//from ww w. j a v a 2s . co m //Scanner user_input = new Scanner( System.in ); File dir = new File(path); //System.out.println(dir.getPath()); File[] dirList = dir.listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { return name.endsWith(".pdf"); } }); int counter = 1; PDDocument pd; PDFTextStripper stripper = new PDFTextStripper(); PDFTextStripperByArea areaSearch = new PDFTextStripperByArea(); PDFTextStripperByArea stripper2 = new PDFTextStripperByArea(); PDFTextStripperByArea stripper3 = new PDFTextStripperByArea(); //PDRectangle rect = new PDRectangle(0, 0, 100, 100); stripper.setStartPage(1); //Start extracting from page 3 stripper.setEndPage(1); //Extract till page 5 File f = new File(dirList[0].getPath()); pd = PDDocument.load(f); //int curHeight = 136; //int rowCount = 37; int curHeight = 116; int rowCount = 39; int rowHeight = 9; int sheetRowCount = 0; int pageStop = 1491; for (int curpage = 800; curpage < pageStop; curpage++) { if (counter > 800) { break; } PDPage page = pd.getPage(curpage); System.out.println("Now parsing page " + curpage); for (int curRow = 0; curRow < 80; curRow++) { Rectangle2D.Float cell = new Rectangle2D.Float(0, curHeight, 80, rowHeight); String name = "cell-1-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); String text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 0, sheetRowCount + 1); cell = new Rectangle2D.Float(80, curHeight, 30, rowHeight); name = "cell-2-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 1, sheetRowCount + 1); cell = new Rectangle2D.Float(110, curHeight, 40, rowHeight); name = "cell-3-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 2, sheetRowCount + 1); cell = new Rectangle2D.Float(150, curHeight, 120, rowHeight); name = "cell-4-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 3, sheetRowCount + 1); cell = new Rectangle2D.Float(270, curHeight, 120, rowHeight); name = "cell-5-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 4, sheetRowCount + 1); cell = new Rectangle2D.Float(390, curHeight, 40, rowHeight); name = "cell-6-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 5, sheetRowCount + 1); cell = new Rectangle2D.Float(430, curHeight, 46, rowHeight); name = "cell-7-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 6, sheetRowCount + 1); cell = new Rectangle2D.Float(476, curHeight, 82, rowHeight); name = "cell-8-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 7, sheetRowCount + 1); cell = new Rectangle2D.Float(558, curHeight, 65, rowHeight); name = "cell-9-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 8, sheetRowCount + 1); cell = new Rectangle2D.Float(623, curHeight, 66, rowHeight); name = "cell-10-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 9, sheetRowCount + 1); cell = new Rectangle2D.Float(689, curHeight, 100, rowHeight); name = "cell-11-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 10, sheetRowCount + 1); sheetRowCount++; curHeight += rowHeight; } //Rectangle2D.Float issueDate = new Rectangle2D.Float(0, 0, 80, page.getMediaBox().getHeight()); //stripper2.addRegion("issueDate", issueDate); //Rectangle2D.Float amount = new Rectangle2D.Float(80, 0, 30, page.getMediaBox().getHeight()); //stripper2.addRegion("amount", amount); //Rectangle2D.Float citation = new Rectangle2D.Float(110, 0, 40, page.getMediaBox().getHeight()); //stripper2.addRegion("citation", citation); //Rectangle2D.Float violation = new Rectangle2D.Float(150, 0, 120, page.getMediaBox().getHeight()); //stripper2.addRegion("violation", violation); //Rectangle2D.Float comment = new Rectangle2D.Float(270, 0, 120, page.getMediaBox().getHeight()); //stripper2.addRegion("comment", comment); //Rectangle2D.Float warning = new Rectangle2D.Float(390, 0, 40, page.getMediaBox().getHeight()); //stripper2.addRegion("warning", warning); //Rectangle2D.Float license = new Rectangle2D.Float(430, 0, 46, page.getMediaBox().getHeight()); //stripper2.addRegion("license", license); //Rectangle2D.Float lot = new Rectangle2D.Float(476, 0, 82, page.getMediaBox().getHeight()); //stripper2.addRegion("lot", lot); //Rectangle2D.Float make = new Rectangle2D.Float(558, 0, 65, page.getMediaBox().getHeight()); //stripper2.addRegion("make", make); //Rectangle2D.Float officer = new Rectangle2D.Float(623, 0, 66, page.getMediaBox().getHeight()); //stripper2.addRegion("officer", officer); //Rectangle2D.Float state = new Rectangle2D.Float(689, 0, 100, page.getMediaBox().getHeight()); //stripper2.addRegion("state", state); //stripper2.extractRegions(page); //String text = stripper2.getTextForRegion("license"); //Rectangle2D.Float row = new Rectangle2D.Float(0, 156, 80, 10); //stripper3.addRegion("row", row); //stripper3.extractRegions(page); //String text = stripper3.getTextForRegion("row"); //System.out.println(text); counter++; curHeight = 116; rowCount = 39; } //AddRow(sheet, text, counter); //counter++; pd.close(); System.out.println("Data extracted to Excel, parsing through Excel data..."); boolean multiline = true; while (multiline) { multiline = false; for (int row = 0; row < sheet.getRows(); row++) { Cell cell = sheet.getCell(0, row); if (cell.getContents().length() < 5) { multiline = true; WritableCell cell2 = sheet.getWritableCell(4, row - 1); WritableCell cell3 = sheet.getWritableCell(4, row); String content = cell2.getContents() + cell3.getContents(); content = content.replace("\n", "").replace("\r", ""); Label l = (Label) cell2; l.setString(content); sheet.removeRow(row); } } } System.out.println("Data extraction complete"); workbook.write(); workbook.close(); }
From source file:uk.ac.leeds.ccg.andyt.rdl.web.RDL_ParsePDF.java
/** * https://svn.apache.org/viewvc/pdfbox/trunk/examples/ Based on * https://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/pdmodel/PrintURLs.java?view=markup&pathrev=1703066 * * @param f/*from ww w.j a v a 2s .c o m*/ * @param filter * @param fis * @return * @throws IOException * @throws TikaException * @throws SAXException */ public static ArrayList<String[]> parseForLinks(File f, String filter, FileInputStream fis) throws IOException, TikaException, SAXException { ArrayList<String[]> result; result = new ArrayList<String[]>(); PDDocument doc = PDDocument.load(f); int pageNum = 0; for (PDPage page : doc.getPages()) { pageNum++; // if (pageNum == 11) { //Degug test hack System.out.println("Parsing page " + pageNum); PDFTextStripperByArea stripper = new PDFTextStripperByArea(); List<PDAnnotation> annotations = page.getAnnotations(); //first setup text extraction regions for (int j = 0; j < annotations.size(); j++) { PDAnnotation annot = annotations.get(j); if (annot instanceof PDAnnotationLink) { PDAnnotationLink link = (PDAnnotationLink) annot; PDRectangle rect = link.getRectangle(); //need to reposition link rectangle to match text space float x = rect.getLowerLeftX(); float y = rect.getUpperRightY(); float width = rect.getWidth(); float height = rect.getHeight(); int rotation = page.getRotation(); if (rotation == 0) { PDRectangle pageSize = page.getMediaBox(); y = pageSize.getHeight() - y; } else if (rotation == 90) { //do nothing } //Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height); // Rounding here could be a problem! Rectangle2D.Double awtRect = new Rectangle2D.Double(x, y, width, height); stripper.addRegion("" + j, awtRect); } } stripper.extractRegions(page); for (int j = 0; j < annotations.size(); j++) { PDAnnotation annot = annotations.get(j); if (annot instanceof PDAnnotationLink) { PDAnnotationLink link = (PDAnnotationLink) annot; PDAction action = link.getAction(); if (action == null) { System.out.println(link.getContents()); System.out.println(annot.getClass().getName()); System.out.println(annot.getAnnotationName()); //System.out.println(annot.getNormalAppearanceStream().toString()); System.out.println(annot.getContents()); System.out.println(annot.getSubtype()); } else { String urlText = stripper.getTextForRegion("" + j); if (action instanceof PDActionURI) { PDActionURI uri = (PDActionURI) action; String url; url = uri.getURI(); if (url.contains(filter)) { String[] partResult; partResult = new String[3]; partResult[0] = "Page " + pageNum; partResult[1] = "urlText " + urlText; partResult[2] = "URL " + uri.getURI(); System.out.println(partResult[0]); System.out.println(partResult[1]); System.out.println(partResult[2]); System.out.println("URL " + uri.getURI()); result.add(partResult); } else { System.out.println("URL " + uri.getURI()); } } else { System.out.println(action.getType()); } } } else { System.out.println(annot.getClass().getName()); System.out.println(annot.getAnnotationName()); System.out.println(annot.getContents()); System.out.println(annot.getSubtype()); } } //} } // PDDocument doc = PDDocument.load(f); // int pageNum = 0; // for (PDPage page : doc.getPages()) { // pageNum++; // List<PDAnnotation> annotations = page.getAnnotations(); // // for (PDAnnotation annotation : annotations) { // PDAnnotation annot = annotation; // if (annot instanceof PDAnnotationLink) { // PDAnnotationLink link = (PDAnnotationLink) annot; // PDAction action = link.getAction(); // if (action instanceof PDActionURI) { // PDActionURI uri = (PDActionURI) action; // String oldURI = uri.getURI(); // String name = annot.getAnnotationName(); // String contents = annot.getContents(); // PDAppearanceStream a = annot.getNormalAppearanceStream(); // //String newURI = "http://pdfbox.apache.org"; // System.out.println(oldURI + " " + name + " " + contents); // //uri.setURI(newURI); // } // } // } // } // result = parseWithTika(fis); //XMPSchema schema; //schema = new XMPSchema(); //List<String> XMPBagOrSeqList; //XMPBagOrSeqList = getXMPBagOrSeqList(XMPSchema schema, String name) { // PDDocument tPDDocument; // tPDDocument = PDDocument.load(f); // COSDocument tCOSDocument; // tCOSDocument = tPDDocument.getDocument(); // String header; // header = tCOSDocument.getHeaderString(); // System.out.println(header); // PDDocumentCatalog tPDDocumentCatalog; // tPDDocumentCatalog = tPDDocument.getDocumentCatalog(); // PDDocumentNameDictionary tPDDocumentNameDictionary; // tPDDocumentNameDictionary = tPDDocumentCatalog.getNames(); // COSDictionary tCOSDictionary; // tCOSDictionary = tPDDocumentNameDictionary.getCOSDictionary(); //tCOSDictionary. // PDPageNode tPDPageNode; // tPDPageNode = tPDDocumentCatalog.getPages(); // List<COSObject> tCOSObjects; // tCOSObjects = tCOSDocument.getObjects(); // int n; // n = tCOSObjects.size(); // System.out.println(n); // COSObject aCOSObject; // String s; // for (int i = 0; i < n; i++) { // aCOSObject = tCOSObjects.get(i); // s = aCOSObject.toString(); // System.out.println(s); // } // XMPMetadata tXMPMetadata; // tXMPMetadata = getXMPMetadata(tPDDocument); // Document XMPDocument; // XMPDocument = tXMPMetadata.getXMPDocument(); // Node n; // n = XMPDocument.getFirstChild(); // parseNode(n); return result; }
From source file:uk.ac.leeds.ccg.andyt.rdl.web.RDL_ParsePDF.java
/** * Converts PDF to a String a page at a time. * * @param f/* www . j a v a 2 s.co m*/ * @return * @throws IOException */ public static String parseToString(File f) throws IOException { String result; result = ""; PDDocument doc = PDDocument.load(f); PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.setSortByPosition(true); //Rectangle rect = new Rectangle(10, 280, 275, 60); //PDPage firstPage = doc.getPage(0); for (PDPage page : doc.getPages()) { PDRectangle aPDRectangle; aPDRectangle = page.getBBox(); Rectangle2D.Double rect = new Rectangle2D.Double(aPDRectangle.getLowerLeftX(), aPDRectangle.getLowerLeftY(), //aPDRectangle.getUpperRightY(), aPDRectangle.getWidth(), aPDRectangle.getHeight()); stripper.addRegion("class1", rect); stripper.extractRegions(page); System.out.println("<Text in the area:" + rect + ">"); String text; text = stripper.getTextForRegion("class1"); System.out.println(text); System.out.println("</Text in the area:" + rect + ">"); result += text; } return result; }
From source file:uk.org.openeyes.PDFFunctions.java
private String getTextArea(PDPage page, Rectangle titleArea) throws IOException { PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.setSortByPosition(true);/*from w w w.j a v a2s.c o m*/ stripper.addRegion("area", titleArea); stripper.extractRegions(page); return stripper.getTextForRegion("area"); }