List of usage examples for org.apache.pdfbox.pdmodel PDDocument getPages
public PDPageTree getPages()
From source file:uk.ac.leeds.ccg.andyt.rdl.web.RDL_ParsePDF.java
/** * https://svn.apache.org/viewvc/pdfbox/trunk/examples/ Based on * https://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/pdmodel/PrintURLs.java?view=markup&pathrev=1703066 * * @param f//from w w w . j a va2 s .c o m * @param filter * @param fis * @return * @throws IOException * @throws TikaException * @throws SAXException */ public static ArrayList<String[]> parseForLinks(File f, String filter, FileInputStream fis) throws IOException, TikaException, SAXException { ArrayList<String[]> result; result = new ArrayList<String[]>(); PDDocument doc = PDDocument.load(f); int pageNum = 0; for (PDPage page : doc.getPages()) { pageNum++; // if (pageNum == 11) { //Degug test hack System.out.println("Parsing page " + pageNum); PDFTextStripperByArea stripper = new PDFTextStripperByArea(); List<PDAnnotation> annotations = page.getAnnotations(); //first setup text extraction regions for (int j = 0; j < annotations.size(); j++) { PDAnnotation annot = annotations.get(j); if (annot instanceof PDAnnotationLink) { PDAnnotationLink link = (PDAnnotationLink) annot; PDRectangle rect = link.getRectangle(); //need to reposition link rectangle to match text space float x = rect.getLowerLeftX(); float y = rect.getUpperRightY(); float width = rect.getWidth(); float height = rect.getHeight(); int rotation = page.getRotation(); if (rotation == 0) { PDRectangle pageSize = page.getMediaBox(); y = pageSize.getHeight() - y; } else if (rotation == 90) { //do nothing } //Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height); // Rounding here could be a problem! Rectangle2D.Double awtRect = new Rectangle2D.Double(x, y, width, height); stripper.addRegion("" + j, awtRect); } } stripper.extractRegions(page); for (int j = 0; j < annotations.size(); j++) { PDAnnotation annot = annotations.get(j); if (annot instanceof PDAnnotationLink) { PDAnnotationLink link = (PDAnnotationLink) annot; PDAction action = link.getAction(); if (action == null) { System.out.println(link.getContents()); System.out.println(annot.getClass().getName()); System.out.println(annot.getAnnotationName()); //System.out.println(annot.getNormalAppearanceStream().toString()); System.out.println(annot.getContents()); System.out.println(annot.getSubtype()); } else { String urlText = stripper.getTextForRegion("" + j); if (action instanceof PDActionURI) { PDActionURI uri = (PDActionURI) action; String url; url = uri.getURI(); if (url.contains(filter)) { String[] partResult; partResult = new String[3]; partResult[0] = "Page " + pageNum; partResult[1] = "urlText " + urlText; partResult[2] = "URL " + uri.getURI(); System.out.println(partResult[0]); System.out.println(partResult[1]); System.out.println(partResult[2]); System.out.println("URL " + uri.getURI()); result.add(partResult); } else { System.out.println("URL " + uri.getURI()); } } else { System.out.println(action.getType()); } } } else { System.out.println(annot.getClass().getName()); System.out.println(annot.getAnnotationName()); System.out.println(annot.getContents()); System.out.println(annot.getSubtype()); } } //} } // PDDocument doc = PDDocument.load(f); // int pageNum = 0; // for (PDPage page : doc.getPages()) { // pageNum++; // List<PDAnnotation> annotations = page.getAnnotations(); // // for (PDAnnotation annotation : annotations) { // PDAnnotation annot = annotation; // if (annot instanceof PDAnnotationLink) { // PDAnnotationLink link = (PDAnnotationLink) annot; // PDAction action = link.getAction(); // if (action instanceof PDActionURI) { // PDActionURI uri = (PDActionURI) action; // String oldURI = uri.getURI(); // String name = annot.getAnnotationName(); // String contents = annot.getContents(); // PDAppearanceStream a = annot.getNormalAppearanceStream(); // //String newURI = "http://pdfbox.apache.org"; // System.out.println(oldURI + " " + name + " " + contents); // //uri.setURI(newURI); // } // } // } // } // result = parseWithTika(fis); //XMPSchema schema; //schema = new XMPSchema(); //List<String> XMPBagOrSeqList; //XMPBagOrSeqList = getXMPBagOrSeqList(XMPSchema schema, String name) { // PDDocument tPDDocument; // tPDDocument = PDDocument.load(f); // COSDocument tCOSDocument; // tCOSDocument = tPDDocument.getDocument(); // String header; // header = tCOSDocument.getHeaderString(); // System.out.println(header); // PDDocumentCatalog tPDDocumentCatalog; // tPDDocumentCatalog = tPDDocument.getDocumentCatalog(); // PDDocumentNameDictionary tPDDocumentNameDictionary; // tPDDocumentNameDictionary = tPDDocumentCatalog.getNames(); // COSDictionary tCOSDictionary; // tCOSDictionary = tPDDocumentNameDictionary.getCOSDictionary(); //tCOSDictionary. // PDPageNode tPDPageNode; // tPDPageNode = tPDDocumentCatalog.getPages(); // List<COSObject> tCOSObjects; // tCOSObjects = tCOSDocument.getObjects(); // int n; // n = tCOSObjects.size(); // System.out.println(n); // COSObject aCOSObject; // String s; // for (int i = 0; i < n; i++) { // aCOSObject = tCOSObjects.get(i); // s = aCOSObject.toString(); // System.out.println(s); // } // XMPMetadata tXMPMetadata; // tXMPMetadata = getXMPMetadata(tPDDocument); // Document XMPDocument; // XMPDocument = tXMPMetadata.getXMPDocument(); // Node n; // n = XMPDocument.getFirstChild(); // parseNode(n); return result; }
From source file:uk.ac.leeds.ccg.andyt.rdl.web.RDL_ParsePDF.java
/** * Converts PDF to a String a page at a time. * * @param f/*from w w w . j a v a 2 s . c om*/ * @return * @throws IOException */ public static String parseToString(File f) throws IOException { String result; result = ""; PDDocument doc = PDDocument.load(f); PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.setSortByPosition(true); //Rectangle rect = new Rectangle(10, 280, 275, 60); //PDPage firstPage = doc.getPage(0); for (PDPage page : doc.getPages()) { PDRectangle aPDRectangle; aPDRectangle = page.getBBox(); Rectangle2D.Double rect = new Rectangle2D.Double(aPDRectangle.getLowerLeftX(), aPDRectangle.getLowerLeftY(), //aPDRectangle.getUpperRightY(), aPDRectangle.getWidth(), aPDRectangle.getHeight()); stripper.addRegion("class1", rect); stripper.extractRegions(page); System.out.println("<Text in the area:" + rect + ">"); String text; text = stripper.getTextForRegion("class1"); System.out.println(text); System.out.println("</Text in the area:" + rect + ">"); result += text; } return result; }