List of usage examples for org.apache.pdfbox.text PDFTextStripperByArea setSortByPosition
public void setSortByPosition(boolean newSortByPosition)
From source file:com.amolik.misc.ExtractTextByArea.java
License:Apache License
/** * This will print the documents text in a certain area. * * @param args The command line arguments. * * @throws IOException If there is an error parsing the document. *///w ww. j a v a2 s . co m public static void main(String[] args) throws IOException { //args[0]= "E:\\Automation\\uphillit\\Fiscal_demo_data.pdf"; // if( args.length != 1 ) // { // usage(); // } // else // { PDDocument document = null; try { document = PDDocument.load(new File("E:\\Automation\\uphillit\\Fiscal_demo_data.pdf")); int numberOfPages = document.getNumberOfPages(); if (numberOfPages > 0) { PDPage page = (PDPage) document.getPages().get(0); System.out.println(page.getContents()); } PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.setSortByPosition(true); Rectangle rect = new Rectangle(3, 1, 600, 6000); stripper.addRegion("class1", rect); PDPage firstPage = document.getPage(0); stripper.extractRegions(firstPage); System.out.println("Text in the area:" + rect); System.out.println(stripper.getTextForRegion("class1")); } finally { if (document != null) { document.close(); } } // } }
From source file:com.enginkutuk.pdfboxsample.PDFBoxSample.java
public void readPdfFile(String path) { try {/*from www .j a v a 2s.c om*/ PDDocument document = null; document = PDDocument.load(new File(path)); document.getClass(); if (!document.isEncrypted()) { PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.setSortByPosition(true); PDFTextStripper Tstripper = new PDFTextStripper(); String st = Tstripper.getText(document); System.out.println(st); JOptionPane.showMessageDialog(null, st); } } catch (Exception e) { e.printStackTrace(); } }
From source file:uk.ac.leeds.ccg.andyt.rdl.web.RDL_ParsePDF.java
/** * Converts PDF to a String a page at a time. * * @param f/*from w w w . j ava 2 s . com*/ * @return * @throws IOException */ public static String parseToString(File f) throws IOException { String result; result = ""; PDDocument doc = PDDocument.load(f); PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.setSortByPosition(true); //Rectangle rect = new Rectangle(10, 280, 275, 60); //PDPage firstPage = doc.getPage(0); for (PDPage page : doc.getPages()) { PDRectangle aPDRectangle; aPDRectangle = page.getBBox(); Rectangle2D.Double rect = new Rectangle2D.Double(aPDRectangle.getLowerLeftX(), aPDRectangle.getLowerLeftY(), //aPDRectangle.getUpperRightY(), aPDRectangle.getWidth(), aPDRectangle.getHeight()); stripper.addRegion("class1", rect); stripper.extractRegions(page); System.out.println("<Text in the area:" + rect + ">"); String text; text = stripper.getTextForRegion("class1"); System.out.println(text); System.out.println("</Text in the area:" + rect + ">"); result += text; } return result; }
From source file:uk.org.openeyes.PDFFunctions.java
private String getTextArea(PDPage page, Rectangle titleArea) throws IOException { PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.setSortByPosition(true); stripper.addRegion("area", titleArea); stripper.extractRegions(page);//w ww.ja v a 2 s . c om return stripper.getTextForRegion("area"); }