Example usage for org.apache.pdfbox.text PDFTextStripperByArea setSortByPosition

Introduction

In this page you can find the example usage for org.apache.pdfbox.text PDFTextStripperByArea setSortByPosition.

Prototype

public void setSortByPosition(boolean newSortByPosition)

Source Link

Document

The order of the text tokens in a PDF file may not be in the same as they appear visually on the screen.

Usage

From source file:com.amolik.misc.ExtractTextByArea.java

License:Apache License

/**
 * This will print the documents text in a certain area.
 *
 * @param args The command line arguments.
 *
 * @throws IOException If there is an error parsing the document.
 *///w  ww.  j  a v  a2 s  .  co  m
public static void main(String[] args) throws IOException {
    //args[0]= "E:\\Automation\\uphillit\\Fiscal_demo_data.pdf";
    //        if( args.length != 1 )
    //        {
    //            usage();
    //        }
    //        else
    //        {
    PDDocument document = null;
    try {
        document = PDDocument.load(new File("E:\\Automation\\uphillit\\Fiscal_demo_data.pdf"));
        int numberOfPages = document.getNumberOfPages();
        if (numberOfPages > 0) {

            PDPage page = (PDPage) document.getPages().get(0);
            System.out.println(page.getContents());
        }
        PDFTextStripperByArea stripper = new PDFTextStripperByArea();
        stripper.setSortByPosition(true);
        Rectangle rect = new Rectangle(3, 1, 600, 6000);
        stripper.addRegion("class1", rect);
        PDPage firstPage = document.getPage(0);
        stripper.extractRegions(firstPage);
        System.out.println("Text in the area:" + rect);
        System.out.println(stripper.getTextForRegion("class1"));
    } finally {
        if (document != null) {
            document.close();
        }
    }
    //       }
}

From source file:com.enginkutuk.pdfboxsample.PDFBoxSample.java

public void readPdfFile(String path) {
    try {/*from  www  .j a v a  2s.c om*/
        PDDocument document = null;
        document = PDDocument.load(new File(path));
        document.getClass();
        if (!document.isEncrypted()) {
            PDFTextStripperByArea stripper = new PDFTextStripperByArea();
            stripper.setSortByPosition(true);
            PDFTextStripper Tstripper = new PDFTextStripper();
            String st = Tstripper.getText(document);
            System.out.println(st);
            JOptionPane.showMessageDialog(null, st);
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:uk.ac.leeds.ccg.andyt.rdl.web.RDL_ParsePDF.java

/**
 * Converts PDF to a String a page at a time.
 *
 * @param f/*from w w  w  .  j  ava 2  s . com*/
 * @return
 * @throws IOException
 */
public static String parseToString(File f) throws IOException {
    String result;
    result = "";
    PDDocument doc = PDDocument.load(f);
    PDFTextStripperByArea stripper = new PDFTextStripperByArea();
    stripper.setSortByPosition(true);
    //Rectangle rect = new Rectangle(10, 280, 275, 60);
    //PDPage firstPage = doc.getPage(0);
    for (PDPage page : doc.getPages()) {
        PDRectangle aPDRectangle;
        aPDRectangle = page.getBBox();
        Rectangle2D.Double rect = new Rectangle2D.Double(aPDRectangle.getLowerLeftX(),
                aPDRectangle.getLowerLeftY(),
                //aPDRectangle.getUpperRightY(),
                aPDRectangle.getWidth(), aPDRectangle.getHeight());
        stripper.addRegion("class1", rect);
        stripper.extractRegions(page);
        System.out.println("<Text in the area:" + rect + ">");
        String text;
        text = stripper.getTextForRegion("class1");
        System.out.println(text);
        System.out.println("</Text in the area:" + rect + ">");
        result += text;
    }
    return result;
}

From source file:uk.org.openeyes.PDFFunctions.java

private String getTextArea(PDPage page, Rectangle titleArea) throws IOException {
    PDFTextStripperByArea stripper = new PDFTextStripperByArea();
    stripper.setSortByPosition(true);
    stripper.addRegion("area", titleArea);
    stripper.extractRegions(page);//w  ww.ja v  a 2 s .  c om
    return stripper.getTextForRegion("area");
}