Example usage for org.apache.pdfbox.text PDFTextStripperByArea removeRegion

List of usage examples for org.apache.pdfbox.text PDFTextStripperByArea removeRegion

Introduction

In this page you can find the example usage for org.apache.pdfbox.text PDFTextStripperByArea removeRegion.

Prototype

public void removeRegion(String regionName) 

Source Link

Document

Delete a region to group text by.

Usage

From source file:pdfconverter.converter3.java

@SuppressWarnings("deprecation")

public static void main(String[] args) throws IOException, WriteException {
    workbook = Workbook.createWorkbook(new File(output));
    System.out.println("File created");
    WritableSheet sheet = workbook.createSheet("Page", 0);
    ExcelStart(sheet);/*from  w ww  . ja v  a  2s  . co  m*/

    //Scanner user_input = new Scanner( System.in );
    File dir = new File(path);
    //System.out.println(dir.getPath());
    File[] dirList = dir.listFiles(new FilenameFilter() {
        @Override
        public boolean accept(File dir, String name) {
            return name.endsWith(".pdf");
        }
    });

    int counter = 1;
    PDDocument pd;
    PDFTextStripper stripper = new PDFTextStripper();
    PDFTextStripperByArea areaSearch = new PDFTextStripperByArea();
    PDFTextStripperByArea stripper2 = new PDFTextStripperByArea();
    PDFTextStripperByArea stripper3 = new PDFTextStripperByArea();
    //PDRectangle rect = new PDRectangle(0, 0, 100, 100);
    stripper.setStartPage(1); //Start extracting from page 3
    stripper.setEndPage(1); //Extract till page 5
    File f = new File(dirList[0].getPath());

    pd = PDDocument.load(f);
    //int curHeight = 136;
    //int rowCount = 37;
    int curHeight = 116;
    int rowCount = 39;
    int rowHeight = 9;
    int sheetRowCount = 0;
    int pageStop = 1491;

    for (int curpage = 800; curpage < pageStop; curpage++) {
        if (counter > 800) {
            break;
        }
        PDPage page = pd.getPage(curpage);

        System.out.println("Now parsing page " + curpage);
        for (int curRow = 0; curRow < 80; curRow++) {
            Rectangle2D.Float cell = new Rectangle2D.Float(0, curHeight, 80, rowHeight);
            String name = "cell-1-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            String text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 0, sheetRowCount + 1);

            cell = new Rectangle2D.Float(80, curHeight, 30, rowHeight);
            name = "cell-2-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 1, sheetRowCount + 1);

            cell = new Rectangle2D.Float(110, curHeight, 40, rowHeight);
            name = "cell-3-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 2, sheetRowCount + 1);

            cell = new Rectangle2D.Float(150, curHeight, 120, rowHeight);
            name = "cell-4-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 3, sheetRowCount + 1);

            cell = new Rectangle2D.Float(270, curHeight, 120, rowHeight);
            name = "cell-5-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 4, sheetRowCount + 1);

            cell = new Rectangle2D.Float(390, curHeight, 40, rowHeight);
            name = "cell-6-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 5, sheetRowCount + 1);

            cell = new Rectangle2D.Float(430, curHeight, 46, rowHeight);
            name = "cell-7-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 6, sheetRowCount + 1);

            cell = new Rectangle2D.Float(476, curHeight, 82, rowHeight);
            name = "cell-8-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 7, sheetRowCount + 1);

            cell = new Rectangle2D.Float(558, curHeight, 65, rowHeight);
            name = "cell-9-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 8, sheetRowCount + 1);

            cell = new Rectangle2D.Float(623, curHeight, 66, rowHeight);
            name = "cell-10-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 9, sheetRowCount + 1);

            cell = new Rectangle2D.Float(689, curHeight, 100, rowHeight);
            name = "cell-11-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 10, sheetRowCount + 1);

            sheetRowCount++;
            curHeight += rowHeight;
        }

        //Rectangle2D.Float issueDate = new Rectangle2D.Float(0, 0, 80, page.getMediaBox().getHeight());
        //stripper2.addRegion("issueDate", issueDate);
        //Rectangle2D.Float amount = new Rectangle2D.Float(80, 0, 30, page.getMediaBox().getHeight());
        //stripper2.addRegion("amount", amount);
        //Rectangle2D.Float citation = new Rectangle2D.Float(110, 0, 40, page.getMediaBox().getHeight());
        //stripper2.addRegion("citation", citation);
        //Rectangle2D.Float violation = new Rectangle2D.Float(150, 0, 120, page.getMediaBox().getHeight());
        //stripper2.addRegion("violation", violation);
        //Rectangle2D.Float comment = new Rectangle2D.Float(270, 0, 120, page.getMediaBox().getHeight());
        //stripper2.addRegion("comment", comment);
        //Rectangle2D.Float warning = new Rectangle2D.Float(390, 0, 40, page.getMediaBox().getHeight());
        //stripper2.addRegion("warning", warning);
        //Rectangle2D.Float license = new Rectangle2D.Float(430, 0, 46, page.getMediaBox().getHeight());
        //stripper2.addRegion("license", license);
        //Rectangle2D.Float lot = new Rectangle2D.Float(476, 0, 82, page.getMediaBox().getHeight());
        //stripper2.addRegion("lot", lot);
        //Rectangle2D.Float make = new Rectangle2D.Float(558, 0, 65, page.getMediaBox().getHeight());
        //stripper2.addRegion("make", make);
        //Rectangle2D.Float officer = new Rectangle2D.Float(623, 0, 66, page.getMediaBox().getHeight());
        //stripper2.addRegion("officer", officer);
        //Rectangle2D.Float state = new Rectangle2D.Float(689, 0, 100, page.getMediaBox().getHeight());
        //stripper2.addRegion("state", state);

        //stripper2.extractRegions(page);
        //String text = stripper2.getTextForRegion("license");

        //Rectangle2D.Float row = new Rectangle2D.Float(0, 156, 80, 10);
        //stripper3.addRegion("row", row);
        //stripper3.extractRegions(page);
        //String text = stripper3.getTextForRegion("row");
        //System.out.println(text);
        counter++;
        curHeight = 116;
        rowCount = 39;
    }
    //AddRow(sheet, text, counter);
    //counter++;
    pd.close();

    System.out.println("Data extracted to Excel, parsing through Excel data...");

    boolean multiline = true;
    while (multiline) {
        multiline = false;
        for (int row = 0; row < sheet.getRows(); row++) {
            Cell cell = sheet.getCell(0, row);
            if (cell.getContents().length() < 5) {
                multiline = true;
                WritableCell cell2 = sheet.getWritableCell(4, row - 1);
                WritableCell cell3 = sheet.getWritableCell(4, row);
                String content = cell2.getContents() + cell3.getContents();
                content = content.replace("\n", "").replace("\r", "");
                Label l = (Label) cell2;
                l.setString(content);
                sheet.removeRow(row);
            }
        }
    }

    System.out.println("Data extraction complete");
    workbook.write();
    workbook.close();
}