com.plumblarrick.andrew.cityrecordtextextractor.CRTextStripper.java Source code

Java tutorial

Introduction

Here is the source code for com.plumblarrick.andrew.cityrecordtextextractor.CRTextStripper.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package com.plumblarrick.andrew.cityrecordtextextractor;

import java.io.IOException;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;

/**
 *
 * @author calarrick
 */

//OLD VERSION -- replace with CRTStripper class

public class CRTextStripper extends PDFTextStripper {

    private int pageCounter = 0;
    private int textChunksOnLine = 0;

    int sumFirstColStarts = 0;
    int sumFirstColEnds = 0;
    int sumSecondColStarts = 0;
    int sumSecondColEnds = 0;
    int avgFirstColStarts = 0;
    //int avgFirstColEnds = 0;
    int avgSecondColStarts = 0;
    //int avgSecondColEnds = 0;
    int columnLineCounter = 0; //counts lines with at least two columns

    int temp_sumFirstColStarts = 0;
    int temp_sumFirstColEnds = 0;

    public CRTextStripper() throws IOException {
    }
    //no call to super b/c PDFTextStripper is invoked with noarg constructor

    /* 
    In base class startPage doesn't do anything. Here implementation adds
    a harder human-readable page divider
        
     */
    @Override
    protected void startPage(PDPage page) throws IOException {

        pageCounter++;
        output.write("\n\n[Start Page " + pageCounter + " ]\n");
        textChunksOnLine = 0;
        sumFirstColStarts = 0;
        sumFirstColEnds = 0;
        sumSecondColStarts = 0;
        sumSecondColEnds = 0;
        avgFirstColStarts = 0;
        //avgFirstColEnds = 0;
        avgSecondColStarts = 0;
        //avgSecondColEnds = 0;

    }

    /*
    Adds tokens within lines of the output file that will enable correct 
    reconstruction of 
    content flow, even when pages switch between single- and multi- column 
    layouts mid-flow.
        
    Overrides 'empty' base-class implementation of the writeString method with
    the List<TextPosition> parameter. Writes the first x-axis position in the list 
    (that of the first character in what PDFBox has already assembled into a 
    line-level String) before calling the basic one-parameter method.
        
    Added counter variable to indicate how many 'columns' appear in the line. This 
    is to assist in downstream calculation of the typical per-page column layout.
     */
    @Override
    protected void writeString(String text, List<TextPosition> textPositions) throws IOException {

        TextPosition tPos = textPositions.get(0);
        TextPosition ePos = textPositions.get(textPositions.size() - 1);
        //Pattern capNoSpace = Pattern.compile("[a-z]([A-Z])");
        String overRun = "";
        text = text.trim();

        int startPoint = Math.round(tPos.getXDirAdj());
        int endPoint = Math.round(ePos.getXDirAdj());
        int splitPoint = 0;

        if (textChunksOnLine == 0) {
            output.write(startPoint + "\t");

            if (avgFirstColStarts == 0) {
                temp_sumFirstColStarts = startPoint;
                temp_sumFirstColEnds = endPoint;
            } else {
                temp_sumFirstColStarts = sumFirstColStarts + startPoint;
                temp_sumFirstColEnds = sumFirstColEnds + endPoint;
            }

        } else {

            if (textChunksOnLine == 1) {

                columnLineCounter++;

                if (avgSecondColStarts == 0) {
                    avgSecondColStarts = startPoint;
                    sumSecondColStarts = startPoint;
                    //avgSecondColEnds = endPoint;
                    sumSecondColEnds = endPoint;
                    sumFirstColStarts = temp_sumFirstColStarts;
                    avgFirstColStarts = temp_sumFirstColStarts;

                } else {
                    sumSecondColStarts = sumSecondColStarts + startPoint;
                    sumSecondColEnds = sumSecondColEnds + endPoint;
                    sumFirstColStarts = temp_sumFirstColStarts;
                    sumFirstColEnds = temp_sumFirstColEnds;
                    avgFirstColStarts = sumFirstColStarts / columnLineCounter;
                    //avgFirstColEnds = sumFirstColEnds / columnLineCounter;
                }

                temp_sumFirstColStarts = 0;
                temp_sumFirstColEnds = 0;

            }

            output.write("|" + startPoint + "\t");
        }

        if (startPoint > endPoint) {
            //catches overrun issues in columns other than the first

            //check for beginning of overrun
            float prevXPos = 0;
            float currXPos;
            int splitIndex = 0;
            for (int i = 0; i < text.length(); i++) {

                currXPos = textPositions.get(i).getXDirAdj();
                if (currXPos < prevXPos) {
                    splitIndex = i;
                    splitPoint = Math.round(currXPos);
                    overRun = text.substring(splitIndex);
                    text = text.substring(0, splitIndex);
                    //output.write("|" + startPoint + "\t");
                    output.write(text);
                    output.write("\n" + splitPoint + "\t" + overRun);
                    writeLineSeparator();

                    break;
                }

                prevXPos = currXPos;

            }

        }

        else {
            output.write(text);
        }

        textChunksOnLine++;

    }

    @Override
    protected void writeLineSeparator() throws IOException {
        //output.write("\t[cols=" + textChunksOnLine + "]");
        output.write(getLineSeparator());
        textChunksOnLine = 0;
    }

}