com.iana.dver.pdf.scrapper.DVERScrapperTask.java Source code

Java tutorial

Introduction

Here is the source code for com.iana.dver.pdf.scrapper.DVERScrapperTask.java

Source

/**
 * 
 */
package com.iana.dver.pdf.scrapper;

import java.awt.Rectangle;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Collections;
import java.util.List;

import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.comparator.LastModifiedFileComparator;
import org.apache.log4j.Logger;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.output.Format;
import org.jdom2.output.XMLOutputter;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDPage;
import org.pdfbox.util.PDFTextStripperByArea;
import org.springframework.integration.file.DirectoryScanner;
import org.springframework.integration.file.HeadDirectoryScanner;

import com.iana.dver.pdf.scrapper.exception.WrongConfigurationException;

/**
 * @author tgbaxi
 */
public class DVERScrapperTask {

    private static final Logger log = Logger.getLogger(DVERScrapperTask.class);

    private String scanDir;

    private String archiveDir;

    private String maxFiles;

    private String xmlDir;

    public void readAndParseDVER() throws WrongConfigurationException, IOException {
        if (scanDir == null && archiveDir == null && maxFiles == null && xmlDir == null) {
            log.info("Wrong configuration used");
            throw new WrongConfigurationException("Some configuration is missing in job");
        }

        // Fetching the files up to 50
        DirectoryScanner scanner = new HeadDirectoryScanner(Integer.parseInt(maxFiles.trim()));
        File dverScanDirectory = new File(scanDir);
        if (dverScanDirectory.isDirectory()) {
            List<File> files = scanner.listFiles(dverScanDirectory);
            // Sorting the list as per created time.
            Collections.sort(files, LastModifiedFileComparator.LASTMODIFIED_COMPARATOR);
            // Iterate the files list
            for (File file : files) {
                if (file.isFile()) {
                    PDFTextStripperByArea textStripper = readDVER(file);
                    generateDverXML(FilenameUtils.getBaseName(file.getName()), textStripper);
                }
            }
        } else {
            throw new WrongConfigurationException("The Path to look for DVER is not a directory");
        }
    }

    /**
     * Step - 1 : Read PDF from the path
     * 
     * @param file
     * @return
     * @throws IOException
     */
    @SuppressWarnings("unchecked")
    private PDFTextStripperByArea readDVER(final File file) throws IOException {
        PDDocument document = PDDocument.load(file);
        PDFTextStripperByArea textStripper = new PDFTextStripperByArea();

        Rectangle addressRect = new Rectangle(10, 50, 200, 50);
        textStripper.addRegion("ADDRESS", addressRect);

        Rectangle reportInfoRect = new Rectangle(300, 50, 300, 50);
        textStripper.addRegion("REPORT_INFO", reportInfoRect);

        Rectangle iepRect = new Rectangle(10, 100, 630, 40);
        textStripper.addRegion("IEP_INFO", iepRect);

        Rectangle mcRect = new Rectangle(10, 140, 630, 80);
        textStripper.addRegion("MC_INFO", mcRect);

        Rectangle driverQueRect = new Rectangle(10, 220, 630, 20);
        textStripper.addRegion("DRIVER_CHOICE", driverQueRect);

        Rectangle vehicleIdRect = new Rectangle(10, 240, 630, 30);
        textStripper.addRegion("VEHICLE_ID", vehicleIdRect);

        Rectangle brkAdjRect = new Rectangle(10, 270, 630, 20);
        textStripper.addRegion("BREAK_ADJ", brkAdjRect);

        Rectangle violationRect = new Rectangle(10, 290, 630, 30);
        textStripper.addRegion("CHASSIS_VIOLATION", violationRect);

        Rectangle otherViolationRect = new Rectangle(10, 320, 630, 50);
        textStripper.addRegion("OTHER_CHASSIS_VIOLATION", otherViolationRect);

        Rectangle driverNotesRect = new Rectangle(10, 370, 630, 30);
        textStripper.addRegion("DRIVER_NOTES", driverNotesRect);

        Rectangle iepNotesRect = new Rectangle(10, 400, 630, 60);
        textStripper.addRegion("IEP_NOTES", iepNotesRect);

        Rectangle dverCreationRect = new Rectangle(10, 720, 630, 60);
        textStripper.addRegion("CREATION_NOTES", dverCreationRect);

        List<PDPage> allPages = document.getDocumentCatalog().getAllPages();
        PDPage firstPage = allPages.get(0);
        textStripper.extractRegions(firstPage);

        return textStripper;
    }

    /**
     * Step - 2 : Generate XML from Text stripper
     * 
     * @param tempTextFile
     * @throws IOException
     */
    private void generateDverXML(String fileName, PDFTextStripperByArea stripper) throws IOException {

        File outputFile = new File(xmlDir + fileName + ".xml");
        OutputStream fos = new FileOutputStream(outputFile);

        Element dver = new Element("DVER");
        Document doc = new Document(dver);

        // Generate Address Node
        String addressDetail = stripper.getTextForRegion("ADDRESS");
        String[] addressArr = addressDetail.split("\\n");
        Element addressNode = new Element("ADDRESS");
        addressNode.addContent(new Element("ADDRESS_1").setText(addressArr[0] + "\\n" + addressArr[1]));
        addressNode.addContent(new Element("ADDRESS_2").setText(addressArr[2] + "\\n" + addressArr[3]));
        if (addressArr.length > 4) {
            String[] tempContact = addressArr[4].split(" ");
            addressNode.addContent(new Element("PHONE").setText(tempContact[1]));
            addressNode.addContent(new Element("FAX").setText(tempContact[3]));
        } else {
            addressNode.addContent(new Element("PHONE").setText(""));
            addressNode.addContent(new Element("FAX").setText(""));
        }
        doc.getRootElement().addContent(addressNode);

        // Report Information Node
        String reportDetail = stripper.getTextForRegion("REPORT_INFO");
        String[] reportDetailArr = reportDetail.split("\\n");
        Element reportInfoNode = new Element("REPORT_INFO");

        for (int i = 0; i < reportDetailArr.length; i++) {
            if (i == 0) {
                String[] reportInfo = reportDetailArr[i].split(":");
                reportInfoNode.addContent(new Element("REPORT_NUMBER").setText(reportInfo[1]));
            } else if (i == 1) {
                String[] inspDetail = reportDetailArr[i].split(":");
                inspDetail[1] = inspDetail[1].replaceAll("Certification Date", "");
                reportInfoNode.addContent(new Element("INSPECTION_DATE").setText(inspDetail[1]));
                reportInfoNode.addContent(new Element("CERTIFICATION_DATE").setText(inspDetail[2]));
            } else if (i == 2) {
                String timings = reportDetailArr[i];
                timings = timings.replaceAll("Time Started:", "");
                timings = timings.replaceAll("Time Ended:", "");
                String[] timeDetail = timings.split(" ");
                reportInfoNode.addContent(new Element("START_TIME").setText(timeDetail[0]));
                reportInfoNode.addContent(new Element("END_TIME").setText(timeDetail[1]));
            } else if (i == 3) {
                String[] reportInfo = reportDetailArr[i].split(":");
                reportInfoNode.addContent(new Element("INSPECTION_LEVEL").setText(reportInfo[1]));
            } else if (i == 4) {
                String[] reportInfo = reportDetailArr[i].split(":");
                reportInfoNode.addContent(new Element("INSPECTION_TYPE").setText(reportInfo[1]));
            }
        }
        doc.getRootElement().addContent(reportInfoNode);

        // INTERMODAL EQUIPMENT PROVIDER INFORMATION
        String iepDetail = stripper.getTextForRegion("IEP_INFO");
        String[] iepDetailArr = iepDetail.split("\\n");
        Element iepInfoNode = new Element("IEP_INFO");

        for (int j = 0; j < iepDetailArr.length; j++) {
            if (j == 1) {
                iepInfoNode.addContent(new Element("IEP_NAME").setText(iepDetailArr[j]));
            } else if (j == 2) {
                String[] tempIepInfo = iepDetailArr[j].split(" ");
                iepInfoNode.addContent(new Element("US_DOT").setText(tempIepInfo[3]));
                iepInfoNode.addContent(new Element("DATA_SOURCE").setText(tempIepInfo[6]));
            }
        }
        doc.getRootElement().addContent(iepInfoNode);

        // MOTOR CARRIER INFORMATION
        String mcDetail = stripper.getTextForRegion("MC_INFO");
        String[] mcDetailArr = mcDetail.split("\\n");
        Element mcDetailNode = new Element("MC_INFO");

        for (int k = 0; k < mcDetailArr.length; k++) {
            if (k == 1) {
                String mcCompAndDriver = mcDetailArr[k].replaceAll("Driver:", "");
                mcDetailNode.addContent(new Element("MC_NAME").setText(mcCompAndDriver.split(" ")[0]));
                mcDetailNode.addContent(new Element("DRIVER").setText(mcCompAndDriver.split(" ")[1]));
            } else if (k == 2) {
                mcDetailNode.addContent(new Element("MC_ADD_1").setText(mcDetailArr[k]));
            } else if (k == 3) {
                mcDetailNode.addContent(new Element("MC_ADD_2").setText(mcDetailArr[k]));
            } else if (k == 4) {
                String tempStr = mcDetailArr[k];
                tempStr = tempStr.replaceAll("USDOT #:", "");
                tempStr = tempStr.replaceAll("Phone #:", "");
                String[] otherDetails = tempStr.trim().split(" ");
                mcDetailNode
                        .addContent(new Element("US_DOT").setText(otherDetails[0] != null ? otherDetails[0] : ""));
                mcDetailNode
                        .addContent(new Element("PHONE").setText(otherDetails[2] != null ? otherDetails[2] : ""));
            } else if (k == 5) {
                String tempStr = mcDetailArr[k];
                tempStr = tempStr.replaceAll("MC/MX #:", "");
                tempStr = tempStr.replaceAll("Fax #:", "");
                String[] otherDetails = tempStr.trim().split(" ");
                mcDetailNode
                        .addContent(new Element("MC_MX").setText(otherDetails[0] != null ? otherDetails[0] : ""));
                mcDetailNode.addContent(new Element("FAX")
                        .setText(otherDetails.length > 1 && otherDetails[1] != null ? otherDetails[2] : ""));
            } else if (k == 6) {
                mcDetailArr[k] = mcDetailArr[k].replaceAll("State #:", "");
                mcDetailNode.addContent(new Element("STATE").setText(mcDetailArr[k] != null ? mcDetailArr[k] : ""));
            } else if (k == 7) {
                mcDetailArr[k] = mcDetailArr[k].replaceAll("Origin:", "");
                mcDetailArr[k] = mcDetailArr[k].replaceAll("Destination:", "");
                mcDetailNode.addContent(
                        new Element("ORIGIN_DESTINATION").setText(mcDetailArr[k] != null ? mcDetailArr[k] : ""));
            }
        }
        doc.getRootElement().addContent(mcDetailNode);

        // VEHICLE IDENTIFICATION
        String vehicleIdentification = stripper.getTextForRegion("VEHICLE_ID");
        String[] vehicleIdArr = vehicleIdentification.split("\\n");
        Element vehicleIdNode = new Element("VEHICLE_IDENTIFICATION");

        for (int l = 0; l < vehicleIdArr.length; l++) {
            if (l == 2) {
                String[] vehicleDetails = vehicleIdArr[l].trim().split(" ");
                for (int m = 0; m < vehicleDetails.length; m++) {
                    if (m == 0) {
                        vehicleIdNode.addContent(
                                new Element("UNIT").setText(vehicleDetails[m] != null ? vehicleDetails[m] : ""));
                    } else if (m == 1) {
                        vehicleIdNode.addContent(
                                new Element("TYPE").setText(vehicleDetails[m] != null ? vehicleDetails[m] : ""));
                    } else if (m == 2) {
                        vehicleIdNode.addContent(
                                new Element("MAKE").setText(vehicleDetails[m] != null ? vehicleDetails[m] : ""));
                    } else if (m == 3) {
                        vehicleIdNode.addContent(
                                new Element("YEAR").setText(vehicleDetails[m] != null ? vehicleDetails[m] : ""));
                    } else if (m == 4) {
                        vehicleIdNode.addContent(
                                new Element("STATE").setText(vehicleDetails[m] != null ? vehicleDetails[m] : ""));
                    } else if (m == 5) {
                        vehicleIdNode.addContent(
                                new Element("LICENSE").setText(vehicleDetails[m] != null ? vehicleDetails[m] : ""));
                    } else if (m == 6) {
                        vehicleIdNode.addContent(new Element("EQUIPMENT_ID")
                                .setText(vehicleDetails[m] != null ? vehicleDetails[m] : ""));
                    } else if (m == 7) {
                        vehicleIdNode.addContent(new Element("UNIT_VIN")
                                .setText(vehicleDetails[m] != null ? vehicleDetails[m] : ""));
                    } else if (m == 8) {
                        vehicleIdNode.addContent(
                                new Element("GVWR").setText(vehicleDetails[m] != null ? vehicleDetails[m] : ""));
                    } else if (m == 9) {
                        vehicleIdNode.addContent(new Element("ISSUED_DECAL")
                                .setText(vehicleDetails[m] != null ? vehicleDetails[m] : ""));
                    } else if (m == 10) {
                        vehicleIdNode.addContent(new Element("OOS_STKR")
                                .setText(vehicleDetails[m] != null ? vehicleDetails[m] : ""));
                    }
                }

            }
        }
        doc.getRootElement().addContent(vehicleIdNode);

        // Brake Adjustments
        String breakAdjustment = stripper.getTextForRegion("BREAK_ADJ");
        String[] breakAdjustmentArr = breakAdjustment.split("-");
        Element breakAdjustmentNode = new Element("BREAK_ADJUSTMENT");

        for (int n = 0; n < breakAdjustmentArr.length; n++) {
            if (n == 1) {
                breakAdjustmentNode.setText(breakAdjustmentArr[n] != null ? breakAdjustmentArr[n] : "");
            }
        }
        doc.getRootElement().addContent(breakAdjustmentNode);

        // Other Chassis Violation details
        String otherViolationDetail = stripper.getTextForRegion("OTHER_CHASSIS_VIOLATION");
        String[] otherViolationDetailArr = otherViolationDetail.split("\\n");
        Element otherViolationElement = new Element("OTHER_CHASSIS_VIOLATION");

        for (int ocnt = 0; ocnt < (otherViolationDetailArr.length - 1); ocnt++) {
            if (ocnt > 1) {
                String[] tempOtrDetail = otherViolationDetailArr[ocnt].split(" ");
                Element violations = new Element("OTHER_VIOLATIONS");
                for (int temp = 0; temp < tempOtrDetail.length; temp++) {
                    if (temp == 0) {
                        violations.addContent(new Element("VIO_CODE").setText(tempOtrDetail[temp]));
                    } else if (temp == 1) {
                        violations.addContent(new Element("SECTION").setText(tempOtrDetail[temp]));
                    } else if (temp == 2) {
                        violations.addContent(new Element("UNIT").setText(tempOtrDetail[temp]));
                    } else if (temp == 3) {
                        violations.addContent(new Element("OOS").setText(tempOtrDetail[temp]));
                    } else if (temp == 4) {
                        violations.addContent(new Element("NUMBER").setText(tempOtrDetail[temp]));
                    } else if (temp == 5) {
                        violations.addContent(new Element("VERIFY").setText(tempOtrDetail[temp]));
                    } else if (temp == 6) {
                        violations.addContent(new Element("CRASH").setText(tempOtrDetail[temp]));
                    } else if (temp == 7) {
                        violations.addContent(new Element("VIO_DESC").setText(tempOtrDetail[temp]));
                    }
                }
                otherViolationElement.addContent(violations);
            }
        }
        doc.getRootElement().addContent(otherViolationElement);

        String driverNotes = stripper.getTextForRegion("DRIVER_NOTES");
        Element driverNotesNode = new Element("NOTES_TO_DRIVER");
        driverNotesNode.setText(driverNotes);
        doc.getRootElement().addContent(driverNotesNode);

        String iepNotes = stripper.getTextForRegion("IEP_NOTES");
        Element iepNotesNode = new Element("NOTES_TO_IEP");
        iepNotesNode.setText(iepNotes);
        doc.getRootElement().addContent(iepNotesNode);

        String creationNotes = stripper.getTextForRegion("CREATION_NOTES");
        Element creationNotesNode = new Element("CREATED_BY");
        creationNotesNode.setText(creationNotes.split("\\n")[1]);
        doc.getRootElement().addContent(creationNotesNode);

        XMLOutputter xmlOutput = new XMLOutputter();
        // display nice nice
        xmlOutput.setFormat(Format.getPrettyFormat());
        xmlOutput.output(doc, fos);
        fos.flush();
        fos.close();

    }

    public String getScanDir() {
        return scanDir;
    }

    public void setScanDir(String scanDir) {
        this.scanDir = scanDir;
    }

    public String getArchiveDir() {
        return archiveDir;
    }

    public void setArchiveDir(String archiveDir) {
        this.archiveDir = archiveDir;
    }

    public String getMaxFiles() {
        return maxFiles;
    }

    public void setMaxFiles(String maxFiles) {
        this.maxFiles = maxFiles;
    }

    public String getXmlDir() {
        return xmlDir;
    }

    public void setXmlDir(String xmlDir) {
        this.xmlDir = xmlDir;
    }
}