com.ntc.utils.PdfTextExtractor.java Source code

Java tutorial

Introduction

Here is the source code for com.ntc.utils.PdfTextExtractor.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */

package com.ntc.utils;

import java.io.File;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

/**
 *
 * @author ugr
 */
public class PdfTextExtractor extends TextExtractor {
    private int numberOfPages;
    private PDFTextStripper stripper;
    private PDDocument document;
    private int currentPage;

    public PdfTextExtractor(File file) {
        try {
            document = PDDocument.load(file);
            numberOfPages = document.getNumberOfPages();
            stripper = new PDFTextStripper();
        } catch (IOException ex) {
            Logger.getLogger(PdfTextExtractor.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    @Override
    public StringBuilder getNextString() {
        currentPage++;
        if (currentPage > numberOfPages) {
            try {
                document.close();
                return null;
            } catch (IOException ex) {
                Logger.getLogger(PdfTextExtractor.class.getName()).log(Level.SEVERE, null, ex);
                return null;
            }
        } else {
            try {
                stripper.setStartPage(currentPage);
                stripper.setEndPage(currentPage);
                String tempString = stripper.getText(document);
                if (tempString == null)
                    return null;
                else
                    return new StringBuilder(tempString);
            } catch (IOException ex) {
                Logger.getLogger(PdfTextExtractor.class.getName()).log(Level.SEVERE, null, ex);
                return null;
            }
        }
    }
}