org.haplo.component.pdfbox.TextExtractPDF.java Source code

Introduction

Here is the source code for org.haplo.component.pdfbox.TextExtractPDF.java

Source

/* Haplo Platform                                     http://haplo.org
 * (c) Haplo Services Ltd 2006 - 2016    http://www.haplo-services.com
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.         */

package org.haplo.component.pdfbox;

import java.io.File;
import java.io.IOException;
import java.io.StringWriter;

import org.apache.log4j.Logger;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

import org.haplo.text.TextExtractOp;

public class TextExtractPDF extends TextExtractOp {
    public TextExtractPDF(String inputPathname) {
        super(inputPathname);
    }

    protected String extract() throws IOException {
        String text = null;
        try (PDDocument pdf = PDDocument.load(new File(getInputPathname()))) {
            PDFTextStripper stripper = new PDFTextStripper();
            StringWriter writer = new StringWriter();
            stripper.writeText(pdf, writer);
            text = writer.toString();
        }
        return text;
    }
}