javadocofflinesearch.htmlprocessing.PdfAttempter.java Source code

Java tutorial

Introduction

Here is the source code for javadocofflinesearch.htmlprocessing.PdfAttempter.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package javadocofflinesearch.htmlprocessing;

import java.io.IOException;
import java.io.InputStream;
import javadocofflinesearch.extensions.Vocabulary;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

/**
 *
 * @author jvanek
 */
public class PdfAttempter {

    private final Vocabulary vc;

    public PdfAttempter(Vocabulary vocabualry) {
        this.vc = vocabualry;
    }

    public String pdftoText(InputStream is, boolean stats) throws IOException {
        PDDocument pdDoc = null;
        COSDocument cosDoc = null;
        try {
            PDFParser parser = new PDFParser(is);
            parser.parse();
            cosDoc = parser.getDocument();
            PDFTextStripper pdfStripper = new PDFTextStripper();
            pdDoc = new PDDocument(cosDoc);
            String text = pdfStripper.getText(pdDoc);
            if (stats) {
                vc.addAll(text);
            }
            return text;
        } finally {
            if (cosDoc != null) {
                cosDoc.close();
            }
            if (pdDoc != null) {
                pdDoc.close();
            }
        }
    }

}