it.polito.tellmefirst.parsing.PDFparser.java Source code

Java tutorial

Introduction

Here is the source code for it.polito.tellmefirst.parsing.PDFparser.java

Source

/**
 * TellMeFirst - A Knowledge Discovery Application
 *
 * Copyright (C) 2012 Federico Cairo, Giuseppe Futia, Federico Benedetto
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package it.polito.tellmefirst.parsing;

import static it.polito.tellmefirst.util.TMFUtils.unchecked;

import java.io.File;
import java.io.FileInputStream;
import it.polito.tellmefirst.exception.TMFVisibleException;
import it.polito.tellmefirst.util.Ret;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

/**
 * Created by IntelliJ IDEA.
 * User: Federico Cairo
 */
public class PDFparser extends TMFTextParser {

    static Log LOG = LogFactory.getLog(PDFparser.class);

    public String pdfToText(File file) throws TMFVisibleException {
        LOG.debug("[pdfToText] - BEGIN");
        String result;
        if (!file.isFile()) {
            throw new TMFVisibleException("File in input is actually not a file.");
        }
        try {
            PDFParser parser = new PDFParser(new FileInputStream(file));
            parser.parse();
            COSDocument cosDoc = parser.getDocument();
            PDFTextStripper pdfStripper = new PDFTextStripper();
            PDDocument pdDoc = new PDDocument(cosDoc);
            //pdfStripper.setStartPage(1);
            //pdfStripper.setEndPage(5);
            // remove syllabification
            String parsedTextWithWrap = pdfStripper.getText(pdDoc);
            result = parsedTextWithWrap.replace("-\n", "");
            if (cosDoc != null)
                cosDoc.close();
            if (pdDoc != null)
                pdDoc.close();
        } catch (Exception e) {
            LOG.error("[pdfToText] - EXCEPTION: ", e);
            throw new TMFVisibleException("Problem parsing file: the PDF document you uploaded seems malformed.");
        }
        LOG.debug("[pdfToText] - END");
        return result;
    }

    // just for testing
    // XXX LOL
    public static void main(String args[]) throws TMFVisibleException {
        PDFparser pdfp = new PDFparser();
        System.out.println(pdfp
                .pdfToText(new File("/home/federico/Scrivania/Progetti/TellMeFirst/files/2011-iscc-paper.pdf")));
    }

    @Override
    public String parse(final File file) {
        return unchecked(new Ret<String>() {
            public String ret() throws Exception {
                return pdfToText(file);
            }
        }, "pdf not parsed");
    }

}