net.yacy.cider.parser.idiom.pdfIdiom.java Source code

Introduction

Here is the source code for net.yacy.cider.parser.idiom.pdfIdiom.java
Source

/**
 *  pdfIdiom.java
 *  Copyright 2010 by Michael Peter Christen
 *  First released 27.4.2010 at http://yacy.net
 *  
 *  This file is part of YaCy Content Integration
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file COPYING.LESSER.
 *  If not, see <http://www.gnu.org/licenses/>.
 */

package net.yacy.cider.parser.idiom;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.util.HashSet;
import java.util.Set;

import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.util.PDFTextStripper;

import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.vocabulary.DC;
import com.hp.hpl.jena.vocabulary.VCARD;

import net.yacy.cider.document.DataSource;
import net.yacy.cider.document.Extension;
import net.yacy.cider.document.MimeType;
import net.yacy.cider.parser.AbstractIdiom;
import net.yacy.cider.parser.Idiom;
import net.yacy.cider.parser.ParserException;
import net.yacy.cider.vocabulary.CIDER;

public class pdfIdiom extends AbstractIdiom implements Idiom {

    private static final Set<MimeType> SUPPORTED_MIME_TYPES = new HashSet<MimeType>();
    private static final Set<Extension> SUPPORTED_EXTENSIONS = new HashSet<Extension>();
    private static final Set<String> USED_VOCABULARIES = new HashSet<String>();

    static {
        SUPPORTED_EXTENSIONS.add(Extension.PDF);
        SUPPORTED_MIME_TYPES.add(MimeType.APPLICATION_PDF);
        SUPPORTED_MIME_TYPES.add(MimeType.APPLICATION_XPDF);
        SUPPORTED_MIME_TYPES.add(MimeType.APPLICATION_ACROBAT);
        SUPPORTED_MIME_TYPES.add(MimeType.APPLICATION_VNDPDF);
        SUPPORTED_MIME_TYPES.add(MimeType.TEXT_PDF);
        SUPPORTED_MIME_TYPES.add(MimeType.TEXT_XPDF);
        USED_VOCABULARIES.add(DC.getURI());
        USED_VOCABULARIES.add(VCARD.getURI());
        USED_VOCABULARIES.add(CIDER.getDataURI());
    }

    public pdfIdiom() {
        super("Acrobat Portable Document Parser");
    }

    @Override
    public Set<MimeType> supportedMimeTypes() {
        return SUPPORTED_MIME_TYPES;
    }

    @Override
    public Set<Extension> supportedExtensions() {
        return SUPPORTED_EXTENSIONS;
    }

    public Set<String> usedVocabularies() {
        return USED_VOCABULARIES;
    }

    @Override
    public Model parse(DataSource source) throws ParserException {
        // create an empty Model
        Model model = ModelFactory.createDefaultModel();
        Resource resource = source.hasURI() ? model.createResource(source.getURI().toNormalform(true, true))
                : model.createResource();

        // open pdf document
        final PDDocument theDocument;
        final PDFParser parser;
        try {
            parser = new PDFParser(source.getStream());
            parser.parse();
            theDocument = parser.getPDDocument();
        } catch (IOException e) {
            log.error(e.getMessage(), e);
            throw new ParserException(e.getMessage(), source.getURI());
        }

        if (theDocument.isEncrypted()) {
            try {
                theDocument.openProtection(new StandardDecryptionMaterial(""));
            } catch (BadSecurityHandlerException e) {
                throw new ParserException("PDF Encrypted (BadSecurityHandlerException): " + e.getMessage(),
                        source.getURI(), e);
            } catch (IOException e) {
                throw new ParserException("PDF Encrypted (IOException): " + e.getMessage(), source.getURI(), e);
            } catch (CryptographyException e) {
                throw new ParserException("PDF Encrypted (CryptographyException): " + e.getMessage(),
                        source.getURI(), e);
            }
            final AccessPermission perm = theDocument.getCurrentAccessPermission();
            if (perm == null || !perm.canExtractContent())
                throw new ParserException("PDF cannot be decrypted", source.getURI());
        }

        // get metadata
        final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();
        String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null;
        if (theDocInfo != null) {
            docTitle = theDocInfo.getTitle();
            docSubject = theDocInfo.getSubject();
            docAuthor = theDocInfo.getAuthor();
            docKeywordStr = theDocInfo.getKeywords();
        }

        if (docAuthor != null && docAuthor.length() > 0) {
            resource.addProperty(VCARD.FN, docAuthor);
            resource.addProperty(DC.creator, docAuthor);
        }
        if (docSubject != null && docSubject.length() > 0) {
            resource.addProperty(DC.subject, docSubject);
        }
        if (docTitle != null && docTitle.length() > 0) {
            resource.addProperty(DC.title, docTitle);
        }
        String[] docKeywords = null;
        if (docKeywordStr != null && docKeywordStr.length() > 0) {
            docKeywords = docKeywordStr.split(" |,");
            resource.addProperty(DC.coverage, concat(docKeywords));
        }

        // get the content
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        Writer writer;
        try {
            writer = new OutputStreamWriter(baos, "UTF-8");
        } catch (UnsupportedEncodingException e1) {
            writer = new OutputStreamWriter(baos);
        }
        try {
            final PDFTextStripper stripper = new PDFTextStripper();
            stripper.writeText(theDocument, writer);
            theDocument.close();
            writer.close();
        } catch (IOException e) {
            if (writer != null)
                try {
                    writer.close();
                } catch (final Exception ex) {
                }
            throw new ParserException("PDF content reader", source.getURI(), e);
        }
        String content;
        try {
            content = new String(baos.toByteArray(), "UTF-8");
        } catch (UnsupportedEncodingException e) {
            content = new String(baos.toByteArray());
        }
        if (content != null && content.length() > 0) {
            resource.addProperty(CIDER.data_content_text, content);
        }

        return model;
    }

}