Java tutorial
/** * pdfIdiom.java * Copyright 2010 by Michael Peter Christen * First released 27.4.2010 at http://yacy.net * * This file is part of YaCy Content Integration * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file COPYING.LESSER. * If not, see <http://www.gnu.org/licenses/>. */ package net.yacy.cider.parser.idiom; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.util.HashSet; import java.util.Set; import org.apache.pdfbox.exceptions.CryptographyException; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.pdmodel.encryption.AccessPermission; import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException; import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial; import org.apache.pdfbox.util.PDFTextStripper; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.ModelFactory; import com.hp.hpl.jena.rdf.model.Resource; import com.hp.hpl.jena.vocabulary.DC; import com.hp.hpl.jena.vocabulary.VCARD; import net.yacy.cider.document.DataSource; import net.yacy.cider.document.Extension; import net.yacy.cider.document.MimeType; import net.yacy.cider.parser.AbstractIdiom; import net.yacy.cider.parser.Idiom; import net.yacy.cider.parser.ParserException; import net.yacy.cider.vocabulary.CIDER; public class pdfIdiom extends AbstractIdiom implements Idiom { private static final Set<MimeType> SUPPORTED_MIME_TYPES = new HashSet<MimeType>(); private static final Set<Extension> SUPPORTED_EXTENSIONS = new HashSet<Extension>(); private static final Set<String> USED_VOCABULARIES = new HashSet<String>(); static { SUPPORTED_EXTENSIONS.add(Extension.PDF); SUPPORTED_MIME_TYPES.add(MimeType.APPLICATION_PDF); SUPPORTED_MIME_TYPES.add(MimeType.APPLICATION_XPDF); SUPPORTED_MIME_TYPES.add(MimeType.APPLICATION_ACROBAT); SUPPORTED_MIME_TYPES.add(MimeType.APPLICATION_VNDPDF); SUPPORTED_MIME_TYPES.add(MimeType.TEXT_PDF); SUPPORTED_MIME_TYPES.add(MimeType.TEXT_XPDF); USED_VOCABULARIES.add(DC.getURI()); USED_VOCABULARIES.add(VCARD.getURI()); USED_VOCABULARIES.add(CIDER.getDataURI()); } public pdfIdiom() { super("Acrobat Portable Document Parser"); } @Override public Set<MimeType> supportedMimeTypes() { return SUPPORTED_MIME_TYPES; } @Override public Set<Extension> supportedExtensions() { return SUPPORTED_EXTENSIONS; } public Set<String> usedVocabularies() { return USED_VOCABULARIES; } @Override public Model parse(DataSource source) throws ParserException { // create an empty Model Model model = ModelFactory.createDefaultModel(); Resource resource = source.hasURI() ? model.createResource(source.getURI().toNormalform(true, true)) : model.createResource(); // open pdf document final PDDocument theDocument; final PDFParser parser; try { parser = new PDFParser(source.getStream()); parser.parse(); theDocument = parser.getPDDocument(); } catch (IOException e) { log.error(e.getMessage(), e); throw new ParserException(e.getMessage(), source.getURI()); } if (theDocument.isEncrypted()) { try { theDocument.openProtection(new StandardDecryptionMaterial("")); } catch (BadSecurityHandlerException e) { throw new ParserException("PDF Encrypted (BadSecurityHandlerException): " + e.getMessage(), source.getURI(), e); } catch (IOException e) { throw new ParserException("PDF Encrypted (IOException): " + e.getMessage(), source.getURI(), e); } catch (CryptographyException e) { throw new ParserException("PDF Encrypted (CryptographyException): " + e.getMessage(), source.getURI(), e); } final AccessPermission perm = theDocument.getCurrentAccessPermission(); if (perm == null || !perm.canExtractContent()) throw new ParserException("PDF cannot be decrypted", source.getURI()); } // get metadata final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation(); String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null; if (theDocInfo != null) { docTitle = theDocInfo.getTitle(); docSubject = theDocInfo.getSubject(); docAuthor = theDocInfo.getAuthor(); docKeywordStr = theDocInfo.getKeywords(); } if (docAuthor != null && docAuthor.length() > 0) { resource.addProperty(VCARD.FN, docAuthor); resource.addProperty(DC.creator, docAuthor); } if (docSubject != null && docSubject.length() > 0) { resource.addProperty(DC.subject, docSubject); } if (docTitle != null && docTitle.length() > 0) { resource.addProperty(DC.title, docTitle); } String[] docKeywords = null; if (docKeywordStr != null && docKeywordStr.length() > 0) { docKeywords = docKeywordStr.split(" |,"); resource.addProperty(DC.coverage, concat(docKeywords)); } // get the content ByteArrayOutputStream baos = new ByteArrayOutputStream(); Writer writer; try { writer = new OutputStreamWriter(baos, "UTF-8"); } catch (UnsupportedEncodingException e1) { writer = new OutputStreamWriter(baos); } try { final PDFTextStripper stripper = new PDFTextStripper(); stripper.writeText(theDocument, writer); theDocument.close(); writer.close(); } catch (IOException e) { if (writer != null) try { writer.close(); } catch (final Exception ex) { } throw new ParserException("PDF content reader", source.getURI(), e); } String content; try { content = new String(baos.toByteArray(), "UTF-8"); } catch (UnsupportedEncodingException e) { content = new String(baos.toByteArray()); } if (content != null && content.length() > 0) { resource.addProperty(CIDER.data_content_text, content); } return model; } }