Java tutorial
/******************************************************************************* * Copyright (c) 2011 Tran Nam Quang. * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * Contributors: * Tran Nam Quang - initial API and implementation *******************************************************************************/ package net.sourceforge.vaticanfetcher.model.parse; import java.io.IOException; import java.io.InputStream; import java.io.StringWriter; import java.util.Collection; import java.util.Collections; import net.sourceforge.vaticanfetcher.enums.Msg; import net.sourceforge.vaticanfetcher.util.annotations.NotNull; import net.sourceforge.vaticanfetcher.util.annotations.Nullable; import org.apache.pdfbox.exceptions.CryptographyException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.util.PDFTextStripper; public final class PdfParser extends StreamParser { private static final Collection<String> extensions = Collections.singleton("pdf"); private static final Collection<String> types = MediaType.Col.application("pdf"); PdfParser() { } @Override protected ParseResult parse(@NotNull InputStream in, @NotNull final ParseContext context) throws ParseException { PDDocument pdfDoc = null; try { /* TODO post-release-1.1: check if 'force' argument in PDDocument/Stripper increases number of parsed PDF files */ pdfDoc = PDDocument.load(in, true); PDDocumentInformation pdInfo; final int pageCount; try { pdInfo = pdfDoc.getDocumentInformation(); pageCount = pdfDoc.getNumberOfPages(); } catch (ClassCastException e) { // Bug #3529070 and #3528345 throw new ParseException(e); } StringWriter writer = new StringWriter(); /* * If the PDF file is encrypted, the PDF stripper will automatically try an empty password. * * In contrast to the paging PDF parser that is used for the preview, we do not need to call * setSortByPosition(true) here because the extracted text will be digested by Lucene anyway. */ PDFTextStripper stripper = new PDFTextStripper() { protected void startPage(PDPage page) throws IOException { context.getReporter().subInfo(getCurrentPageNo(), pageCount); } protected void endPage(PDPage page) throws IOException { if (context.getCancelable().isCanceled()) setEndPage(0); } }; stripper.setForceParsing(true); try { stripper.writeText(pdfDoc, writer); } catch (RuntimeException e) { /* PDFTextStripper.writeText can throw various RuntimeExceptions, see bugs #3446010, #3448272, #3444887. */ throw new ParseException(e); } return new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle()).addAuthor(pdInfo.getAuthor()) .addMiscMetadata(pdInfo.getSubject()).addMiscMetadata(pdInfo.getKeywords()); } catch (IOException e) { if (e.getCause() instanceof CryptographyException) throw new ParseException(Msg.doc_pw_protected.get()); throw new ParseException(e); } finally { close(pdfDoc); } } static void close(@Nullable PDDocument doc) { if (doc != null) { try { doc.close(); } catch (IOException e) { } } } protected Collection<String> getExtensions() { return extensions; } protected Collection<String> getTypes() { return types; } public String getTypeLabel() { return Msg.filetype_pdf.get(); } }