de.oio.jpdfunit.document.pdflibimpl.PdfBoxAnalyser.java Source code

Introduction

Here is the source code for de.oio.jpdfunit.document.pdflibimpl.PdfBoxAnalyser.java
Source

/*
 * JPdfUnit- make your pdf green
 * Copyright (C) 2005 Orientation in Objects GmbH
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the Apache License Version 2.0.
 * There is a copy of this license along with this library.
 * Otherwise see terms of license at apache.org
 *
 * Feel free to contact us:
 *
 * jpdfunit-users@lists.sourceforge.net
 *
 * $Id: PdfBoxAnalyser.java,v 1.2 2011/12/09 14:03:49 s_schaefer Exp $
 */

package de.oio.jpdfunit.document.pdflibimpl;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.LinkedList;
import java.util.List;

import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.encryption.PDEncryptionDictionary;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.apache.pdfbox.util.PDFTextStripper;

import de.oio.jpdfunit.document.util.PdfImplUtilResourceBundle;
import de.oio.jpdfunit.document.util.TextSearchType;
import de.oio.jpdfunit.document.util.ITextSearcher;

/**
 * The PdfBoxAnalyser class is the adaptor class to the pdf library PDFBox. It
 * extends the PDFTextStripper class of the PDFBox and provides via the
 * PDDocument of PDFBox and the PDFTextStripper the user the functionalities to
 * get the different informations and content of the pdf document.
 *
 * @author bbratkus
 */
class PdfBoxAnalyser extends PDFTextStripper {
    private static final String PARAMETER = PdfImplUtilResourceBundle.getString("PdfBoxAnalyser.inital");

    private static final String CANTDECRYPT = PdfImplUtilResourceBundle.getString("PdfBoxAnalyser.decrypt");

    private static final String NOCONTENT = PdfImplUtilResourceBundle.getString("PdfBoxAnalyser.content");

    private static final String NODOCINIT = PdfImplUtilResourceBundle.getString("PdfBoxAnalyser.docInital");

    private final transient PDDocument pdDocument;

    private transient StringBuffer textbuffer = null;

    private transient LinkedList<PdfBoxFontAdapter> fonts;

    private transient ArrayList<String> bookMarkList;

    /**
     * This constructor uses a String parameter to instanciate the PDDocument.
     *
     * @param file
     *        The path and the file as String. I.e. "/home/bbratkus/test.pdf".
     * @throws IOException
     */
    public PdfBoxAnalyser(final String file) throws IOException {
        if ((file.equals("") || (file == null))) //$NON-NLS-1$
        {
            throw new IllegalArgumentException(PARAMETER);
        }
        try {
            pdDocument = PDDocument.load(file);
        } catch (final IOException ioe) {
            throw new IllegalArgumentException(NODOCINIT);
        }
        setContentAsStringBuffer();
        getDocumentFonts();
    }

    /**
     * This constructor uses a InputStream as parameter to instanciate the
     * PDDocument.
     *
     * @param pdfFileStream
     *        The Stream which the pdf file is within.
     * @throws IOException
     */
    public PdfBoxAnalyser(final InputStream pdfStream) throws IOException {
        if (pdfStream == null) {
            throw new IllegalArgumentException(PARAMETER);
        }
        try {
            pdDocument = PDDocument.load(pdfStream);
        } catch (final IOException ioe) {
            throw new IllegalArgumentException(NODOCINIT);
        }
        setContentAsStringBuffer();
        getDocumentFonts();
    }

    /**
     * This method sets the Content of the the PDDocument to a StringBuffer.
     *
     * @throws IllegalArgumentException
     *         The method throws an IllegalArgumentException if if is not possible
     *         to get the content as StringBuffer of the PDDocument.
     */
    private void setContentAsStringBuffer() {
        if (!pdDocument.isEncrypted()) {
            try {
                this.textbuffer = new StringBuffer(getText(pdDocument));
            } catch (final IOException e) {
                throw new IllegalArgumentException(NOCONTENT);
            }
        }
    }

    /**
     * Close the document.
     *
     * @throws IllegalStateException
     *         The method throws an IllegalStateException if the framework is not
     *         able to close the document.
     */
    public void closeDocument() {
        try {
            pdDocument.close();
        } catch (final IOException ioe) {
            throw new IllegalStateException(PdfImplUtilResourceBundle.getString("PdfBoxAnalyser.close"));
        }
    }

    /**
     * This method gets the number of pages of the PDDocument.
     *
     * @return The page number of the document.
     * @throws IllegalStateException
     *         The method throws an IllegalStateException if it is not possible to
     *         get the page count.
     */
    public int countPages() {

        return pdDocument.getNumberOfPages();
    }

    /**
     * This method tries to decrypt the document via the user or the owner
     * password.
     *
     * @param passwd
     *        The supposed owner or user password of the document.
     * @throws IllegalStateException
     *         The method throws an IllegalStateException if it is not possible to
     *         decrypt the document.
     * @throws IllegalArgumentException
     *         if the supposed password do not match the password of the pdf
     *         document.
     */
    public void decryptDocument(final String passwd) {
        if (passwd == null)
            throw new IllegalArgumentException(PARAMETER);

        try {
            pdDocument.decrypt(passwd);
            if (pdDocument.isEncrypted())
                throw new IllegalStateException(PdfImplUtilResourceBundle.getString("PdfBoxAnalyser.encrypted"));
        } catch (final CryptographyException e) {
            throw new IllegalStateException(CANTDECRYPT);
        } catch (final IOException e) {
            throw new IllegalStateException(CANTDECRYPT);
        }
    }

    /**
     * The method gets the authors name of the PDDocumentInformation of the
     * PDDocument.
     *
     * @return The authors name which is set in the document information.
     */
    public String getAuthor() {
        return pdDocument.getDocumentInformation().getAuthor();
    }

    /**
     * This methods files a String with the StringBuffer which holds the content.
     *
     * @return The hole content of a pdf document.
     */
    public String getContent() {
        setContentAsStringBuffer();
        return textbuffer.toString();
    }

    /**
     * This method get the content for a certain page.
     *
     * @param page
     *        The page number of the page which content should be returned. The
     *        page count is 1-based.
     * @return The content of the selected page.
     * @throws IllegalArgumentException
     *         The method throws an IllegalArgumentException if the page number is
     *         smaller or equals 0.
     */
    public String getContentOnPage(final int page) {

        if ((page == 0) || (page < 0)) {
            throw new IllegalArgumentException(PARAMETER);
        }
        StringBuffer tmpBuffer = null;
        super.setStartPage(page);
        super.setEndPage(page);
        try {
            tmpBuffer = new StringBuffer(super.getText(pdDocument));
        } catch (final IOException ioe) {
            throw new IllegalArgumentException(NOCONTENT);
        }
        return tmpBuffer.toString();
    }

    /**
     * The method gets the creator of the PDDocumentInformation of the PDDocument.
     *
     * @return The creator of the document.
     */
    public String getCreator() {
        return pdDocument.getDocumentInformation().getCreator();
    }

    /**
     * The method gets the creation date of the PDDocumentInformation of the
     * PDDocument.
     *
     * @return The creation date which is set in the document information.
     */
    public Calendar getCreationDate() {
        try {
            return pdDocument.getDocumentInformation().getCreationDate();
        } catch (final IOException e) {
            throw new IllegalStateException();
        }
    }

    /**
     * The method gets the encryption length of the PDEncryptionDictionary of the
     * PDDocument.
     *
     * @return The actual length of the choosen encryption.
     */
    public int getEncryptionLength() {
        int length = 0;
        try {
            if (pdDocument.isEncrypted()) {
                final PDEncryptionDictionary dicen = pdDocument.getEncryptionDictionary();
                length = dicen.getLength();
            }
        } catch (final IOException ioe) {
            length = 0;
        }
        return length;
    }

    /**
     * The method gets the first page where the text appears.
     *
     * @param text
     *        The string or regular expression (constructs a regular expression
     *        matcher from a String by compiling it using a new instance of
     *        RECompiler) to search for in the document. Be carefull regex are
     *        greedy.
     * @param type
     *        The kind of search.
     * @return The first page the text was found. Returns -1 if the text can not
     *         be found
     */
    public int getFirstPageForContent(final String text, final TextSearchType type) {
        boolean isContent = false;
        if ((text == null) || (type == null)) {
            throw new IllegalArgumentException(PARAMETER);
        }

        String docContent = null;
        int value = -1;
        final ITextSearcher textsearcher = type.getSearcher();
        final int pageCount = pdDocument.getNumberOfPages();
        for (int i = 1; i <= pageCount; i++) {
            docContent = this.getContentOnPage(i);
            isContent = textsearcher.isTextContent(text, docContent);
            if (isContent) {
                value = i;
            }
        }

        return value;
    }

    /**
     * The method gets the keywords of the PDDocumentInformation of the
     * PDDocument.
     *
     * @return The keywords which are set in the document information.
     */
    public String getKeywords() {
        return pdDocument.getDocumentInformation().getKeywords();
    }

    /**
     * The method gets all the pages where the text appears.
     *
     * @param text
     *        The string or regular expression (Constructs a regular expression
     *        matcher from a String by compiling it using a new instance of
     *        RECompiler.) to search for in the document. Be carefull regex are
     *        greedy.
     * @param type
     *        The kind of search. The type is related to the pages of the
     *        document.
     * @return The hole page numbers where the text was found in the document. If
     *         the text was found one time returns -1.
     * @see de.oio.jpdfunit.document.IContent
     * @deprecated As of version 0.93, replaced by
     *             <code>Content.getListOfPagesForContent(String text, TextSearchType type)</code>
     *             .
     */
    @Deprecated
    public int[] getPagesForContent(final String text, final TextSearchType type) {
        int[] result;
        boolean isContent = false;
        if ((text == null) || (type == null)) {
            throw new IllegalArgumentException(PARAMETER);
        }
        String docContent = null;
        final ITextSearcher textsearcher = type.getSearcher();
        final int pageCount = pdDocument.getNumberOfPages();
        result = new int[pageCount];
        boolean positive = false;
        for (int i = 1; i <= pageCount; i++) {
            docContent = this.getContentOnPage(i);
            isContent = textsearcher.isTextContent(text, docContent);
            if (isContent) {
                result[i] = i;
                positive = true;
            }
        }
        if (!positive) {
            result = new int[0];
        }
        return result;
    }

    /**
     * The method gets all the pages where the text appears.
     *
     * @param text
     *        The string or regular expression (Constructs a regular expression
     *        matcher from a String by compiling it using a new instance of
     *        RECompiler.) to search for in the document. Be carefull regex are
     *        greedy.
     * @param type
     *        The kind of search. The type is related to the pages of the
     *        document.
     * @return The hole page numbers where the text was found in the document. If
     *         the text was found one time returns -1.
     */
    public List<Integer> getListOfPagesForContent(final String text, final TextSearchType type) {
        boolean isContent = false;
        if ((text == null) || (type == null)) {
            throw new IllegalArgumentException(PARAMETER);
        }
        String docContent = null;
        final ITextSearcher textsearcher = type.getSearcher();
        final int pageCount = pdDocument.getNumberOfPages();
        final List<Integer> pages = new ArrayList<Integer>();
        for (int i = 1; i <= pageCount; i++) {
            docContent = this.getContentOnPage(i);
            isContent = textsearcher.isTextContent(text, docContent);
            if (isContent) {
                pages.add(new Integer(i));
            }
        }
        return pages;
    }

    /**
     * The method gets the producer of the PDDocumentInformation of the
     * PDDocument.
     *
     * @return The producer which is set in the in information of the document.
     */
    public String getProducer() {
        return pdDocument.getDocumentInformation().getProducer();
    }

    /**
     * The method gets the subject of the PDDocumentInformation of the PDDocument.
     *
     * @return The subject of the document which is set in the document
     *         information.
     */
    public String getSubject() {
        return pdDocument.getDocumentInformation().getSubject();

    }

    /**
     * The method gets the title of the PDDocumentInformation of the PDDocument.
     *
     * @return The title of the document which is set in the document information.
     */
    public String getTitle() {
        return pdDocument.getDocumentInformation().getTitle();
    }

    /**
     * The method gets the version of the COSDocument of the PDDocument.
     *
     * @return The version of the tested pdf file.
     */
    public float getVersion() {
        final COSDocument cosDoc = pdDocument.getDocument();
        return cosDoc.getVersion();
    }

    /**
     * The method ckecks if the text is content.
     *
     * @param text
     *        The expected text which should be content of the document, even here
     *        you can use a regular expression .
     * @param type
     *        The kind of search. The type is related to the pages of the
     *        document.
     * @return The method returns true if the text is found the first time.
     */
    public boolean isTextContent(final String text, final TextSearchType type) {
        boolean isContent = false;
        if ((text == null) || (type == null)) {
            throw new IllegalArgumentException(PARAMETER);
        }

        String docContent = null;

        final ITextSearcher textsearcher = type.getSearcher();
        final int pageCount = pdDocument.getNumberOfPages();
        boolean returnBool = false;
        for (int i = 1; i <= pageCount; i++) {
            docContent = this.getContentOnPage(i);
            isContent = textsearcher.isTextContent(text, docContent);
            if (isContent) {
                returnBool = true;
            }
        }

        return returnBool;
    }

    /**
     * The method ckecks if the text is content on a certain page.
     *
     * @param text
     *        The expected text or regular expression which should be content of
     *        the document.
     * @param type
     *        The kind of search. The type is related to the pages of the
     *        document.
     * @param page
     *        The page of the document which should be searched for the text. The
     *        page count is 1-based.
     * @return The method returns true if the text is found on the suggested page.
     */
    public boolean isTextContentOnPage(final String text, final TextSearchType type, final int page) {
        boolean isContent = false;
        if ((text == null) || (type == null) || (page == 0) || (page < 0)) {
            throw new IllegalArgumentException(PARAMETER);
        }

        String docContent = null;
        final ITextSearcher textsearcher = type.getSearcher();
        docContent = this.getContentOnPage(page);
        isContent = textsearcher.isTextContent(text, docContent);
        return isContent;
    }

    /**
     * The method checks if the PDDocument is encrypted.
     *
     * @return the method returns true if the document is encrypted.
     */
    public boolean isDocumentEncrypted() {
        return pdDocument.isEncrypted();
    }

    /**
     * The method checks if the supposed password is a password of the PDDocument.
     *
     * @param expected
     *        The expected owner password. The owner password is required for
     *        changing the attributes of the document.
     * @return The method returns true, if the expected password matches to the
     *         owner password of the document.
     */
    public boolean isOwnerPasswd(final String expected) {
        if (expected == null) {
            throw new IllegalArgumentException(PARAMETER);
        }
        boolean isPasswd = false;
        try {
            if (pdDocument.isEncrypted()) {
                isPasswd = pdDocument.isOwnerPassword(expected);
            }
        } catch (final IOException e) {
            throw new IllegalStateException(CANTDECRYPT);
        } catch (final CryptographyException e) {
            throw new IllegalStateException(CANTDECRYPT);
        }
        return isPasswd;
    }

    /**
     * The method checks if the supposed password is a password of the PDDocument.
     *
     * @param expected
     *        The expected user password. The user password is required for i.e.
     *        reading the document.
     * @return The method returns true, if the expected password matches to the
     *         user password of the document.
     */
    public boolean isUserPasswd(final String expected) {
        if (expected == null) {
            throw new IllegalArgumentException(PARAMETER);
        }
        boolean isPasswd = false;
        try {
            if (pdDocument.isEncrypted()) {
                isPasswd = pdDocument.isUserPassword(expected);
            }
        } catch (final IOException e) {
            throw new IllegalStateException(CANTDECRYPT);
        } catch (final CryptographyException e) {
            throw new IllegalStateException(CANTDECRYPT);
        }
        return isPasswd;
    }

    /**
     * @return A LinkedList with containing all Fonts of the Document, means the
     *         "names" and the types of a font relative to the page.
     */
    public LinkedList<PdfBoxFontAdapter> getAllFontsInDocument() {
        getDocumentFonts();
        return fonts;
    }

    private void getDocumentFonts() {
        fonts = new LinkedList<PdfBoxFontAdapter>();
        PDResources ress;
        LinkedList myFontList = null;
        if (!pdDocument.isEncrypted()) {
            for (int page = 0; page < pdDocument.getNumberOfPages(); page++) {
                ress = ((PDPage) (pdDocument.getDocumentCatalog().getAllPages().get(page))).findResources();
                myFontList = new LinkedList(ress.getFonts().values());
                for (int i = 0; i < myFontList.size(); i++) {
                    fonts.add(new PdfBoxFontAdapter(((PDFont) (myFontList.get(i))).getBaseFont(),
                            ((PDFont) (myFontList.get(i))).getSubType(), page));
                }
            }
        }
    }

    /**
     * This method returns all bookmarks of a pdf document.
     *
     * @return The llst containing the bookmarks.
     */
    public List<String> getAllBookmarks() {
        getBookmarks();
        return bookMarkList;
    }

    private void getBookmarks() {
        bookMarkList = new ArrayList<String>();
        final PDDocumentOutline root = pdDocument.getDocumentCatalog().getDocumentOutline();
        if (root != null) {
            final PDOutlineItem item = root.getFirstChild();
            rekursionBookmarks(item);
        }
    }

    private void rekursionBookmarks(PDOutlineItem bla) {
        while (bla != null) {
            bookMarkList.add(bla.getTitle());
            final PDOutlineItem child = bla.getFirstChild();
            rekursionBookmarks(child);
            bla = bla.getNextSibling();
        }
    }
}