org.archive.modules.extractor.PDFParser.java Source code

Introduction

Here is the source code for org.archive.modules.extractor.PDFParser.java
Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.modules.extractor;

import com.lowagie.text.pdf.PdfReader;
import com.lowagie.text.pdf.PdfName;
import com.lowagie.text.pdf.PdfObject;
import com.lowagie.text.pdf.PdfDictionary;
import com.lowagie.text.pdf.PRIndirectReference;
import com.lowagie.text.pdf.PdfArray;

import java.io.*;
import java.util.*;

/** Supports PDF parsing operations.  For now this primarily means
 *  extracting URIs, but the logic in extractURIs() could easily be adopted/extended
 * for a variety of PDF processing tasks.
 *
 * @author Parker Thompson
 *
 */
//TODO make this more effecient, it currently had to read the whole file into memory
// before processing can begin, and appears to take much longer than it "should"
// to parse small, but admittedly complex, documents.
public class PDFParser {

    protected ArrayList<String> foundURIs;
    protected ArrayList<ArrayList<Integer>> encounteredReferences;
    protected PdfReader documentReader;
    protected byte[] document;
    protected PdfDictionary catalog;

    public PDFParser(String doc) throws IOException {
        resetState();
        getInFromFile(doc);
        initialize();
    }

    public PDFParser(byte[] doc) throws IOException {
        resetState();
        document = doc;
        initialize();
    }

    /** Reinitialize the object as though a new one were created.
     */
    protected void resetState() {
        foundURIs = new ArrayList<String>();
        encounteredReferences = new ArrayList<ArrayList<Integer>>();
        documentReader = null;
        document = null;
        catalog = null;

        for (int i = 0; i < encounteredReferences.size(); i++) {
            encounteredReferences.add(new ArrayList<Integer>());
        }
    }

    /**
     * Reset the object and initialize it with a new byte array (the document).
     * @param doc
     * @throws IOException
     */
    public void resetState(byte[] doc) throws IOException {
        resetState();
        document = doc;
        initialize();
    }

    /** Reinitialize the object as though a new one were created, complete
     * with a valid pointer to a document that can be read
     * @param doc
     * @throws IOException
     */
    public void resetState(String doc) throws IOException {
        resetState();
        getInFromFile(doc);
        initialize();
    }

    /**
     * Read a file named 'doc' and store its' bytes for later processing.
     * @param doc
     * @throws IOException
     */
    protected void getInFromFile(String doc) throws IOException {
        File documentOnDisk = new File(doc);

        long length = documentOnDisk.length();
        document = new byte[(int) length];

        FileInputStream inStream = new FileInputStream(documentOnDisk);

        inStream.read(document);
    }

    /**
     * Indicates, based on a PDFObject's generation/id pair whether
     * the parser has already encountered this object (or a reference to it)
     * so we don't infinitely loop on circuits within the PDF.
     * @param generation
     * @param id
     * @return True if already seen.
     */
    protected boolean haveSeen(int generation, int id) {

        // if we can't store this generation grow our list until we can
        if (generation >= encounteredReferences.size()) {
            for (int i = encounteredReferences.size(); i <= generation; i++) {
                encounteredReferences.add(new ArrayList<Integer>());
            }

            // clearly we haven't seen it
            return false;
        }

        ArrayList<Integer> generationList = encounteredReferences.get(generation);

        for (int i : generationList) {
            if (i == id) {
                return true;
            }
        }
        return false;
    }

    /**
     * Note that an object (id/generation pair) has been seen by this parser
     * so that it can be handled differently when it is encountered again.
     * @param generation
     * @param id
     */
    protected void markAsSeen(int generation, int id) {
        ArrayList<Integer> objectIds = encounteredReferences.get(generation);
        objectIds.add(id);
    }

    /**
     * Get a list of URIs retrieved from the Pdf during the
     * extractURIs operation.
     * @return A list of URIs retrieved from the Pdf during the
     * extractURIs operation.
     */
    public ArrayList<String> getURIs() {
        return foundURIs;
    }

    /**
     * Initialize opens the document for reading.  This is done implicitly
     * by the constuctor.  This should only need to be called directly following
     * a reset.
     * @throws IOException
     */
    protected void initialize() throws IOException {
        if (document != null) {
            documentReader = new PdfReader(document);
        }

        catalog = documentReader.getCatalog();
    }

    /**
     * Extract URIs from all objects found in a Pdf document's catalog.
     * Returns an array list representing all URIs found in the document catalog tree.
     * @return URIs from all objects found in a Pdf document's catalog.
     */
    public ArrayList<String> extractURIs() {
        extractURIs(catalog);
        return getURIs();
    }

    /**
     * Parse a PdfDictionary, looking for URIs recursively and adding
     * them to foundURIs
     * @param entity
     */
    @SuppressWarnings("unchecked")
    protected void extractURIs(PdfObject entity) {

        // deal with dictionaries
        if (entity.isDictionary()) {

            PdfDictionary dictionary = (PdfDictionary) entity;

            Set<PdfName> allkeys = dictionary.getKeys();
            for (PdfName key : allkeys) {
                PdfObject value = dictionary.get(key);

                // see if it's the key is a UR[I,L]
                if (key.toString().equals("/URI") || key.toString().equals("/URL")) {
                    foundURIs.add(value.toString());

                } else {
                    this.extractURIs(value);
                }

            }

            // deal with arrays
        } else if (entity.isArray()) {

            PdfArray array = (PdfArray) entity;
            for (PdfObject pdfObject : (Iterable<PdfObject>) array.getArrayList()) {
                this.extractURIs(pdfObject);
            }

            // deal with indirect references
        } else if (entity.getClass() == PRIndirectReference.class) {

            PRIndirectReference indirect = (PRIndirectReference) entity;

            // if we've already seen a reference to this object
            if (haveSeen(indirect.getGeneration(), indirect.getNumber())) {
                return;

                // note that we've seen it if it's new
            } else {
                markAsSeen(indirect.getGeneration(), indirect.getNumber());
            }

            // dereference the "pointer" and process the object
            indirect.getReader(); // FIXME: examine side-effects
            PdfObject direct = PdfReader.getPdfObject(indirect);

            this.extractURIs(direct);
        }
    }

    public static void main(String[] argv) {

        try {
            PDFParser parser = new PDFParser("/home/parkert/files/pdfspec.pdf");
            ArrayList<String> uris = parser.extractURIs();
            Iterator<String> i = uris.iterator();
            while (i.hasNext()) {
                String uri = (String) i.next();
                System.out.println("got uri: " + uri);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}