de.uni_koeln.ub.drc.reader.PdfContentExtractor.java Source code

Introduction

Here is the source code for de.uni_koeln.ub.drc.reader.PdfContentExtractor.java
Source

/**************************************************************************************************
 * Copyright (c) 2010 Mihail Atanassov and others. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 * <p/>
 * Contributors: <br/>
 * Mihail Atanassov - initial API and implementation <br/>
 * Fabian Steeg - Refactored for PdfBox
 *************************************************************************************************/
package de.uni_koeln.ub.drc.reader;

import java.awt.print.PageFormat;
import java.io.File;
import java.io.IOException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Vector;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.TextPosition;

import de.uni_koeln.ub.drc.reader.temp.PDFTextStripper2;
import de.uni_koeln.ub.drc.reader.temp.PositionWrapper;

/**
 * The utility class {@code PdfContentExtractor} is optimized for parsing PDF
 * documents generated from an OCR result of ABBYY FineReader 9.0 Professional
 * Edition. Subclasses a PdfBox PDFTextStripper to get the text, paragraph and
 * and position information.
 * 
 * @author Mihail Atanssov <saeko.bjagai@gmail.com> (original version) <br/>
 *         Fabian Steeg <fsteeg@gmail.com> (Refactored for PdfBox)
 */
public class PdfContentExtractor extends PDFTextStripper2 {

    private static String location;
    private List<TextPosition> paragraphs = new ArrayList<TextPosition>();

    /**
     * @throws IOException
     *             From superclass
     */
    public PdfContentExtractor() throws IOException {
        super();
    }

    @Override
    protected void isParagraphSeparation(final PositionWrapper position, final PositionWrapper lastPosition,
            final PositionWrapper lastLineStartPosition) {
        /*
         * TODO we can get lines from here if we need to, we will be called here
         * for every line
         */
        super.isParagraphSeparation(position, lastPosition, lastLineStartPosition);
        if (position.isParagraphStart()) {
            paragraphs.add(position.getTextPosition());
        }
    }

    /**
     * @param pdfName
     *            The full path to the PDF file to extract content from
     * @return The PageInfo object for the PDF
     */
    public static PageInfo extractContentFromPdf(String pdfName) {
        try {
            location = pdfName;
            PDDocument document = PDDocument.load(new File(pdfName));
            PdfContentExtractor x = initExtractor(document);
            PageInfo result = x.toPageInfo();
            document.close();
            return result;
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }

    private static PdfContentExtractor initExtractor(final PDDocument document) throws IOException {
        StringWriter writer = new StringWriter();
        PdfContentExtractor x = new PdfContentExtractor();
        x.setDropThreshold(3.75f);
        // x.setIndentThreshold(1f); // for tweaking paragraph detection
        try {
            x.writeText(document, writer);
        } catch (NullPointerException e) {
            System.err.println("Could not process: " + location); //$NON-NLS-1$
            e.printStackTrace();
        }
        return x;
    }

    private PageInfo toPageInfo() {
        Vector<List<TextPosition>> positions = charactersByArticle;
        List<ExtractedWord> words = new ArrayList<ExtractedWord>();
        if (positions.size() == 0 || positions.get(0).size() == 0) {
            System.err.println("No content found for: " + location); //$NON-NLS-1$
            return new PageInfo(words);
        }
        TextPosition currentWordStart = positions.get(0).get(0);
        StringBuilder currentWordText = new StringBuilder();
        for (List<TextPosition> list : positions) {
            for (TextPosition pos : list) {
                if (currentWordStart == null) {
                    currentWordStart = pos; // remember start for new words
                }
                currentWordText.append(pos.getCharacter());
                if (pos.getCharacter().equals(" ") //$NON-NLS-1$
                        || pos.getCharacter().equals("-")) { //$NON-NLS-1$
                    ExtractedWord w = word(currentWordStart, currentWordText, pos);
                    if (currentWordText.toString().trim().length() > 0) {
                        words.add(w);
                    }
                    currentWordText = new StringBuilder();
                    currentWordStart = null; // forget current word start
                }
            }
        }
        return new PageInfo(words);
    }

    private ExtractedWord word(final TextPosition currentWordStart, final StringBuilder currentWord,
            final TextPosition endPosition) {
        String wordText = currentWord.toString();
        PageFormat format = document.getPageFormat(0);
        double width = format.getWidth();
        double height = format.getHeight();
        Point start = new Point(currentWordStart.getX(), height - currentWordStart.getY());
        Point end = new Point(endPosition.getX(), height - endPosition.getY());
        ExtractedWord w = new ExtractedWord(wordText, start, end, paragraphs.contains(currentWordStart),
                endPosition.getFontSize(), width, height);
        return w;
    }
}