Java tutorial
/************************************************************************************************** * Copyright (c) 2010 Mihail Atanassov and others. All rights reserved. This program and the * accompanying materials are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * <p/> * Contributors: <br/> * Mihail Atanassov - initial API and implementation <br/> * Fabian Steeg - Refactored for PdfBox *************************************************************************************************/ package de.uni_koeln.ub.drc.reader; import java.awt.print.PageFormat; import java.io.File; import java.io.IOException; import java.io.StringWriter; import java.util.ArrayList; import java.util.List; import java.util.Vector; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.util.TextPosition; import de.uni_koeln.ub.drc.reader.temp.PDFTextStripper2; import de.uni_koeln.ub.drc.reader.temp.PositionWrapper; /** * The utility class {@code PdfContentExtractor} is optimized for parsing PDF * documents generated from an OCR result of ABBYY FineReader 9.0 Professional * Edition. Subclasses a PdfBox PDFTextStripper to get the text, paragraph and * and position information. * * @author Mihail Atanssov <saeko.bjagai@gmail.com> (original version) <br/> * Fabian Steeg <fsteeg@gmail.com> (Refactored for PdfBox) */ public class PdfContentExtractor extends PDFTextStripper2 { private static String location; private List<TextPosition> paragraphs = new ArrayList<TextPosition>(); /** * @throws IOException * From superclass */ public PdfContentExtractor() throws IOException { super(); } @Override protected void isParagraphSeparation(final PositionWrapper position, final PositionWrapper lastPosition, final PositionWrapper lastLineStartPosition) { /* * TODO we can get lines from here if we need to, we will be called here * for every line */ super.isParagraphSeparation(position, lastPosition, lastLineStartPosition); if (position.isParagraphStart()) { paragraphs.add(position.getTextPosition()); } } /** * @param pdfName * The full path to the PDF file to extract content from * @return The PageInfo object for the PDF */ public static PageInfo extractContentFromPdf(String pdfName) { try { location = pdfName; PDDocument document = PDDocument.load(new File(pdfName)); PdfContentExtractor x = initExtractor(document); PageInfo result = x.toPageInfo(); document.close(); return result; } catch (IOException e) { e.printStackTrace(); } return null; } private static PdfContentExtractor initExtractor(final PDDocument document) throws IOException { StringWriter writer = new StringWriter(); PdfContentExtractor x = new PdfContentExtractor(); x.setDropThreshold(3.75f); // x.setIndentThreshold(1f); // for tweaking paragraph detection try { x.writeText(document, writer); } catch (NullPointerException e) { System.err.println("Could not process: " + location); //$NON-NLS-1$ e.printStackTrace(); } return x; } private PageInfo toPageInfo() { Vector<List<TextPosition>> positions = charactersByArticle; List<ExtractedWord> words = new ArrayList<ExtractedWord>(); if (positions.size() == 0 || positions.get(0).size() == 0) { System.err.println("No content found for: " + location); //$NON-NLS-1$ return new PageInfo(words); } TextPosition currentWordStart = positions.get(0).get(0); StringBuilder currentWordText = new StringBuilder(); for (List<TextPosition> list : positions) { for (TextPosition pos : list) { if (currentWordStart == null) { currentWordStart = pos; // remember start for new words } currentWordText.append(pos.getCharacter()); if (pos.getCharacter().equals(" ") //$NON-NLS-1$ || pos.getCharacter().equals("-")) { //$NON-NLS-1$ ExtractedWord w = word(currentWordStart, currentWordText, pos); if (currentWordText.toString().trim().length() > 0) { words.add(w); } currentWordText = new StringBuilder(); currentWordStart = null; // forget current word start } } } return new PageInfo(words); } private ExtractedWord word(final TextPosition currentWordStart, final StringBuilder currentWord, final TextPosition endPosition) { String wordText = currentWord.toString(); PageFormat format = document.getPageFormat(0); double width = format.getWidth(); double height = format.getHeight(); Point start = new Point(currentWordStart.getX(), height - currentWordStart.getY()); Point end = new Point(endPosition.getX(), height - endPosition.getY()); ExtractedWord w = new ExtractedWord(wordText, start, end, paragraphs.contains(currentWordStart), endPosition.getFontSize(), width, height); return w; } }