ch.uzh.ifi.pdeboer.pdfpreprocessing.pdf.TextHighlight.java Source code

Introduction

Here is the source code for ch.uzh.ifi.pdeboer.pdfpreprocessing.pdf.TextHighlight.java
Source

package ch.uzh.ifi.pdeboer.pdfpreprocessing.pdf;

/*
 * Copyright 2014 J. Kuiper
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.edit.PDPageContentStream;
import org.apache.pdfbox.pdmodel.graphics.PDExtendedGraphicsState;
import org.apache.pdfbox.util.Matrix;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;

import java.awt.*;
import java.io.IOException;
import java.io.Writer;
import java.util.*;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * This class implements the methods highlight and highlightDefault which will add a highlight to the PDF based on a
 * Pattern or String. The idea is to extend the PDFTextStripper and override the methods that write to the Output to
 * instead write to a TextCache that keeps data on the position of the TextPositions. From this information we can then
 * derive bounding boxes (and quads) that can be used to write the annotations. See the main method for example usage
 *
 * @author J. Kuiper <me@joelkuiper.eu>
 */
public class TextHighlight extends PDFTextStripper {

    public TextCache textCache;
    private float verticalTolerance = 5;
    private float heightModifier = (float) 1.250;
    private boolean inParagraph;

    /**
     * Instantiate a new object. This object will load properties from PDFTextAnnotator.properties and will apply
     * encoding-specific conversions to the output text.
     *
     * @param encoding The encoding that the output will be written in.
     * @throws IOException If there is an error reading the properties.
     */
    public TextHighlight(final String encoding) throws IOException {
        super(encoding);
    }

    /**
     * Computes a series of bounding boxes (PDRectangle) from a list of TextPositions. It will create a new bounding box
     * if the vertical tolerance is exceeded
     *
     * @param positions
     * @throws IOException
     */
    public List<PDRectangle> getTextBoundingBoxes(final List<TextPosition> positions) {
        final List<PDRectangle> boundingBoxes = new ArrayList<PDRectangle>();

        float lowerLeftX = -1, lowerLeftY = -1, upperRightX = -1, upperRightY = -1;
        boolean first = true;
        for (int i = 0; i < positions.size(); i++) {
            final TextPosition position = positions.get(i);
            if (position == null) {
                continue;
            }
            final Matrix textPos = position.getTextPos();
            final float height = position.getHeight() * getHeightModifier();
            if (first) {
                lowerLeftX = textPos.getXPosition();
                upperRightX = lowerLeftX + position.getWidth();

                lowerLeftY = textPos.getYPosition();
                upperRightY = lowerLeftY + height;
                first = false;
                continue;
            }

            // we are still on the same line
            if (Math.abs(textPos.getYPosition() - lowerLeftY) <= getVerticalTolerance()) {
                upperRightX = textPos.getXPosition() + position.getWidth();
                upperRightY = textPos.getYPosition() + height;
            } else {
                final PDRectangle boundingBox = boundingBox(lowerLeftX, lowerLeftY, upperRightX, upperRightY);
                boundingBoxes.add(boundingBox);

                // new line
                lowerLeftX = textPos.getXPosition();
                upperRightX = lowerLeftX + position.getWidth();

                lowerLeftY = textPos.getYPosition();
                upperRightY = lowerLeftY + height;
            }
        }
        if (!(lowerLeftX == -1 && lowerLeftY == -1 && upperRightX == -1 && upperRightY == -1)) {
            final PDRectangle boundingBox = boundingBox(lowerLeftX, lowerLeftY, upperRightX, upperRightY);
            boundingBoxes.add(boundingBox);
        }
        return boundingBoxes;
    }

    private PDRectangle boundingBox(final float lowerLeftX, final float lowerLeftY, final float upperRightX,
            final float upperRightY) {
        final PDRectangle boundingBox = new PDRectangle();
        boundingBox.setLowerLeftX(lowerLeftX);
        boundingBox.setLowerLeftY(lowerLeftY);
        boundingBox.setUpperRightX(upperRightX);
        boundingBox.setUpperRightY(upperRightY);
        return boundingBox;
    }

    public void highlight(final Pattern searchText, final Pattern markingPattern, Color color, int pageNr) {
        if (textCache == null || document == null) {
            throw new IllegalArgumentException("TextCache was not initialized");
        }

        final List<PDPage> pages = document.getDocumentCatalog().getAllPages();

        try {
            boolean found = false;

            final PDPage page = pages.get(pageNr - 1);
            PDPageContentStream contentStream = new PDPageContentStream(document, page, true, true);

            PDExtendedGraphicsState graphicsState = new PDExtendedGraphicsState();
            graphicsState.setNonStrokingAlphaConstant(0.5f);
            PDResources resources = page.findResources();
            Map graphicsStateDictionary = resources.getGraphicsStates();
            if (graphicsStateDictionary == null) {
                // There is no graphics state dictionary in the resources dictionary, create one.
                graphicsStateDictionary = new TreeMap();
            }
            graphicsStateDictionary.put("highlights", graphicsState);
            resources.setGraphicsStates(graphicsStateDictionary);

            for (Match searchMatch : textCache.match(pageNr, searchText)) {
                if (textCache.match(searchMatch.positions, markingPattern).size() > 0) {
                    for (Match markingMatch : textCache.match(searchMatch.positions, markingPattern)) {
                        if (markupMatch(color, contentStream, markingMatch)) {
                            found = true;
                        }
                    }
                } else {
                    System.out
                            .println("Cannot highlight: " + markingPattern.pattern() + " on page " + (pageNr - 1));
                }
                if (found) {
                    break;
                }
            }
            contentStream.close();
        } catch (Exception e) {
            e.printStackTrace();
        } catch (Error e1) {
            e1.printStackTrace();
            throw e1;
        }
    }

    private boolean markupMatch(Color color, PDPageContentStream contentStream, Match markingMatch)
            throws IOException {
        final List<PDRectangle> textBoundingBoxes = getTextBoundingBoxes(markingMatch.positions);

        if (textBoundingBoxes.size() > 0) {
            contentStream.appendRawCommands("/highlights gs\n");
            contentStream.setNonStrokingColor(color);
            for (PDRectangle textBoundingBox : textBoundingBoxes) {
                contentStream.fillRect(textBoundingBox.getLowerLeftX(), textBoundingBox.getLowerLeftY(),
                        Math.max(Math.abs(textBoundingBox.getUpperRightX() - textBoundingBox.getLowerLeftX()), 10),
                        10);
            }
            return true;
        }
        return false;
    }

    /**
     * The vertical tolerance determines whether a character is still on the same line
     */
    public float getVerticalTolerance() {
        return verticalTolerance;
    }

    /**
     * The height modifier is applied to the font height, it allows the annotations to be changed by a certain factor
     */
    public float getHeightModifier() {
        return heightModifier;
    }

    /*
     * The following methods are overwritten from the PDTextStripper
     */
    public void initialize(final PDDocument pdf) throws IOException {
        try {
            resetEngine();
            document = pdf;
            textCache = new TextCache();

            if (getAddMoreFormatting()) {
                setParagraphEnd(getLineSeparator());
                setPageStart(getLineSeparator());
                setArticleStart(getLineSeparator());
                setArticleEnd(getLineSeparator());
            }
            startDocument(pdf);
            processPages(pdf.getDocumentCatalog().getAllPages());
            endDocument(pdf);
        } catch (Exception e) {
            e.printStackTrace();
        } catch (Error e1) {
            e1.printStackTrace();
        }
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void resetEngine() {
        super.resetEngine();
        textCache = null;
    }

    /**
     * Start a new article, which is typically defined as a column on a single page (also referred to as a bead).
     * Default implementation is to do nothing. Subclasses may provide additional information.
     *
     * @param isltr true if primary direction of text is left to right.
     * @throws IOException If there is any error writing to the stream.
     */
    @Override
    protected void startArticle(final boolean isltr) throws IOException {
        final String articleStart = getArticleStart();
        textCache.append(articleStart, null);
    }

    /**
     * End an article. Default implementation is to do nothing. Subclasses may provide additional information.
     *
     * @throws IOException If there is any error writing to the stream.
     */
    @Override
    protected void endArticle() throws IOException {
        final String articleEnd = getArticleEnd();
        textCache.append(articleEnd, null);
    }

    /**
     * Start a new page. Default implementation is to do nothing. Subclasses may provide additional information.
     *
     * @param page The page we are about to process.
     * @throws IOException If there is any error writing to the stream.
     */
    @Override
    protected void startPage(final PDPage page) throws IOException {
        // default is to do nothing.
    }

    /**
     * End a page. Default implementation is to do nothing. Subclasses may provide additional information.
     *
     * @param page The page we are about to process.
     * @throws IOException If there is any error writing to the stream.
     */
    @Override
    protected void endPage(final PDPage page) throws IOException {
        // default is to do nothing
    }

    /**
     * Write the page separator value to the text cache.
     *
     * @throws IOException If there is a problem writing out the pageseparator to the document.
     */
    @Override
    protected void writePageSeperator() {
        final String pageSeparator = getPageSeparator();
        textCache.append(pageSeparator, null);
    }

    /**
     * Write the line separator value to the text cache.
     *
     * @throws IOException If there is a problem writing out the lineseparator to the document.
     */
    @Override
    protected void writeLineSeparator() {
        final String lineSeparator = getLineSeparator();
        textCache.append(lineSeparator, null);
    }

    /**
     * Write the word separator value to the text cache.
     *
     * @throws IOException If there is a problem writing out the wordseparator to the document.
     */
    @Override
    protected void writeWordSeparator() {
        final String wordSeparator = getWordSeparator();
        textCache.append(wordSeparator, null);
    }

    /**
     * Write the string in TextPosition to the text cache.
     *
     * @param text The text to write to the stream.
     */
    @Override
    protected void writeCharacters(final TextPosition text) {
        final String character = text.getCharacter();
        textCache.append(character, text);

    }

    /**
     * Write a string to the text cache. The default implementation will ignore the <code>text</code> and just calls
     * {@link #writeCharacters(TextPosition)} .
     *
     * @param text          The text to write to the stream.
     * @param textPositions The TextPositions belonging to the text.
     */
    @Override
    protected void writeString(final String text, final List<TextPosition> textPositions) {
        for (final TextPosition textPosition : textPositions) {
            writeCharacters(textPosition);
        }
    }

    /**
     * writes the paragraph separator string to the text cache.
     *
     * @throws IOException
     */
    @Override
    protected void writeParagraphSeparator() {
        writeParagraphEnd();
        writeParagraphStart();
    }

    /**
     * Write something (if defined) at the start of a paragraph.
     *
     * @throws IOException
     */
    @Override
    protected void writeParagraphStart() {
        if (inParagraph) {
            writeParagraphEnd();
            inParagraph = false;
        }

        final String paragraphStart = getParagraphStart();
        textCache.append(paragraphStart, null);
        inParagraph = true;
    }

    /**
     * Write something (if defined) at the end of a paragraph.
     *
     * @throws IOException
     */
    @Override
    protected void writeParagraphEnd() {
        final String paragraphEnd = getParagraphEnd();
        textCache.append(paragraphEnd, null);

        inParagraph = false;
    }

    /**
     * Write something (if defined) at the start of a page.
     *
     * @throws IOException
     */
    @Override
    protected void writePageStart() {
        final String pageStart = getPageStart();
        textCache.append(pageStart, null);
    }

    /**
     * Write something (if defined) at the start of a page.
     *
     * @throws IOException
     */
    @Override
    protected void writePageEnd() {
        final String pageEnd = getPageEnd();
        textCache.append(pageEnd, null);
    }

    @Override
    public String getText(final PDDocument doc) throws IOException {
        throw new IllegalArgumentException("Not applicable for TextHighlight");
    }

    @Override
    public String getText(final COSDocument doc) throws IOException {
        throw new IllegalArgumentException("Not applicable for TextHighlight");
    }

    @Override
    public void writeText(final COSDocument doc, final Writer outputStream) throws IOException {
        throw new IllegalArgumentException("Not applicable for TextHighlight");
    }

    @Override
    public void writeText(final PDDocument doc, final Writer outputStream) throws IOException {
        throw new IllegalArgumentException("Not applicable for TextHighlight");
    }

    /**
     * Internal utility class that keeps a mapping from the text contents to their TextPositions. This is needed to
     * compute bounding boxes. The data is stored on a per-page basis (keyed on the 1-based pageNo)
     */
    public class TextCache {
        private final Map<Integer, StringBuilder> texts = new HashMap<Integer, StringBuilder>();
        private final Map<Integer, ArrayList<TextPosition>> positions = new HashMap<Integer, ArrayList<TextPosition>>();

        private StringBuilder obtainStringBuilder(final Integer pageNo) {
            StringBuilder sb = texts.get(pageNo);
            if (sb == null) {
                sb = new StringBuilder();
                texts.put(pageNo, sb);
            }
            return sb;
        }

        private ArrayList<TextPosition> obtainTextPositions(final Integer pageNo) {
            ArrayList<TextPosition> textPositions = positions.get(pageNo);
            if (textPositions == null) {
                textPositions = new ArrayList<TextPosition>();
                positions.put(pageNo, textPositions);
            }
            return textPositions;
        }

        public String getText(final Integer pageNo) {
            return obtainStringBuilder(pageNo).toString();
        }

        public List<TextPosition> getTextPositions(final Integer pageNo) {
            return obtainTextPositions(pageNo);
        }

        public void append(final String str, final TextPosition pos) {
            final int currentPage = getCurrentPageNo();
            final ArrayList<TextPosition> positions = obtainTextPositions(currentPage);
            final StringBuilder sb = obtainStringBuilder(currentPage);

            for (int i = 0; i < str.length(); i++) {
                sb.append(str.charAt(i));
                positions.add(pos);
            }
        }

        /**
         * Given a page and a pattern it will return a list of matches for that pattern. A Match is a tuple of <String,
         * List<TextPositions>>
         *
         * @param pageNo
         * @param pattern
         * @return list of matches
         */
        public List<Match> match(final Integer pageNo, final Pattern pattern) {
            return match(getTextPositions(pageNo), this.getText(pageNo), pattern);
        }

        public List<Match> match(List<TextPosition> textPositions, final Pattern pattern) {
            StringBuilder sb = new StringBuilder(textPositions.size() * 2);
            for (TextPosition textPosition : textPositions) {
                if (textPosition != null)
                    sb.append(textPosition.getCharacter());
            }
            return match(textPositions, sb.toString(), pattern);
        }

        public List<Match> match(List<TextPosition> textPositions, String text, final Pattern pattern) {
            try {
                final Matcher matcher = pattern.matcher(text);
                final List<Match> matches = new ArrayList<Match>();

                while (matcher.find()) {
                    final List<TextPosition> elements = textPositions.subList(matcher.start(), matcher.end());
                    matches.add(new Match(matcher.group(), elements));
                }
                return matches;
            } catch (Error e) {
                System.out.println("An error occurred while searching for: " + pattern.toString());
                e.printStackTrace();
                final List<Match> emptyList = new ArrayList<Match>();
                return emptyList;
            } catch (Exception e1) {
                System.out.println("An exception occurred while seraching for: " + pattern.toString());
                e1.printStackTrace();
                final List<Match> emptyList = new ArrayList<Match>();
                return emptyList;
            }
        }
    }

}