helper.pdfpreprocessing.pdf.TextHighlight.java Source code

Introduction

Here is the source code for helper.pdfpreprocessing.pdf.TextHighlight.java
Source

package helper.pdfpreprocessing.pdf;

/*
 * Copyright 2016 J. Kuiper and M. Roesch
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.pdmodel.graphics.state.PDExtendedGraphicsState;
import org.apache.pdfbox.pdmodel.interactive.annotation.*;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;

import java.awt.*;
import java.io.IOException;
import java.io.Writer;
import java.util.*;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * This class implements the methods highlight and highlightDefault which will add a highlight to the PDF based on a
 * Pattern or String. The idea is to extend the PDFTextStripper and override the methods that write to the Output to
 * instead write to a TextCache that keeps data on the position of the TextPositions. From this information we can then
 * derive bounding boxes (and quads) that can be used to write the annotations. See the main method for example usage
 *
 * @author J. Kuiper <me@joelkuiper.eu>
 * @author manuel (modifications)
 */
public class TextHighlight extends PDFTextStripper {

    public TextCache textCache;
    private float verticalTolerance = 5;
    private float heightModifier = (float) 1.250;
    private boolean inParagraph;

    /**
     * Instantiate a new object. This object will load properties from PDFTextAnnotator.properties and will apply
     * encoding-specific conversions to the output text.
     *
     * @param encoding The encoding that the output will be written in.
     * @throws IOException If there is an error reading the properties.
     */
    public TextHighlight(final String encoding) throws IOException {
        //super(encoding);
    }

    /**
     * Computes a series of bounding boxes (PDRectangle) from a list of TextPositions. It will create a new bounding box
     * if the vertical tolerance is exceeded
     *
     * @param positions
     * @throws IOException
     */
    public List<PDRectangle> getTextBoundingBoxes(final List<TextPosition> positions) {
        final List<PDRectangle> boundingBoxes = new ArrayList<>();

        float lowerLeftX = -1, lowerLeftY = -1, upperRightX = -1, upperRightY = -1;
        boolean first = true;
        for (final TextPosition position : positions) {
            if (position == null) {
                continue;
            }
            final Matrix textPos = position.getTextMatrix();
            final float height = position.getHeight() * getHeightModifier();
            if (first) {
                lowerLeftX = textPos.getTranslateX();
                upperRightX = lowerLeftX + position.getWidth();

                lowerLeftY = textPos.getTranslateY();
                upperRightY = lowerLeftY + height;
                first = false;
                continue;
            }

            // we are still on the same line
            if (Math.abs(textPos.getTranslateY() - lowerLeftY) <= getVerticalTolerance()) {
                upperRightX = textPos.getTranslateX() + position.getWidth();
                upperRightY = textPos.getTranslateY() + height;
            } else {
                final PDRectangle boundingBox = boundingBox(lowerLeftX, lowerLeftY, upperRightX, upperRightY);
                boundingBoxes.add(boundingBox);

                // new line
                lowerLeftX = textPos.getTranslateX();
                upperRightX = lowerLeftX + position.getWidth();

                lowerLeftY = textPos.getTranslateY();
                upperRightY = lowerLeftY + height;
            }
        }
        if (!(lowerLeftX == -1 && lowerLeftY == -1 && upperRightX == -1 && upperRightY == -1)) {
            final PDRectangle boundingBox = boundingBox(lowerLeftX, lowerLeftY, upperRightX, upperRightY);
            boundingBoxes.add(boundingBox);
        }
        return boundingBoxes;
    }

    private PDRectangle boundingBox(final float lowerLeftX, final float lowerLeftY, final float upperRightX,
            final float upperRightY) {
        final PDRectangle boundingBox = new PDRectangle();
        boundingBox.setLowerLeftX(lowerLeftX);
        boundingBox.setLowerLeftY(lowerLeftY);
        boundingBox.setUpperRightX(upperRightX);
        boundingBox.setUpperRightY(upperRightY);
        return boundingBox;
    }

    public void highlight(final Pattern searchText, final Pattern markingPattern, Color color, int pageNr,
            boolean withId, String comment) {
        if (textCache == null || document == null) {
            throw new IllegalArgumentException("TextCache was not initialized");
        }

        try {
            boolean found = false;

            final PDPage page = document.getPages().get(pageNr - 1);
            PDPageContentStream contentStream = new PDPageContentStream(document, page,
                    PDPageContentStream.AppendMode.APPEND, true);

            PDExtendedGraphicsState graphicsState = new PDExtendedGraphicsState();
            graphicsState.setNonStrokingAlphaConstant(0.5f);

            contentStream.setGraphicsStateParameters(graphicsState);

            for (Match searchMatch : textCache.match(pageNr, searchText)) {
                if (textCache.match(searchMatch.positions, markingPattern).size() > 0) {
                    for (Match markingMatch : textCache.match(searchMatch.positions, markingPattern)) {
                        if (markupMatch(color, contentStream, markingMatch, 10, withId, page, comment, false)) {
                            found = true;
                        }
                    }
                } else {
                    System.out
                            .println("Cannot highlight: " + markingPattern.pattern() + " on page " + (pageNr - 1));
                }
                if (found) {
                    break;
                }
            }
            contentStream.close();
        } catch (Exception e) {
            e.printStackTrace();
        } catch (Error e1) {
            e1.printStackTrace();
            throw e1;
        }
    }

    public void highlight(int startIndex, int stopIndex, Color color, int pageNr, int boxHeight,
            boolean hasLineOffset, boolean withId, String comment, boolean commentOnly) {
        if (textCache == null || document == null) {
            throw new IllegalArgumentException("TextCache was not initialized");
        }
        try {
            final PDPage page = document.getPages().get(pageNr - 1);
            PDPageContentStream contentStream = new PDPageContentStream(document, page,
                    PDPageContentStream.AppendMode.APPEND, true);

            PDExtendedGraphicsState graphicsState = new PDExtendedGraphicsState();
            graphicsState.setNonStrokingAlphaConstant(0.5f);

            contentStream.setGraphicsStateParameters(graphicsState);

            List<TextPosition> pos = textCache.getTextPositions(pageNr);
            int numberOfLines = 0;
            if (hasLineOffset) {
                numberOfLines = textCache.getText(pageNr).substring(0, stopIndex).split("\\n").length - 1;
            }
            pos = pos.subList(Math.min(numberOfLines + startIndex, pos.size()),
                    Math.min(numberOfLines + stopIndex, pos.size()));
            Match m = new Match(pageNr + "-" + startIndex, pos);
            markupMatch(color, contentStream, m, boxHeight, withId, page, comment, commentOnly);

            contentStream.close();
        } catch (Exception e) {
            e.printStackTrace();
        } catch (Error e1) {
            e1.printStackTrace();
            throw e1;
        }
    }

    private boolean markupMatch(Color color, PDPageContentStream contentStream, Match markingMatch, int height,
            boolean withId, PDPage page, String comment, boolean commentOnly) throws IOException {
        final List<PDRectangle> textBoundingBoxes = getTextBoundingBoxes(markingMatch.positions);

        if (textBoundingBoxes.size() > 0) {
            contentStream.setNonStrokingColor(color);
            for (PDRectangle textBoundingBox : textBoundingBoxes) {
                if (comment.isEmpty()) {
                    contentStream.addRect(textBoundingBox.getLowerLeftX(), textBoundingBox.getLowerLeftY(), Math
                            .max(Math.abs(textBoundingBox.getUpperRightX() - textBoundingBox.getLowerLeftX()), 10),
                            height);
                    contentStream.fill();
                }
                if (withId) {
                    PDFont font = PDType1Font.HELVETICA;
                    contentStream.beginText();
                    contentStream.setFont(font, 5);
                    contentStream.newLineAtOffset(textBoundingBox.getUpperRightX(),
                            textBoundingBox.getUpperRightY());
                    contentStream.showText(markingMatch.str);
                    contentStream.endText();
                }
                if (!comment.isEmpty() && !commentOnly) {
                    PDAnnotationTextMarkup txtMark = new PDAnnotationTextMarkup(
                            PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
                    PDRectangle position = new PDRectangle();
                    position.setLowerLeftX(textBoundingBox.getLowerLeftX());
                    position.setLowerLeftY(textBoundingBox.getLowerLeftY());
                    position.setUpperRightX(textBoundingBox.getLowerLeftX() + Math
                            .max(Math.abs(textBoundingBox.getUpperRightX() - textBoundingBox.getLowerLeftX()), 10));
                    position.setUpperRightY(textBoundingBox.getLowerLeftY() + 10);
                    txtMark.setRectangle(position);

                    float[] quads = new float[8];
                    quads[0] = position.getLowerLeftX(); // x1
                    quads[1] = position.getUpperRightY() - 2; // y1
                    quads[2] = position.getUpperRightX(); // x2
                    quads[3] = quads[1]; // y2
                    quads[4] = quads[0]; // x3
                    quads[5] = position.getLowerLeftY() - 2; // y3
                    quads[6] = quads[2]; // x4
                    quads[7] = quads[5]; // y5
                    txtMark.setQuadPoints(quads);
                    txtMark.setConstantOpacity((float) 0.5);
                    txtMark.setContents("Missing Assumption/s (" + markingMatch.str + "):\n" + comment);
                    float[] colorArray = new float[] { 0, 0, 0 };
                    colorArray = color.getColorComponents(colorArray);
                    PDColor hColor = new PDColor(colorArray, PDDeviceRGB.INSTANCE);
                    txtMark.setColor(hColor);
                    txtMark.setCreationDate(Calendar.getInstance());
                    txtMark.setTitlePopup("Assumption Error");
                    page.getAnnotations().add(txtMark);
                } else if (!comment.isEmpty() && commentOnly) {
                    for (int i = 0; i < page.getAnnotations().size(); i++) {
                        String extractedComment = page.getAnnotations().get(i).getContents();
                        if (extractedComment != null) {
                            String commentID = extractedComment.substring(extractedComment.indexOf("(") + 1,
                                    extractedComment.indexOf(")"));
                            if (markingMatch.str.equals(commentID) && extractedComment.contains(comment)) {
                                page.getAnnotations().get(i).setContents(extractedComment + "\n" + comment);
                            }

                        }
                    }
                }
            }
            return true;
        }
        return false;
    }

    /**
     * The vertical tolerance determines whether a character is still on the same line
     */
    public float getVerticalTolerance() {
        return verticalTolerance;
    }

    /**
     * The height modifier is applied to the font height, it allows the annotations to be changed by a certain factor
     */
    public float getHeightModifier() {
        return heightModifier;
    }

    /*
     * The following methods are overwritten from the PDTextStripper
     */
    public void initialize(final PDDocument pdf) throws IOException {
        try {
            resetEngine();
            document = pdf;
            textCache = new TextCache();

            if (getAddMoreFormatting()) {
                setParagraphEnd(getLineSeparator());
                setPageStart(getLineSeparator());
                setArticleStart(getLineSeparator());
                setArticleEnd(getLineSeparator());
            }
            startDocument(pdf);
            processPages(pdf.getPages());
            endDocument(pdf);
        } catch (Exception e) {
            e.printStackTrace();
        } catch (Error e) {
            e.printStackTrace();
        }
    }

    /**
     * {@inheritDoc}
     */
    //@Override
    public void resetEngine() {
        //super.resetEngine();
        textCache = null;
    }

    /**
     * Start a new article, which is typically defined as a column on a single page (also referred to as a bead).
     * Default implementation is to do nothing. Subclasses may provide additional information.
     *
     * @param isltr true if primary direction of text is left to right.
     * @throws IOException If there is any error writing to the stream.
     */
    @Override
    protected void startArticle(final boolean isltr) throws IOException {
        final String articleStart = getArticleStart();
        textCache.append(articleStart, null);
    }

    /**
     * End an article. Default implementation is to do nothing. Subclasses may provide additional information.
     *
     * @throws IOException If there is any error writing to the stream.
     */
    @Override
    protected void endArticle() throws IOException {
        final String articleEnd = getArticleEnd();
        textCache.append(articleEnd, null);
    }

    /**
     * Start a new page. Default implementation is to do nothing. Subclasses may provide additional information.
     *
     * @param page The page we are about to process.
     * @throws IOException If there is any error writing to the stream.
     */
    @Override
    protected void startPage(final PDPage page) throws IOException {
        // default is to do nothing.
    }

    /**
     * End a page. Default implementation is to do nothing. Subclasses may provide additional information.
     *
     * @param page The page we are about to process.
     * @throws IOException If there is any error writing to the stream.
     */
    @Override
    protected void endPage(final PDPage page) throws IOException {
        // default is to do nothing
    }

    /**
     * Write the line separator value to the text cache.
     *
     * @throws IOException If there is a problem writing out the lineseparator to the document.
     */
    @Override
    protected void writeLineSeparator() throws IOException {
        final String lineSeparator = getLineSeparator();
        textCache.append(lineSeparator, null);
    }

    /**
     * Write the word separator value to the text cache.
     *
     * @throws IOException If there is a problem writing out the wordseparator to the document.
     */
    @Override
    protected void writeWordSeparator() throws IOException {
        final String wordSeparator = getWordSeparator();
        textCache.append(wordSeparator, null);
    }

    /**
     * Write the string in TextPosition to the text cache.
     *
     * @param text The text to write to the stream.
     */
    @Override
    protected void writeCharacters(final TextPosition text) {
        final String character = text.getUnicode();
        textCache.append(character, text);

    }

    /**
     * Write a string to the text cache. The default implementation will ignore the <code>text</code> and just calls
     * {@link #writeCharacters(TextPosition)} .
     *
     * @param text          The text to write to the stream.
     * @param textPositions The TextPositions belonging to the text.
     */
    @Override
    protected void writeString(final String text, final List<TextPosition> textPositions) {
        for (final TextPosition textPosition : textPositions) {
            writeCharacters(textPosition);
        }
    }

    /**
     * writes the paragraph separator string to the text cache.
     *
     * @throws IOException
     */
    @Override
    protected void writeParagraphSeparator() throws IOException {
        writeParagraphEnd();
        writeParagraphStart();
    }

    /**
     * Write something (if defined) at the start of a paragraph.
     *
     * @throws IOException
     */
    @Override
    protected void writeParagraphStart() throws IOException {
        if (inParagraph) {
            writeParagraphEnd();
            inParagraph = false;
        }

        final String paragraphStart = getParagraphStart();
        textCache.append(paragraphStart, null);
        inParagraph = true;
    }

    /**
     * Write something (if defined) at the end of a paragraph.
     *
     * @throws IOException
     */
    @Override
    protected void writeParagraphEnd() throws IOException {
        final String paragraphEnd = getParagraphEnd();
        textCache.append(paragraphEnd, null);

        inParagraph = false;
    }

    /**
     * Write something (if defined) at the start of a page.
     *
     * @throws IOException
     */
    @Override
    protected void writePageStart() throws IOException {
        final String pageStart = getPageStart();
        textCache.append(pageStart, null);
    }

    /**
     * Write something (if defined) at the start of a page.
     *
     * @throws IOException
     */
    @Override
    protected void writePageEnd() throws IOException {
        final String pageEnd = getPageEnd();
        textCache.append(pageEnd, null);
    }

    @Override
    public String getText(final PDDocument doc) throws IOException {
        throw new IllegalArgumentException("Not applicable for TextHighlight");
    }

    @Override
    public void writeText(final PDDocument doc, final Writer outputStream) throws IOException {
        throw new IllegalArgumentException("Not applicable for TextHighlight");
    }

    /**
     * Internal utility class that keeps a mapping from the text contents to their TextPositions. This is needed to
     * compute bounding boxes. The data is stored on a per-page basis (keyed on the 1-based pageNo)
     */
    public class TextCache {
        private final Map<Integer, StringBuilder> texts = new HashMap<>();
        private final Map<Integer, ArrayList<TextPosition>> positions = new HashMap<>();

        private StringBuilder obtainStringBuilder(final Integer pageNo) {
            StringBuilder sb = texts.get(pageNo);
            if (sb == null) {
                sb = new StringBuilder();
                texts.put(pageNo, sb);
            }
            return sb;
        }

        private ArrayList<TextPosition> obtainTextPositions(final Integer pageNo) {
            ArrayList<TextPosition> textPositions = positions.get(pageNo);
            if (textPositions == null) {
                textPositions = new ArrayList<>();
                positions.put(pageNo, textPositions);
            }
            return textPositions;
        }

        public String getText(final Integer pageNo) {
            return obtainStringBuilder(pageNo).toString();
        }

        public List<TextPosition> getTextPositions(final Integer pageNo) {
            return obtainTextPositions(pageNo);
        }

        public void append(final String str, final TextPosition pos) {
            final int currentPage = getCurrentPageNo();
            final ArrayList<TextPosition> positions = obtainTextPositions(currentPage);
            final StringBuilder sb = obtainStringBuilder(currentPage);

            for (int i = 0; i < str.length(); i++) {
                sb.append(str.charAt(i));
                positions.add(pos);
            }
        }

        /**
         * Given a page and a pattern it will return a list of matches for that pattern. A Match is a tuple of <String,
         * List<TextPositions>>
         *
         * @param pageNo
         * @param pattern
         * @return list of matches
         */
        public List<Match> match(final Integer pageNo, final Pattern pattern) {
            return match(getTextPositions(pageNo), this.getText(pageNo), pattern);
        }

        public List<Match> match(List<TextPosition> textPositions, final Pattern pattern) {
            StringBuilder sb = new StringBuilder(textPositions.size() * 2);
            for (TextPosition textPosition : textPositions) {
                if (textPosition != null)
                    sb.append(textPosition.getUnicode());
            }
            return match(textPositions, sb.toString(), pattern);
        }

        public List<Match> match(List<TextPosition> textPositions, String text, final Pattern pattern) {
            try {
                final Matcher matcher = pattern.matcher(text);
                final List<Match> matches = new ArrayList<>();

                while (matcher.find()) {
                    final List<TextPosition> elements = textPositions.subList(matcher.start(), matcher.end());
                    matches.add(new Match(matcher.group(), elements));
                }
                return matches;
            } catch (Error e) {
                System.out.println("An error occurred while searching for: " + pattern.toString());
                e.printStackTrace();
                final List<Match> emptyList = new ArrayList<>();
                return emptyList;
            } catch (Exception e) {
                System.out.println("An exception occurred while seraching for: " + pattern.toString());
                e.printStackTrace();
                final List<Match> emptyList = new ArrayList<>();
                return emptyList;
            }
        }
    }

}