Pdf2TxtUtils.CustomLocationTextExtractionStrategy.java Source code

Introduction

Here is the source code for Pdf2TxtUtils.CustomLocationTextExtractionStrategy.java
Source

/*
 * $Id: LocationTextExtractionStrategy.java 5498 2012-10-25 20:59:53Z trumpetinc $
 *
 * This file is part of the iText (R) project.
 * Copyright (c) 1998-2012 1T3XT BVBA
 * Authors: Kevin Day, Bruno Lowagie, Paulo Soares, et al.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License version 3
 * as published by the Free Software Foundation with the addition of the
 * following permission added to Section 15 as permitted in Section 7(a):
 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT,
 * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Affero General Public License for more details.
 * You should have received a copy of the GNU Affero General Public License
 * along with this program; if not, see http://www.gnu.org/licenses or write to
 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 * Boston, MA, 02110-1301 USA, or download the license from the following URL:
 * http://itextpdf.com/terms-of-use/
 *
 * The interactive user interfaces in modified source and object code versions
 * of this program must display Appropriate Legal Notices, as required under
 * Section 5 of the GNU Affero General Public License.
 *
 * In accordance with Section 7(b) of the GNU Affero General Public License,
 * a covered work must retain the producer line in every PDF that is created
 * or manipulated using iText.
 *
 * You can be released from the requirements of the license by purchasing
 * a commercial license. Buying such a license is mandatory as soon as you
 * develop commercial activities involving the iText software without
 * disclosing the source code of your own applications.
 * These activities include: offering paid services to customers as an ASP,
 * serving PDFs on the fly in a web application, shipping iText with a closed
 * source product.
 *
 * For more information, please contact iText Software Corp. at this
 * address: sales@itextpdf.com
 */
package Pdf2TxtUtils;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;

import com.itextpdf.text.pdf.DocumentFont;
import com.itextpdf.text.pdf.parser.FilteredRenderListener;
import com.itextpdf.text.pdf.parser.ImageRenderInfo;
import com.itextpdf.text.pdf.parser.LineSegment;
import com.itextpdf.text.pdf.parser.Matrix;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextRenderInfo;
import com.itextpdf.text.pdf.parser.Vector;

/**
 * <b>Development preview</b> - this class (and all of the parser classes) are still experiencing heavy development, and are
 * subject to change both behavior and interface. <br>
 * A text extraction renderer that keeps track of relative position of text on page The resultant text will be relatively
 * consistent with the physical layout that most PDF files have on screen. <br>
 * This renderer keeps track of the orientation and distance (both perpendicular and parallel) to the unit vector of the
 * orientation. Text is ordered by orientation, then perpendicular, then parallel distance. Text with the same perpendicular
 * distance, but different parallel distance is treated as being on the same line. <br>
 * This renderer also uses a simple strategy based on the font metrics to determine if a blank space should be inserted into the
 * output.
 * 
 * @since 5.0.2
 */
public class CustomLocationTextExtractionStrategy implements TextExtractionStrategy {

    /** set to true for debugging */
    static boolean DUMP_STATE = false;

    /** a summary of all found text */
    private final List<TextChunk> locationalResult = new ArrayList<TextChunk>();

    /**
     * Creates a new text extraction renderer.
     */
    public CustomLocationTextExtractionStrategy() {
    }

    /**
     * @see com.itextpdf.text.pdf.parser.RenderListener#beginTextBlock()
     */
    public void beginTextBlock() {
    }

    /**
     * @see com.itextpdf.text.pdf.parser.RenderListener#endTextBlock()
     */
    public void endTextBlock() {
    }

    /**
     * @param str
     * @return true if the string starts with a space character, false if the string is empty or starts with a non-space character
     */
    private boolean startsWithSpace(String str) {
        if (str.length() == 0)
            return false;
        return str.charAt(0) == ' ';
    }

    /**
     * @param str
     * @return true if the string ends with a space character, false if the string is empty or ends with a non-space character
     */
    private boolean endsWithSpace(String str) {
        if (str.length() == 0)
            return false;
        return str.charAt(str.length() - 1) == ' ';
    }

    /**
     * Filters the provided list with the provided filter
     * 
     * @param textChunks
     *            a list of all TextChunks that this strategy found during processing
     * @param filter
     *            the filter to apply. If null, filtering will be skipped.
     * @return the filtered list
     * @since 5.3.3
     */
    private List<TextChunk> filterTextChunks(List<TextChunk> textChunks, TextChunkFilter filter) {
        if (filter == null)
            return textChunks;

        List<TextChunk> filtered = new ArrayList<TextChunk>();
        for (TextChunk textChunk : textChunks) {
            if (filter.accept(textChunk))
                filtered.add(textChunk);
        }
        return filtered;
    }

    /**
     * Gets text that meets the specified filter If multiple text extractions will be performed for the same page (i.e. for
     * different physical regions of the page), filtering at this level is more efficient than filtering using
     * {@link FilteredRenderListener} - but not nearly as powerful because most of the RenderInfo state is not captured in
     * {@link TextChunk}
     * 
     * @param chunkFilter
     *            the filter to to apply
     * @return the text results so far, filtered using the specified filter
     */
    public String getResultantText(TextChunkFilter chunkFilter) {
        if (DUMP_STATE)
            dumpState();

        List<TextChunk> filteredTextChunks = filterTextChunks(locationalResult, chunkFilter);
        Collections.sort(filteredTextChunks);

        StringBuffer sb = new StringBuffer();
        TextChunk lastChunk = null;
        boolean forceSameLine = false;
        for (TextChunk chunk : filteredTextChunks) {

            if (chunk.text.length() == 1) {
                char o = chunk.text.charAt(0);
                if (o == 'o' || o == '-')
                    continue;
            }

            if (lastChunk == null) {
                sb.append(chunk.text);
            } else {
                if (chunk.sameLine(lastChunk)) {
                    float dist = chunk.distanceFromEndOf(lastChunk);

                    if (dist < -chunk.charSpaceWidth)
                        sb.append(' ');
                    // we only insert a blank space if the trailing character of
                    // the previous string wasn't a space,
                    // and the leading character of the current string isn't a
                    // space
                    else if (dist > chunk.charSpaceWidth / 2.0f && !startsWithSpace(chunk.text)
                            && !endsWithSpace(lastChunk.text))
                        sb.append(' ');

                    // sb.append(chunk.text);
                    String chunktext = chunk.text;

                    // fora o proximo trecho para a mesma linha se terminou em -, para tratar separaao silbica
                    if ((chunktext.trim().length() > 0)
                            && (chunktext.trim().charAt(chunktext.trim().length() - 1) == '-')) {
                        chunktext = chunk.text.trim().substring(0, chunk.text.trim().lastIndexOf('-'));
                        forceSameLine = true;
                    }

                    sb.append(chunktext);

                } else {

                    if (!forceSameLine) {
                        if (chunk.specialLineBreak(lastChunk, chunk.NORMAL_SPACE_BW_LINES)) {
                            sb.append('\n');
                            sb.append("@@DOUBLE_NW@@");

                        }

                        sb.append('\n');
                    } else {
                        forceSameLine = false;
                    }
                    sb.append(chunk.text);

                }
            }
            lastChunk = chunk;
        }

        return sb.toString();
    }

    /**
     * Returns the result so far.
     * 
     * @return a String with the resulting text.
     */
    public String getResultantText() {

        return getResultantText(null);

    }

    /** Used for debugging only */
    private void dumpState() {
        for (Iterator<TextChunk> iterator = locationalResult.iterator(); iterator.hasNext();) {
            TextChunk location = (TextChunk) iterator.next();

            location.printDiagnostics();

            System.out.println();
        }

    }

    /**
     * 
     * @see com.itextpdf.text.pdf.parser.RenderListener#renderText(com.itextpdf.text.pdf.parser.TextRenderInfo)
     */
    public void renderText(TextRenderInfo renderInfo) {
        LineSegment segment = renderInfo.getBaseline();
        if (renderInfo.getRise() != 0) { // remove the rise from the baseline -
                                         // we do this because the text from
                                         // a
                                         // super/subscript render operations
                                         // should probably be considered as
                                         // part of
                                         // the baseline of the text the
                                         // super/sub is relative to
            Matrix riseOffsetTransform = new Matrix(0, -renderInfo.getRise());
            segment = segment.transformBy(riseOffsetTransform);
        }

        TipoTexto tipo = TipoTexto.NORMAL;

        if (renderInfo.getFont().getFontDescriptor(DocumentFont.CAPHEIGHT, 1000) > 681) {
            /*
             * System.out.println("OPA!!!:-- " + renderInfo.getFont().getFontDescriptor( DocumentFont.CAPHEIGHT, 1000) + " x " +
             * renderInfo.getText());
             */
            tipo = TipoTexto.AUTONOMOS;
        }

        TextChunk location = new TextChunk(renderInfo.getText(), segment.getStartPoint(), segment.getEndPoint(),
                renderInfo.getSingleSpaceWidth(), tipo);
        locationalResult.add(location);
    }

    /**
     * Represents a chunk of text, it's orientation, and location relative to the orientation vector
     */
    public static class TextChunk implements Comparable<TextChunk> {
        /** expected space bewteen lines **/
        private final int NORMAL_SPACE_BW_LINES = 11;

        /** the text of the chunk */
        private final String text;
        /** the starting location of the chunk */
        private final Vector startLocation;
        /** the ending location of the chunk */
        private final Vector endLocation;
        /** unit vector in the orientation of the chunk */
        private final Vector orientationVector;
        /** the orientation as a scalar for quick sorting */
        private final int orientationMagnitude;
        /**
         * perpendicular distance to the orientation unit vector (i.e. the Y position in an unrotated coordinate system) we round
         * to the nearest integer to handle the fuzziness of comparing floats
         */
        private final int distPerpendicular;
        /**
         * distance of the start of the chunk parallel to the orientation unit vector (i.e. the X position in an unrotated
         * coordinate system)
         */
        private final float distParallelStart;
        /**
         * distance of the end of the chunk parallel to the orientation unit vector (i.e. the X position in an unrotated
         * coordinate system)
         */
        private final float distParallelEnd;
        /** the width of a single space character in the font of the chunk */
        private final float charSpaceWidth;

        public TipoTexto tipo;

        public TextChunk(String string, Vector startLocation, Vector endLocation, float charSpaceWidth,
                TipoTexto _tipo) {
            this.text = string;
            this.startLocation = startLocation;
            this.endLocation = endLocation;
            this.charSpaceWidth = charSpaceWidth;
            this.tipo = _tipo;

            Vector oVector = endLocation.subtract(startLocation);
            if (oVector.length() == 0) {
                oVector = new Vector(1, 0, 0);
            }
            orientationVector = oVector.normalize();
            orientationMagnitude = (int) (Math.atan2(orientationVector.get(Vector.I2),
                    orientationVector.get(Vector.I1)) * 1000);

            // see
            // http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html
            // the two vectors we are crossing are in the same plane, so the
            // result will be purely
            // in the z-axis (out of plane) direction, so we just take the I3
            // component of the result
            Vector origin = new Vector(0, 0, 1);
            distPerpendicular = (int) (startLocation.subtract(origin)).cross(orientationVector).get(Vector.I3);

            distParallelStart = orientationVector.dot(startLocation);
            distParallelEnd = orientationVector.dot(endLocation);
        }

        /**
         * @return the start location of the text
         */
        public Vector getStartLocation() {
            return startLocation;
        }

        /**
         * @return the end location of the text
         */
        public Vector getEndLocation() {
            return endLocation;
        }

        private void printDiagnostics() {
            System.out.println("Text (@" + startLocation + " -> " + endLocation + "): " + text);
            System.out.println("orientationMagnitude: " + orientationMagnitude);
            System.out.println("distPerpendicular: " + distPerpendicular);
            System.out.println("distParallel: " + distParallelStart);
        }

        /**
         * @param as
         *            the location to compare to
         * @return true is this location is on the the same line as the other
         */
        public boolean sameLine(TextChunk as) {
            if (orientationMagnitude != as.orientationMagnitude)
                return false;

            if (distPerpendicular != as.distPerpendicular)
                return false;

            return true;
        }

        /**
         * @param as
         *            the location to compare to
         * @return true se for um espao grande entre as linhas (candidato a titulo de portaria)
         */
        public boolean specialLineBreak(TextChunk as, int normalSpaceBetweenLines) {

            return (distPerpendicular - as.distPerpendicular) > normalSpaceBetweenLines;
        }

        /**
         * Computes the distance between the end of 'other' and the beginning of this chunk in the direction of this chunk's
         * orientation vector. Note that it's a bad idea to call this for chunks that aren't on the same line and orientation, but
         * we don't explicitly check for that condition for performance reasons.
         * 
         * @param other
         * @return the number of spaces between the end of 'other' and the beginning of this chunk
         */
        public float distanceFromEndOf(TextChunk other) {
            float distance = distParallelStart - other.distParallelEnd;
            return distance;
        }

        /**
         * Compares based on orientation, perpendicular distance, then parallel distance
         * 
         * @see java.lang.Comparable#compareTo(java.lang.Object)
         */
        public int compareTo(TextChunk rhs) {
            if (this == rhs)
                return 0; // not really needed, but just in case

            int rslt;
            rslt = compareInts(orientationMagnitude, rhs.orientationMagnitude);
            if (rslt != 0)
                return rslt;

            rslt = compareInts(distPerpendicular, rhs.distPerpendicular);
            if (rslt != 0)
                return rslt;

            return Float.compare(distParallelStart, rhs.distParallelStart);
        }

        /**
         * 
         * @param int1
         * @param int2
         * @return comparison of the two integers
         */
        private static int compareInts(int int1, int int2) {
            return int1 == int2 ? 0 : int1 < int2 ? -1 : 1;
        }

    }

    /**
     * no-op method - this renderer isn't interested in image events
     * 
     * @see com.itextpdf.text.pdf.parser.RenderListener#renderImage(com.itextpdf.text.pdf.parser.ImageRenderInfo)
     * @since 5.0.1
     */
    public void renderImage(ImageRenderInfo renderInfo) {
        // do nothing
    }

    /**
     * Specifies a filter for filtering {@link TextChunk} objects during text extraction
     * 
     * @see CustomLocationTextExtractionStrategy#getResultantText(TextChunkFilter)
     * @since 5.3.3
     */
    public static interface TextChunkFilter {
        /**
         * @param textChunk
         *            the chunk to check
         * @return true if the chunk should be allowed
         */
        public boolean accept(TextChunk textChunk);
    }
}