com.formkiq.core.service.generator.pdfbox.TextToPDFieldMapper.java Source code

Java tutorial

Introduction

Here is the source code for com.formkiq.core.service.generator.pdfbox.TextToPDFieldMapper.java

Source

/*
 * Copyright (C) 2017 FormKiQ Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.formkiq.core.service.generator.pdfbox;

import static com.formkiq.core.service.generator.pdfbox.PDRectangleUtil.calculateTextPosition;
import static org.apache.commons.lang3.StringUtils.isEmpty;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;

import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.interactive.form.PDField;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;

import com.formkiq.core.util.Strings;

/**
 * Maps PDF Text to {@link PDField}.
 *
 */
public class TextToPDFieldMapper extends PDFTextStripper {

    /** Logger. */
    private static final Logger LOG = Logger.getLogger(TextToPDFieldMapper.class.getName());

    /** {@link List} of {@link PdfTextField}. */
    private Map<Integer, List<PdfTextField>> textLocations;

    /**
     * constructor.
     * @throws IOException IOException
     */
    public TextToPDFieldMapper() throws IOException {
        super();

        this.textLocations = new HashMap<>();
    }

    /**
    * Override the default functionality of PDFTextStripper.
    */
    @Override
    protected void writeString(final String o, final List<TextPosition> textPositions) throws IOException {

        Integer page = Integer.valueOf(getCurrentPageNo() - 1);

        if (!this.textLocations.containsKey(page)) {
            this.textLocations.put(page, new ArrayList<>());
        }

        // TODO replace with CollectionUtil.groupBy
        List<List<TextPosition>> splits = split(removeNonPrintableAndExtraSpaces(textPositions));

        for (List<TextPosition> tps : splits) {

            String text = tps.stream().map(s -> s.getUnicode()).collect(Collectors.joining());

            if (text.matches(".*[a-zA-Z0-9/]+.*")) {

                PDRectangle rect = calculateTextPosition(tps);

                PDFont font = tps.get(0).getFont();
                float fontSize = tps.stream().map(s -> Float.valueOf(s.getFontSizeInPt())).max(Float::compare)
                        .orElse(Float.valueOf(0)).floatValue();

                PdfTextField tf = new PdfTextField();
                tf.setText(text.replaceAll("\t", " "));
                tf.setRectangle(rect);
                tf.setFontSize(fontSize);
                tf.setFontName(font.getName());

                LOG.log(Level.FINE, "page=" + page + ",rect=" + rect + ",fontsize=" + fontSize + ",font=" + font
                        + ",text=" + text);
                this.textLocations.get(page).add(tf);
            }
        }
    }

    /**
     * Remove Non Printable Characters and extra spaces.
     * @param textPositions {@link List} of {@link TextPosition}
     * @return {@link List} of {@link TextPosition}
     */
    private List<TextPosition> removeNonPrintableAndExtraSpaces(final List<TextPosition> textPositions) {

        List<TextPosition> list = textPositions.stream()
                .filter(s -> cleanTextContent(s.getUnicode()).equals(s.getUnicode())).collect(Collectors.toList());

        int c = 0;
        Iterator<TextPosition> itr = list.iterator();
        while (itr.hasNext()) {

            TextPosition tp = itr.next();
            if (isEmpty(tp.getUnicode().trim())) {
                c++;
                if (c > 2) {
                    itr.remove();
                }
            } else {
                c = 0;
            }
        }

        return list;
    }

    /**
     * Remove Non Prinable Characters.
     *
     * @param s {@link String}
     * @return {@link String}
     */
    private String cleanTextContent(final String s) {

        if (Strings.getBytes(s).length > 1) {
            final int maxUnicodeCharacter = 8300;
            char c = s.charAt(0);
            if (Integer.toUnsignedLong(c) > maxUnicodeCharacter) {
                return "";
            }
        }

        return s;
    }

    /**
     * Split the Text Position if characters are too far apart.
     * @param textPositions {@link List} of {@link TextPosition}
     * @return {@link List} of {@link List} of {@link TextPosition}
     */
    private List<List<TextPosition>> split(final List<TextPosition> textPositions) {

        final int fudgewidth = 5;
        List<List<TextPosition>> list = new ArrayList<>();
        List<TextPosition> remainder = new ArrayList<>(textPositions);

        int s = 0;
        int size = textPositions.size();

        for (int i = 1; i < size; i++) {

            TextPosition p = textPositions.get(i - 1);
            TextPosition c = textPositions.get(i);

            float cpos = c.getTextMatrix().getTranslateX();
            float ppos = p.getTextMatrix().getTranslateX() + p.getWidth() + fudgewidth;

            if (cpos > ppos) {

                List<TextPosition> tp = new ArrayList<>();
                for (int j = s; j < i; j++) {
                    tp.add(textPositions.get(j));
                    remainder.remove(textPositions.get(j));
                }

                list.add(tp);

                s = i;
            }
        }

        list.add(remainder);

        return list;
    }

    /**
     * @return {@link Map}
     */
    public Map<Integer, List<PdfTextField>> getTextLocations() {
        return this.textLocations;
    }
}