Java tutorial
/* * Copyright (C) 2017 FormKiQ Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.formkiq.core.service.generator.pdfbox; import static com.formkiq.core.service.generator.pdfbox.PDRectangleUtil.calculateTextPosition; import static org.apache.commons.lang3.StringUtils.isEmpty; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import java.util.stream.Collectors; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.interactive.form.PDField; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition; import com.formkiq.core.util.Strings; /** * Maps PDF Text to {@link PDField}. * */ public class TextToPDFieldMapper extends PDFTextStripper { /** Logger. */ private static final Logger LOG = Logger.getLogger(TextToPDFieldMapper.class.getName()); /** {@link List} of {@link PdfTextField}. */ private Map<Integer, List<PdfTextField>> textLocations; /** * constructor. * @throws IOException IOException */ public TextToPDFieldMapper() throws IOException { super(); this.textLocations = new HashMap<>(); } /** * Override the default functionality of PDFTextStripper. */ @Override protected void writeString(final String o, final List<TextPosition> textPositions) throws IOException { Integer page = Integer.valueOf(getCurrentPageNo() - 1); if (!this.textLocations.containsKey(page)) { this.textLocations.put(page, new ArrayList<>()); } // TODO replace with CollectionUtil.groupBy List<List<TextPosition>> splits = split(removeNonPrintableAndExtraSpaces(textPositions)); for (List<TextPosition> tps : splits) { String text = tps.stream().map(s -> s.getUnicode()).collect(Collectors.joining()); if (text.matches(".*[a-zA-Z0-9/]+.*")) { PDRectangle rect = calculateTextPosition(tps); PDFont font = tps.get(0).getFont(); float fontSize = tps.stream().map(s -> Float.valueOf(s.getFontSizeInPt())).max(Float::compare) .orElse(Float.valueOf(0)).floatValue(); PdfTextField tf = new PdfTextField(); tf.setText(text.replaceAll("\t", " ")); tf.setRectangle(rect); tf.setFontSize(fontSize); tf.setFontName(font.getName()); LOG.log(Level.FINE, "page=" + page + ",rect=" + rect + ",fontsize=" + fontSize + ",font=" + font + ",text=" + text); this.textLocations.get(page).add(tf); } } } /** * Remove Non Printable Characters and extra spaces. * @param textPositions {@link List} of {@link TextPosition} * @return {@link List} of {@link TextPosition} */ private List<TextPosition> removeNonPrintableAndExtraSpaces(final List<TextPosition> textPositions) { List<TextPosition> list = textPositions.stream() .filter(s -> cleanTextContent(s.getUnicode()).equals(s.getUnicode())).collect(Collectors.toList()); int c = 0; Iterator<TextPosition> itr = list.iterator(); while (itr.hasNext()) { TextPosition tp = itr.next(); if (isEmpty(tp.getUnicode().trim())) { c++; if (c > 2) { itr.remove(); } } else { c = 0; } } return list; } /** * Remove Non Prinable Characters. * * @param s {@link String} * @return {@link String} */ private String cleanTextContent(final String s) { if (Strings.getBytes(s).length > 1) { final int maxUnicodeCharacter = 8300; char c = s.charAt(0); if (Integer.toUnsignedLong(c) > maxUnicodeCharacter) { return ""; } } return s; } /** * Split the Text Position if characters are too far apart. * @param textPositions {@link List} of {@link TextPosition} * @return {@link List} of {@link List} of {@link TextPosition} */ private List<List<TextPosition>> split(final List<TextPosition> textPositions) { final int fudgewidth = 5; List<List<TextPosition>> list = new ArrayList<>(); List<TextPosition> remainder = new ArrayList<>(textPositions); int s = 0; int size = textPositions.size(); for (int i = 1; i < size; i++) { TextPosition p = textPositions.get(i - 1); TextPosition c = textPositions.get(i); float cpos = c.getTextMatrix().getTranslateX(); float ppos = p.getTextMatrix().getTranslateX() + p.getWidth() + fudgewidth; if (cpos > ppos) { List<TextPosition> tp = new ArrayList<>(); for (int j = s; j < i; j++) { tp.add(textPositions.get(j)); remainder.remove(textPositions.get(j)); } list.add(tp); s = i; } } list.add(remainder); return list; } /** * @return {@link Map} */ public Map<Integer, List<PdfTextField>> getTextLocations() { return this.textLocations; } }