Java tutorial
/* This file is part of pdf2html. pdf2html is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. pdf2html is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with pdf2html. If not, see <http://www.gnu.org/licenses/>. Copyright by timendum. Based on org.apache.pdfbox.util.PDFTextStripper by Apache Software Foundation */ package net.timendum.pdf; import java.io.IOException; import java.util.List; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDFontDescriptor; import org.apache.pdfbox.util.Matrix; import org.apache.pdfbox.util.TextPosition; import com.google.common.collect.HashMultiset; import com.google.common.collect.Multiset; public class StatisticParser extends LocalPDFTextStripper { private static final int FLAG_FIXED_PITCH = 1; private static final int FLAG_SERIF = 2; private static final int FLAG_SYMBOLIC = 4; private static final int FLAG_SCRIPT = 8; private static final int FLAG_NON_SYMBOLIC = 32; private static final int FLAG_ITALIC = 64; private static final int FLAG_ALL_CAP = 65536; private static final int FLAG_SMALL_CAP = 131072; private static final int FLAG_FORCE_BOLD = 262144; private float pages = 0; private float lines = 0; // private float leftMargin = 0; private Multiset<Float> leftMargin = HashMultiset.create(); // private float rightMargin = 0; private Multiset<Float> rightMargin = HashMultiset.create(); private float averangeLine = 0; private float averangeLeftMargin; private float averangeRightMargin; private Multiset<Float> linesFontSize = HashMultiset.create(); //private Map<PDFont, Integer> fonts = new HashMap<PDFont, Integer>(); private Multiset<Float> lineSpacing = HashMultiset.create(); private float averangeLineSpacing = 0; //private Map<Float, Integer> lastLine = new HashMap<Float, Integer>(); private Multiset<Float> lastLine = HashMultiset.create(); private Multiset<Float> fontWeight = HashMultiset.create(); private float averangeLastLine = 0; private float averangeFontSize = 0; private float averangeFontWeight = 0; public StatisticParser() throws IOException { } private float prevLineY = -1f; @Override protected void startPage(PDPage page) throws IOException { pages++; prevLineY = -1f; } @Override protected void writeLineStart(List<TextPosition> line) { if (isLineEmpty(line)) { return; } lines++; float lineY = getFirstTrimmed(line).getY(); if (prevLineY >= 0f) { incrementOrAdd(lineSpacing, lineY - prevLineY); } prevLineY = lineY; float start = getFirstTrimmed(line).getX(); // leftMargin += start; incrementOrAdd(leftMargin, start); TextPosition lastTrimmed = getLastTrimmed(line); float end = lastTrimmed.getX() + lastTrimmed.getWidth(); // rightMargin += end; incrementOrAdd(rightMargin, end); Float fontSize; for (TextPosition t : line) { PDFont font = t.getFont(); if (font != null) { PDFontDescriptor fontDescriptor = font.getFontDescriptor(); if (fontDescriptor != null) incrementOrAdd(fontWeight, fontDescriptor.getFontWeight()); } fontSize = t.getFontSizeInPt(); if (fontSize > 0) { incrementOrAdd(linesFontSize, fontSize); } } } @Override protected void endPage(PDPage page) throws IOException { if (prevLineY >= 0f) { incrementOrAdd(lastLine, prevLineY); } } private void incrementOrAdd(Multiset<Float> multiset, float key) { multiset.add(key); } @Override public void endDocument(PDDocument pdf) throws IOException { averangeLine = lines / pages; // averangeLeftMargin = leftMargin / lines; averangeLeftMargin = findMax(leftMargin); // averangeRightMargin = rightMargin / lines; averangeRightMargin = findMax(rightMargin); averangeFontSize = findMax(linesFontSize); averangeLineSpacing = findMax(lineSpacing); averangeLastLine = findMax(lastLine); averangeFontWeight = findMax(fontWeight); } private float findMax(Multiset<Float> multiset) { float actual = 0f; int max = -1; for (Float k : multiset) { int count = multiset.count(k); if (count > max) { max = count; actual = k; } } return actual; } public float getPages() { return pages; } public float getLines() { return lines; } public float getAverangeLines() { return averangeLine; } public float getAverangeLeftMargin() { return averangeLeftMargin; } public float getAverangeRightMargin() { return averangeRightMargin; } public float getAverangeFontSize() { return averangeFontSize; } public float getAverangeLineSpacing() { return averangeLineSpacing; } public float getAverangeLastLine() { return averangeLastLine; } public float getAverangeFontWeight() { return averangeFontWeight; } @Override protected void startArticle() throws IOException { } @Override protected void startArticle(boolean isltr) throws IOException { } @Override protected void endArticle() throws IOException { } @Override protected void writeCharacters(TextPosition text) throws IOException { } @Override protected void writeHeader() throws IOException { } @Override protected void writeLineSeparator() throws IOException { } @Override protected void writePageEnd() throws IOException { } @Override protected void writePageSeperator() throws IOException { } @Override protected void writePageStart() throws IOException { } @Override protected void writeParagraphEnd() throws IOException { } @Override protected void writeParagraphSeparator() throws IOException { } @Override protected void writeParagraphStart() throws IOException { } @Override protected void writeString(String chars) throws IOException { } @Override protected void writeWordSeparator() throws IOException { } public boolean isItalic(PDFontDescriptor descriptor) { if (descriptor.getItalicAngle() != 0f) { return true; } if ((descriptor.getFlags() & FLAG_ITALIC) == FLAG_ITALIC) { return true; } if (descriptor.getFontName() != null && descriptor.getFontName().indexOf("Italic") > -1) { return true; } return false; } public boolean isBold(PDFontDescriptor descriptor) { if (descriptor.getFontWeight() > averangeFontWeight) { return true; } if ((descriptor.getFlags() & FLAG_FORCE_BOLD) == FLAG_FORCE_BOLD) { return true; } if (descriptor.getFontName() != null && descriptor.getFontName().indexOf("Bold") > -1) { return true; } return false; } public boolean isItalic(TextPosition text) { if (isItalic(text.getFont().getFontDescriptor())) { return true; } Matrix textPos = text.getTextPos(); if (textPos != null && textPos.getXScale() < textPos.getYScale()) { return true; } if (textPos != null && textPos.getXScale() > textPos.getYScale()) { return true; } return false; } @Override public String toString() { StringBuilder builder = new StringBuilder(); builder.append("Pages=").append(pages); builder.append("\nlines=").append(lines); //builder.append("\n#fonts="); //builder.append(fonts.size()); //@formatter:off builder.append("\naverangeLineSpacing=").append(averangeLineSpacing).append(" #lineSpacing=") .append(lineSpacing.size()).append('x').append(lineSpacing.count(averangeLineSpacing)); builder.append("\naverangeLastLine=").append(averangeLastLine).append(" #lastLine=").append(lastLine.size()) .append('x').append(lastLine.count(averangeLastLine)); builder.append("\naverangeLine=").append(averangeLine); builder.append("\naverangeLeftMargin=").append(averangeLeftMargin).append(", #leftMargin=") .append(leftMargin.size()).append('x').append(leftMargin.count(averangeLeftMargin)); builder.append("\naverangeRightMargin=").append(averangeRightMargin).append(" #rightMargin=") .append(rightMargin.size()).append('x').append(rightMargin.count(averangeRightMargin)); builder.append("\naverangeFontSize=").append(averangeFontSize).append(" #linesFontSize=") .append(linesFontSize.size()).append('x').append(linesFontSize.count(averangeFontSize)); //@formatter:on return builder.toString(); } }