Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Writer; import java.util.ArrayList; import java.util.Comparator; import java.util.List; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition; public class PDFTextExtract extends PDFTextStripper { private static ArrayList<Text> _tmp = new ArrayList<Text>(); public String PDFFilePath; public String OutputFilepath; /** * Instantiate a new PDFTextStripper object. * * @throws IOException If there is an error loading the properties. */ public PDFTextExtract() throws IOException { } public PDFTextExtract(String PDffilepath, String OutputFilepath) throws IOException { this.PDFFilePath = PDffilepath; this.OutputFilepath = OutputFilepath; } /** * This will parse the documents data. * * @throws IOException If there is an error parsing the document. */ public void process() throws IOException { PDDocument document = null; String res = null; OutputStream os = null; try { // Target PDF file. document = PDDocument.load(new File(this.PDFFilePath)); // Extract Text from PDF ordered by page number. for (int i = 1; i <= document.getNumberOfPages(); i++) { System.out.println("processing page " + i + "..."); _tmp.clear(); PDFTextExtract stripper = new PDFTextExtract(); // Tell PDFBox to sort the text positions. stripper.setSortByPosition(true); // Extract only one page. stripper.setStartPage(i); stripper.setEndPage(i); // Convert class `textPositions` into `Text`. // Save conversion result in `_tmp`. Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream()); stripper.writeText(document, dummy); // Skip if nothing converted. if (_tmp.isEmpty()) continue; // Concate result into string. for (Text now : _tmp) { res += now.unicode; } } // Write result to file. os = new FileOutputStream(OutputFilepath); os.write(res.getBytes()); System.out.println("processing completed."); } finally { if (document != null) { document.close(); } if (os != null) { os.close(); } } } /** * Override the default functionality of PDFTextStripper. */ @Override protected void writeString(String _string, List<TextPosition> textPositions) throws IOException { for (TextPosition text : textPositions) { { _tmp.add(new Text(text.getXDirAdj(), text.getYDirAdj(), text.getFontSize(), text.getXScale(), text.getHeightDir(), text.getWidthOfSpace(), text.getWidthDirAdj(), text.getUnicode())); } } } public class TextPositionComparator implements Comparator<TextPosition> { @Override public int compare(TextPosition pos1, TextPosition pos2) { int cmp1 = Float.compare(pos1.getDir(), pos2.getDir()); if (cmp1 != 0) { return cmp1; } float x1 = pos1.getXDirAdj(); float x2 = pos2.getXDirAdj(); float pos1YBottom = pos1.getYDirAdj(); float pos2YBottom = pos2.getYDirAdj(); float pos1YTop = pos1YBottom - pos1.getHeightDir(); float pos2YTop = pos2YBottom - pos2.getHeightDir(); float yDifference = Math.abs(pos1YBottom - pos2YBottom); if (yDifference < 1. || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) { return Float.compare(x1, x2); } else if (pos1YBottom < pos2YBottom) { return -1; } else { return 1; } } } }