List of usage examples for org.apache.pdfbox.pdmodel PDDocument load
public static PDDocument load(byte[] input) throws IOException
From source file:SanerePdf.java
License:Apache License
public static String txtFromPdf(String filename) throws IOException { PDDocument doc = PDDocument.load(filename); PDFTextStripper stripper = new PDFTextStripper(); stripper.setPageSeparator("\f"); // form feed //stripper.setSortByPosition(true); //stripper.setShouldSeparateByBeads(false); String text = stripper.getText(doc); doc.close();//w ww.j a va 2 s .c om return text; }
From source file:PDFTextExtract.java
License:Apache License
/** * This will parse the documents data./*from ww w. j a v a2 s .com*/ * * @throws IOException If there is an error parsing the document. */ public void process() throws IOException { PDDocument document = null; String res = null; OutputStream os = null; try { // Target PDF file. document = PDDocument.load(new File(this.PDFFilePath)); // Extract Text from PDF ordered by page number. for (int i = 1; i <= document.getNumberOfPages(); i++) { System.out.println("processing page " + i + "..."); _tmp.clear(); PDFTextExtract stripper = new PDFTextExtract(); // Tell PDFBox to sort the text positions. stripper.setSortByPosition(true); // Extract only one page. stripper.setStartPage(i); stripper.setEndPage(i); // Convert class `textPositions` into `Text`. // Save conversion result in `_tmp`. Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream()); stripper.writeText(document, dummy); // Skip if nothing converted. if (_tmp.isEmpty()) continue; // Concate result into string. for (Text now : _tmp) { res += now.unicode; } } // Write result to file. os = new FileOutputStream(OutputFilepath); os.write(res.getBytes()); System.out.println("processing completed."); } finally { if (document != null) { document.close(); } if (os != null) { os.close(); } } }
From source file:FormFiller.java
private static void fillPdf(HashMap dealerTrackData, String inputFileName, String outputDir, String outputFormType) {// w ww. ja v a2s .co m try { PDDocument pdfTemplate = PDDocument.load(new File(inputFileName)); PDDocumentCatalog docCatalog = pdfTemplate.getDocumentCatalog(); PDAcroForm acroForm = docCatalog.getAcroForm(); List<PDField> fieldList = acroForm.getFields(); String[] fieldArray = new String[fieldList.size()]; int i = 0; for (PDField sField : fieldList) { fieldArray[i] = sField.getFullyQualifiedName(); i++; } for (String f : fieldArray) { PDField field = acroForm.getField(f); String value = (String) dealerTrackData.get(f); if (value != null) { try { field.setValue(value); } catch (IllegalArgumentException e) { System.err.println("Could not insert: " + f + "."); } } } pdfTemplate.save(outputDir + "/" + dealerTrackData.get("fullName") + " " + outputFormType + ".pdf"); // printing - need to look into the long form stuff! if (print && !inputFileName.contains("Title Guarantee")) printPdf(pdfTemplate, dealerTrackData, inputFileName, inputFileName.contains("Purchase Contract") ? 2 : 1); pdfTemplate.close(); } catch (Exception e) { e.printStackTrace(); } }
From source file:FormFiller.java
private static void getChaseCreditInfo(JavascriptExecutor jse, HashMap dealerTrackData, WebDriver driver) throws InterruptedException, IOException { // Home Phone String homePhone0 = (String) jse.executeScript("return document.getElementById('iFrm')." + "contentWindow.document.body.childNodes[2].contentDocument.getElementById('app_home_phone1').value"); String homePhone1 = (String) jse.executeScript("return document.getElementById('iFrm')." + "contentWindow.document.body.childNodes[2].contentDocument.getElementById('app_home_phone2').value"); String homePhone2 = (String) jse.executeScript("return document.getElementById('iFrm')." + "contentWindow.document.body.childNodes[2].contentDocument.getElementById('app_home_phone3').value"); dealerTrackData.put("Home (or business) Phone Number", "(" + homePhone0 + ")" + " " + homePhone1 + "-" + homePhone2); // Business Phone String workPhone0 = (String) jse.executeScript("return document.getElementById('iFrm')." + "contentWindow.document.body.childNodes[2].contentDocument.getElementById('app_bus_phone1').value"); String workPhone1 = (String) jse.executeScript("return document.getElementById('iFrm')." + "contentWindow.document.body.childNodes[2].contentDocument.getElementById('app_bus_phone2').value"); String workPhone2 = (String) jse.executeScript("return document.getElementById('iFrm')." + "contentWindow.document.body.childNodes[2].contentDocument.getElementById('app_bus_phone3').value"); dealerTrackData.put("workPhone", "(" + workPhone0 + ")" + " " + workPhone1 + "-" + workPhone2); jse.executeScript(/*from w w w.j av a2 s . co m*/ "document.getElementById('iFrm').contentWindow.document.body.childNodes[2].contentDocument." + "getElementsByName('cmdPrint')[0].click()"); Thread.sleep(4000); File chaseCreditApplication; String downloadDir = System.getProperty("os.name").toLowerCase().contains("win") ? "C:/Users/" + System.getProperty("user.name") + "/Downloads/" : "/Users/" + System.getProperty("user.name") + "/Downloads/"; do { chaseCreditApplication = getLatestFileFromDir(downloadDir); Thread.sleep(1000); } while (chaseCreditApplication == null || System.currentTimeMillis() > chaseCreditApplication.lastModified() + 4000); PDDocument creditAppTemplate = PDDocument.load(chaseCreditApplication); outputDir = makeOutputDir(dealerTrackData); creditAppTemplate.save(outputDir + "/" + dealerTrackData.get("fullName") + " " + "Chase Credit App.pdf"); if (print) printPdf(creditAppTemplate, dealerTrackData, "Chase Credit Application", 1); creditAppTemplate.close(); }
From source file:PrintImageLocations.java
License:Apache License
/** * This will print the documents data./* w w w.j av a 2s.c o m*/ * * @param args The command line arguments. * * @throws Exception If there is an error parsing the document. */ public static void main2() throws Exception { if (flag) { usage(); } else { PDDocument document = null; try { document = PDDocument.load(PrintTextLocations.INPUTFILE); if (document.isEncrypted()) { try { document.decrypt(""); } catch (InvalidPasswordException e) { System.err.println("Error: Document is encrypted with a password."); System.exit(1); } } PrintImageLocations printer = new PrintImageLocations(); List allPages = document.getDocumentCatalog().getAllPages(); for (int i = 0; i < allPages.size(); i++) { PDPage page = (PDPage) allPages.get(i); System.out.println("Processing page: " + i); printer.processStream(page, page.findResources(), page.getContents().getStream()); } } finally { if (document != null) { document.close(); } } } }
From source file:PrintUtil.java
License:Apache License
void print() { PDDocument doc = null;/*w w w. java 2 s. c o m*/ try { doc = PDDocument.load(new File(path)); } catch (IOException e) { e.printStackTrace(); } job.setPrintable(new PDFPrintable(doc, Scaling.SCALE_TO_FIT)); PrintRequestAttributeSet attr = new HashPrintRequestAttributeSet(); attr.add(Sides.DUPLEX); attr.add(MediaSizeName.ISO_A4); System.out.println("Printing =>" + path); try { job.print(attr); } catch (PrinterException e) { e.printStackTrace(); } finally { if (doc != null) { try { doc.close(); } catch (IOException ignored) { } } } }
From source file:PDFConverter.java
License:Apache License
/** * Implementation is informed by PDFBox authors. * * @param doc/*from w w w.ja v a 2 s. c om*/ * @return * @throws IOException */ @Override public synchronized ConvertedDocument convert(java.io.File doc) throws IOException { /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Adapted from LucenePDFDocument.java from PDFBox lucene project * * This class is used to create a document for the lucene search engine. * This should easily plug into the IndexHTML or IndexFiles that comes * with the lucene project. This class will populate the following * fields. * <table> <tr> <th>Lucene Field Name</th> <th>Description</th> </tr> * <tr> * <td>path</td> <td>File system path if loaded from a file</td> </tr> * <tr> * <td>url</td> <td>URL to PDF document</td> </tr> <tr> * <td>contents</td> * <td>Entire contents of PDF document, indexed but not stored</td> * </tr> * <tr> <td>summary</td> <td>First 500 characters of content</td> </tr> * <tr> * <td>modified</td> <td>The modified date/time according to the url or * path</td> </tr> <tr> <td>uid</td> <td>A unique identifier for the * Lucene document.</td> </tr> <tr> <td>CreationDate</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>Creator</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>Keywords</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>ModificationDate</td> * <td>From PDF meta-data if available</td> </tr> <tr> <td>Producer</td> * <td>From PDF meta-data if available</td> </tr> <tr> <td>Subject</td> * <td>From PDF meta-data if available</td> </tr> <tr> <td>Trapped</td> * <td>From PDF meta-data if available</td> </tr> <tr> * <td>Encrypted</td> <td>From PDF meta-data if available</td> </tr> * </table> * * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a> * @version $Revision: 1.23 $ * * @throws IOException If there is an error parsing the document. */ PDDocument pdfDocument = null; ConvertedDocument textdoc = new ConvertedDocument(doc); try { pdfDocument = PDDocument.load(doc); if (pdfDocument.isEncrypted()) { //Just try using the default password and move on // Even if the doc is encrypted, apparently you can try. Throw exception if it fails. textdoc.addProperty("encrypted", "YES"); } //create a writer where to append the text content. StringWriter writer = new StringWriter(); stripper.resetEngine(); stripper.writeText(pdfDocument, writer); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { textdoc.addAuthor(info.getAuthor()); try { textdoc.addCreateDate(info.getCreationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } textdoc.addProperty("creator_tool", info.getCreator()); textdoc.addProperty("keywords", info.getKeywords()); /* try { metadata.add("ModificationDate", info.getModificationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } */ //metadata.add("Producer", info.getProducer()); textdoc.addProperty("subject", info.getSubject()); String ttl = info.getTitle(); if (ttl == null || "untitled".equalsIgnoreCase(ttl)) { ttl = textdoc.filename; } textdoc.addTitle(ttl); // metadata.add("Trapped", info.getTrapped()); // TODO: Character set is what? textdoc.setEncoding("UTF-8"); } // Note: the buffer to string operation is costless; // the char array value of the writer buffer and the content string // is shared as long as the buffer content is not modified, which will // not occur here. textdoc.setText(writer.getBuffer().toString()); return textdoc; } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:PDFExtractMetadata.java
License:Apache License
/** * This is the main method.//from w w w . j a va 2 s .c o m * * @param args The command line arguments. * * @throws IOException If there is an error parsing the document. * @throws XmpParsingException */ public static void main(String[] args) throws IOException, XmpParsingException { if (args.length != 1) { usage(); System.exit(1); } else { PDDocument document = null; try { document = PDDocument.load(new File(args[0])); PDDocumentCatalog catalog = document.getDocumentCatalog(); PDMetadata meta = catalog.getMetadata(); if (meta != null) { DomXmpParser xmpParser = new DomXmpParser(); try { XMPMetadata metadata = xmpParser.parse(meta.createInputStream()); DublinCoreSchema dc = metadata.getDublinCoreSchema(); if (dc != null) { display("Title:", dc.getTitle()); display("Description:", dc.getDescription()); listString("Creators: ", dc.getCreators()); listCalendar("Dates:", dc.getDates()); listString("Subjects:", dc.getSubjects()); } AdobePDFSchema pdf = metadata.getAdobePDFSchema(); if (pdf != null) { display("Keywords:", pdf.getKeywords()); display("PDF Version:", pdf.getPDFVersion()); display("PDF Producer:", pdf.getProducer()); } XMPBasicSchema basic = metadata.getXMPBasicSchema(); if (basic != null) { display("Create Date:", basic.getCreateDate()); display("Modify Date:", basic.getModifyDate()); display("Creator Tool:", basic.getCreatorTool()); } } catch (XmpParsingException e) { System.err.println("An error ouccred when parsing the meta data: " + e.getMessage()); } } else { // The pdf doesn't contain any metadata, try to use the // document information instead PDDocumentInformation information = document.getDocumentInformation(); if (information != null) { showDocumentInformation(information); } } } finally { if (document != null) { document.close(); } } } }
From source file:ExtractTextFromPdf.java
public static void main(String[] args) { PDFParser parser = null;//from w w w . j av a2s . c o m PDDocument pdDoc = null; COSDocument cosDoc = null; PDFTextStripper pdfStripper; String parsedText; String fileName = "C:/Users/Kavya Gupta/Desktop/Texas_Title.pdf"; File file = new File(fileName); try { byte data[] = new byte[1024]; ((RandomAccessRead) file).read(data, 0, 1024); pdDoc = PDDocument.load(new File(fileName)); pdfStripper = new PDFTextStripper(); parsedText = pdfStripper.getText(pdDoc); System.out.println(parsedText.replaceAll("[^A-Za-z0-9. ]+", "")); } catch (Exception e) { e.printStackTrace(); try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e1) { e.printStackTrace(); } } }
From source file:DecodePlate.java
License:Open Source License
public static void ProcessPlateWork() throws Exception { // Filter out fixes more than 50nm away from airport, no chart goes that far. // This helps us from trying to decode spurious strings as fix names and // helps us avoid duplicate name problems. nearDBFixes.clear();//from ww w . j a v a 2 s .c o m for (DBFix dbfix : allDBFixes) { if (Lib.LatLonDist(dbfix.lat, dbfix.lon, airport.lat, airport.lon) <= maxFixDistNM) { dbfix.mentioned = false; nearDBFixes.put(dbfix.name, dbfix); } } // Also add in runways as fixes cuz some plates use them for fixes. for (Runway rwy : airport.runways.values()) { nearDBFixes.put(rwy.name, rwy); } // Open PDF and scan it. PDDocument pddoc = PDDocument.load(pdfName); PDDocumentCatalog doccat = pddoc.getDocumentCatalog(); PDPageNode pages = doccat.getPages(); List kids = new LinkedList(); pages.getAllKids(kids); if (kids.size() != 1) throw new Exception("pdf not a single Page"); Object kid = kids.get(0); PDPage page = (PDPage) kid; int imgWidth = (int) (page.getMediaBox().getWidth() / pdfDpi * csvDpi + 0.5F); int imgHeight = (int) (page.getMediaBox().getHeight() / pdfDpi * csvDpi + 0.5F); BufferedImage bi = new BufferedImage(imgWidth, imgHeight, BufferedImage.TYPE_INT_ARGB); Graphics2D g2d = bi.createGraphics(); PagePanel pagepanel = new PagePanel(page); pagepanel.paintComponent(g2d); pagepanel.resolveFixes(g2d); if (markedpngname != null) { if (!ImageIO.write(bi, "png", new File(markedpngname))) { throw new IOException("ImageIO.write(" + markedpngname + ") failed"); } } pddoc.close(); }