Example usage for org.apache.pdfbox.pdmodel PDDocument load

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument load.

Prototype

public static PDDocument load(byte[] input) throws IOException

Source Link

Document

Parses a PDF.

Usage

From source file:SanerePdf.java

License:Apache License

public static String txtFromPdf(String filename) throws IOException {
    PDDocument doc = PDDocument.load(filename);

    PDFTextStripper stripper = new PDFTextStripper();

    stripper.setPageSeparator("\f"); // form feed
    //stripper.setSortByPosition(true);
    //stripper.setShouldSeparateByBeads(false);
    String text = stripper.getText(doc);

    doc.close();//w ww.j a  va 2  s  .c om
    return text;
}

From source file:PDFTextExtract.java

License:Apache License

/**
 * This will parse the documents data./*from ww w.  j  a v  a2 s  .com*/
 * 
 * @throws IOException If there is an error parsing the document.
 */
public void process() throws IOException {
    PDDocument document = null;
    String res = null;
    OutputStream os = null;

    try {
        // Target PDF file.
        document = PDDocument.load(new File(this.PDFFilePath));

        // Extract Text from PDF ordered by page number.
        for (int i = 1; i <= document.getNumberOfPages(); i++) {
            System.out.println("processing page " + i + "...");
            _tmp.clear();
            PDFTextExtract stripper = new PDFTextExtract();

            // Tell PDFBox to sort the text positions.
            stripper.setSortByPosition(true);

            // Extract only one page.
            stripper.setStartPage(i);
            stripper.setEndPage(i);

            // Convert class `textPositions` into `Text`.
            // Save conversion result in `_tmp`.
            Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
            stripper.writeText(document, dummy);

            // Skip if nothing converted.
            if (_tmp.isEmpty())
                continue;

            // Concate result into string.
            for (Text now : _tmp) {
                res += now.unicode;
            }
        }

        // Write result to file.
        os = new FileOutputStream(OutputFilepath);
        os.write(res.getBytes());
        System.out.println("processing completed.");
    } finally {
        if (document != null) {
            document.close();
        }
        if (os != null) {
            os.close();
        }
    }

}

From source file:FormFiller.java

private static void fillPdf(HashMap dealerTrackData, String inputFileName, String outputDir,
        String outputFormType) {// w  ww.  ja  v  a2s  .co m
    try {
        PDDocument pdfTemplate = PDDocument.load(new File(inputFileName));

        PDDocumentCatalog docCatalog = pdfTemplate.getDocumentCatalog();
        PDAcroForm acroForm = docCatalog.getAcroForm();

        List<PDField> fieldList = acroForm.getFields();

        String[] fieldArray = new String[fieldList.size()];
        int i = 0;
        for (PDField sField : fieldList) {
            fieldArray[i] = sField.getFullyQualifiedName();
            i++;
        }

        for (String f : fieldArray) {
            PDField field = acroForm.getField(f);
            String value = (String) dealerTrackData.get(f);
            if (value != null) {
                try {
                    field.setValue(value);
                } catch (IllegalArgumentException e) {
                    System.err.println("Could not insert: " + f + ".");
                }
            }
        }

        pdfTemplate.save(outputDir + "/" + dealerTrackData.get("fullName") + " " + outputFormType + ".pdf");

        // printing - need to look into the long form stuff!
        if (print && !inputFileName.contains("Title Guarantee"))
            printPdf(pdfTemplate, dealerTrackData, inputFileName,
                    inputFileName.contains("Purchase Contract") ? 2 : 1);

        pdfTemplate.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:FormFiller.java

private static void getChaseCreditInfo(JavascriptExecutor jse, HashMap dealerTrackData, WebDriver driver)
        throws InterruptedException, IOException {

    // Home Phone
    String homePhone0 = (String) jse.executeScript("return document.getElementById('iFrm')."
            + "contentWindow.document.body.childNodes[2].contentDocument.getElementById('app_home_phone1').value");
    String homePhone1 = (String) jse.executeScript("return document.getElementById('iFrm')."
            + "contentWindow.document.body.childNodes[2].contentDocument.getElementById('app_home_phone2').value");
    String homePhone2 = (String) jse.executeScript("return document.getElementById('iFrm')."
            + "contentWindow.document.body.childNodes[2].contentDocument.getElementById('app_home_phone3').value");

    dealerTrackData.put("Home (or business) Phone Number",
            "(" + homePhone0 + ")" + " " + homePhone1 + "-" + homePhone2);

    // Business Phone
    String workPhone0 = (String) jse.executeScript("return document.getElementById('iFrm')."
            + "contentWindow.document.body.childNodes[2].contentDocument.getElementById('app_bus_phone1').value");
    String workPhone1 = (String) jse.executeScript("return document.getElementById('iFrm')."
            + "contentWindow.document.body.childNodes[2].contentDocument.getElementById('app_bus_phone2').value");
    String workPhone2 = (String) jse.executeScript("return document.getElementById('iFrm')."
            + "contentWindow.document.body.childNodes[2].contentDocument.getElementById('app_bus_phone3').value");

    dealerTrackData.put("workPhone", "(" + workPhone0 + ")" + " " + workPhone1 + "-" + workPhone2);

    jse.executeScript(/*from w  w  w.j av a2 s  .  co m*/
            "document.getElementById('iFrm').contentWindow.document.body.childNodes[2].contentDocument."
                    + "getElementsByName('cmdPrint')[0].click()");

    Thread.sleep(4000);
    File chaseCreditApplication;
    String downloadDir = System.getProperty("os.name").toLowerCase().contains("win")
            ? "C:/Users/" + System.getProperty("user.name") + "/Downloads/"
            : "/Users/" + System.getProperty("user.name") + "/Downloads/";
    do {
        chaseCreditApplication = getLatestFileFromDir(downloadDir);
        Thread.sleep(1000);
    } while (chaseCreditApplication == null
            || System.currentTimeMillis() > chaseCreditApplication.lastModified() + 4000);

    PDDocument creditAppTemplate = PDDocument.load(chaseCreditApplication);
    outputDir = makeOutputDir(dealerTrackData);
    creditAppTemplate.save(outputDir + "/" + dealerTrackData.get("fullName") + " " + "Chase Credit App.pdf");

    if (print)
        printPdf(creditAppTemplate, dealerTrackData, "Chase Credit Application", 1);
    creditAppTemplate.close();

}

From source file:PrintImageLocations.java

License:Apache License

/**
 * This will print the documents data./* w  w  w.j  av  a  2s.c o m*/
 *
 * @param args The command line arguments.
 *
 * @throws Exception If there is an error parsing the document.
 */
public static void main2() throws Exception {
    if (flag) {
        usage();
    } else {
        PDDocument document = null;
        try {
            document = PDDocument.load(PrintTextLocations.INPUTFILE);
            if (document.isEncrypted()) {
                try {
                    document.decrypt("");
                } catch (InvalidPasswordException e) {
                    System.err.println("Error: Document is encrypted with a password.");
                    System.exit(1);
                }
            }
            PrintImageLocations printer = new PrintImageLocations();
            List allPages = document.getDocumentCatalog().getAllPages();
            for (int i = 0; i < allPages.size(); i++) {
                PDPage page = (PDPage) allPages.get(i);
                System.out.println("Processing page: " + i);
                printer.processStream(page, page.findResources(), page.getContents().getStream());
            }
        } finally {
            if (document != null) {
                document.close();
            }
        }
    }
}

From source file:PrintUtil.java

License:Apache License

void print() {
    PDDocument doc = null;/*w w  w. java 2  s.  c  o  m*/
    try {
        doc = PDDocument.load(new File(path));
    } catch (IOException e) {
        e.printStackTrace();
    }
    job.setPrintable(new PDFPrintable(doc, Scaling.SCALE_TO_FIT));
    PrintRequestAttributeSet attr = new HashPrintRequestAttributeSet();
    attr.add(Sides.DUPLEX);
    attr.add(MediaSizeName.ISO_A4);
    System.out.println("Printing =>" + path);
    try {
        job.print(attr);
    } catch (PrinterException e) {
        e.printStackTrace();
    } finally {
        if (doc != null) {
            try {
                doc.close();
            } catch (IOException ignored) {
            }
        }
    }
}

From source file:PDFConverter.java

License:Apache License

/**
 * Implementation is informed by PDFBox authors.
 *
 * @param doc/*from w  w  w.ja  v  a 2  s. c om*/
 * @return
 * @throws IOException
 */
@Override
public synchronized ConvertedDocument convert(java.io.File doc) throws IOException {

    /*
     * Licensed to the Apache Software Foundation (ASF) under one or more
     * contributor license agreements.  See the NOTICE file distributed with
     * this work for additional information regarding copyright ownership.
     * The ASF licenses this file to You under the Apache License, Version 2.0
     * (the "License"); you may not use this file except in compliance with
     * the License.  You may obtain a copy of the License at
     *
     *      http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific language governing permissions and
     * limitations under the License.
     */
    /**
     * Adapted from LucenePDFDocument.java from PDFBox lucene project
     *
     * This class is used to create a document for the lucene search engine.
     * This should easily plug into the IndexHTML or IndexFiles that comes
     * with the lucene project. This class will populate the following
     * fields.
     * <table> <tr> <th>Lucene Field Name</th> <th>Description</th> </tr>
     * <tr>
     * <td>path</td> <td>File system path if loaded from a file</td> </tr>
     * <tr>
     * <td>url</td> <td>URL to PDF document</td> </tr> <tr>
     * <td>contents</td>
     * <td>Entire contents of PDF document, indexed but not stored</td>
     * </tr>
     * <tr> <td>summary</td> <td>First 500 characters of content</td> </tr>
     * <tr>
     * <td>modified</td> <td>The modified date/time according to the url or
     * path</td> </tr> <tr> <td>uid</td> <td>A unique identifier for the
     * Lucene document.</td> </tr> <tr> <td>CreationDate</td> <td>From PDF
     * meta-data if available</td> </tr> <tr> <td>Creator</td> <td>From PDF
     * meta-data if available</td> </tr> <tr> <td>Keywords</td> <td>From PDF
     * meta-data if available</td> </tr> <tr> <td>ModificationDate</td>
     * <td>From PDF meta-data if available</td> </tr> <tr> <td>Producer</td>
     * <td>From PDF meta-data if available</td> </tr> <tr> <td>Subject</td>
     * <td>From PDF meta-data if available</td> </tr> <tr> <td>Trapped</td>
     * <td>From PDF meta-data if available</td> </tr> <tr>
     * <td>Encrypted</td> <td>From PDF meta-data if available</td> </tr>
     * </table>
     *
     * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
     * @version $Revision: 1.23 $
     *
     * @throws IOException If there is an error parsing the document.
     */
    PDDocument pdfDocument = null;
    ConvertedDocument textdoc = new ConvertedDocument(doc);

    try {
        pdfDocument = PDDocument.load(doc);

        if (pdfDocument.isEncrypted()) {
            //Just try using the default password and move on
            // Even if the doc is encrypted, apparently you can try. Throw exception if it fails.
            textdoc.addProperty("encrypted", "YES");
        }

        //create a writer where to append the text content.
        StringWriter writer = new StringWriter();
        stripper.resetEngine();
        stripper.writeText(pdfDocument, writer);

        PDDocumentInformation info = pdfDocument.getDocumentInformation();
        if (info != null) {
            textdoc.addAuthor(info.getAuthor());
            try {
                textdoc.addCreateDate(info.getCreationDate());
            } catch (IOException io) {
                //ignore, bad date but continue with indexing
            }
            textdoc.addProperty("creator_tool", info.getCreator());
            textdoc.addProperty("keywords", info.getKeywords());
            /* try {
             metadata.add("ModificationDate", info.getModificationDate());
             } catch (IOException io) {
             //ignore, bad date but continue with indexing
             } */
            //metadata.add("Producer", info.getProducer());
            textdoc.addProperty("subject", info.getSubject());
            String ttl = info.getTitle();
            if (ttl == null || "untitled".equalsIgnoreCase(ttl)) {
                ttl = textdoc.filename;
            }
            textdoc.addTitle(ttl);
            // metadata.add("Trapped", info.getTrapped());

            // TODO: Character set is what?
            textdoc.setEncoding("UTF-8");
        }

        // Note: the buffer to string operation is costless;
        // the char array value of the writer buffer and the content string
        // is shared as long as the buffer content is not modified, which will
        // not occur here.
        textdoc.setText(writer.getBuffer().toString());

        return textdoc;

    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}

From source file:PDFExtractMetadata.java

License:Apache License

/**
 * This is the main method.//from w w  w  . j  a va 2  s .c  o m
 *
 * @param args The command line arguments.
 *
 * @throws IOException If there is an error parsing the document.
 * @throws XmpParsingException
 */
public static void main(String[] args) throws IOException, XmpParsingException {
    if (args.length != 1) {
        usage();
        System.exit(1);
    } else {
        PDDocument document = null;
        try {
            document = PDDocument.load(new File(args[0]));
            PDDocumentCatalog catalog = document.getDocumentCatalog();
            PDMetadata meta = catalog.getMetadata();
            if (meta != null) {
                DomXmpParser xmpParser = new DomXmpParser();
                try {
                    XMPMetadata metadata = xmpParser.parse(meta.createInputStream());

                    DublinCoreSchema dc = metadata.getDublinCoreSchema();
                    if (dc != null) {
                        display("Title:", dc.getTitle());
                        display("Description:", dc.getDescription());
                        listString("Creators: ", dc.getCreators());
                        listCalendar("Dates:", dc.getDates());
                        listString("Subjects:", dc.getSubjects());
                    }

                    AdobePDFSchema pdf = metadata.getAdobePDFSchema();
                    if (pdf != null) {
                        display("Keywords:", pdf.getKeywords());
                        display("PDF Version:", pdf.getPDFVersion());
                        display("PDF Producer:", pdf.getProducer());
                    }

                    XMPBasicSchema basic = metadata.getXMPBasicSchema();
                    if (basic != null) {
                        display("Create Date:", basic.getCreateDate());
                        display("Modify Date:", basic.getModifyDate());
                        display("Creator Tool:", basic.getCreatorTool());
                    }
                } catch (XmpParsingException e) {
                    System.err.println("An error ouccred when parsing the meta data: " + e.getMessage());
                }
            } else {
                // The pdf doesn't contain any metadata, try to use the
                // document information instead
                PDDocumentInformation information = document.getDocumentInformation();
                if (information != null) {
                    showDocumentInformation(information);
                }
            }

        } finally {
            if (document != null) {
                document.close();
            }
        }
    }

}

From source file:ExtractTextFromPdf.java

public static void main(String[] args) {

    PDFParser parser = null;//from  w w  w . j av a2s  . c  o  m
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    PDFTextStripper pdfStripper;

    String parsedText;
    String fileName = "C:/Users/Kavya Gupta/Desktop/Texas_Title.pdf";
    File file = new File(fileName);
    try {
        byte data[] = new byte[1024];
        ((RandomAccessRead) file).read(data, 0, 1024);

        pdDoc = PDDocument.load(new File(fileName));
        pdfStripper = new PDFTextStripper();
        parsedText = pdfStripper.getText(pdDoc);
        System.out.println(parsedText.replaceAll("[^A-Za-z0-9. ]+", ""));
    } catch (Exception e) {
        e.printStackTrace();
        try {
            if (cosDoc != null)
                cosDoc.close();
            if (pdDoc != null)
                pdDoc.close();
        } catch (Exception e1) {
            e.printStackTrace();
        }
    }
}

From source file:DecodePlate.java

License:Open Source License

public static void ProcessPlateWork() throws Exception {
    // Filter out fixes more than 50nm away from airport, no chart goes that far.
    // This helps us from trying to decode spurious strings as fix names and
    // helps us avoid duplicate name problems.

    nearDBFixes.clear();//from  ww w . j a  v  a  2  s  .c  o m
    for (DBFix dbfix : allDBFixes) {
        if (Lib.LatLonDist(dbfix.lat, dbfix.lon, airport.lat, airport.lon) <= maxFixDistNM) {
            dbfix.mentioned = false;
            nearDBFixes.put(dbfix.name, dbfix);
        }
    }

    // Also add in runways as fixes cuz some plates use them for fixes.

    for (Runway rwy : airport.runways.values()) {
        nearDBFixes.put(rwy.name, rwy);
    }

    // Open PDF and scan it.

    PDDocument pddoc = PDDocument.load(pdfName);
    PDDocumentCatalog doccat = pddoc.getDocumentCatalog();
    PDPageNode pages = doccat.getPages();
    List kids = new LinkedList();
    pages.getAllKids(kids);
    if (kids.size() != 1)
        throw new Exception("pdf not a single Page");
    Object kid = kids.get(0);
    PDPage page = (PDPage) kid;
    int imgWidth = (int) (page.getMediaBox().getWidth() / pdfDpi * csvDpi + 0.5F);
    int imgHeight = (int) (page.getMediaBox().getHeight() / pdfDpi * csvDpi + 0.5F);
    BufferedImage bi = new BufferedImage(imgWidth, imgHeight, BufferedImage.TYPE_INT_ARGB);
    Graphics2D g2d = bi.createGraphics();
    PagePanel pagepanel = new PagePanel(page);
    pagepanel.paintComponent(g2d);
    pagepanel.resolveFixes(g2d);
    if (markedpngname != null) {
        if (!ImageIO.write(bi, "png", new File(markedpngname))) {
            throw new IOException("ImageIO.write(" + markedpngname + ") failed");
        }
    }
    pddoc.close();
}