Example usage for org.apache.pdfbox.pdmodel PDDocument load

List of usage examples for org.apache.pdfbox.pdmodel PDDocument load

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument load.

Prototype

public static PDDocument load(byte[] input) throws IOException 

Source Link

Document

Parses a PDF.

Usage

From source file:SanerePdf.java

License:Apache License

public static String txtFromPdf(String filename) throws IOException {
    PDDocument doc = PDDocument.load(filename);

    PDFTextStripper stripper = new PDFTextStripper();

    stripper.setPageSeparator("\f"); // form feed
    //stripper.setSortByPosition(true);
    //stripper.setShouldSeparateByBeads(false);
    String text = stripper.getText(doc);

    doc.close();//w ww.j a  va 2  s  .c om
    return text;
}

From source file:PDFTextExtract.java

License:Apache License

/**
 * This will parse the documents data./*from ww w.  j  a v  a2 s  .com*/
 * 
 * @throws IOException If there is an error parsing the document.
 */
public void process() throws IOException {
    PDDocument document = null;
    String res = null;
    OutputStream os = null;

    try {
        // Target PDF file.
        document = PDDocument.load(new File(this.PDFFilePath));

        // Extract Text from PDF ordered by page number.
        for (int i = 1; i <= document.getNumberOfPages(); i++) {
            System.out.println("processing page " + i + "...");
            _tmp.clear();
            PDFTextExtract stripper = new PDFTextExtract();

            // Tell PDFBox to sort the text positions.
            stripper.setSortByPosition(true);

            // Extract only one page.
            stripper.setStartPage(i);
            stripper.setEndPage(i);

            // Convert class `textPositions` into `Text`.
            // Save conversion result in `_tmp`.
            Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
            stripper.writeText(document, dummy);

            // Skip if nothing converted.
            if (_tmp.isEmpty())
                continue;

            // Concate result into string.
            for (Text now : _tmp) {
                res += now.unicode;
            }
        }

        // Write result to file.
        os = new FileOutputStream(OutputFilepath);
        os.write(res.getBytes());
        System.out.println("processing completed.");
    } finally {
        if (document != null) {
            document.close();
        }
        if (os != null) {
            os.close();
        }
    }

}

From source file:FormFiller.java

private static void fillPdf(HashMap dealerTrackData, String inputFileName, String outputDir,
        String outputFormType) {// w  ww.  ja  v  a2s  .co m
    try {
        PDDocument pdfTemplate = PDDocument.load(new File(inputFileName));

        PDDocumentCatalog docCatalog = pdfTemplate.getDocumentCatalog();
        PDAcroForm acroForm = docCatalog.getAcroForm();

        List<PDField> fieldList = acroForm.getFields();

        String[] fieldArray = new String[fieldList.size()];
        int i = 0;
        for (PDField sField : fieldList) {
            fieldArray[i] = sField.getFullyQualifiedName();
            i++;
        }

        for (String f : fieldArray) {
            PDField field = acroForm.getField(f);
            String value = (String) dealerTrackData.get(f);
            if (value != null) {
                try {
                    field.setValue(value);
                } catch (IllegalArgumentException e) {
                    System.err.println("Could not insert: " + f + ".");
                }
            }
        }

        pdfTemplate.save(outputDir + "/" + dealerTrackData.get("fullName") + " " + outputFormType + ".pdf");

        // printing - need to look into the long form stuff!
        if (print && !inputFileName.contains("Title Guarantee"))
            printPdf(pdfTemplate, dealerTrackData, inputFileName,
                    inputFileName.contains("Purchase Contract") ? 2 : 1);

        pdfTemplate.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:FormFiller.java

private static void getChaseCreditInfo(JavascriptExecutor jse, HashMap dealerTrackData, WebDriver driver)
        throws InterruptedException, IOException {

    // Home Phone
    String homePhone0 = (String) jse.executeScript("return document.getElementById('iFrm')."
            + "contentWindow.document.body.childNodes[2].contentDocument.getElementById('app_home_phone1').value");
    String homePhone1 = (String) jse.executeScript("return document.getElementById('iFrm')."
            + "contentWindow.document.body.childNodes[2].contentDocument.getElementById('app_home_phone2').value");
    String homePhone2 = (String) jse.executeScript("return document.getElementById('iFrm')."
            + "contentWindow.document.body.childNodes[2].contentDocument.getElementById('app_home_phone3').value");

    dealerTrackData.put("Home (or business) Phone Number",
            "(" + homePhone0 + ")" + " " + homePhone1 + "-" + homePhone2);

    // Business Phone
    String workPhone0 = (String) jse.executeScript("return document.getElementById('iFrm')."
            + "contentWindow.document.body.childNodes[2].contentDocument.getElementById('app_bus_phone1').value");
    String workPhone1 = (String) jse.executeScript("return document.getElementById('iFrm')."
            + "contentWindow.document.body.childNodes[2].contentDocument.getElementById('app_bus_phone2').value");
    String workPhone2 = (String) jse.executeScript("return document.getElementById('iFrm')."
            + "contentWindow.document.body.childNodes[2].contentDocument.getElementById('app_bus_phone3').value");

    dealerTrackData.put("workPhone", "(" + workPhone0 + ")" + " " + workPhone1 + "-" + workPhone2);

    jse.executeScript(/*from w  w  w.j av a2 s  .  co m*/
            "document.getElementById('iFrm').contentWindow.document.body.childNodes[2].contentDocument."
                    + "getElementsByName('cmdPrint')[0].click()");

    Thread.sleep(4000);
    File chaseCreditApplication;
    String downloadDir = System.getProperty("os.name").toLowerCase().contains("win")
            ? "C:/Users/" + System.getProperty("user.name") + "/Downloads/"
            : "/Users/" + System.getProperty("user.name") + "/Downloads/";
    do {
        chaseCreditApplication = getLatestFileFromDir(downloadDir);
        Thread.sleep(1000);
    } while (chaseCreditApplication == null
            || System.currentTimeMillis() > chaseCreditApplication.lastModified() + 4000);

    PDDocument creditAppTemplate = PDDocument.load(chaseCreditApplication);
    outputDir = makeOutputDir(dealerTrackData);
    creditAppTemplate.save(outputDir + "/" + dealerTrackData.get("fullName") + " " + "Chase Credit App.pdf");

    if (print)
        printPdf(creditAppTemplate, dealerTrackData, "Chase Credit Application", 1);
    creditAppTemplate.close();

}

From source file:PrintImageLocations.java

License:Apache License

/**
 * This will print the documents data./* w  w  w.j  av  a  2s.c o m*/
 *
 * @param args The command line arguments.
 *
 * @throws Exception If there is an error parsing the document.
 */
public static void main2() throws Exception {
    if (flag) {
        usage();
    } else {
        PDDocument document = null;
        try {
            document = PDDocument.load(PrintTextLocations.INPUTFILE);
            if (document.isEncrypted()) {
                try {
                    document.decrypt("");
                } catch (InvalidPasswordException e) {
                    System.err.println("Error: Document is encrypted with a password.");
                    System.exit(1);
                }
            }
            PrintImageLocations printer = new PrintImageLocations();
            List allPages = document.getDocumentCatalog().getAllPages();
            for (int i = 0; i < allPages.size(); i++) {
                PDPage page = (PDPage) allPages.get(i);
                System.out.println("Processing page: " + i);
                printer.processStream(page, page.findResources(), page.getContents().getStream());
            }
        } finally {
            if (document != null) {
                document.close();
            }
        }
    }
}

From source file:PrintUtil.java

License:Apache License

void print() {
    PDDocument doc = null;/*w w  w. java 2  s.  c  o  m*/
    try {
        doc = PDDocument.load(new File(path));
    } catch (IOException e) {
        e.printStackTrace();
    }
    job.setPrintable(new PDFPrintable(doc, Scaling.SCALE_TO_FIT));
    PrintRequestAttributeSet attr = new HashPrintRequestAttributeSet();
    attr.add(Sides.DUPLEX);
    attr.add(MediaSizeName.ISO_A4);
    System.out.println("Printing =>" + path);
    try {
        job.print(attr);
    } catch (PrinterException e) {
        e.printStackTrace();
    } finally {
        if (doc != null) {
            try {
                doc.close();
            } catch (IOException ignored) {
            }
        }
    }
}

From source file:PDFConverter.java

License:Apache License

/**
 * Implementation is informed by PDFBox authors.
 *
 * @param doc/*from w  w  w.ja  v  a 2  s. c om*/
 * @return
 * @throws IOException
 */
@Override
public synchronized ConvertedDocument convert(java.io.File doc) throws IOException {

    /*
     * Licensed to the Apache Software Foundation (ASF) under one or more
     * contributor license agreements.  See the NOTICE file distributed with
     * this work for additional information regarding copyright ownership.
     * The ASF licenses this file to You under the Apache License, Version 2.0
     * (the "License"); you may not use this file except in compliance with
     * the License.  You may obtain a copy of the License at
     *
     *      http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific language governing permissions and
     * limitations under the License.
     */
    /**
     * Adapted from LucenePDFDocument.java from PDFBox lucene project
     *
     * This class is used to create a document for the lucene search engine.
     * This should easily plug into the IndexHTML or IndexFiles that comes
     * with the lucene project. This class will populate the following
     * fields.
     * <table> <tr> <th>Lucene Field Name</th> <th>Description</th> </tr>
     * <tr>
     * <td>path</td> <td>File system path if loaded from a file</td> </tr>
     * <tr>
     * <td>url</td> <td>URL to PDF document</td> </tr> <tr>
     * <td>contents</td>
     * <td>Entire contents of PDF document, indexed but not stored</td>
     * </tr>
     * <tr> <td>summary</td> <td>First 500 characters of content</td> </tr>
     * <tr>
     * <td>modified</td> <td>The modified date/time according to the url or
     * path</td> </tr> <tr> <td>uid</td> <td>A unique identifier for the
     * Lucene document.</td> </tr> <tr> <td>CreationDate</td> <td>From PDF
     * meta-data if available</td> </tr> <tr> <td>Creator</td> <td>From PDF
     * meta-data if available</td> </tr> <tr> <td>Keywords</td> <td>From PDF
     * meta-data if available</td> </tr> <tr> <td>ModificationDate</td>
     * <td>From PDF meta-data if available</td> </tr> <tr> <td>Producer</td>
     * <td>From PDF meta-data if available</td> </tr> <tr> <td>Subject</td>
     * <td>From PDF meta-data if available</td> </tr> <tr> <td>Trapped</td>
     * <td>From PDF meta-data if available</td> </tr> <tr>
     * <td>Encrypted</td> <td>From PDF meta-data if available</td> </tr>
     * </table>
     *
     * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
     * @version $Revision: 1.23 $
     *
     * @throws IOException If there is an error parsing the document.
     */
    PDDocument pdfDocument = null;
    ConvertedDocument textdoc = new ConvertedDocument(doc);

    try {
        pdfDocument = PDDocument.load(doc);

        if (pdfDocument.isEncrypted()) {
            //Just try using the default password and move on
            // Even if the doc is encrypted, apparently you can try. Throw exception if it fails.
            textdoc.addProperty("encrypted", "YES");
        }

        //create a writer where to append the text content.
        StringWriter writer = new StringWriter();
        stripper.resetEngine();
        stripper.writeText(pdfDocument, writer);

        PDDocumentInformation info = pdfDocument.getDocumentInformation();
        if (info != null) {
            textdoc.addAuthor(info.getAuthor());
            try {
                textdoc.addCreateDate(info.getCreationDate());
            } catch (IOException io) {
                //ignore, bad date but continue with indexing
            }
            textdoc.addProperty("creator_tool", info.getCreator());
            textdoc.addProperty("keywords", info.getKeywords());
            /* try {
             metadata.add("ModificationDate", info.getModificationDate());
             } catch (IOException io) {
             //ignore, bad date but continue with indexing
             } */
            //metadata.add("Producer", info.getProducer());
            textdoc.addProperty("subject", info.getSubject());
            String ttl = info.getTitle();
            if (ttl == null || "untitled".equalsIgnoreCase(ttl)) {
                ttl = textdoc.filename;
            }
            textdoc.addTitle(ttl);
            // metadata.add("Trapped", info.getTrapped());

            // TODO: Character set is what?
            textdoc.setEncoding("UTF-8");
        }

        // Note: the buffer to string operation is costless;
        // the char array value of the writer buffer and the content string
        // is shared as long as the buffer content is not modified, which will
        // not occur here.
        textdoc.setText(writer.getBuffer().toString());

        return textdoc;

    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}

From source file:PDFExtractMetadata.java

License:Apache License

/**
 * This is the main method.//from w w  w  . j  a va 2  s .c  o m
 *
 * @param args The command line arguments.
 *
 * @throws IOException If there is an error parsing the document.
 * @throws XmpParsingException
 */
public static void main(String[] args) throws IOException, XmpParsingException {
    if (args.length != 1) {
        usage();
        System.exit(1);
    } else {
        PDDocument document = null;
        try {
            document = PDDocument.load(new File(args[0]));
            PDDocumentCatalog catalog = document.getDocumentCatalog();
            PDMetadata meta = catalog.getMetadata();
            if (meta != null) {
                DomXmpParser xmpParser = new DomXmpParser();
                try {
                    XMPMetadata metadata = xmpParser.parse(meta.createInputStream());

                    DublinCoreSchema dc = metadata.getDublinCoreSchema();
                    if (dc != null) {
                        display("Title:", dc.getTitle());
                        display("Description:", dc.getDescription());
                        listString("Creators: ", dc.getCreators());
                        listCalendar("Dates:", dc.getDates());
                        listString("Subjects:", dc.getSubjects());
                    }

                    AdobePDFSchema pdf = metadata.getAdobePDFSchema();
                    if (pdf != null) {
                        display("Keywords:", pdf.getKeywords());
                        display("PDF Version:", pdf.getPDFVersion());
                        display("PDF Producer:", pdf.getProducer());
                    }

                    XMPBasicSchema basic = metadata.getXMPBasicSchema();
                    if (basic != null) {
                        display("Create Date:", basic.getCreateDate());
                        display("Modify Date:", basic.getModifyDate());
                        display("Creator Tool:", basic.getCreatorTool());
                    }
                } catch (XmpParsingException e) {
                    System.err.println("An error ouccred when parsing the meta data: " + e.getMessage());
                }
            } else {
                // The pdf doesn't contain any metadata, try to use the
                // document information instead
                PDDocumentInformation information = document.getDocumentInformation();
                if (information != null) {
                    showDocumentInformation(information);
                }
            }

        } finally {
            if (document != null) {
                document.close();
            }
        }
    }

}

From source file:ExtractTextFromPdf.java

public static void main(String[] args) {

    PDFParser parser = null;//from  w w  w . j av a2s  . c  o  m
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    PDFTextStripper pdfStripper;

    String parsedText;
    String fileName = "C:/Users/Kavya Gupta/Desktop/Texas_Title.pdf";
    File file = new File(fileName);
    try {
        byte data[] = new byte[1024];
        ((RandomAccessRead) file).read(data, 0, 1024);

        pdDoc = PDDocument.load(new File(fileName));
        pdfStripper = new PDFTextStripper();
        parsedText = pdfStripper.getText(pdDoc);
        System.out.println(parsedText.replaceAll("[^A-Za-z0-9. ]+", ""));
    } catch (Exception e) {
        e.printStackTrace();
        try {
            if (cosDoc != null)
                cosDoc.close();
            if (pdDoc != null)
                pdDoc.close();
        } catch (Exception e1) {
            e.printStackTrace();
        }
    }
}

From source file:DecodePlate.java

License:Open Source License

public static void ProcessPlateWork() throws Exception {
    // Filter out fixes more than 50nm away from airport, no chart goes that far.
    // This helps us from trying to decode spurious strings as fix names and
    // helps us avoid duplicate name problems.

    nearDBFixes.clear();//from  ww w . j a  v  a  2  s  .c  o m
    for (DBFix dbfix : allDBFixes) {
        if (Lib.LatLonDist(dbfix.lat, dbfix.lon, airport.lat, airport.lon) <= maxFixDistNM) {
            dbfix.mentioned = false;
            nearDBFixes.put(dbfix.name, dbfix);
        }
    }

    // Also add in runways as fixes cuz some plates use them for fixes.

    for (Runway rwy : airport.runways.values()) {
        nearDBFixes.put(rwy.name, rwy);
    }

    // Open PDF and scan it.

    PDDocument pddoc = PDDocument.load(pdfName);
    PDDocumentCatalog doccat = pddoc.getDocumentCatalog();
    PDPageNode pages = doccat.getPages();
    List kids = new LinkedList();
    pages.getAllKids(kids);
    if (kids.size() != 1)
        throw new Exception("pdf not a single Page");
    Object kid = kids.get(0);
    PDPage page = (PDPage) kid;
    int imgWidth = (int) (page.getMediaBox().getWidth() / pdfDpi * csvDpi + 0.5F);
    int imgHeight = (int) (page.getMediaBox().getHeight() / pdfDpi * csvDpi + 0.5F);
    BufferedImage bi = new BufferedImage(imgWidth, imgHeight, BufferedImage.TYPE_INT_ARGB);
    Graphics2D g2d = bi.createGraphics();
    PagePanel pagepanel = new PagePanel(page);
    pagepanel.paintComponent(g2d);
    pagepanel.resolveFixes(g2d);
    if (markedpngname != null) {
        if (!ImageIO.write(bi, "png", new File(markedpngname))) {
            throw new IOException("ImageIO.write(" + markedpngname + ") failed");
        }
    }
    pddoc.close();
}