Example usage for org.apache.pdfbox.pdmodel PDDocument load

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument load.

Prototype

public static PDDocument load(byte[] input) throws IOException

Source Link

Document

Parses a PDF.

Usage

From source file:org.mabb.fontverter.opentype.TtfInstructions.TestFullTtfPrograms.java

License:Open Source License

@Test
public void executeSecondGlyphIn_BrokenHelveticaNeueTtf() throws Exception {
    PDDocument doc = PDDocument.load(TestUtils.readTestFile("pdf/HorariosMadrid_Segovia.pdf"));

    PDFont rawType0Font = extractFont(doc, "TCQDAA+HelveticaNeue-Light-Identity-H");
    OpenTypeFont font = (OpenTypeFont) PdfFontExtractor.convertType0FontToOpenType((PDType0Font) rawType0Font);

    List<TtfGlyph> glyphs = font.getGlyfTable().getNonEmptyGlyphs();
    TtfGlyph glyph = glyphs.get(1);//from www .  j a  va2 s. c  om
    List<TtfInstruction> instructions = glyph.getInstructions();

    TtfVirtualMachine vm = new TtfVirtualMachine(font);
    vm.execute(instructions);
}

From source file:org.mabb.fontverter.pdf.PdfFontExtractor.java

License:Open Source License

private static void extractPdfFonts(String extractPath, File pdfFile, FontFormat format) throws IOException {
    File fontExtractDir = new File(extractPath);
    if (!fontExtractDir.exists())
        fontExtractDir.mkdir();//ww  w  .ja v  a 2s  .c  o  m

    PDDocument pdf = PDDocument.load(pdfFile);

    PdfFontExtractor fontExtractor = new PdfFontExtractor();
    fontExtractor.setExtractFormat(format);
    fontExtractor.extractFontsToDir(pdf, extractPath);

    pdf.close();
}

From source file:org.mabb.fontverter.pdf.PdfFontExtractor.java

License:Open Source License

public void extractFontsToDir(File pdf, String path) throws IOException {
    PDDocument doc = PDDocument.load(pdf);
    extractFontsToDir(doc, path);
    doc.close();
}

From source file:org.mabb.fontverter.pdf.PdfFontExtractor.java

License:Open Source License

public void extractFontsToDir(byte[] pdf, String path) throws IOException {
    PDDocument doc = PDDocument.load(pdf);
    extractFontsToDir(doc, path);/*from ww  w .j  ava 2  s  .c  o m*/
    doc.close();
}

From source file:org.mabb.fontverter.pdf.TestPdfFontExtractor.java

License:Open Source License

@Test
public void givenPdfWith2Fonts_extractFontsToFVFontList_thenListHasSameNumberOfFonts() throws IOException {
    PDDocument doc = PDDocument.load(TestUtils.readTestFile("pdf/brno30.pdf"));
    PdfFontExtractor extractor = new PdfFontExtractor();

    List<FVFont> fonts = extractor.extractToFVFonts(doc);

    Assert.assertEquals(3, fonts.size());
    doc.close();/*from  w  w w.j a v  a  2  s  .c om*/
}

From source file:org.mabb.fontverter.pdf.TestPdfFontExtractor.java

License:Open Source License

@Test
public void givenPdfWith2Fonts_extractFontsToDir_thenDirectoryHasThreeTtfFiles() throws IOException {
    PDDocument doc = PDDocument.load(TestUtils.readTestFile("pdf/brno30.pdf"));
    PdfFontExtractor extractor = new PdfFontExtractor();

    File extractDir = folder.getRoot();
    extractor.extractFontsToDir(doc, extractDir);
    File[] fontFiles = extractDir.listFiles();

    Assert.assertEquals(3, fontFiles.length);
    for (File fileOn : fontFiles)
        Assert.assertEquals("ttf", FilenameUtils.getExtension(fileOn.getPath()));

    doc.close();//from  ww w.j a va  2s  . c o m
}

From source file:org.mabb.fontverter.pdf.TestPdfFontExtractor.java

License:Open Source License

@Test
public void givenPdfWith2Fonts_extractFontsToDirWithWoff1FormatSet_thenDirectoryHasThreeWoffFiles()
        throws IOException {
    File extractDir = folder.getRoot();
    PDDocument doc = PDDocument.load(TestUtils.readTestFile("pdf/brno30.pdf"));

    PdfFontExtractor extractor = new PdfFontExtractor();
    extractor.setExtractFormat(FontVerter.FontFormat.WOFF1);

    extractor.extractFontsToDir(doc, extractDir);
    File[] fontFiles = extractDir.listFiles();

    Assert.assertEquals(3, fontFiles.length);
    for (File fileOn : fontFiles)
        Assert.assertEquals("woff", FilenameUtils.getExtension(fileOn.getPath()));

    doc.close();//from   ww  w  . j av  a 2  s .c om
}

From source file:org.mabb.fontverter.pdf.TestType0ToOpenTypeConverter.java

License:Open Source License

public TestType0ToOpenTypeConverter() throws IOException {
    doc = PDDocument.load(TestUtils.readTestFile("pdf/HorariosMadrid_Segovia.pdf"));
}

From source file:org.MagicBeans.latexFileType.PDFGenerator.java

public PDPage getPDFPage(int number) {
    PDF_PATH = ApplicationUtils.getTempPDFFile(workingDir);

    PDPage page = null;/*from   ww  w .  j  av  a2s . c  om*/

    File pdfFile = null;
    try {
        pdfFile = new File(PDF_PATH);
        if (pdfFile.exists()) {
            inputPDF = PDDocument.load(pdfFile);
            List<PDPage> allPages = inputPDF.getDocumentCatalog().getAllPages();
            if (allPages != null && !allPages.isEmpty() && allPages.size() >= number && number > 0) {
                page = allPages.get(number - 1);
            }
        }
    } catch (IOException ex) {
        return null;
    }
    return page;
}

From source file:org.mitre.xtext.converters.PDFConverter.java

License:Apache License

/** Implementation is informed by PDFBox authors.
 *///from   ww  w . j av  a 2s  .  com
@Override
public synchronized ConvertedDocument convert(java.io.File doc) throws IOException {

    /*
     * Licensed to the Apache Software Foundation (ASF) under one or more
     * contributor license agreements.  See the NOTICE file distributed with
     * this work for additional information regarding copyright ownership.
     * The ASF licenses this file to You under the Apache License, Version 2.0
     * (the "License"); you may not use this file except in compliance with
     * the License.  You may obtain a copy of the License at
     *
     *      http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific language governing permissions and
     * limitations under the License.
     */

    /**
     * Adapted from LucenePDFDocument.java from PDFBox lucene project
     *
     * This class is used to create a document for the lucene search engine.
     * This should easily plug into the IndexHTML or IndexFiles that comes with
     * the lucene project. This class will populate the following fields.
     * <table> <tr> <th>Lucene Field Name</th> <th>Description</th> </tr> <tr>
     * <td>path</td> <td>File system path if loaded from a file</td> </tr> <tr>
     * <td>url</td> <td>URL to PDF document</td> </tr> <tr> <td>contents</td>
     * <td>Entire contents of PDF document, indexed but not stored</td> </tr>
     * <tr> <td>summary</td> <td>First 500 characters of content</td> </tr> <tr>
     * <td>modified</td> <td>The modified date/time according to the url or
     * path</td> </tr> <tr> <td>uid</td> <td>A unique identifier for the Lucene
     * document.</td> </tr> <tr> <td>CreationDate</td> <td>From PDF meta-data if
     * available</td> </tr> <tr> <td>Creator</td> <td>From PDF meta-data if
     * available</td> </tr> <tr> <td>Keywords</td> <td>From PDF meta-data if
     * available</td> </tr> <tr> <td>ModificationDate</td> <td>From PDF
     * meta-data if available</td> </tr> <tr> <td>Producer</td> <td>From PDF
     * meta-data if available</td> </tr> <tr> <td>Subject</td> <td>From PDF
     * meta-data if available</td> </tr> <tr> <td>Trapped</td> <td>From PDF
     * meta-data if available</td> </tr> <tr> <td>Encrypted</td> <td>From PDF
     * meta-data if available</td> </tr> </table>
     *
     * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
     * @version $Revision: 1.23 $
     *
     * @throws IOException If there is an error parsing the document.
     */
    PDDocument pdfDocument = null;
    ConvertedDocument textdoc = new ConvertedDocument(doc);

    try {
        pdfDocument = PDDocument.load(doc);

        if (pdfDocument.isEncrypted()) {
            //Just try using the default password and move on
            /**
             *
             * Exception in thread "main" java.lang.NoClassDefFoundError:
             * org/bouncycastle/jce/provider/BouncyCastleProvider at
             * org.apache.pdfbox.pdmodel.PDDocument.openProtection(PDDocument.java:1090)
             * at
             * org.apache.pdfbox.pdmodel.PDDocument.decrypt(PDDocument.java:594)
             *
             * CRYPTO stuff -- load BouncyCastle crypto JAR files. try {
             * pdfDocument.decrypt(""); } catch (CryptographyException e) {
             * throw new IOException("Error decrypting document(" + pdf_file
             * + "): " + e); } catch (InvalidPasswordException e) { //they
             * didn't suppply a password and the default of "" was wrong.
             * throw new IOException( "Error: The document(" + pdf_file + ")
             * is encrypted "); } finally { if (pdfDocument != null) {
             * pdfDocument.close();} }
             */
            textdoc.addProperty("encrypted", "YES");
        } else {

            //create a writer where to append the text content.
            StringWriter writer = new StringWriter();
            stripper.resetEngine();
            stripper.writeText(pdfDocument, writer);

            PDDocumentInformation info = pdfDocument.getDocumentInformation();
            if (info != null) {
                textdoc.addAuthor(info.getAuthor());
                try {
                    textdoc.addCreateDate(info.getCreationDate());
                } catch (IOException io) {
                    //ignore, bad date but continue with indexing
                }
                textdoc.addProperty("creator_tool", info.getCreator());
                textdoc.addProperty("keywords", info.getKeywords());
                /* try {
                 metadata.add("ModificationDate", info.getModificationDate());
                 } catch (IOException io) {
                 //ignore, bad date but continue with indexing
                 } */
                //metadata.add("Producer", info.getProducer());
                textdoc.addProperty("subject", info.getSubject());
                String ttl = info.getTitle();
                if (ttl == null || "untitled".equalsIgnoreCase(ttl)) {
                    ttl = textdoc.filename;
                }
                textdoc.addTitle(ttl);
                // metadata.add("Trapped", info.getTrapped());

                // TODO: Character set is what?
                textdoc.setEncoding("UTF-8");
            }

            // Note: the buffer to string operation is costless;
            // the char array value of the writer buffer and the content string
            // is shared as long as the buffer content is not modified, which will
            // not occur here.
            textdoc.setPayload(writer.getBuffer().toString());
        }
        return textdoc;

    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}