Example usage for org.apache.pdfbox.pdmodel PDDocument load

List of usage examples for org.apache.pdfbox.pdmodel PDDocument load

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument load.

Prototype

public static PDDocument load(byte[] input) throws IOException 

Source Link

Document

Parses a PDF.

Usage

From source file:com.openkm.util.metadata.MetadataExtractor.java

License:Open Source License

/**
 * Extract metadata from PDF// w  w  w . j a  v a  2s.c  o m
 */
public static PdfMetadata pdfExtractor(InputStream is) throws IOException {
    PDDocument doc = PDDocument.load(is);
    PDDocumentInformation info = doc.getDocumentInformation();
    PdfMetadata md = new PdfMetadata();

    md.setNumberOfPages(doc.getNumberOfPages());
    md.setTitle(info.getTitle());
    md.setAuthor(info.getAuthor());
    md.setSubject(info.getSubject());
    md.setKeywords(info.getKeywords());
    md.setCreator(info.getCreator());
    md.setProducer(info.getProducer());
    md.setTrapped(info.getTrapped());
    md.setCreationDate(info.getCreationDate());
    md.setModificationDate(info.getModificationDate());

    log.info("pdfExtractor: {}", md);
    return md;
}

From source file:com.plumblarrick.andrew.cityrecordtextextractor.IssueExtractorPositional.java

public void extractToFile(String inFileName, String outFileName) throws IOException {

    this.inFileName = inFileName;
    this.outFileName = outFileName;
    try {//from w w  w . j  a  v  a2s .  c  o m
        document = PDDocument.load(new File(inFileName));

        PDFTextStripper stripper = new CRTStripper();
        //stripper.setSortByPosition(true);
        stripper.setStartPage(0);
        stripper.setEndPage(document.getNumberOfPages());

        fileOut = (new BufferedWriter(new PrintWriter(outFileName, "UTF-8")));

        fileOut.write("Source file: " + inFileName + "\n");
        stripper.writeText(document, fileOut);

    } finally {
        if (document != null) {
            document.close();
            fileOut.flush();
            fileOut.close();
        }
    }
}

From source file:com.poscoict.license.service.CertificateService.java

public boolean extractPagesAsImage(String sourceFile, String fileName, int resolution, String password) {

    boolean result = false;
    //? ?/*w  ww. j a v  a2  s .com*/
    String imageFormat = Consts.IMG_FORMAT;
    int pdfPageCn = 0;
    PDDocument pdfDoc = null;

    try {
        //PDF?  ?
        pdfDoc = PDDocument.load(sourceFile);

        //PDF? ??  ?
        pdfPageCn = pdfDoc.getNumberOfPages();
    } catch (IOException ioe) {
        logger.error("PDF ?  : ", ioe);
    }

    PDFImageWriter imageWriter = new PDFImageWriter();
    try {
        result = imageWriter.writeImage(pdfDoc, imageFormat, password, 1, //?  ?
                pdfPageCn, //?  ?
                //? ? ?  ?+? "?1.gif" ?
                Consts.IMG_PATH + fileName, BufferedImage.TYPE_INT_RGB, resolution //?   300 
        );
        pdfDoc.close();
    } catch (IOException ioe) {
        logger.error("PDF ?  : ", ioe);
    }
    return result;
}

From source file:com.proquest.demo.allinone.PDFLBase.java

/**
 * Extract Text using PDF BOX, instead APDFL
 *
 * @param fileNamePath/* w  w  w.j a va  2 s  . co m*/
 * @return
 * @throws Exception
 */
public byte[] extractTextPDFBox(final String fileNamePath) throws Exception {
    final String BLANK_SPACE = " ";
    final String UTF_8 = "UTF-8";
    final PropertyReaderLib libPropertyReaderLib = new PropertyReaderLib(PropertyFileNames.PDFLIBRARY);
    final String regex = libPropertyReaderLib
            .getPropertyValue(PdfLibraryKeys.REGEX_TO_REMOVE_FROM_EXTRACTEDTEXT.toString());

    byte[] bytesToReturn = null;
    try {
        final File file = new File(fileNamePath);
        final PDDocument pdfDoc = PDDocument.load(file);
        final PDFTextStripper pdfStripper = new PDFTextStripper();
        final String textFromPDF = pdfStripper.getText(pdfDoc);
        pdfDoc.close();

        bytesToReturn = textFromPDF.getBytes(UTF_8);
        final String textStr = new String(bytesToReturn).replaceAll(regex, BLANK_SPACE);
        bytesToReturn = textStr.getBytes();
    } catch (IOException e) {
        throw new Exception(e.getMessage());
    }
    return bytesToReturn;
}

From source file:com.przemo.pdfmanipulate.PDFBuilder.java

public static PDDocument build(final String sourcePath, final Formularz form) throws IOException {
    PDDocument doc = PDDocument.load(new File(sourcePath));
    renderForm(form, doc);//from   ww  w .j av a2  s.c om
    return doc;
}

From source file:com.quanticate.opensource.pdftkbox.Bookmarks.java

License:Apache License

public Bookmarks(File pdf) throws IOException {
    document = PDDocument.load(pdf);
}

From source file:com.santaanna.friendlyreader.pdfstod.GUI.PDFReader.java

License:Apache License

/**
 * This will parse a document./*from  w  w w.j a v  a2 s  .co m*/
 *
 * @param input The input stream for the document.
 *
 * @return The document.
 *
 * @throws IOException If there is an error parsing the document.
 */
private static PDDocument parseDocument(InputStream input) throws IOException {
    PDDocument document = PDDocument.load(input);
    if (document.isEncrypted()) {
        try {
            document.decrypt("");
        } catch (org.apache.pdfbox.exceptions.InvalidPasswordException e) {
            System.err.println("Error: The document is encrypted.");
        } catch (org.apache.pdfbox.exceptions.CryptographyException e) {
            e.printStackTrace();
        }
    }

    return document;
}

From source file:com.santaanna.friendlyreader.pdfstod.pdfstod3.ReplaceStringStreamEngine.java

License:Apache License

/**
 * Sammanfatta PDF dokumentet baserat p ..
 *
 * @param inputFile The PDF to open.//from   w ww .j  a  v  a 2s .co m
 * @param outputFile The PDF to write to.
 * @param strToFind The string to find in the PDF document.
 * @param message The message to write in the file.
 *
 * @throws IOException If there is an error writing the data.
 * @throws COSVisitorException If there is an error writing the PDF.
 */
public Collection<SEmening> doIt(String inputFile, String outputFile1, boolean DoHighlight, int sumslidval,
        int valdmening) throws IOException, COSVisitorException {
    // the document
    // doc = null; // Output dokumentet (? Kollas!)
    PDFOperator gop = PDFOperator.getOperator("g");
    COSFloat cfloat5 = new COSFloat("0.25");
    COSFloat cfloat1 = new COSFloat("0.75");
    Boolean gray1 = true;
    outputFile = outputFile1;
    String meningsrest = "";
    String sidtext = "";
    Boolean filesaved = false;

    try {
        helaTexten = "";
        SkrivUt(3, "Fre DoIt doc1 load");
        doc1 = PDDocument.load(inputFile); // Indokumentet.
        SkrivUt(3, "Efter DoIt doc1 load");
        List pages = doc1.getDocumentCatalog().getAllPages();
        // SkrivUt(2, "Antal sidor: " + pages.size());
        for (int i = 0; i < pages.size(); i++) { // Frsta fasen skall samla hela texten samt ev ndra relativa till absoluta.
                                                 // TBVector = new Vector<SETextBlock>(); // TB vektorn fr denna sida.
                                                 // PageVector.add(i, TBVector); // Lgg till TB vektorn fr denna sida.
            SkrivUt(4, "Ny sida helaTexten: " + i);
            PDPage page = (PDPage) pages.get(i);
            PDStream contents = page.getContents();
            //AH Kod frn PageDrawer:
            if (contents != null) {
                PDResources resources = page.findResources();
                SkrivUt(4, "Fre getHelaTexten.");
                // Fas = relativ2absolut r inte implementerad n!
                // PDFStreamEngine.fas = PDFStreamEngine.rel2abs;
                setSumcharAlla(0); // Nollstll teckenrknaren fr strings
                sidtext = getHelaTexten(page.getContents().getStream()); //getTextFromPDF, Robin
                helaTexten += sidtext;
                // Hmta hela texten frn dokumentet.
                // SkrivUt(2, "Hela texten per sida0: " + sidtext);
                // helaTexten = ""; // Skall inte nollstllas nu!
                // helaTexten = ""; // AH* >> Nollstll INTE, - fr alla sidor.
                SkrivUt(4, "Efter getHelaTexten.");
                // cosStream.getStreamToken
                /*
                // PDStream nycont = new PDStream( getTokenList());
                PDFStreamEngine.fas = PDFStreamEngine.splitstrings;
                processStream( page, resources, page.getContents().getStream());
                SkrivUt(4, "Hela texten2: " + helaTexten);
                SkrivUt(4, "Efter andra processStream.");
                meningsvektor = Hittameningarna( helaTexten );
                helaTexten = ""; // AH* >> Nollstll???
                 *
                 */
            }
        } // Extrahera meningarna frn hela texten:
          // SkrivUt(2, "Hela texten1: " + helaTexten);
        meningsvektor = Hittameningarna(helaTexten); // Splittra texten i meningar.
        SkrivUt(1, "Meningsvektor.Size: " + meningsvektor.size());
        SEmening semen = null;
        for (int n = 0; n < meningsvektor.size(); n++) {
            // Lgg till mening till meningsvektor.
            // semen = new SEmening();
            // semen.helameningen = menvektor.get( n );
            // meningsvektor.add(n, semen); // Huvudstrukturen fr meningar.
            SkrivUt(1, "Mening: " + meningsvektor.get(n).helameningen);
        }
        /* for (int n = 0; n < meningsvektor.size(); n++ )
        {
        SkrivUt(4, "Mening: " + meningsvektor.get( n ).helameningen);
        } */

        // String helaTextTemp = helaTexten;

        // Vid nsta bearbetning skall string och array splittras vid
        // meningsgrnser.
        mind = 0;
        meningsrest = meningsvektor.get(mind).helameningen; // Kvarvarande text p aktuella sidan
        for (int i = 0; i < pages.size(); i++) {
            // Borde flytta p denna om de inte skall anvndas!
            // TBVector = new Vector<SETextBlock>(); // TB vektorn fr denna sida.
            // PageVector.add(i, TBVector); // Totalstruktur.
            // Lgg till TB vektorn fr denna sida.
            SkrivUt(4, "Ny sida Split: " + i);
            // PDPage ndrad till lokal variabel!
            page1 = (PDPage) pages.get(i);
            PDStream contents = page1.getContents();
            //AH Kod frn PageDrawer:
            SkrivUt(4, "Innan contents test.");
            if (contents != null) {
                PDResources resources = page1.findResources();
                SkrivUt(3, "Fre splitMeningar 1.");
                // PDFStreamEngine.fas = PDFStreamEngine.rel2abs;
                setSumcharAlla(0); // Nollstll teckenrknaren fr strings
                SkrivUt(4, "Fre splitMeningar 2.");
                meningsrest = splitMeningar(meningsrest, page1.getContents().getStream());
                SkrivUt(4, "*** meningsrest: " + meningsrest);
                SkrivUt(4, "3, Efter splitMeningar.");
            }
        }

        // Kolla denna kod!!!
        // saveAndClose( outputFile, doc1 ); // AH****
        // doc1 = PDDocument.load( inputFile ); // Indokumentet.
        // SkrivUt(3,"Efter DoIt doc1 load");
        // pages = doc1.getDocumentCatalog().getAllPages(); Nyinlagd. Kvar???

        // Hr skall g operatorer lggas till fr varje TJ och Tj!
        mind = 0; // Behvs denna hr?
        meningsrest = meningsvektor.get(mind).helameningen; // Kvarvarande text p aktuella sidan.

        for (int i = 0; i < pages.size(); i++) {
            // Borde flytta p denna om de inte skall anvndas!
            // TBVector = new Vector<SETextBlock>(); // TB vektorn fr denna sida.
            // PageVector.add(i, TBVector); // Totalstruktur.
            // Lgg till TB vektorn fr denna sida.
            SkrivUt(4, "Ny sida Gray: " + i);
            // PDPage ndrad till lokal variabel!
            page1 = (PDPage) pages.get(i);
            PDStream contents = page1.getContents();
            //AH Kod frn PageDrawer:
            SkrivUt(4, "Innan contents test.");
            if (contents != null) {
                PDResources resources = page1.findResources();
                SkrivUt(3, "Fre GrayInsert 1.");
                // PDFStreamEngine.fas = PDFStreamEngine.rel2abs;
                setSumcharAlla(0); // Nollstll teckenrknaren fr strings
                // SkrivUt(4, "Fre splitMeningar 2.");
                meningsrest = grayInsert(meningsrest, page1.getContents().getStream(), i);
                //SkrivUt(4, "*** meningsrest: "+ meningsrest);
                SkrivUt(3, "Efter grayInsert av sida.");
            }
        }
        SkrivUt(3, "Efter hela grayInsert.");

        // Bygg TB och meningsstrukturer.
        //
        mind = 0;
        meningsrest = meningsvektor.get(mind).helameningen; // Kvarvarande text p aktuella sidan.

        for (int i = 0; i < pages.size(); i++) {
            TBIndex = 0; // Index i textblocks strukturen.
            // Hr anvnds och byggs datastrukturerna!
            TBVector = new Vector<SETextBlock>(); // TB vektorn fr denna sida.
            PageVector.add(i, TBVector); // Totalstruktur.
            tbpagenr = i;
            // Lgg till TB vektorn fr denna sida.
            SkrivUt(4, "Ny sida Split: " + i);
            // PDPage ndrad till lokal variabel!
            page1 = (PDPage) pages.get(i);
            PDStream contents = page1.getContents();
            //AH Kod frn PageDrawer:
            SkrivUt(4, "Innan contents test.");
            if (contents != null) {
                PDResources resources = page1.findResources();
                SkrivUt(3, "Fre byggStrukturer 1.");
                // PDFStreamEngine.fas = PDFStreamEngine.rel2abs;
                setSumcharAlla(0); // Nollstll teckenrknaren fr strings
                SkrivUt(4, "Fre byggStrukturer 2.");
                meningsrest = byggStrukturer(meningsrest, page1.getContents().getStream(), i);
                SkrivUt(4, "*** meningsrest: " + meningsrest);
                SkrivUt(3, "Efter byggStrukturer.");
            }
        }

        // Skriv ut innehllet i TB strukturen:
        listTextBlocks();

        // Skriv ut meningarna:
        listMeningar();
        //*/
        // Sista passet skall samla in TP fontmetrics och spara dem till TB strukturer.

        SkrivUt(4, "Fr lngt.");
        mind = 0; // Behvs denna hr?

        for (int i = 0; i < pages.size(); i++) { // Frsta fasen skall samla hela texten samt ev ndra relativa till absoluta.
                                                 // TBVector = new Vector<SETextBlock>(); // TB vektorn fr denna sida.
                                                 // PageVector.add(i, TBVector); // Lgg till TB vektorn fr denna sida.
            tempsidnr = i;
            SkrivUt(4, "Ny sida A: " + i);
            PDPage page = (PDPage) pages.get(i);
            PDStream contents = page.getContents();
            //AH Kod frn PageDrawer:
            if (contents != null) {
                PDResources resources = page.findResources();
                SkrivUt(4, "Fre processStream.");
                // PDFStreamEngine.fas = PDFStreamEngine.rel2abs;
                setSumcharAlla(0); // Nollstll teckenrknaren fr strings
                // SkrivUt(4, "Hela texten2FRE: " + helaTexten);
                // processStream( page, resources, page.getContents().getStream()); // Hr anropas sidhanteringen!
                // SkrivUt(2, "Hela texten2: " + helaTexten);
                // helaTexten = ""; // AH* >> Nollstll INTE, - fr alla sidor.
                SkrivUt(4, "Efter processStream. fre nya");
                // cosStream.getStreamToken

                /*
                // PDStream nycont = new PDStream( getTokenList());
                PDFStreamEngine.fas = PDFStreamEngine.splitstrings;
                processStream( page, resources, page.getContents().getStream());
                SkrivUt(4, "Hela texten2: " + helaTexten);
                SkrivUt(4, "Efter andra processStream.");
                meningsvektor = Hittameningarna( helaTexten );
                helaTexten = ""; // AH* >> Nollstll???
                 *
                 */
            }
        }
        SkrivUt(3, "Efter processStream.");

        //if (DoHighlight)
        //{
        // Hr skall texten frmedlas till EasyReader och resultatlista med
        // meningar som skall highlightas skall returneras!
        if ((DoHighlight) && !(helaTexten.equals(""))) {
            SkrivUt(2, "Fre sammanfatta. helaTexten = \"\"");
            menisammanfattningen = sammanfatta(helaTexten, sumslidval);
            System.out.println(menisammanfattningen);
        } else
            menisammanfattningen = null;
        // Hr skall g operatorernas argument modifieras fr de som skall vara
        // med i sammanfattningen.
        mind = 0; // Behvs denna hr?
        meningsrest = meningsvektor.get(mind).helameningen; // Kvarvarande text p aktuella sidan.
        cosenr = 0; // index fr COSString eller COSArray.
        mennr = 0; // index fr aktuell mening.
        mendelnr = 0;
        mendelantal = 0; // Antal delar som meningen bestr av.
        valdsida = -1; // valda sidan inte knd n.
        for (int i = 0; i < pages.size(); i++) {
            // Borde flytta p denna om de inte skall anvndas!
            // TBVector = new Vector<SETextBlock>(); // TB vektorn fr denna sida.
            // PageVector.add(i, TBVector); // Totalstruktur.
            // Lgg till TB vektorn fr denna sida.
            SkrivUt(4, "Ny sida highlight: " + i);
            // PDPage ndrad till lokal variabel!
            page1 = (PDPage) pages.get(i);
            PDStream contents = page1.getContents();
            //AH Kod frn PageDrawer:
            SkrivUt(4, "Innan contents test.");
            if (contents != null) {
                PDResources resources = page1.findResources();
                SkrivUt(1, "Fre highlight 1. Sida: " + i);
                // PDFStreamEngine.fas = PDFStreamEngine.rel2abs;
                setSumcharAlla(0); // Nollstll teckenrknaren fr strings
                // SkrivUt(4, "Fre splitMeningar 2.");
                // if (DoHighlight) 
                meningsrest = highlight(meningsrest, page1.getContents().getStream(), i, DoHighlight,
                        valdmening);
                //SkrivUt(4, "*** meningsrest: "+ meningsrest);
                SkrivUt(1, "Efter highlight av sida:" + i);
            }
        }
        SkrivUt(3, "Efter hela highlight.");
        /*} else // Spara data till pageTokens fr sparande till fil efter.
        {
                
        }*/

        // Dags att hmta fontmetrics och spara till fil. r det samma som ovan?
        // Skall inte gras f.n!

        for (int i = 0; i < pages.size(); i++) {
            TBVector = new Vector<SETextBlock>(); // TB vektorn fr denna sida.
            PageVector.add(i, TBVector); // Lgg till TB vektorn fr denna sida.
            SkrivUt(4, "Ny sida X: " + i);
            // PDPage ndrad, inte lokal lngre!
            page = (PDPage) pages.get(i);
            PDStream contents = page.getContents();
            //AH Kod frn PageDrawer:
            if (contents != null) {
                PDResources resources = page.findResources();
                SkrivUt(4, "Fre processStream.");
                // PDFStreamEngine.fas = PDFStreamEngine.rel2abs;
                setSumcharAlla(0); // Nollstll teckenrknaren fr strings
                // AH* Nsta rad anvnds fr att hmta ut fontmetrics.
                /* processStream( page, resources, page.getContents().getStream()); // Hr anropas sidhanteringen!
                SkrivUt(4, "Hela texten1: " + helaTexten);
                // helaTexten = ""; // Skall inte nollstllas nu!
                SkrivUt(4, "Efter processStream. fre nya");
                // cosStream.getStreamToken
                 *
                 */

                // PDStream nycont = new PDStream( getTokenList());
                // PDFStreamEngine.fas = PDFStreamEngine.splitstrings;
                // Nedanstende har anropats ovan.
                // processStream( page, resources, page.getContents().getStream());
                // SkrivUt(4, "Hela texten3: " + helaTexten);
                SkrivUt(4, "Efter andra processStream.");
            }
            SkrivUt(3, "Efter hela andra processStream.");
            /*
            PDFStreamParser parser = new PDFStreamParser(contents.getStream());
            parser.parse();
             *
             */
            // SkrivUt(4, "Egna loopen Sida: " + i);
            /* List tokens = getTokenList();// AH* parser.getTokens(); Tidigare hmtning av lista.
            // Dvs hmta INTE tokens frn den parsade filen. Anvnd tidigare data.
             LinkedList arguments = new LinkedList(); // AH* argumenten till operatorn.
             for( int j=0; j<tokens.size(); j++ )
             {
            Object next = tokens.get( j );
            if( next instanceof PDFOperator )
            {
                PDFOperator op = (PDFOperator)next;
                //Tj and TJ are the two operators that display
                //strings in a PDF
                //AH:
                //SkrivUt(4, "ArgumentList length: " + arguments.size());
                //>> AH* SkrivUt(4, "Operator anrop:" + OperatorCall( op, arguments ));
                // AH: Hr borde man gra ett anrop till StreamEngine!
                arguments = new LinkedList(); // Mste nollstlla argumenten
                // efter varje operator.
                if( op.getOperation().equals( "Tj" ) )
                {
                    //Tj takes one operator and that is the string
                    //to display so lets update that operator
                    COSString previous = (COSString)tokens.get( j-1 );
                    String string = previous.getString();
                    string = string.replaceFirst( strToFind, message );
                    previous.reset();
                    previous.append( string.getBytes() );
                    // AH* Testa tillgg av kod.
                            
                    tokens.add(j-1, gop);
                    if (gray1)
                    {
                        tokens.add(j-1,cfloat1 );
                        gray1 = false;
                    } else
                    {
                        tokens.add(j-1, cfloat5);
                        gray1 = true;
                    }
                    j = j+2;
                             
                            
                }
                else if( op.getOperation().equals( "TJ" ) )
                {
                    COSArray previous = (COSArray)tokens.get( j-1 );
                    for( int k=0; k<previous.size(); k++ )
                    {
                        Object arrElement = previous.getObject( k );
                        if( arrElement instanceof COSString )
                        {
                            COSString cosString = (COSString)arrElement;
                            String string = cosString.getString();
                            string = string.replaceFirst( strToFind, message );
                            cosString.reset();
                            cosString.append( string.getBytes() );
                        }
                    }
                    // AH: Tillagd kod!
                    /*
                    tokens.add(j-1, gop);
                    if (gray1)
                    {
                        tokens.add(j-1,cfloat1 );
                        gray1 = false;
                    } else
                    {
                        tokens.add(j-1, cfloat5);
                        gray1 = true;
                    }
                    j = j+2;
                    
                             
                }
            } else // Inte PDFOperator, samla argument!
            {
                if (next instanceof COSBase)
                {
                     arguments.add( next);
                     //SkrivUt(4, "COSBase " + next.toString());
                } else
                {
                    SkrivUt(4, "next inte rtt typ!");
                }
            }
             }
                    
             */
            //now that the tokens are updated we will replace the
            //page content stream.
            // Uppdatera data till filen!
            SkrivUt(3, ">>> Fre spara tokens i DoIt.");
            PDStream updatedStream = new PDStream(doc1);
            SkrivUt(3, ">>> Efter updated stream i DoIt.");
            OutputStream out = updatedStream.createOutputStream();
            ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
            tokenWriter.writeTokens(pageTokens.get(i));
            page.setContents(updatedStream);
            SkrivUt(3, ">>> Efter spara tokens i DoIt.");
        }
        /*
        if (!filesaved)
        {
        doc1.save( outputFile );
        filesaved = true;
        }
        doc1.close();
        SkrivUt(3, "doc1 closed 1.");
         *
         */
    } finally {
        saveAndClose(outputFile, doc1);
        /*
        SkrivUt(2, "Finally.");
        if( doc1 != null )
        {
        if (!filesaved)
        {
            doc1.save( outputFile );
            filesaved = true;
        }
        doc1.close();
        SkrivUt(3, "doc1 closed 2.");
        }
         *
         */
    }

    return meningsvektor;
}

From source file:com.shmsoft.dmass.ocr.PDFImageExtractor.java

License:Apache License

@SuppressWarnings("rawtypes")
@Override/*from   w w  w .j  a  v a 2s . com*/
public List<String> extractImages() {
    File extractionDir = new File(conf.getPdfImageExtractionDir());
    extractionDir.mkdirs();

    List<String> result = new ArrayList<String>();

    PDDocument document = null;
    try {
        document = PDDocument.load(file);

        List pages = document.getDocumentCatalog().getAllPages();
        Iterator iter = pages.iterator();
        int i = 1;
        int maxNumberOfImages = Project.getProject().getOcrMaxImagesPerPDF();

        while (iter.hasNext()) {
            PDPage page = (PDPage) iter.next();
            PDResources resources = page.getResources();
            Map pageImages = resources.getImages();
            if (pageImages != null) {
                Iterator imageIter = pageImages.keySet().iterator();
                while (imageIter.hasNext()) {
                    if (i > maxNumberOfImages) {
                        return result;
                    }

                    String key = (String) imageIter.next();
                    PDXObjectImage image = (PDXObjectImage) pageImages.get(key);

                    String fileName = conf.getPdfImageExtractionDir() + OCRUtil.createUniqueFileName("image");
                    image.write2file(fileName);

                    result.add(fileName + "." + image.getSuffix());

                    i++;
                }
            }
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }

    return result;
}

From source file:com.sinefine.util.pdf.Pdfs.java

License:Apache License

/**
 * Returns {@code true} if the <em>images</em> of the two PDF byte arrays are the same,
 * {@code false} otherwise./*from  ww w .  java2  s . c o  m*/
 *
 * <p>
 * Any differences between the documents that do not change the rendered image are ignored.</p>
 *
 * @param actual the actual byte array.
 * @param expected the expected byte array.
 * @return {@code true} if the two PDF <em>images</em> are the same, {@code false} otherwise.
 * @throws IOException if an error occurs whilst processing the byte arrays.
 */
public static boolean areImagesSame(final byte[] actual, final byte[] expected) throws IOException {
    PDDocument actualPdfDocument = null;
    PDDocument expectedPdfDocument = null;
    try (final InputStream actualInputStream = new ByteArrayInputStream(actual);
            final InputStream expectedInputStream = new ByteArrayInputStream(expected)) {
        actualPdfDocument = PDDocument.load(actualInputStream);
        expectedPdfDocument = PDDocument.load(expectedInputStream);
        final List<byte[]> actualPages = toPageImages(actualPdfDocument);
        final List<byte[]> expectedPages = toPageImages(expectedPdfDocument);
        return areByteListsEqual(actualPages, expectedPages);
    } finally {
        closeQuietly(actualPdfDocument);
        closeQuietly(expectedPdfDocument);
    }
}