Example usage for org.apache.pdfbox.pdmodel PDDocument load

List of usage examples for org.apache.pdfbox.pdmodel PDDocument load

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument load.

Prototype

public static PDDocument load(byte[] input) throws IOException 

Source Link

Document

Parses a PDF.

Usage

From source file:net.sourceforge.docfetcher.model.parse.PagingPdfParser.java

License:Open Source License

public final void run() throws ParseException, CheckedOutOfMemoryError {
    PDDocument doc = null;/*from w  ww .  j  av  a2  s  .c o  m*/
    try {
        doc = PDDocument.load(file);
        PagingStripper stripper = new PagingStripper();
        stripper.setForceParsing(true);
        stripper.setSortByPosition(true);
        stripper.writeText(doc, writer);
    } catch (Exception e) {
        throw new ParseException(e);
    } catch (OutOfMemoryError e) {
        throw new CheckedOutOfMemoryError(e);
    } finally {
        PdfParser.close(doc);
    }
}

From source file:net.sourceforge.docfetcher.model.parse.TestParseFromZip.java

License:Open Source License

@Test
public void testZippedPdf() throws Exception {
    new ZipAndRun(TestFiles.multi_page_pdf) {
        protected void handleInputStream(InputStream in) throws Exception {
            PDDocument pdfDoc = PDDocument.load(in);
            PDFTextStripper stripper = new PDFTextStripper();
            StringWriter writer = new StringWriter();
            stripper.setForceParsing(true);
            stripper.setSortByPosition(true);
            stripper.writeText(pdfDoc, writer); // Will handle encryption with empty password
            PDDocumentInformation pdInfo = pdfDoc.getDocumentInformation();
            ParseResult result = new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle())
                    .addAuthor(pdInfo.getAuthor()).addMiscMetadata(pdInfo.getSubject())
                    .addMiscMetadata(pdInfo.getKeywords());
            String expectedContents = Util.join(Util.LS, "page 1", "page 2", "page 3");
            String actualContents = result.getContent().toString().trim();
            assertEquals(expectedContents, actualContents);
        }/*from   w ww .j a  v  a  2 s .c  o m*/
    };
}

From source file:net.sourceforge.docfetcher.parse.PDFParser.java

License:Open Source License

public String renderText(File file) throws ParseException {
    PDDocument pdfDoc = null;//from   w  w w  .ja  va2 s. c o m
    try {
        pdfDoc = PDDocument.load(file);
        if (pdfDoc.isEncrypted()) {
            try {
                pdfDoc.openProtection(new StandardDecryptionMaterial(""));
            } catch (Exception e) {
                throw new ParseException(file, Msg.no_extraction_permission.value());
            }
        }
        PDFTextStripper stripper = new PDFTextStripper();
        StringWriter writer = new StringWriter();
        stripper.writeText(pdfDoc, writer);
        return writer.toString();
    } catch (IOException e) {
        throw new ParseException(file, Msg.file_not_readable.value());
    } finally {
        if (pdfDoc != null) {
            try {
                pdfDoc.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

From source file:net.sourceforge.docfetcher.parse.PDFParser.java

License:Open Source License

public Document parse(File file) throws ParseException {
    PDDocument pdfDoc = null;/* w  ww . j  a v a2s.co  m*/
    try {
        // Check if PDF file is encrypted
        pdfDoc = PDDocument.load(file);
        if (pdfDoc.isEncrypted()) {
            try {
                pdfDoc.openProtection(new StandardDecryptionMaterial(""));
            } catch (Exception e) {
                throw new ParseException(file, Msg.no_extraction_permission.value());
            }
        }

        // Get tags and contents
        PDFTextStripper stripper = new PDFTextStripper();
        StringWriter writer = new StringWriter();
        stripper.writeText(pdfDoc, writer);
        DocFetcher.getInstance().setExceptionHandlerEnabled(true);
        PDDocumentInformation pdInfo = pdfDoc.getDocumentInformation();
        String[] metaData = new String[] { pdInfo.getTitle(), pdInfo.getAuthor(), pdInfo.getSubject(),
                pdInfo.getKeywords(), };
        for (String field : metaData)
            if (field != null)
                writer.append(" ").append(field); //$NON-NLS-1$
        return new Document(file, metaData[0], writer.getBuffer()).addAuthor(metaData[1]);
    } catch (IOException e) {
        throw new ParseException(file, Msg.file_not_readable.value());
    } finally {
        if (pdfDoc != null) {
            try {
                pdfDoc.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

From source file:net.yacy.cora.util.Html2Image.java

License:Open Source License

/**
 * convert a pdf (first page) to an image. proper values are i.e. width = 1024, height = 1024, density = 300, quality = 75
 * using internal pdf library or external command line tool on linux or mac
 * @param pdf input pdf file/*w  ww .j a va  2  s  . c  o  m*/
 * @param image output jpg file
 * @param width
 * @param height
 * @param density (dpi)
 * @param quality
 * @return
 */
public static boolean pdf2image(File pdf, File image, int width, int height, int density, int quality) {
    final File convert = convertMac1.exists() ? convertMac1
            : convertMac2.exists() ? convertMac2 : convertDebian;

    // convert pdf to jpg using internal pdfbox capability
    if (OS.isWindows || !convert.exists()) {
        try {
            PDDocument pdoc = PDDocument.load(pdf);
            BufferedImage bi = new PDFRenderer(pdoc).renderImageWithDPI(0, density, ImageType.RGB);

            return ImageIO.write(bi, "jpg", image);

        } catch (IOException ex) {
        }
    }

    // convert on mac or linux using external command line utility
    try {
        // i.e. convert -density 300 -trim yacy.pdf[0] -trim -resize 1024x -crop x1024+0+0 -quality 75% yacy-convert-300.jpg
        // note: both -trim are necessary, otherwise it is trimmed only on one side. The [0] selects the first page of the pdf
        String command = convert.getAbsolutePath() + " -density " + density + " -trim " + pdf.getAbsolutePath()
                + "[0] -trim -resize " + width + "x -crop x" + height + "+0+0 -quality " + quality + "% "
                + image.getAbsolutePath();
        List<String> message = OS.execSynchronous(command);
        if (image.exists())
            return true;
        ConcurrentLog.warn("Html2Image", "failed to create image with command: " + command);
        for (String m : message)
            ConcurrentLog.warn("Html2Image", ">> " + m);

        // another try for mac: use Image Events using AppleScript in osacript commands...
        // the following command overwrites a pdf with an png, so we must make a copy first
        if (!OS.isMacArchitecture)
            return false;
        File pngFile = new File(pdf.getAbsolutePath() + ".tmp.pdf");
        org.apache.commons.io.FileUtils.copyFile(pdf, pngFile);
        String[] commandx = { "osascript", "-e", "set ImgFile to \"" + pngFile.getAbsolutePath() + "\"", "-e",
                "tell application \"Image Events\"", "-e", "set Img to open file ImgFile", "-e",
                "save Img as PNG", "-e", "end tell" };
        //ConcurrentLog.warn("Html2Image", "failed to create image with command: " + commandx);
        message = OS.execSynchronous(commandx);
        for (String m : message)
            ConcurrentLog.warn("Html2Image", ">> " + m);
        // now we must read and convert this file to a jpg with the target size 1024x1024
        try {
            File newPngFile = new File(pngFile.getAbsolutePath() + ".png");
            pngFile.renameTo(newPngFile);
            Image img = ImageParser.parse(pngFile.getAbsolutePath(), FileUtils.read(newPngFile));
            final Image scaled = img.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING);
            final MediaTracker mediaTracker = new MediaTracker(new Container());
            mediaTracker.addImage(scaled, 0);
            try {
                mediaTracker.waitForID(0);
            } catch (final InterruptedException e) {
            }
            // finally write the image
            final BufferedImage bi = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
            bi.createGraphics().drawImage(scaled, 0, 0, width, height, null);
            ImageIO.write(bi, "jpg", image);
            newPngFile.delete();
            return image.exists();
        } catch (IOException e) {
            ConcurrentLog.logException(e);
            return false;
        }
    } catch (IOException e) {
        e.printStackTrace();
        return false;
    }
}

From source file:net.yacy.document.parser.pdfParser.java

License:Open Source License

@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset,
        final VocabularyScraper scraper, final int timezoneOffset, final InputStream source)
        throws Parser.Failure, InterruptedException {

    // check memory for parser
    if (!MemoryControl.request(200 * 1024 * 1024, false))
        throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(),
                location);/*  w ww. j a  v  a2 s. c  o  m*/

    // create a pdf parser
    PDDocument pdfDoc;
    //final PDFParser pdfParser;
    try {
        Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain
        pdfDoc = PDDocument.load(source);
        //PDFParser pdfParser = new PDFParser(source);
        //pdfParser.parse();
        //pdfDoc = pdfParser.getPDDocument();
    } catch (final IOException e) {
        throw new Parser.Failure(e.getMessage(), location);
    } finally {
        Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
    }

    if (pdfDoc.isEncrypted()) {
        try {
            pdfDoc.openProtection(new StandardDecryptionMaterial(""));
        } catch (final BadSecurityHandlerException e) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location);
        } catch (final IOException e) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location);
        } catch (final CryptographyException e) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location);
        }
        final AccessPermission perm = pdfDoc.getCurrentAccessPermission();
        if (perm == null || !perm.canExtractContent()) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
        }
    }

    // extracting some metadata
    PDDocumentInformation info = pdfDoc.getDocumentInformation();
    String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
    Date docDate = new Date();
    if (info != null) {
        docTitle = info.getTitle();
        docSubject = info.getSubject();
        docAuthor = info.getAuthor();
        docPublisher = info.getProducer();
        if (docPublisher == null || docPublisher.isEmpty())
            docPublisher = info.getCreator();
        docKeywordStr = info.getKeywords();
        try {
            if (info.getModificationDate() != null)
                docDate = info.getModificationDate().getTime();
        } catch (IOException e) {
        }
        // unused:
        // info.getTrapped());
    }
    info = null;

    if (docTitle == null || docTitle.isEmpty()) {
        docTitle = MultiProtocolURL.unescape(location.getFileName());
    }
    if (docTitle == null) {
        docTitle = docSubject;
    }
    String[] docKeywords = null;
    if (docKeywordStr != null) {
        docKeywords = docKeywordStr.split(" |,");
    }

    Collection<AnchorURL>[] pdflinks = null;
    Document[] result = null;
    try {
        // get the links
        pdflinks = extractPdfLinks(pdfDoc);

        // get the fulltext (either per document or for each page)
        final PDFTextStripper stripper = new PDFTextStripper("UTF-8");

        if (individualPages) {
            // this is a hack which stores individual pages of the source pdf into individual index documents
            // the new documents will get a virtual link with a post argument page=X appended to the original url

            // collect text
            int pagecount = pdfDoc.getNumberOfPages();
            String[] pages = new String[pagecount];
            for (int page = 1; page <= pagecount; page++) {
                stripper.setStartPage(page);
                stripper.setEndPage(page);
                pages[page - 1] = stripper.getText(pdfDoc);
                //System.out.println("PAGE " + page + ": " + pages[page - 1]);
            }

            // create individual documents for each page
            assert pages.length == pdflinks.length : "pages.length = " + pages.length + ", pdflinks.length = "
                    + pdflinks.length;
            result = new Document[Math.min(pages.length, pdflinks.length)];
            String loc = location.toNormalform(true);
            for (int page = 0; page < result.length; page++) {
                result[page] = new Document(
                        new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname
                                + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash
                        mimeType, "UTF-8", this, null, docKeywords, singleList(docTitle), docAuthor,
                        docPublisher, null, null, 0.0f, 0.0f,
                        pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
                        pdflinks == null || page >= pdflinks.length ? null : pdflinks[page], null, null, false,
                        docDate);
            }
        } else {
            // collect the whole text at once
            final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
            byte[] contentBytes = new byte[0];
            stripper.setEndPage(3); // get first 3 pages (always)
            writer.append(stripper.getText(pdfDoc));
            contentBytes = writer.getBytes(); // remember text in case of interrupting thread

            if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
                stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
                stripper.setEndPage(Integer.MAX_VALUE); // set to default
                // we start the pdf parsing in a separate thread to ensure that it can be terminated
                final PDDocument pdfDocC = pdfDoc;
                final Thread t = new Thread() {
                    @Override
                    public void run() {
                        Thread.currentThread().setName("pdfParser.getText:" + location);
                        try {
                            writer.append(stripper.getText(pdfDocC));
                        } catch (final Throwable e) {
                        }
                    }
                };
                t.start();
                t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
                if (t.isAlive())
                    t.interrupt();
            }
            contentBytes = writer.getBytes(); // get final text before closing writer

            Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
            for (Collection<AnchorURL> pdflinksx : pdflinks)
                if (pdflinksx != null)
                    pdflinksCombined.addAll(pdflinksx);
            result = new Document[] { new Document(location, mimeType, "UTF-8", this, null, docKeywords,
                    singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, contentBytes,
                    pdflinksCombined, null, null, false, docDate) };
        }
    } catch (final Throwable e) {
        //close the writer (in finally)
        //throw new Parser.Failure(e.getMessage(), location);
    } finally {
        try {
            pdfDoc.close();
        } catch (final Throwable e) {
        }
    }

    // clear resources in pdfbox. they say that is resolved but it's not. see:
    // https://issues.apache.org/jira/browse/PDFBOX-313
    // https://issues.apache.org/jira/browse/PDFBOX-351
    // https://issues.apache.org/jira/browse/PDFBOX-441
    // the pdfbox still generates enormeous number of object allocations and don't delete these
    // the following Object are statically stored and never flushed:
    // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary,
    // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
    // the great number of these objects can easily be seen in Java Visual VM
    // we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out.
    pdfDoc = null;
    clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();

    return result;
}

From source file:neuralclassification.Classificator.java

String readText(String filepath, String name) {
    PDDocument pdfDocument = null;/*from ww w.j  a  va2  s. c  o m*/
    String paper = null;
    try {
        pdfDocument = PDDocument.load(new File(filepath + "/" + name));
        PDFTextStripper stripper = new PDFTextStripper();
        paper = stripper.getText(pdfDocument);
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        if (pdfDocument != null)
            try {
                pdfDocument.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
    }

    return paper;
}

From source file:neuralclassification.Trainer.java

String readText(String name) {
    PDDocument pdfDocument = null;//  w w  w.java  2  s .com
    String paper = null;
    try {
        pdfDocument = PDDocument.load(new File(filepath + "/" + name));
        PDFTextStripper stripper = new PDFTextStripper();
        paper = stripper.getText(pdfDocument);
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        if (pdfDocument != null)
            try {
                pdfDocument.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
    }

    return paper;
}

From source file:no.digipost.print.validate.PdfValidator.java

License:Apache License

/**
 * @param pdfStream the input stream for reading the PDF. It will be closed before returning from
 *                  this method/*from  w  w w.  jav  a 2  s.c  o  m*/
 * @param readStrategy decides if PDF is completely read into memory or not
 */
private PdfValidationResult validerForPrint(InputStream pdfStream,
        PdfValidationSettings printValideringsinnstillinger, PdfValidateStrategy readStrategy) {
    int antallSider = -1;
    try {
        List<PdfValidationError> errors;
        try {
            if (readStrategy == NON_SEQUENTIALLY) {
                try (EnhancedNonSequentialPDFParser dpostNonSequentialPDFParser = new EnhancedNonSequentialPDFParser(
                        pdfStream)) {
                    antallSider = dpostNonSequentialPDFParser.getNumberOfPages();
                    errors = validerStreamForPrint(dpostNonSequentialPDFParser, printValideringsinnstillinger);
                }
            } else if (readStrategy == FULLY_IN_MEMORY) {
                try (PDDocument pdDoc = PDDocument.load(pdfStream)) {
                    antallSider = pdDoc.getNumberOfPages();
                    errors = validerDokumentForPrint(pdDoc, printValideringsinnstillinger);
                }
            } else {
                throw new IllegalArgumentException(
                        "Unknown " + PdfValidateStrategy.class.getSimpleName() + ": " + readStrategy);
            }
        } catch (Exception e) {
            errors = asList(PdfValidationError.PDF_PARSE_ERROR);
            LOG.info("PDF-en kunne ikke parses. (" + e.getMessage() + ")");
            LOG.debug(e.getMessage(), e);
        }

        return new PdfValidationResult(errors, antallSider);
    } finally {
        IOUtils.closeQuietly(pdfStream);
    }
}

From source file:nominas.sei.form.Principal.java

private void ordenaNominas(String rutaEntrada, String rutaSalida) {
    ArrayList<PaginaNomina> paginasNomina = new ArrayList<PaginaNomina>();

    for (int x = 0; x < 1; x++) {//RECORREMOS EL ARREGLO CON LOS NOMBRES DE ARCHIVO

        try {/* w ww.  ja va  2 s.c  o  m*/
            PDDocument pd = PDDocument.load(rutaEntrada); //CARGAR EL PDF
            List l = pd.getDocumentCatalog().getAllPages();//NUMERO LAS PAGINAS DEL ARCHIVO
            Object[] obj = l.toArray();//METO EN UN OBJETO LA LISTA DE PAGINAS PARA MANIPULARLA
            for (int i = 0; i < l.size(); i++) {
                PDPage page = (PDPage) obj[i];//PAGE ES LA PAGINA 1 DE LA QUE CONSTA EL ARCHIVO
                PageFormat pageFormat = pd.getPageFormat(0);//PROPIEDADES DE LA PAGINA (FORMATO)
                Double d1 = new Double(pageFormat.getHeight());//ALTO
                Double d2 = new Double(pageFormat.getWidth());//ANCHO
                int width = d1.intValue();//ANCHO
                int eigth = 1024;//ALTO

                PDFTextStripperByArea stripper = new PDFTextStripperByArea();//COMPONENTE PARA ACCESO AL TEXTO
                Rectangle rect = new Rectangle(0, 0, width, eigth);//DEFNIR AREA DONDE SE BUSCARA EL TEXTO
                stripper.addRegion("area1", rect);//REGISTRAMOS LA REGION CON UN NOMBRE
                stripper.extractRegions(page);//EXTRAE TEXTO DEL AREA

                String contenido = new String();//CONTENIDO = A LO QUE CONTENGA EL AREA O REGION
                contenido = (stripper.getTextForRegion("area1"));
                String[] lines = contenido.split("[\\r\\n]+");
                String nombre = lines[1].substring(28, lines[1].length() - 10);//Separamos el nombre
                PaginaNomina nomina = new PaginaNomina(page, nombre);
                paginasNomina.add(nomina);
            }
            Collections.sort(paginasNomina);
            // Create a new empty document
            PDDocument document = new PDDocument();

            for (int i = 0; i < paginasNomina.size(); i++) {
                System.out.println(paginasNomina.get(i).getNombre());
                document.addPage(paginasNomina.get(i).getPagina());
            }
            // Save the newly created document
            document.save(rutaSalida);

            // finally make sure that the document is properly
            // closed.
            document.close();
            pd.close();//CERRAMOS OBJETO ACROBAT
        } catch (Exception e) {
            System.out.println(e.getMessage());
        } //CATCH
    } //FOR
}