Example usage for org.apache.pdfbox.pdmodel PDDocument load

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument load.

Prototype

public static PDDocument load(byte[] input) throws IOException

Source Link

Document

Parses a PDF.

Usage

From source file:net.sourceforge.docfetcher.model.parse.PagingPdfParser.java

License:Open Source License

public final void run() throws ParseException, CheckedOutOfMemoryError {
    PDDocument doc = null;/*from w  ww .  j  av  a2  s  .c o  m*/
    try {
        doc = PDDocument.load(file);
        PagingStripper stripper = new PagingStripper();
        stripper.setForceParsing(true);
        stripper.setSortByPosition(true);
        stripper.writeText(doc, writer);
    } catch (Exception e) {
        throw new ParseException(e);
    } catch (OutOfMemoryError e) {
        throw new CheckedOutOfMemoryError(e);
    } finally {
        PdfParser.close(doc);
    }
}

From source file:net.sourceforge.docfetcher.model.parse.TestParseFromZip.java

License:Open Source License

@Test
public void testZippedPdf() throws Exception {
    new ZipAndRun(TestFiles.multi_page_pdf) {
        protected void handleInputStream(InputStream in) throws Exception {
            PDDocument pdfDoc = PDDocument.load(in);
            PDFTextStripper stripper = new PDFTextStripper();
            StringWriter writer = new StringWriter();
            stripper.setForceParsing(true);
            stripper.setSortByPosition(true);
            stripper.writeText(pdfDoc, writer); // Will handle encryption with empty password
            PDDocumentInformation pdInfo = pdfDoc.getDocumentInformation();
            ParseResult result = new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle())
                    .addAuthor(pdInfo.getAuthor()).addMiscMetadata(pdInfo.getSubject())
                    .addMiscMetadata(pdInfo.getKeywords());
            String expectedContents = Util.join(Util.LS, "page 1", "page 2", "page 3");
            String actualContents = result.getContent().toString().trim();
            assertEquals(expectedContents, actualContents);
        }/*from   w ww .j a  v  a  2 s .c  o m*/
    };
}

From source file:net.sourceforge.docfetcher.parse.PDFParser.java

License:Open Source License

public String renderText(File file) throws ParseException {
    PDDocument pdfDoc = null;//from   w  w w  .ja  va2 s. c o m
    try {
        pdfDoc = PDDocument.load(file);
        if (pdfDoc.isEncrypted()) {
            try {
                pdfDoc.openProtection(new StandardDecryptionMaterial(""));
            } catch (Exception e) {
                throw new ParseException(file, Msg.no_extraction_permission.value());
            }
        }
        PDFTextStripper stripper = new PDFTextStripper();
        StringWriter writer = new StringWriter();
        stripper.writeText(pdfDoc, writer);
        return writer.toString();
    } catch (IOException e) {
        throw new ParseException(file, Msg.file_not_readable.value());
    } finally {
        if (pdfDoc != null) {
            try {
                pdfDoc.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

From source file:net.sourceforge.docfetcher.parse.PDFParser.java

License:Open Source License

public Document parse(File file) throws ParseException {
    PDDocument pdfDoc = null;/* w  ww . j  a v a2s.co  m*/
    try {
        // Check if PDF file is encrypted
        pdfDoc = PDDocument.load(file);
        if (pdfDoc.isEncrypted()) {
            try {
                pdfDoc.openProtection(new StandardDecryptionMaterial(""));
            } catch (Exception e) {
                throw new ParseException(file, Msg.no_extraction_permission.value());
            }
        }

        // Get tags and contents
        PDFTextStripper stripper = new PDFTextStripper();
        StringWriter writer = new StringWriter();
        stripper.writeText(pdfDoc, writer);
        DocFetcher.getInstance().setExceptionHandlerEnabled(true);
        PDDocumentInformation pdInfo = pdfDoc.getDocumentInformation();
        String[] metaData = new String[] { pdInfo.getTitle(), pdInfo.getAuthor(), pdInfo.getSubject(),
                pdInfo.getKeywords(), };
        for (String field : metaData)
            if (field != null)
                writer.append(" ").append(field); //$NON-NLS-1$
        return new Document(file, metaData[0], writer.getBuffer()).addAuthor(metaData[1]);
    } catch (IOException e) {
        throw new ParseException(file, Msg.file_not_readable.value());
    } finally {
        if (pdfDoc != null) {
            try {
                pdfDoc.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

From source file:net.yacy.cora.util.Html2Image.java

License:Open Source License

/**
 * convert a pdf (first page) to an image. proper values are i.e. width = 1024, height = 1024, density = 300, quality = 75
 * using internal pdf library or external command line tool on linux or mac
 * @param pdf input pdf file/*w  ww .j a va  2  s  . c  o  m*/
 * @param image output jpg file
 * @param width
 * @param height
 * @param density (dpi)
 * @param quality
 * @return
 */
public static boolean pdf2image(File pdf, File image, int width, int height, int density, int quality) {
    final File convert = convertMac1.exists() ? convertMac1
            : convertMac2.exists() ? convertMac2 : convertDebian;

    // convert pdf to jpg using internal pdfbox capability
    if (OS.isWindows || !convert.exists()) {
        try {
            PDDocument pdoc = PDDocument.load(pdf);
            BufferedImage bi = new PDFRenderer(pdoc).renderImageWithDPI(0, density, ImageType.RGB);

            return ImageIO.write(bi, "jpg", image);

        } catch (IOException ex) {
        }
    }

    // convert on mac or linux using external command line utility
    try {
        // i.e. convert -density 300 -trim yacy.pdf[0] -trim -resize 1024x -crop x1024+0+0 -quality 75% yacy-convert-300.jpg
        // note: both -trim are necessary, otherwise it is trimmed only on one side. The [0] selects the first page of the pdf
        String command = convert.getAbsolutePath() + " -density " + density + " -trim " + pdf.getAbsolutePath()
                + "[0] -trim -resize " + width + "x -crop x" + height + "+0+0 -quality " + quality + "% "
                + image.getAbsolutePath();
        List<String> message = OS.execSynchronous(command);
        if (image.exists())
            return true;
        ConcurrentLog.warn("Html2Image", "failed to create image with command: " + command);
        for (String m : message)
            ConcurrentLog.warn("Html2Image", ">> " + m);

        // another try for mac: use Image Events using AppleScript in osacript commands...
        // the following command overwrites a pdf with an png, so we must make a copy first
        if (!OS.isMacArchitecture)
            return false;
        File pngFile = new File(pdf.getAbsolutePath() + ".tmp.pdf");
        org.apache.commons.io.FileUtils.copyFile(pdf, pngFile);
        String[] commandx = { "osascript", "-e", "set ImgFile to \"" + pngFile.getAbsolutePath() + "\"", "-e",
                "tell application \"Image Events\"", "-e", "set Img to open file ImgFile", "-e",
                "save Img as PNG", "-e", "end tell" };
        //ConcurrentLog.warn("Html2Image", "failed to create image with command: " + commandx);
        message = OS.execSynchronous(commandx);
        for (String m : message)
            ConcurrentLog.warn("Html2Image", ">> " + m);
        // now we must read and convert this file to a jpg with the target size 1024x1024
        try {
            File newPngFile = new File(pngFile.getAbsolutePath() + ".png");
            pngFile.renameTo(newPngFile);
            Image img = ImageParser.parse(pngFile.getAbsolutePath(), FileUtils.read(newPngFile));
            final Image scaled = img.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING);
            final MediaTracker mediaTracker = new MediaTracker(new Container());
            mediaTracker.addImage(scaled, 0);
            try {
                mediaTracker.waitForID(0);
            } catch (final InterruptedException e) {
            }
            // finally write the image
            final BufferedImage bi = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
            bi.createGraphics().drawImage(scaled, 0, 0, width, height, null);
            ImageIO.write(bi, "jpg", image);
            newPngFile.delete();
            return image.exists();
        } catch (IOException e) {
            ConcurrentLog.logException(e);
            return false;
        }
    } catch (IOException e) {
        e.printStackTrace();
        return false;
    }
}

From source file:net.yacy.document.parser.pdfParser.java

License:Open Source License

@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset,
        final VocabularyScraper scraper, final int timezoneOffset, final InputStream source)
        throws Parser.Failure, InterruptedException {

    // check memory for parser
    if (!MemoryControl.request(200 * 1024 * 1024, false))
        throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(),
                location);/*  w ww. j a  v  a2 s. c  o  m*/

    // create a pdf parser
    PDDocument pdfDoc;
    //final PDFParser pdfParser;
    try {
        Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain
        pdfDoc = PDDocument.load(source);
        //PDFParser pdfParser = new PDFParser(source);
        //pdfParser.parse();
        //pdfDoc = pdfParser.getPDDocument();
    } catch (final IOException e) {
        throw new Parser.Failure(e.getMessage(), location);
    } finally {
        Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
    }

    if (pdfDoc.isEncrypted()) {
        try {
            pdfDoc.openProtection(new StandardDecryptionMaterial(""));
        } catch (final BadSecurityHandlerException e) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location);
        } catch (final IOException e) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location);
        } catch (final CryptographyException e) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location);
        }
        final AccessPermission perm = pdfDoc.getCurrentAccessPermission();
        if (perm == null || !perm.canExtractContent()) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
        }
    }

    // extracting some metadata
    PDDocumentInformation info = pdfDoc.getDocumentInformation();
    String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
    Date docDate = new Date();
    if (info != null) {
        docTitle = info.getTitle();
        docSubject = info.getSubject();
        docAuthor = info.getAuthor();
        docPublisher = info.getProducer();
        if (docPublisher == null || docPublisher.isEmpty())
            docPublisher = info.getCreator();
        docKeywordStr = info.getKeywords();
        try {
            if (info.getModificationDate() != null)
                docDate = info.getModificationDate().getTime();
        } catch (IOException e) {
        }
        // unused:
        // info.getTrapped());
    }
    info = null;

    if (docTitle == null || docTitle.isEmpty()) {
        docTitle = MultiProtocolURL.unescape(location.getFileName());
    }
    if (docTitle == null) {
        docTitle = docSubject;
    }
    String[] docKeywords = null;
    if (docKeywordStr != null) {
        docKeywords = docKeywordStr.split(" |,");
    }

    Collection<AnchorURL>[] pdflinks = null;
    Document[] result = null;
    try {
        // get the links
        pdflinks = extractPdfLinks(pdfDoc);

        // get the fulltext (either per document or for each page)
        final PDFTextStripper stripper = new PDFTextStripper("UTF-8");

        if (individualPages) {
            // this is a hack which stores individual pages of the source pdf into individual index documents
            // the new documents will get a virtual link with a post argument page=X appended to the original url

            // collect text
            int pagecount = pdfDoc.getNumberOfPages();
            String[] pages = new String[pagecount];
            for (int page = 1; page <= pagecount; page++) {
                stripper.setStartPage(page);
                stripper.setEndPage(page);
                pages[page - 1] = stripper.getText(pdfDoc);
                //System.out.println("PAGE " + page + ": " + pages[page - 1]);
            }

            // create individual documents for each page
            assert pages.length == pdflinks.length : "pages.length = " + pages.length + ", pdflinks.length = "
                    + pdflinks.length;
            result = new Document[Math.min(pages.length, pdflinks.length)];
            String loc = location.toNormalform(true);
            for (int page = 0; page < result.length; page++) {
                result[page] = new Document(
                        new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname
                                + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash
                        mimeType, "UTF-8", this, null, docKeywords, singleList(docTitle), docAuthor,
                        docPublisher, null, null, 0.0f, 0.0f,
                        pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
                        pdflinks == null || page >= pdflinks.length ? null : pdflinks[page], null, null, false,
                        docDate);
            }
        } else {
            // collect the whole text at once
            final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
            byte[] contentBytes = new byte[0];
            stripper.setEndPage(3); // get first 3 pages (always)
            writer.append(stripper.getText(pdfDoc));
            contentBytes = writer.getBytes(); // remember text in case of interrupting thread

            if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
                stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
                stripper.setEndPage(Integer.MAX_VALUE); // set to default
                // we start the pdf parsing in a separate thread to ensure that it can be terminated
                final PDDocument pdfDocC = pdfDoc;
                final Thread t = new Thread() {
                    @Override
                    public void run() {
                        Thread.currentThread().setName("pdfParser.getText:" + location);
                        try {
                            writer.append(stripper.getText(pdfDocC));
                        } catch (final Throwable e) {
                        }
                    }
                };
                t.start();
                t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
                if (t.isAlive())
                    t.interrupt();
            }
            contentBytes = writer.getBytes(); // get final text before closing writer

            Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
            for (Collection<AnchorURL> pdflinksx : pdflinks)
                if (pdflinksx != null)
                    pdflinksCombined.addAll(pdflinksx);
            result = new Document[] { new Document(location, mimeType, "UTF-8", this, null, docKeywords,
                    singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, contentBytes,
                    pdflinksCombined, null, null, false, docDate) };
        }
    } catch (final Throwable e) {
        //close the writer (in finally)
        //throw new Parser.Failure(e.getMessage(), location);
    } finally {
        try {
            pdfDoc.close();
        } catch (final Throwable e) {
        }
    }

    // clear resources in pdfbox. they say that is resolved but it's not. see:
    // https://issues.apache.org/jira/browse/PDFBOX-313
    // https://issues.apache.org/jira/browse/PDFBOX-351
    // https://issues.apache.org/jira/browse/PDFBOX-441
    // the pdfbox still generates enormeous number of object allocations and don't delete these
    // the following Object are statically stored and never flushed:
    // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary,
    // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
    // the great number of these objects can easily be seen in Java Visual VM
    // we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out.
    pdfDoc = null;
    clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();

    return result;
}

From source file:neuralclassification.Classificator.java

String readText(String filepath, String name) {
    PDDocument pdfDocument = null;/*from ww w.j  a  va2  s. c  o m*/
    String paper = null;
    try {
        pdfDocument = PDDocument.load(new File(filepath + "/" + name));
        PDFTextStripper stripper = new PDFTextStripper();
        paper = stripper.getText(pdfDocument);
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        if (pdfDocument != null)
            try {
                pdfDocument.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
    }

    return paper;
}

From source file:neuralclassification.Trainer.java

String readText(String name) {
    PDDocument pdfDocument = null;//  w w  w.java  2  s .com
    String paper = null;
    try {
        pdfDocument = PDDocument.load(new File(filepath + "/" + name));
        PDFTextStripper stripper = new PDFTextStripper();
        paper = stripper.getText(pdfDocument);
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        if (pdfDocument != null)
            try {
                pdfDocument.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
    }

    return paper;
}

From source file:no.digipost.print.validate.PdfValidator.java

License:Apache License

/**
 * @param pdfStream the input stream for reading the PDF. It will be closed before returning from
 *                  this method/*from  w  w w.  jav  a 2  s.c  o  m*/
 * @param readStrategy decides if PDF is completely read into memory or not
 */
private PdfValidationResult validerForPrint(InputStream pdfStream,
        PdfValidationSettings printValideringsinnstillinger, PdfValidateStrategy readStrategy) {
    int antallSider = -1;
    try {
        List<PdfValidationError> errors;
        try {
            if (readStrategy == NON_SEQUENTIALLY) {
                try (EnhancedNonSequentialPDFParser dpostNonSequentialPDFParser = new EnhancedNonSequentialPDFParser(
                        pdfStream)) {
                    antallSider = dpostNonSequentialPDFParser.getNumberOfPages();
                    errors = validerStreamForPrint(dpostNonSequentialPDFParser, printValideringsinnstillinger);
                }
            } else if (readStrategy == FULLY_IN_MEMORY) {
                try (PDDocument pdDoc = PDDocument.load(pdfStream)) {
                    antallSider = pdDoc.getNumberOfPages();
                    errors = validerDokumentForPrint(pdDoc, printValideringsinnstillinger);
                }
            } else {
                throw new IllegalArgumentException(
                        "Unknown " + PdfValidateStrategy.class.getSimpleName() + ": " + readStrategy);
            }
        } catch (Exception e) {
            errors = asList(PdfValidationError.PDF_PARSE_ERROR);
            LOG.info("PDF-en kunne ikke parses. (" + e.getMessage() + ")");
            LOG.debug(e.getMessage(), e);
        }

        return new PdfValidationResult(errors, antallSider);
    } finally {
        IOUtils.closeQuietly(pdfStream);
    }
}

From source file:nominas.sei.form.Principal.java

private void ordenaNominas(String rutaEntrada, String rutaSalida) {
    ArrayList<PaginaNomina> paginasNomina = new ArrayList<PaginaNomina>();

    for (int x = 0; x < 1; x++) {//RECORREMOS EL ARREGLO CON LOS NOMBRES DE ARCHIVO

        try {/* w ww.  ja va  2 s.c  o  m*/
            PDDocument pd = PDDocument.load(rutaEntrada); //CARGAR EL PDF
            List l = pd.getDocumentCatalog().getAllPages();//NUMERO LAS PAGINAS DEL ARCHIVO
            Object[] obj = l.toArray();//METO EN UN OBJETO LA LISTA DE PAGINAS PARA MANIPULARLA
            for (int i = 0; i < l.size(); i++) {
                PDPage page = (PDPage) obj[i];//PAGE ES LA PAGINA 1 DE LA QUE CONSTA EL ARCHIVO
                PageFormat pageFormat = pd.getPageFormat(0);//PROPIEDADES DE LA PAGINA (FORMATO)
                Double d1 = new Double(pageFormat.getHeight());//ALTO
                Double d2 = new Double(pageFormat.getWidth());//ANCHO
                int width = d1.intValue();//ANCHO
                int eigth = 1024;//ALTO

                PDFTextStripperByArea stripper = new PDFTextStripperByArea();//COMPONENTE PARA ACCESO AL TEXTO
                Rectangle rect = new Rectangle(0, 0, width, eigth);//DEFNIR AREA DONDE SE BUSCARA EL TEXTO
                stripper.addRegion("area1", rect);//REGISTRAMOS LA REGION CON UN NOMBRE
                stripper.extractRegions(page);//EXTRAE TEXTO DEL AREA

                String contenido = new String();//CONTENIDO = A LO QUE CONTENGA EL AREA O REGION
                contenido = (stripper.getTextForRegion("area1"));
                String[] lines = contenido.split("[\\r\\n]+");
                String nombre = lines[1].substring(28, lines[1].length() - 10);//Separamos el nombre
                PaginaNomina nomina = new PaginaNomina(page, nombre);
                paginasNomina.add(nomina);
            }
            Collections.sort(paginasNomina);
            // Create a new empty document
            PDDocument document = new PDDocument();

            for (int i = 0; i < paginasNomina.size(); i++) {
                System.out.println(paginasNomina.get(i).getNombre());
                document.addPage(paginasNomina.get(i).getPagina());
            }
            // Save the newly created document
            document.save(rutaSalida);

            // finally make sure that the document is properly
            // closed.
            document.close();
            pd.close();//CERRAMOS OBJETO ACROBAT
        } catch (Exception e) {
            System.out.println(e.getMessage());
        } //CATCH
    } //FOR
}