Example usage for org.apache.pdfbox.pdmodel PDDocument getNumberOfPages

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument getNumberOfPages.

Prototype

public int getNumberOfPages()

Source Link

Document

This will return the total page count of the PDF document.

Usage

From source file:mj.ocraptor.extraction.tika.parser.pdf.PDFParser.java

License:Apache License

private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException {
    PDDocumentInformation info = document.getDocumentInformation();
    metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
    addMetadata(metadata, TikaCoreProperties.TITLE, info.getTitle());
    addMetadata(metadata, TikaCoreProperties.CREATOR, info.getAuthor());
    addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
    addMetadata(metadata, "producer", info.getProducer());
    // TODO: Move to description in Tika 2.0
    addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
    addMetadata(metadata, "trapped", info.getTrapped());
    try {//from   w  ww.  ja  v  a2s. c o m
        // TODO Remove these in Tika 2.0
        addMetadata(metadata, "created", info.getCreationDate());
        addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
    } catch (IOException e) {
        // Invalid date format, just ignore
    }
    try {
        Calendar modified = info.getModificationDate();
        addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
        addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
    } catch (IOException e) {
        // Invalid date format, just ignore
    }

    // All remaining metadata is custom
    // Copy this over as-is
    List<String> handledMetadata = Arrays.asList(new String[] { "Author", "Creator", "CreationDate", "ModDate",
            "Keywords", "Producer", "Subject", "Title", "Trapped" });
    for (COSName key : info.getDictionary().keySet()) {
        String name = key.getName();
        if (!handledMetadata.contains(name)) {
            addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
        }
    }
}

From source file:model.util.pdf.PDFUtils.java

License:Apache License

/**
 * Infamous main method./*  w  w  w  .ja v  a2  s  .  c o m*/
 * Adapted by Julius Huelsmann.
 * 
 * @author Ben Litchfield
 *
 * @param args Command line arguments, should be one and a reference to a file.
 *
 * @throws IOException If there is an error parsing the document.
 */
public static BufferedImage pdf2image(final String[] _parameters) throws IOException {

    // suppress the Dock icon on OS X if called from outside
    // the paint - project.
    System.setProperty("apple.awt.UIElement", "true");

    String password = "";
    String pdfFile = null;
    String outputPrefix = null;
    String imageFormat = "jpg";
    int startPage = 1;
    int endPage = Integer.MAX_VALUE;
    String color = "rgb";
    int dpi;
    float cropBoxLowerLeftX = 0;
    float cropBoxLowerLeftY = 0;
    float cropBoxUpperRightX = 0;
    float cropBoxUpperRightY = 0;
    boolean showTime = false;
    try {
        dpi = Toolkit.getDefaultToolkit().getScreenResolution();
    } catch (HeadlessException e) {
        dpi = 96;
    }
    for (int i = 0; i < _parameters.length; i++) {
        if (_parameters[i].equals(PASSWORD)) {
            i++;
            if (i >= _parameters.length) {
                usage();
            }
            password = _parameters[i];
        } else if (_parameters[i].equals(START_PAGE)) {
            i++;
            if (i >= _parameters.length) {
                usage();
            }
            startPage = Integer.parseInt(_parameters[i]);
        } else if (_parameters[i].equals(END_PAGE)) {
            i++;
            if (i >= _parameters.length) {
                usage();
            }
            endPage = Integer.parseInt(_parameters[i]);
        } else if (_parameters[i].equals(PAGE)) {
            i++;
            if (i >= _parameters.length) {
                usage();
            }
            startPage = Integer.parseInt(_parameters[i]);
            endPage = Integer.parseInt(_parameters[i]);
        } else if (_parameters[i].equals(IMAGE_TYPE) || _parameters[i].equals(FORMAT)) {
            i++;
            imageFormat = _parameters[i];
        } else if (_parameters[i].equals(OUTPUT_PREFIX) || _parameters[i].equals(PREFIX)) {
            i++;
            outputPrefix = _parameters[i];
        } else if (_parameters[i].equals(COLOR)) {
            i++;
            color = _parameters[i];
        } else if (_parameters[i].equals(RESOLUTION) || _parameters[i].equals(DPI)) {
            i++;
            dpi = Integer.parseInt(_parameters[i]);
        } else if (_parameters[i].equals(CROPBOX)) {
            i++;
            cropBoxLowerLeftX = Float.valueOf(_parameters[i]);
            i++;
            cropBoxLowerLeftY = Float.valueOf(_parameters[i]);
            i++;
            cropBoxUpperRightX = Float.valueOf(_parameters[i]);
            i++;
            cropBoxUpperRightY = Float.valueOf(_parameters[i]);
        } else if (_parameters[i].equals(TIME)) {
            showTime = true;
        } else {
            if (pdfFile == null) {
                pdfFile = _parameters[i];
            }
        }
    }
    if (pdfFile == null) {
        usage();
    } else {
        if (outputPrefix == null) {
            outputPrefix = pdfFile.substring(0, pdfFile.lastIndexOf('.'));
        }

        PDDocument document = null;
        try {
            document = PDDocument.load(new File(pdfFile), password);

            ImageType imageType = null;
            if ("bilevel".equalsIgnoreCase(color)) {
                imageType = ImageType.BINARY;
            } else if ("gray".equalsIgnoreCase(color)) {
                imageType = ImageType.GRAY;
            } else if ("rgb".equalsIgnoreCase(color)) {
                imageType = ImageType.RGB;
            } else if ("rgba".equalsIgnoreCase(color)) {
                imageType = ImageType.ARGB;
            }

            if (imageType == null) {
                System.err.println("Error: Invalid color.");
                System.exit(2);
            }

            //if a CropBox has been specified, update the CropBox:
            //changeCropBoxes(PDDocument document,float a, float b, float c,float d)
            if (cropBoxLowerLeftX != 0 || cropBoxLowerLeftY != 0 || cropBoxUpperRightX != 0
                    || cropBoxUpperRightY != 0) {
                changeCropBox(document, cropBoxLowerLeftX, cropBoxLowerLeftY, cropBoxUpperRightX,
                        cropBoxUpperRightY);
            }

            long startTime = System.nanoTime();

            // render the pages
            boolean success = true;
            endPage = Math.min(endPage, document.getNumberOfPages());
            PDFRenderer renderer = new PDFRenderer(document);
            for (int i = startPage - 1; i < endPage;) {
                BufferedImage image = renderer.renderImageWithDPI(i, dpi, imageType);
                String fileName = outputPrefix + (i + 1) + "." + imageFormat;
                success &= ImageIOUtil.writeImage(image, fileName, dpi);
                return image;
            }

            // performance stats
            long endTime = System.nanoTime();
            long duration = endTime - startTime;
            int count = 1 + endPage - startPage;
            if (showTime) {
                System.err.printf("Rendered %d page%s in %dms\n", count, count == 1 ? "" : "s",
                        duration / 1000000);
            }

            if (!success) {
                System.err.println("Error: no writer found for image format '" + imageFormat + "'");
                System.exit(1);
            }
        } finally {
            if (document != null) {
                document.close();
            }
        }
        return null;
    }
    return null;
}

From source file:net.bookinaction.TextInfoExtractor.java

License:Apache License

public void doTextPosition(String source, String coord_text, StripperParam stripperParam) throws IOException {

    String source_pdf = source;// www  .ja  v a 2 s.  c o  m
    String new_file = source.split("\\.")[0] + "-new.pdf";

    PDDocument document = PDDocument.load(new File(source_pdf));

    PrintWriter writer = new PrintWriter(new File(coord_text));

    //s.recordHeader(writer, source_pdf, document.getNumberOfPages(), sParam);

    for (int i = 0; i < document.getNumberOfPages(); i++) {
        getTextPositionFromPage(document, stripperParam, i + 1, writer, true);

    }

    if (document != null) {
        document.save(new_file);
        document.close();
    }

    if (writer != null)
        writer.close();

}

From source file:net.dstserbak.dataindexer.tokenizer.PDFTokenizer.java

/**
 * Reads all pages of the PDF file and splits text to words, which are
 * returned as TokensMap object.//from ww w. jav  a 2s.c o  m
 * @param pd PDDocument object that is created from stream
 * @return Map that contains tokens, which are belong to PDF document
 * @throws IOException If an I/O error occurs
 */
private static TokensMap tokenizeInput(PDDocument pd) throws IOException {
    int numberOfPages = pd.getNumberOfPages();
    if (pd.isEncrypted()) {
        log.log(Level.SEVERE, "PDF is ecrypted");
        return null;
    } else if (numberOfPages < 1) {
        log.log(Level.SEVERE, "PDF number of pages is less than 1");
        return null;
    }

    PDFTextStripper stripper = new PDFTextStripper();
    stripper.setStartPage(1);
    stripper.setEndPage(numberOfPages);
    StringTokenizer st = new StringTokenizer(stripper.getText(pd));
    TokensMap tokensMap = new TokensMap();
    TokenizerUtils.addTokensToMap(tokensMap, st);
    return tokensMap;
}

From source file:net.homeip.donaldm.doxmentor4j.indexers.PDFIndexer.java

License:Open Source License

@Override
public Reader getText(URI uri, int page, StringBuilder title)
        throws FileNotFoundException, MalformedURLException, IOException
//-----------------------------------------------------------------------------------------
{
    FileWriter writer = null;//www . j  ava 2  s  .com
    PDDocument pdf = null;
    PDFTextStripper stripper = null;
    java.io.File tmpPdf = null;
    try {
        tmpPdf = Utils.uri2File(uri);
        if (tmpPdf != null)
            pdf = PDDocument.load(tmpPdf.getAbsolutePath(), true);
        else
            pdf = PDDocument.load(uri.toURL(), true);
        PDDocumentInformation pdfInfo = pdf.getDocumentInformation();
        String s = pdfInfo.getTitle();
        if ((s == null) || (s.length() == 0))
            s = uri.getPath();
        if (title != null)
            title.append(s);
        stripper = new PDFTextStripper();
        if (page >= 0) {
            stripper.setStartPage(page);
            stripper.setEndPage(page);
        } else {
            stripper.setStartPage(1);
            stripper.setEndPage(pdf.getNumberOfPages());
        }
        java.io.File f = java.io.File.createTempFile("pdf", ".tmp");
        writer = new FileWriter(f);
        stripper.writeText(pdf, writer);
        try {
            writer.close();
            writer = null;
        } catch (Exception _e) {
        }
        stripper.resetEngine();
        return new FileReader(f);
    } finally {
        if (stripper != null)
            try {
                stripper.resetEngine();
            } catch (Exception _e) {
            }
        if (pdf != null)
            try {
                pdf.close();
            } catch (Exception _e) {
            }
        if (writer != null)
            try {
                writer.close();
            } catch (Exception _e) {
            }
        if ((tmpPdf != null)
                && (tmpPdf.getAbsolutePath().toLowerCase().indexOf(System.getProperty("java.io.tmpdir")) >= 0))
            tmpPdf.delete();
    }
}

From source file:net.homeip.donaldm.doxmentor4j.indexers.PDFIndexer.java

License:Open Source License

@Override
public long index(String href, URI uri, boolean followLinks, Object... extraParams) throws IOException
//-----------------------------------------------------------------------------------------------------
{
    if (m_indexWriter == null) {
        logger.error("PDFIndexer: index writer is null");
        return -1;
    }// w w  w . j  a v a 2  s  . c o  m
    PDDocument pdf = null;
    PDFTextStripper stripper = null;
    Reader reader = null;
    Writer writer = null;
    java.io.File tmpPdf = null;
    try {
        tmpPdf = Utils.uri2File(uri);
        if (tmpPdf != null)
            pdf = PDDocument.load(tmpPdf.getAbsolutePath(), true);
        else
            pdf = PDDocument.load(uri.toURL(), true);
        PDDocumentInformation pdfInfo = pdf.getDocumentInformation();
        String title = pdfInfo.getTitle();
        if ((title == null) || (title.isEmpty()))
            title = uri.getPath();
        stripper = new PDFTextStripper();
        int noPages = pdf.getNumberOfPages();
        stripper.setSuppressDuplicateOverlappingText(false);
        if (noPages != PDDocument.UNKNOWN_NUMBER_OF_PAGES) {
            for (int page = 1; page <= noPages; page++) {
                stripper.setStartPage(page);
                stripper.setEndPage(page);
                writer = new StringWriter();
                stripper.writeText(pdf, writer);
                reader = new StringReader(writer.toString());
                Document doc = new Document();
                doc.add(new Field("path", href, Field.Store.YES, Field.Index.NO));
                doc.add(new Field("title", title.toString(), Field.Store.YES, Field.Index.ANALYZED));
                doc.add(new Field("contents", reader, Field.TermVector.WITH_POSITIONS_OFFSETS));
                doc.add(new Field("page", Integer.toString(page), Field.Store.YES, Field.Index.NO));
                if (addDocument(doc))
                    AjaxIndexer.incrementCount();
                try {
                    writer.close();
                    writer = null;
                } catch (Exception _e) {
                }
                try {
                    reader.close();
                    reader = null;
                } catch (Exception _e) {
                }
                if ((page % 50) == 0) {
                    try {
                        System.runFinalization();
                        System.gc();
                    } catch (Exception _e) {
                    }
                }
            }
        } else {
            java.io.File f = java.io.File.createTempFile("pdf", ".tmp");
            writer = new FileWriter(f);
            stripper.writeText(pdf, writer);
            try {
                writer.close();
                writer = null;
            } catch (Exception _e) {
            }
            reader = new FileReader(f);
            Document doc = new Document();
            doc.add(new Field("path", href, Field.Store.YES, Field.Index.NO));
            doc.add(new Field("title", title.toString(), Field.Store.YES, Field.Index.ANALYZED));
            doc.add(new Field("contents", reader, Field.TermVector.WITH_POSITIONS_OFFSETS));
            doc.add(new Field("page", "-1", Field.Store.YES, Field.Index.NO));
            if (addDocument(doc))
                AjaxIndexer.incrementCount();
            try {
                reader.close();
                reader = null;
            } catch (Exception _e) {
            }
            try {
                System.runFinalization();
                System.gc();
            } catch (Exception _e) {
            }
        }
        return 1;
    } catch (Exception e) {
        logger.error("Error indexing PDF text from " + uri.toString(), e);
        return -1;
    } finally {
        if (stripper != null)
            try {
                stripper.resetEngine();
            } catch (Exception _e) {
            }
        if (pdf != null)
            try {
                pdf.close();
            } catch (Exception _e) {
            }
        if (writer != null)
            try {
                writer.close();
            } catch (Exception _e) {
            }
        if (reader != null)
            try {
                reader.close();
            } catch (Exception _e) {
            }
        if ((tmpPdf != null)
                && (tmpPdf.getAbsolutePath().toLowerCase().indexOf(System.getProperty("java.io.tmpdir")) >= 0))
            tmpPdf.delete();
    }
}

From source file:net.sourceforge.docfetcher.model.parse.PdfParser.java

License:Open Source License

@Override
protected ParseResult parse(@NotNull InputStream in, @NotNull final ParseContext context)
        throws ParseException {
    PDDocument pdfDoc = null;
    try {//from w ww . j  a  va  2s . co m
        /*
         * TODO post-release-1.1: check if 'force' argument in PDDocument/Stripper increases
         * number of parsed PDF files
         */
        pdfDoc = PDDocument.load(in, true);
        PDDocumentInformation pdInfo;
        final int pageCount;
        try {
            pdInfo = pdfDoc.getDocumentInformation();
            pageCount = pdfDoc.getNumberOfPages();
        } catch (ClassCastException e) {
            // Bug #3529070 and #3528345
            throw new ParseException(e);
        }
        StringWriter writer = new StringWriter();

        /*
         * If the PDF file is encrypted, the PDF stripper will automatically
         * try an empty password.
         * 
         * In contrast to the paging PDF parser that is used for the
         * preview, we do not need to call setSortByPosition(true) here
         * because the extracted text will be digested by Lucene anyway.
         */
        PDFTextStripper stripper = new PDFTextStripper() {
            protected void startPage(PDPage page) throws IOException {
                context.getReporter().subInfo(getCurrentPageNo(), pageCount);
            }

            protected void endPage(PDPage page) throws IOException {
                if (context.getCancelable().isCanceled())
                    setEndPage(0);
            }
        };
        stripper.setForceParsing(true);

        try {
            stripper.writeText(pdfDoc, writer);
        } catch (RuntimeException e) {
            /*
             * PDFTextStripper.writeText can throw various
             * RuntimeExceptions, see bugs #3446010, #3448272, #3444887.
             */
            throw new ParseException(e);
        }

        return new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle()).addAuthor(pdInfo.getAuthor())
                .addMiscMetadata(pdInfo.getSubject()).addMiscMetadata(pdInfo.getKeywords());
    } catch (IOException e) {
        if (e.getCause() instanceof CryptographyException)
            throw new ParseException(Msg.doc_pw_protected.get());
        throw new ParseException(e);
    } finally {
        close(pdfDoc);
    }
}

From source file:net.sourceforge.vaticanfetcher.model.parse.PdfParser.java

License:Open Source License

@Override
protected ParseResult parse(@NotNull InputStream in, @NotNull final ParseContext context)
        throws ParseException {
    PDDocument pdfDoc = null;
    try {//from   w  w w . j ava 2s.com
        /* TODO post-release-1.1: check if 'force' argument in PDDocument/Stripper increases number of parsed PDF files */
        pdfDoc = PDDocument.load(in, true);
        PDDocumentInformation pdInfo;
        final int pageCount;
        try {
            pdInfo = pdfDoc.getDocumentInformation();
            pageCount = pdfDoc.getNumberOfPages();
        } catch (ClassCastException e) {
            // Bug #3529070 and #3528345
            throw new ParseException(e);
        }
        StringWriter writer = new StringWriter();

        /*
         * If the PDF file is encrypted, the PDF stripper will automatically try an empty password.
         * 
         * In contrast to the paging PDF parser that is used for the preview, we do not need to call 
         * setSortByPosition(true) here because the extracted text will be digested by Lucene anyway.
         */
        PDFTextStripper stripper = new PDFTextStripper() {
            protected void startPage(PDPage page) throws IOException {
                context.getReporter().subInfo(getCurrentPageNo(), pageCount);
            }

            protected void endPage(PDPage page) throws IOException {
                if (context.getCancelable().isCanceled())
                    setEndPage(0);
            }
        };
        stripper.setForceParsing(true);

        try {
            stripper.writeText(pdfDoc, writer);
        } catch (RuntimeException e) {
            /* PDFTextStripper.writeText can throw various RuntimeExceptions, see bugs #3446010, #3448272, #3444887. */
            throw new ParseException(e);
        }

        return new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle()).addAuthor(pdInfo.getAuthor())
                .addMiscMetadata(pdInfo.getSubject()).addMiscMetadata(pdInfo.getKeywords());
    } catch (IOException e) {
        if (e.getCause() instanceof CryptographyException)
            throw new ParseException(Msg.doc_pw_protected.get());
        throw new ParseException(e);
    } finally {
        close(pdfDoc);
    }
}

From source file:net.yacy.document.parser.pdfParser.java

License:Open Source License

@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset,
        final VocabularyScraper scraper, final int timezoneOffset, final InputStream source)
        throws Parser.Failure, InterruptedException {

    // check memory for parser
    if (!MemoryControl.request(200 * 1024 * 1024, false))
        throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(),
                location);/*ww  w.j a  v  a 2  s  . c o  m*/

    // create a pdf parser
    PDDocument pdfDoc;
    //final PDFParser pdfParser;
    try {
        Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain
        pdfDoc = PDDocument.load(source);
        //PDFParser pdfParser = new PDFParser(source);
        //pdfParser.parse();
        //pdfDoc = pdfParser.getPDDocument();
    } catch (final IOException e) {
        throw new Parser.Failure(e.getMessage(), location);
    } finally {
        Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
    }

    if (pdfDoc.isEncrypted()) {
        try {
            pdfDoc.openProtection(new StandardDecryptionMaterial(""));
        } catch (final BadSecurityHandlerException e) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location);
        } catch (final IOException e) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location);
        } catch (final CryptographyException e) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location);
        }
        final AccessPermission perm = pdfDoc.getCurrentAccessPermission();
        if (perm == null || !perm.canExtractContent()) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
        }
    }

    // extracting some metadata
    PDDocumentInformation info = pdfDoc.getDocumentInformation();
    String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
    Date docDate = new Date();
    if (info != null) {
        docTitle = info.getTitle();
        docSubject = info.getSubject();
        docAuthor = info.getAuthor();
        docPublisher = info.getProducer();
        if (docPublisher == null || docPublisher.isEmpty())
            docPublisher = info.getCreator();
        docKeywordStr = info.getKeywords();
        try {
            if (info.getModificationDate() != null)
                docDate = info.getModificationDate().getTime();
        } catch (IOException e) {
        }
        // unused:
        // info.getTrapped());
    }
    info = null;

    if (docTitle == null || docTitle.isEmpty()) {
        docTitle = MultiProtocolURL.unescape(location.getFileName());
    }
    if (docTitle == null) {
        docTitle = docSubject;
    }
    String[] docKeywords = null;
    if (docKeywordStr != null) {
        docKeywords = docKeywordStr.split(" |,");
    }

    Collection<AnchorURL>[] pdflinks = null;
    Document[] result = null;
    try {
        // get the links
        pdflinks = extractPdfLinks(pdfDoc);

        // get the fulltext (either per document or for each page)
        final PDFTextStripper stripper = new PDFTextStripper("UTF-8");

        if (individualPages) {
            // this is a hack which stores individual pages of the source pdf into individual index documents
            // the new documents will get a virtual link with a post argument page=X appended to the original url

            // collect text
            int pagecount = pdfDoc.getNumberOfPages();
            String[] pages = new String[pagecount];
            for (int page = 1; page <= pagecount; page++) {
                stripper.setStartPage(page);
                stripper.setEndPage(page);
                pages[page - 1] = stripper.getText(pdfDoc);
                //System.out.println("PAGE " + page + ": " + pages[page - 1]);
            }

            // create individual documents for each page
            assert pages.length == pdflinks.length : "pages.length = " + pages.length + ", pdflinks.length = "
                    + pdflinks.length;
            result = new Document[Math.min(pages.length, pdflinks.length)];
            String loc = location.toNormalform(true);
            for (int page = 0; page < result.length; page++) {
                result[page] = new Document(
                        new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname
                                + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash
                        mimeType, "UTF-8", this, null, docKeywords, singleList(docTitle), docAuthor,
                        docPublisher, null, null, 0.0f, 0.0f,
                        pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
                        pdflinks == null || page >= pdflinks.length ? null : pdflinks[page], null, null, false,
                        docDate);
            }
        } else {
            // collect the whole text at once
            final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
            byte[] contentBytes = new byte[0];
            stripper.setEndPage(3); // get first 3 pages (always)
            writer.append(stripper.getText(pdfDoc));
            contentBytes = writer.getBytes(); // remember text in case of interrupting thread

            if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
                stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
                stripper.setEndPage(Integer.MAX_VALUE); // set to default
                // we start the pdf parsing in a separate thread to ensure that it can be terminated
                final PDDocument pdfDocC = pdfDoc;
                final Thread t = new Thread() {
                    @Override
                    public void run() {
                        Thread.currentThread().setName("pdfParser.getText:" + location);
                        try {
                            writer.append(stripper.getText(pdfDocC));
                        } catch (final Throwable e) {
                        }
                    }
                };
                t.start();
                t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
                if (t.isAlive())
                    t.interrupt();
            }
            contentBytes = writer.getBytes(); // get final text before closing writer

            Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
            for (Collection<AnchorURL> pdflinksx : pdflinks)
                if (pdflinksx != null)
                    pdflinksCombined.addAll(pdflinksx);
            result = new Document[] { new Document(location, mimeType, "UTF-8", this, null, docKeywords,
                    singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, contentBytes,
                    pdflinksCombined, null, null, false, docDate) };
        }
    } catch (final Throwable e) {
        //close the writer (in finally)
        //throw new Parser.Failure(e.getMessage(), location);
    } finally {
        try {
            pdfDoc.close();
        } catch (final Throwable e) {
        }
    }

    // clear resources in pdfbox. they say that is resolved but it's not. see:
    // https://issues.apache.org/jira/browse/PDFBOX-313
    // https://issues.apache.org/jira/browse/PDFBOX-351
    // https://issues.apache.org/jira/browse/PDFBOX-441
    // the pdfbox still generates enormeous number of object allocations and don't delete these
    // the following Object are statically stored and never flushed:
    // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary,
    // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
    // the great number of these objects can easily be seen in Java Visual VM
    // we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out.
    pdfDoc = null;
    clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();

    return result;
}

From source file:no.digipost.print.validate.PdfValidator.java

License:Apache License

/**
 * Leser hele dokumentet inn i minnet//from w w  w  . ja  v  a 2 s .c om
 */
private List<PdfValidationError> validerDokumentForPrint(final PDDocument pdDoc,
        final PdfValidationSettings innstillinger) throws IOException {
    List<PdfValidationError> errors = new ArrayList<>();

    if (pdDoc.isEncrypted()) {
        return failValidationIfEncrypted(errors);
    }

    if (innstillinger.validerSideantall) {
        validerSideantall(pdDoc.getNumberOfPages(), errors);
    }

    if (innstillinger.validerPDFversjon) {
        validerPdfVersjon(pdDoc.getDocument().getVersion(), errors);
    }

    boolean dokumentHarUgyldigeDimensjoner = false;
    for (PDPage page : getAllPagesFrom(pdDoc)) {
        if (harUgyldigeDimensjoner(page)) {
            dokumentHarUgyldigeDimensjoner = true;
            break;
        }
    }

    leggTilValideringsfeil(dokumentHarUgyldigeDimensjoner, UNSUPPORTED_DIMENSIONS, errors);

    boolean harTekstIStrekkodeomraade = false;
    boolean dokumentHarSiderHvisMarginIkkeLarSegVerifisereForPrint = false;
    if (innstillinger.validerVenstremarg) {
        for (PDPage page : getAllPagesFrom(pdDoc)) {
            try {
                if (harTekstIStrekkodeomraade(page)) {
                    harTekstIStrekkodeomraade = true;
                    break;
                }
            } catch (NullPointerException npe) {
                dokumentHarSiderHvisMarginIkkeLarSegVerifisereForPrint = true;
                LOG.info("Klarte ikke  verifiserere margen p en side");
            }
        }
    }

    leggTilValideringsfeil(dokumentHarSiderHvisMarginIkkeLarSegVerifisereForPrint,
            UNABLE_TO_VERIFY_SUITABLE_MARGIN_FOR_PRINT, errors);
    leggTilValideringsfeil(harTekstIStrekkodeomraade, INSUFFICIENT_MARGIN_FOR_PRINT, errors);

    if (innstillinger.validerFonter) {
        for (PDPage page : getAllPagesFrom(pdDoc)) {
            validerFonter(fontValidator.getPageFonts(page), errors);
        }
    }

    return errors;
}