Example usage for org.apache.pdfbox.pdmodel PDDocument getDocumentInformation

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument getDocumentInformation.

Prototype

public PDDocumentInformation getDocumentInformation()

Source Link

Document

This will get the document info dictionary.

Usage

From source file:net.sf.jabref.logic.xmp.XMPUtil.java

License:Open Source License

/**
 * Try to write the given BibTexEntry in the Document Information (the
 * properties of the pdf).// w w w  .j a  v  a 2  s  . co m
 *
 * Existing fields values are overriden if the bibtex entry has the
 * corresponding value set.
 *
 * @param document
 *            The pdf document to write to.
 * @param entry
 *            The Bibtex entry that is written into the PDF properties. *
 * @param database
 *            maybenull An optional database which the given bibtex entries
 *            belong to, which will be used to resolve strings. If the
 *            database is null the strings will not be resolved.
 */
private static void writeDocumentInformation(PDDocument document, BibEntry entry, BibDatabase database) {

    PDDocumentInformation di = document.getDocumentInformation();

    BibEntry resolvedEntry;
    if (database == null) {
        resolvedEntry = entry;
    } else {
        resolvedEntry = database.resolveForStrings(entry, false);
    }

    // Query privacy filter settings
    JabRefPreferences prefs = JabRefPreferences.getInstance();
    boolean useXmpPrivacyFilter = prefs.getBoolean(JabRefPreferences.USE_XMP_PRIVACY_FILTER);
    // Fields for which not to write XMP data later on:
    Set<String> filters = new TreeSet<>(prefs.getStringList(JabRefPreferences.XMP_PRIVACY_FILTERS));

    // Set all the values including key and entryType
    Set<String> fields = resolvedEntry.getFieldNames();

    for (String field : fields) {

        if (useXmpPrivacyFilter && filters.contains(field)) {
            // erase field instead of adding it
            if ("author".equals(field)) {
                di.setAuthor(null);
            } else if ("title".equals(field)) {
                di.setTitle(null);
            } else if ("keywords".equals(field)) {
                di.setKeywords(null);
            } else if ("abstract".equals(field)) {
                di.setSubject(null);
            } else {
                di.setCustomMetadataValue("bibtex/" + field, null);
            }
            continue;
        }

        if ("author".equals(field)) {
            di.setAuthor(resolvedEntry.getField("author"));
        } else if ("title".equals(field)) {
            di.setTitle(resolvedEntry.getField("title"));
        } else if ("keywords".equals(field)) {
            di.setKeywords(resolvedEntry.getField("keywords"));
        } else if ("abstract".equals(field)) {
            di.setSubject(resolvedEntry.getField("abstract"));
        } else {
            di.setCustomMetadataValue("bibtex/" + field, resolvedEntry.getField(field));
        }
    }
    di.setCustomMetadataValue("bibtex/entrytype", EntryUtil.capitalizeFirst(resolvedEntry.getType()));
}

From source file:net.sf.jabref.util.XMPUtil.java

License:Open Source License

/**
 * Try to read the given BibTexEntry from the XMP-stream of the given
 * inputstream containing a PDF-file.//from   ww  w.ja va2  s  .c om
 * 
 * @param inputStream
 *            The inputstream to read from.
 * 
 * @throws IOException
 *             Throws an IOException if the file cannot be read, so the user
 *             than remove a lock or cancel the operation.
 */
@SuppressWarnings("unchecked")
public static List<BibtexEntry> readXMP(InputStream inputStream) throws IOException {

    List<BibtexEntry> result = new LinkedList<BibtexEntry>();

    PDDocument document = null;

    try {
        document = PDDocument.load(inputStream);
        if (document.isEncrypted()) {
            throw new EncryptionNotSupportedException("Error: Cannot read metadata from encrypted document.");
        }

        XMPMetadata meta = XMPUtil.getXMPMetadata(document);

        // If we did not find any XMP metadata, search for non XMP metadata
        if (meta != null) {

            List<XMPSchema> schemas = meta.getSchemasByNamespaceURI(XMPSchemaBibtex.NAMESPACE);

            for (XMPSchema schema : schemas) {
                XMPSchemaBibtex bib = (XMPSchemaBibtex) schema;

                result.add(bib.getBibtexEntry());
            }

            // If we did not find anything have a look if a Dublin Core exists
            if (result.isEmpty()) {
                schemas = meta.getSchemasByNamespaceURI(XMPSchemaDublinCore.NAMESPACE);
                for (XMPSchema schema : schemas) {
                    XMPSchemaDublinCore dc = (XMPSchemaDublinCore) schema;

                    BibtexEntry entry = XMPUtil.getBibtexEntryFromDublinCore(dc);

                    if (entry != null) {
                        result.add(entry);
                    }
                }
            }
        }
        if (result.isEmpty()) {
            BibtexEntry entry = XMPUtil
                    .getBibtexEntryFromDocumentInformation(document.getDocumentInformation());

            if (entry != null) {
                result.add(entry);
            }
        }
    } finally {
        if (document != null) {
            document.close();
        }
    }

    // return null, if no metadata was found
    if (result.isEmpty()) {
        return null;
    }
    return result;
}

From source file:net.sf.jabref.util.XMPUtil.java

License:Open Source License

/**
 * Try to write the given BibTexEntry in the Document Information (the
 * properties of the pdf)./*from   w  w w  . j  a v  a2 s  . c  o m*/
 * 
 * Existing fields values are overriden if the bibtex entry has the
 * corresponding value set.
 * 
 * @param document
 *            The pdf document to write to.
 * @param entry
 *            The Bibtex entry that is written into the PDF properties. *
 * @param database
 *            maybenull An optional database which the given bibtex entries
 *            belong to, which will be used to resolve strings. If the
 *            database is null the strings will not be resolved.
 */
private static void writeDocumentInformation(PDDocument document, BibtexEntry entry, BibtexDatabase database) {

    PDDocumentInformation di = document.getDocumentInformation();

    if (database != null) {
        entry = database.resolveForStrings(entry, false);
    }

    // Query privacy filter settings
    JabRefPreferences prefs = JabRefPreferences.getInstance();
    boolean useXmpPrivacyFilter = prefs.getBoolean(JabRefPreferences.USE_XMP_PRIVACY_FILTER);
    // Fields for which not to write XMP data later on:
    TreeSet<String> filters = new TreeSet<String>(
            Arrays.asList(prefs.getStringArray(JabRefPreferences.XMP_PRIVACY_FILTERS)));

    // Set all the values including key and entryType
    Set<String> fields = entry.getAllFields();

    for (String field : fields) {

        if (useXmpPrivacyFilter && filters.contains(field)) {
            // erase field instead of adding it
            if (field.equals("author")) {
                di.setAuthor(null);
            } else if (field.equals("title")) {
                di.setTitle(null);
            } else if (field.equals("keywords")) {
                di.setKeywords(null);
            } else if (field.equals("abstract")) {
                di.setSubject(null);
            } else {
                di.setCustomMetadataValue("bibtex/" + field, null);
            }
            continue;
        }

        if (field.equals("author")) {
            di.setAuthor(entry.getField("author"));
        } else if (field.equals("title")) {
            di.setTitle(entry.getField("title"));
        } else if (field.equals("keywords")) {
            di.setKeywords(entry.getField("keywords"));
        } else if (field.equals("abstract")) {
            di.setSubject(entry.getField("abstract"));
        } else {
            di.setCustomMetadataValue("bibtex/" + field, entry.getField(field));
        }
    }
    di.setCustomMetadataValue("bibtex/entrytype", entry.getType().getName());
}

From source file:net.sf.mmm.content.parser.impl.pdf.ContentParserPdf.java

License:Apache License

/**
 * {@inheritDoc}//from w  w w  .  j a va 2s. c o  m
 */
@Override
public void parse(InputStream inputStream, long filesize, ContentParserOptions options,
        MutableGenericContext context) throws Exception {

    PDFParser parser = new PDFParser(inputStream);
    parser.parse();
    PDDocument pdfDoc = parser.getPDDocument();
    try {
        if (pdfDoc.isEncrypted()) {
            // pdfDoc.decrypt("password");
            return;
        }
        PDDocumentInformation info = pdfDoc.getDocumentInformation();
        String title = info.getTitle();
        if (title != null) {
            context.setVariable(VARIABLE_NAME_TITLE, title);
        }
        String keywords = info.getKeywords();
        if (keywords != null) {
            context.setVariable(VARIABLE_NAME_KEYWORDS, keywords);
        }
        String author = info.getAuthor();
        if (author != null) {
            context.setVariable(VARIABLE_NAME_CREATOR, author);
        }

        if (filesize < options.getMaximumBufferSize()) {
            PDFTextStripper stripper = new PDFTextStripper();
            context.setVariable(VARIABLE_NAME_TEXT, stripper.getText(pdfDoc));
        }
    } finally {
        pdfDoc.close();
    }
}

From source file:net.sourceforge.docfetcher.model.parse.PdfParser.java

License:Open Source License

@Override
protected ParseResult parse(@NotNull InputStream in, @NotNull final ParseContext context)
        throws ParseException {
    PDDocument pdfDoc = null;
    try {//from ww  w. j ava 2 s .c  om
        /*
         * TODO post-release-1.1: check if 'force' argument in PDDocument/Stripper increases
         * number of parsed PDF files
         */
        pdfDoc = PDDocument.load(in, true);
        PDDocumentInformation pdInfo;
        final int pageCount;
        try {
            pdInfo = pdfDoc.getDocumentInformation();
            pageCount = pdfDoc.getNumberOfPages();
        } catch (ClassCastException e) {
            // Bug #3529070 and #3528345
            throw new ParseException(e);
        }
        StringWriter writer = new StringWriter();

        /*
         * If the PDF file is encrypted, the PDF stripper will automatically
         * try an empty password.
         * 
         * In contrast to the paging PDF parser that is used for the
         * preview, we do not need to call setSortByPosition(true) here
         * because the extracted text will be digested by Lucene anyway.
         */
        PDFTextStripper stripper = new PDFTextStripper() {
            protected void startPage(PDPage page) throws IOException {
                context.getReporter().subInfo(getCurrentPageNo(), pageCount);
            }

            protected void endPage(PDPage page) throws IOException {
                if (context.getCancelable().isCanceled())
                    setEndPage(0);
            }
        };
        stripper.setForceParsing(true);

        try {
            stripper.writeText(pdfDoc, writer);
        } catch (RuntimeException e) {
            /*
             * PDFTextStripper.writeText can throw various
             * RuntimeExceptions, see bugs #3446010, #3448272, #3444887.
             */
            throw new ParseException(e);
        }

        return new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle()).addAuthor(pdInfo.getAuthor())
                .addMiscMetadata(pdInfo.getSubject()).addMiscMetadata(pdInfo.getKeywords());
    } catch (IOException e) {
        if (e.getCause() instanceof CryptographyException)
            throw new ParseException(Msg.doc_pw_protected.get());
        throw new ParseException(e);
    } finally {
        close(pdfDoc);
    }
}

From source file:net.sourceforge.docfetcher.model.parse.TestParseFromZip.java

License:Open Source License

@Test
public void testZippedPdf() throws Exception {
    new ZipAndRun(TestFiles.multi_page_pdf) {
        protected void handleInputStream(InputStream in) throws Exception {
            PDDocument pdfDoc = PDDocument.load(in);
            PDFTextStripper stripper = new PDFTextStripper();
            StringWriter writer = new StringWriter();
            stripper.setForceParsing(true);
            stripper.setSortByPosition(true);
            stripper.writeText(pdfDoc, writer); // Will handle encryption with empty password
            PDDocumentInformation pdInfo = pdfDoc.getDocumentInformation();
            ParseResult result = new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle())
                    .addAuthor(pdInfo.getAuthor()).addMiscMetadata(pdInfo.getSubject())
                    .addMiscMetadata(pdInfo.getKeywords());
            String expectedContents = Util.join(Util.LS, "page 1", "page 2", "page 3");
            String actualContents = result.getContent().toString().trim();
            assertEquals(expectedContents, actualContents);
        }//from   www.  ja  va2  s .  c om
    };
}

From source file:net.sourceforge.docfetcher.parse.PDFParser.java

License:Open Source License

public Document parse(File file) throws ParseException {
    PDDocument pdfDoc = null;
    try {// w  w w  .  j a  v a 2 s .  co  m
        // Check if PDF file is encrypted
        pdfDoc = PDDocument.load(file);
        if (pdfDoc.isEncrypted()) {
            try {
                pdfDoc.openProtection(new StandardDecryptionMaterial(""));
            } catch (Exception e) {
                throw new ParseException(file, Msg.no_extraction_permission.value());
            }
        }

        // Get tags and contents
        PDFTextStripper stripper = new PDFTextStripper();
        StringWriter writer = new StringWriter();
        stripper.writeText(pdfDoc, writer);
        DocFetcher.getInstance().setExceptionHandlerEnabled(true);
        PDDocumentInformation pdInfo = pdfDoc.getDocumentInformation();
        String[] metaData = new String[] { pdInfo.getTitle(), pdInfo.getAuthor(), pdInfo.getSubject(),
                pdInfo.getKeywords(), };
        for (String field : metaData)
            if (field != null)
                writer.append(" ").append(field); //$NON-NLS-1$
        return new Document(file, metaData[0], writer.getBuffer()).addAuthor(metaData[1]);
    } catch (IOException e) {
        throw new ParseException(file, Msg.file_not_readable.value());
    } finally {
        if (pdfDoc != null) {
            try {
                pdfDoc.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

From source file:net.sourceforge.vaticanfetcher.model.parse.PdfParser.java

License:Open Source License

@Override
protected ParseResult parse(@NotNull InputStream in, @NotNull final ParseContext context)
        throws ParseException {
    PDDocument pdfDoc = null;
    try {//from  w  w w . j a  va2s  .  c om
        /* TODO post-release-1.1: check if 'force' argument in PDDocument/Stripper increases number of parsed PDF files */
        pdfDoc = PDDocument.load(in, true);
        PDDocumentInformation pdInfo;
        final int pageCount;
        try {
            pdInfo = pdfDoc.getDocumentInformation();
            pageCount = pdfDoc.getNumberOfPages();
        } catch (ClassCastException e) {
            // Bug #3529070 and #3528345
            throw new ParseException(e);
        }
        StringWriter writer = new StringWriter();

        /*
         * If the PDF file is encrypted, the PDF stripper will automatically try an empty password.
         * 
         * In contrast to the paging PDF parser that is used for the preview, we do not need to call 
         * setSortByPosition(true) here because the extracted text will be digested by Lucene anyway.
         */
        PDFTextStripper stripper = new PDFTextStripper() {
            protected void startPage(PDPage page) throws IOException {
                context.getReporter().subInfo(getCurrentPageNo(), pageCount);
            }

            protected void endPage(PDPage page) throws IOException {
                if (context.getCancelable().isCanceled())
                    setEndPage(0);
            }
        };
        stripper.setForceParsing(true);

        try {
            stripper.writeText(pdfDoc, writer);
        } catch (RuntimeException e) {
            /* PDFTextStripper.writeText can throw various RuntimeExceptions, see bugs #3446010, #3448272, #3444887. */
            throw new ParseException(e);
        }

        return new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle()).addAuthor(pdInfo.getAuthor())
                .addMiscMetadata(pdInfo.getSubject()).addMiscMetadata(pdInfo.getKeywords());
    } catch (IOException e) {
        if (e.getCause() instanceof CryptographyException)
            throw new ParseException(Msg.doc_pw_protected.get());
        throw new ParseException(e);
    } finally {
        close(pdfDoc);
    }
}

From source file:net.yacy.cider.parser.idiom.pdfIdiom.java

License:Open Source License

@Override
public Model parse(DataSource source) throws ParserException {
    // create an empty Model
    Model model = ModelFactory.createDefaultModel();
    Resource resource = source.hasURI() ? model.createResource(source.getURI().toNormalform(true, true))
            : model.createResource();/*from  w w  w  . ja  v a2  s . c o m*/

    // open pdf document
    final PDDocument theDocument;
    final PDFParser parser;
    try {
        parser = new PDFParser(source.getStream());
        parser.parse();
        theDocument = parser.getPDDocument();
    } catch (IOException e) {
        log.error(e.getMessage(), e);
        throw new ParserException(e.getMessage(), source.getURI());
    }

    if (theDocument.isEncrypted()) {
        try {
            theDocument.openProtection(new StandardDecryptionMaterial(""));
        } catch (BadSecurityHandlerException e) {
            throw new ParserException("PDF Encrypted (BadSecurityHandlerException): " + e.getMessage(),
                    source.getURI(), e);
        } catch (IOException e) {
            throw new ParserException("PDF Encrypted (IOException): " + e.getMessage(), source.getURI(), e);
        } catch (CryptographyException e) {
            throw new ParserException("PDF Encrypted (CryptographyException): " + e.getMessage(),
                    source.getURI(), e);
        }
        final AccessPermission perm = theDocument.getCurrentAccessPermission();
        if (perm == null || !perm.canExtractContent())
            throw new ParserException("PDF cannot be decrypted", source.getURI());
    }

    // get metadata
    final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();
    String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null;
    if (theDocInfo != null) {
        docTitle = theDocInfo.getTitle();
        docSubject = theDocInfo.getSubject();
        docAuthor = theDocInfo.getAuthor();
        docKeywordStr = theDocInfo.getKeywords();
    }

    if (docAuthor != null && docAuthor.length() > 0) {
        resource.addProperty(VCARD.FN, docAuthor);
        resource.addProperty(DC.creator, docAuthor);
    }
    if (docSubject != null && docSubject.length() > 0) {
        resource.addProperty(DC.subject, docSubject);
    }
    if (docTitle != null && docTitle.length() > 0) {
        resource.addProperty(DC.title, docTitle);
    }
    String[] docKeywords = null;
    if (docKeywordStr != null && docKeywordStr.length() > 0) {
        docKeywords = docKeywordStr.split(" |,");
        resource.addProperty(DC.coverage, concat(docKeywords));
    }

    // get the content
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    Writer writer;
    try {
        writer = new OutputStreamWriter(baos, "UTF-8");
    } catch (UnsupportedEncodingException e1) {
        writer = new OutputStreamWriter(baos);
    }
    try {
        final PDFTextStripper stripper = new PDFTextStripper();
        stripper.writeText(theDocument, writer);
        theDocument.close();
        writer.close();
    } catch (IOException e) {
        if (writer != null)
            try {
                writer.close();
            } catch (final Exception ex) {
            }
        throw new ParserException("PDF content reader", source.getURI(), e);
    }
    String content;
    try {
        content = new String(baos.toByteArray(), "UTF-8");
    } catch (UnsupportedEncodingException e) {
        content = new String(baos.toByteArray());
    }
    if (content != null && content.length() > 0) {
        resource.addProperty(CIDER.data_content_text, content);
    }

    return model;
}

From source file:net.yacy.document.parser.pdfParser.java

License:Open Source License

@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset,
        final VocabularyScraper scraper, final int timezoneOffset, final InputStream source)
        throws Parser.Failure, InterruptedException {

    // check memory for parser
    if (!MemoryControl.request(200 * 1024 * 1024, false))
        throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(),
                location);/* w w w  . java  2 s.c om*/

    // create a pdf parser
    PDDocument pdfDoc;
    //final PDFParser pdfParser;
    try {
        Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain
        pdfDoc = PDDocument.load(source);
        //PDFParser pdfParser = new PDFParser(source);
        //pdfParser.parse();
        //pdfDoc = pdfParser.getPDDocument();
    } catch (final IOException e) {
        throw new Parser.Failure(e.getMessage(), location);
    } finally {
        Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
    }

    if (pdfDoc.isEncrypted()) {
        try {
            pdfDoc.openProtection(new StandardDecryptionMaterial(""));
        } catch (final BadSecurityHandlerException e) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location);
        } catch (final IOException e) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location);
        } catch (final CryptographyException e) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location);
        }
        final AccessPermission perm = pdfDoc.getCurrentAccessPermission();
        if (perm == null || !perm.canExtractContent()) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
        }
    }

    // extracting some metadata
    PDDocumentInformation info = pdfDoc.getDocumentInformation();
    String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
    Date docDate = new Date();
    if (info != null) {
        docTitle = info.getTitle();
        docSubject = info.getSubject();
        docAuthor = info.getAuthor();
        docPublisher = info.getProducer();
        if (docPublisher == null || docPublisher.isEmpty())
            docPublisher = info.getCreator();
        docKeywordStr = info.getKeywords();
        try {
            if (info.getModificationDate() != null)
                docDate = info.getModificationDate().getTime();
        } catch (IOException e) {
        }
        // unused:
        // info.getTrapped());
    }
    info = null;

    if (docTitle == null || docTitle.isEmpty()) {
        docTitle = MultiProtocolURL.unescape(location.getFileName());
    }
    if (docTitle == null) {
        docTitle = docSubject;
    }
    String[] docKeywords = null;
    if (docKeywordStr != null) {
        docKeywords = docKeywordStr.split(" |,");
    }

    Collection<AnchorURL>[] pdflinks = null;
    Document[] result = null;
    try {
        // get the links
        pdflinks = extractPdfLinks(pdfDoc);

        // get the fulltext (either per document or for each page)
        final PDFTextStripper stripper = new PDFTextStripper("UTF-8");

        if (individualPages) {
            // this is a hack which stores individual pages of the source pdf into individual index documents
            // the new documents will get a virtual link with a post argument page=X appended to the original url

            // collect text
            int pagecount = pdfDoc.getNumberOfPages();
            String[] pages = new String[pagecount];
            for (int page = 1; page <= pagecount; page++) {
                stripper.setStartPage(page);
                stripper.setEndPage(page);
                pages[page - 1] = stripper.getText(pdfDoc);
                //System.out.println("PAGE " + page + ": " + pages[page - 1]);
            }

            // create individual documents for each page
            assert pages.length == pdflinks.length : "pages.length = " + pages.length + ", pdflinks.length = "
                    + pdflinks.length;
            result = new Document[Math.min(pages.length, pdflinks.length)];
            String loc = location.toNormalform(true);
            for (int page = 0; page < result.length; page++) {
                result[page] = new Document(
                        new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname
                                + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash
                        mimeType, "UTF-8", this, null, docKeywords, singleList(docTitle), docAuthor,
                        docPublisher, null, null, 0.0f, 0.0f,
                        pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
                        pdflinks == null || page >= pdflinks.length ? null : pdflinks[page], null, null, false,
                        docDate);
            }
        } else {
            // collect the whole text at once
            final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
            byte[] contentBytes = new byte[0];
            stripper.setEndPage(3); // get first 3 pages (always)
            writer.append(stripper.getText(pdfDoc));
            contentBytes = writer.getBytes(); // remember text in case of interrupting thread

            if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
                stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
                stripper.setEndPage(Integer.MAX_VALUE); // set to default
                // we start the pdf parsing in a separate thread to ensure that it can be terminated
                final PDDocument pdfDocC = pdfDoc;
                final Thread t = new Thread() {
                    @Override
                    public void run() {
                        Thread.currentThread().setName("pdfParser.getText:" + location);
                        try {
                            writer.append(stripper.getText(pdfDocC));
                        } catch (final Throwable e) {
                        }
                    }
                };
                t.start();
                t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
                if (t.isAlive())
                    t.interrupt();
            }
            contentBytes = writer.getBytes(); // get final text before closing writer

            Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
            for (Collection<AnchorURL> pdflinksx : pdflinks)
                if (pdflinksx != null)
                    pdflinksCombined.addAll(pdflinksx);
            result = new Document[] { new Document(location, mimeType, "UTF-8", this, null, docKeywords,
                    singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, contentBytes,
                    pdflinksCombined, null, null, false, docDate) };
        }
    } catch (final Throwable e) {
        //close the writer (in finally)
        //throw new Parser.Failure(e.getMessage(), location);
    } finally {
        try {
            pdfDoc.close();
        } catch (final Throwable e) {
        }
    }

    // clear resources in pdfbox. they say that is resolved but it's not. see:
    // https://issues.apache.org/jira/browse/PDFBOX-313
    // https://issues.apache.org/jira/browse/PDFBOX-351
    // https://issues.apache.org/jira/browse/PDFBOX-441
    // the pdfbox still generates enormeous number of object allocations and don't delete these
    // the following Object are statically stored and never flushed:
    // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary,
    // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
    // the great number of these objects can easily be seen in Java Visual VM
    // we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out.
    pdfDoc = null;
    clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();

    return result;
}