List of usage examples for org.apache.pdfbox.pdmodel PDDocument getDocumentInformation
public PDDocumentInformation getDocumentInformation()
From source file:net.sf.jabref.logic.xmp.XMPUtil.java
License:Open Source License
/** * Try to write the given BibTexEntry in the Document Information (the * properties of the pdf).// w w w .j a v a 2 s . co m * * Existing fields values are overriden if the bibtex entry has the * corresponding value set. * * @param document * The pdf document to write to. * @param entry * The Bibtex entry that is written into the PDF properties. * * @param database * maybenull An optional database which the given bibtex entries * belong to, which will be used to resolve strings. If the * database is null the strings will not be resolved. */ private static void writeDocumentInformation(PDDocument document, BibEntry entry, BibDatabase database) { PDDocumentInformation di = document.getDocumentInformation(); BibEntry resolvedEntry; if (database == null) { resolvedEntry = entry; } else { resolvedEntry = database.resolveForStrings(entry, false); } // Query privacy filter settings JabRefPreferences prefs = JabRefPreferences.getInstance(); boolean useXmpPrivacyFilter = prefs.getBoolean(JabRefPreferences.USE_XMP_PRIVACY_FILTER); // Fields for which not to write XMP data later on: Set<String> filters = new TreeSet<>(prefs.getStringList(JabRefPreferences.XMP_PRIVACY_FILTERS)); // Set all the values including key and entryType Set<String> fields = resolvedEntry.getFieldNames(); for (String field : fields) { if (useXmpPrivacyFilter && filters.contains(field)) { // erase field instead of adding it if ("author".equals(field)) { di.setAuthor(null); } else if ("title".equals(field)) { di.setTitle(null); } else if ("keywords".equals(field)) { di.setKeywords(null); } else if ("abstract".equals(field)) { di.setSubject(null); } else { di.setCustomMetadataValue("bibtex/" + field, null); } continue; } if ("author".equals(field)) { di.setAuthor(resolvedEntry.getField("author")); } else if ("title".equals(field)) { di.setTitle(resolvedEntry.getField("title")); } else if ("keywords".equals(field)) { di.setKeywords(resolvedEntry.getField("keywords")); } else if ("abstract".equals(field)) { di.setSubject(resolvedEntry.getField("abstract")); } else { di.setCustomMetadataValue("bibtex/" + field, resolvedEntry.getField(field)); } } di.setCustomMetadataValue("bibtex/entrytype", EntryUtil.capitalizeFirst(resolvedEntry.getType())); }
From source file:net.sf.jabref.util.XMPUtil.java
License:Open Source License
/** * Try to read the given BibTexEntry from the XMP-stream of the given * inputstream containing a PDF-file.//from ww w.ja va2 s .c om * * @param inputStream * The inputstream to read from. * * @throws IOException * Throws an IOException if the file cannot be read, so the user * than remove a lock or cancel the operation. */ @SuppressWarnings("unchecked") public static List<BibtexEntry> readXMP(InputStream inputStream) throws IOException { List<BibtexEntry> result = new LinkedList<BibtexEntry>(); PDDocument document = null; try { document = PDDocument.load(inputStream); if (document.isEncrypted()) { throw new EncryptionNotSupportedException("Error: Cannot read metadata from encrypted document."); } XMPMetadata meta = XMPUtil.getXMPMetadata(document); // If we did not find any XMP metadata, search for non XMP metadata if (meta != null) { List<XMPSchema> schemas = meta.getSchemasByNamespaceURI(XMPSchemaBibtex.NAMESPACE); for (XMPSchema schema : schemas) { XMPSchemaBibtex bib = (XMPSchemaBibtex) schema; result.add(bib.getBibtexEntry()); } // If we did not find anything have a look if a Dublin Core exists if (result.isEmpty()) { schemas = meta.getSchemasByNamespaceURI(XMPSchemaDublinCore.NAMESPACE); for (XMPSchema schema : schemas) { XMPSchemaDublinCore dc = (XMPSchemaDublinCore) schema; BibtexEntry entry = XMPUtil.getBibtexEntryFromDublinCore(dc); if (entry != null) { result.add(entry); } } } } if (result.isEmpty()) { BibtexEntry entry = XMPUtil .getBibtexEntryFromDocumentInformation(document.getDocumentInformation()); if (entry != null) { result.add(entry); } } } finally { if (document != null) { document.close(); } } // return null, if no metadata was found if (result.isEmpty()) { return null; } return result; }
From source file:net.sf.jabref.util.XMPUtil.java
License:Open Source License
/** * Try to write the given BibTexEntry in the Document Information (the * properties of the pdf)./*from w w w . j a v a2 s . c o m*/ * * Existing fields values are overriden if the bibtex entry has the * corresponding value set. * * @param document * The pdf document to write to. * @param entry * The Bibtex entry that is written into the PDF properties. * * @param database * maybenull An optional database which the given bibtex entries * belong to, which will be used to resolve strings. If the * database is null the strings will not be resolved. */ private static void writeDocumentInformation(PDDocument document, BibtexEntry entry, BibtexDatabase database) { PDDocumentInformation di = document.getDocumentInformation(); if (database != null) { entry = database.resolveForStrings(entry, false); } // Query privacy filter settings JabRefPreferences prefs = JabRefPreferences.getInstance(); boolean useXmpPrivacyFilter = prefs.getBoolean(JabRefPreferences.USE_XMP_PRIVACY_FILTER); // Fields for which not to write XMP data later on: TreeSet<String> filters = new TreeSet<String>( Arrays.asList(prefs.getStringArray(JabRefPreferences.XMP_PRIVACY_FILTERS))); // Set all the values including key and entryType Set<String> fields = entry.getAllFields(); for (String field : fields) { if (useXmpPrivacyFilter && filters.contains(field)) { // erase field instead of adding it if (field.equals("author")) { di.setAuthor(null); } else if (field.equals("title")) { di.setTitle(null); } else if (field.equals("keywords")) { di.setKeywords(null); } else if (field.equals("abstract")) { di.setSubject(null); } else { di.setCustomMetadataValue("bibtex/" + field, null); } continue; } if (field.equals("author")) { di.setAuthor(entry.getField("author")); } else if (field.equals("title")) { di.setTitle(entry.getField("title")); } else if (field.equals("keywords")) { di.setKeywords(entry.getField("keywords")); } else if (field.equals("abstract")) { di.setSubject(entry.getField("abstract")); } else { di.setCustomMetadataValue("bibtex/" + field, entry.getField(field)); } } di.setCustomMetadataValue("bibtex/entrytype", entry.getType().getName()); }
From source file:net.sf.mmm.content.parser.impl.pdf.ContentParserPdf.java
License:Apache License
/** * {@inheritDoc}//from w w w . j a va 2s. c o m */ @Override public void parse(InputStream inputStream, long filesize, ContentParserOptions options, MutableGenericContext context) throws Exception { PDFParser parser = new PDFParser(inputStream); parser.parse(); PDDocument pdfDoc = parser.getPDDocument(); try { if (pdfDoc.isEncrypted()) { // pdfDoc.decrypt("password"); return; } PDDocumentInformation info = pdfDoc.getDocumentInformation(); String title = info.getTitle(); if (title != null) { context.setVariable(VARIABLE_NAME_TITLE, title); } String keywords = info.getKeywords(); if (keywords != null) { context.setVariable(VARIABLE_NAME_KEYWORDS, keywords); } String author = info.getAuthor(); if (author != null) { context.setVariable(VARIABLE_NAME_CREATOR, author); } if (filesize < options.getMaximumBufferSize()) { PDFTextStripper stripper = new PDFTextStripper(); context.setVariable(VARIABLE_NAME_TEXT, stripper.getText(pdfDoc)); } } finally { pdfDoc.close(); } }
From source file:net.sourceforge.docfetcher.model.parse.PdfParser.java
License:Open Source License
@Override protected ParseResult parse(@NotNull InputStream in, @NotNull final ParseContext context) throws ParseException { PDDocument pdfDoc = null; try {//from ww w. j ava 2 s .c om /* * TODO post-release-1.1: check if 'force' argument in PDDocument/Stripper increases * number of parsed PDF files */ pdfDoc = PDDocument.load(in, true); PDDocumentInformation pdInfo; final int pageCount; try { pdInfo = pdfDoc.getDocumentInformation(); pageCount = pdfDoc.getNumberOfPages(); } catch (ClassCastException e) { // Bug #3529070 and #3528345 throw new ParseException(e); } StringWriter writer = new StringWriter(); /* * If the PDF file is encrypted, the PDF stripper will automatically * try an empty password. * * In contrast to the paging PDF parser that is used for the * preview, we do not need to call setSortByPosition(true) here * because the extracted text will be digested by Lucene anyway. */ PDFTextStripper stripper = new PDFTextStripper() { protected void startPage(PDPage page) throws IOException { context.getReporter().subInfo(getCurrentPageNo(), pageCount); } protected void endPage(PDPage page) throws IOException { if (context.getCancelable().isCanceled()) setEndPage(0); } }; stripper.setForceParsing(true); try { stripper.writeText(pdfDoc, writer); } catch (RuntimeException e) { /* * PDFTextStripper.writeText can throw various * RuntimeExceptions, see bugs #3446010, #3448272, #3444887. */ throw new ParseException(e); } return new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle()).addAuthor(pdInfo.getAuthor()) .addMiscMetadata(pdInfo.getSubject()).addMiscMetadata(pdInfo.getKeywords()); } catch (IOException e) { if (e.getCause() instanceof CryptographyException) throw new ParseException(Msg.doc_pw_protected.get()); throw new ParseException(e); } finally { close(pdfDoc); } }
From source file:net.sourceforge.docfetcher.model.parse.TestParseFromZip.java
License:Open Source License
@Test public void testZippedPdf() throws Exception { new ZipAndRun(TestFiles.multi_page_pdf) { protected void handleInputStream(InputStream in) throws Exception { PDDocument pdfDoc = PDDocument.load(in); PDFTextStripper stripper = new PDFTextStripper(); StringWriter writer = new StringWriter(); stripper.setForceParsing(true); stripper.setSortByPosition(true); stripper.writeText(pdfDoc, writer); // Will handle encryption with empty password PDDocumentInformation pdInfo = pdfDoc.getDocumentInformation(); ParseResult result = new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle()) .addAuthor(pdInfo.getAuthor()).addMiscMetadata(pdInfo.getSubject()) .addMiscMetadata(pdInfo.getKeywords()); String expectedContents = Util.join(Util.LS, "page 1", "page 2", "page 3"); String actualContents = result.getContent().toString().trim(); assertEquals(expectedContents, actualContents); }//from www. ja va2 s . c om }; }
From source file:net.sourceforge.docfetcher.parse.PDFParser.java
License:Open Source License
public Document parse(File file) throws ParseException { PDDocument pdfDoc = null; try {// w w w . j a v a 2 s . co m // Check if PDF file is encrypted pdfDoc = PDDocument.load(file); if (pdfDoc.isEncrypted()) { try { pdfDoc.openProtection(new StandardDecryptionMaterial("")); } catch (Exception e) { throw new ParseException(file, Msg.no_extraction_permission.value()); } } // Get tags and contents PDFTextStripper stripper = new PDFTextStripper(); StringWriter writer = new StringWriter(); stripper.writeText(pdfDoc, writer); DocFetcher.getInstance().setExceptionHandlerEnabled(true); PDDocumentInformation pdInfo = pdfDoc.getDocumentInformation(); String[] metaData = new String[] { pdInfo.getTitle(), pdInfo.getAuthor(), pdInfo.getSubject(), pdInfo.getKeywords(), }; for (String field : metaData) if (field != null) writer.append(" ").append(field); //$NON-NLS-1$ return new Document(file, metaData[0], writer.getBuffer()).addAuthor(metaData[1]); } catch (IOException e) { throw new ParseException(file, Msg.file_not_readable.value()); } finally { if (pdfDoc != null) { try { pdfDoc.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:net.sourceforge.vaticanfetcher.model.parse.PdfParser.java
License:Open Source License
@Override protected ParseResult parse(@NotNull InputStream in, @NotNull final ParseContext context) throws ParseException { PDDocument pdfDoc = null; try {//from w w w . j a va2s . c om /* TODO post-release-1.1: check if 'force' argument in PDDocument/Stripper increases number of parsed PDF files */ pdfDoc = PDDocument.load(in, true); PDDocumentInformation pdInfo; final int pageCount; try { pdInfo = pdfDoc.getDocumentInformation(); pageCount = pdfDoc.getNumberOfPages(); } catch (ClassCastException e) { // Bug #3529070 and #3528345 throw new ParseException(e); } StringWriter writer = new StringWriter(); /* * If the PDF file is encrypted, the PDF stripper will automatically try an empty password. * * In contrast to the paging PDF parser that is used for the preview, we do not need to call * setSortByPosition(true) here because the extracted text will be digested by Lucene anyway. */ PDFTextStripper stripper = new PDFTextStripper() { protected void startPage(PDPage page) throws IOException { context.getReporter().subInfo(getCurrentPageNo(), pageCount); } protected void endPage(PDPage page) throws IOException { if (context.getCancelable().isCanceled()) setEndPage(0); } }; stripper.setForceParsing(true); try { stripper.writeText(pdfDoc, writer); } catch (RuntimeException e) { /* PDFTextStripper.writeText can throw various RuntimeExceptions, see bugs #3446010, #3448272, #3444887. */ throw new ParseException(e); } return new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle()).addAuthor(pdInfo.getAuthor()) .addMiscMetadata(pdInfo.getSubject()).addMiscMetadata(pdInfo.getKeywords()); } catch (IOException e) { if (e.getCause() instanceof CryptographyException) throw new ParseException(Msg.doc_pw_protected.get()); throw new ParseException(e); } finally { close(pdfDoc); } }
From source file:net.yacy.cider.parser.idiom.pdfIdiom.java
License:Open Source License
@Override public Model parse(DataSource source) throws ParserException { // create an empty Model Model model = ModelFactory.createDefaultModel(); Resource resource = source.hasURI() ? model.createResource(source.getURI().toNormalform(true, true)) : model.createResource();/*from w w w . ja v a2 s . c o m*/ // open pdf document final PDDocument theDocument; final PDFParser parser; try { parser = new PDFParser(source.getStream()); parser.parse(); theDocument = parser.getPDDocument(); } catch (IOException e) { log.error(e.getMessage(), e); throw new ParserException(e.getMessage(), source.getURI()); } if (theDocument.isEncrypted()) { try { theDocument.openProtection(new StandardDecryptionMaterial("")); } catch (BadSecurityHandlerException e) { throw new ParserException("PDF Encrypted (BadSecurityHandlerException): " + e.getMessage(), source.getURI(), e); } catch (IOException e) { throw new ParserException("PDF Encrypted (IOException): " + e.getMessage(), source.getURI(), e); } catch (CryptographyException e) { throw new ParserException("PDF Encrypted (CryptographyException): " + e.getMessage(), source.getURI(), e); } final AccessPermission perm = theDocument.getCurrentAccessPermission(); if (perm == null || !perm.canExtractContent()) throw new ParserException("PDF cannot be decrypted", source.getURI()); } // get metadata final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation(); String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null; if (theDocInfo != null) { docTitle = theDocInfo.getTitle(); docSubject = theDocInfo.getSubject(); docAuthor = theDocInfo.getAuthor(); docKeywordStr = theDocInfo.getKeywords(); } if (docAuthor != null && docAuthor.length() > 0) { resource.addProperty(VCARD.FN, docAuthor); resource.addProperty(DC.creator, docAuthor); } if (docSubject != null && docSubject.length() > 0) { resource.addProperty(DC.subject, docSubject); } if (docTitle != null && docTitle.length() > 0) { resource.addProperty(DC.title, docTitle); } String[] docKeywords = null; if (docKeywordStr != null && docKeywordStr.length() > 0) { docKeywords = docKeywordStr.split(" |,"); resource.addProperty(DC.coverage, concat(docKeywords)); } // get the content ByteArrayOutputStream baos = new ByteArrayOutputStream(); Writer writer; try { writer = new OutputStreamWriter(baos, "UTF-8"); } catch (UnsupportedEncodingException e1) { writer = new OutputStreamWriter(baos); } try { final PDFTextStripper stripper = new PDFTextStripper(); stripper.writeText(theDocument, writer); theDocument.close(); writer.close(); } catch (IOException e) { if (writer != null) try { writer.close(); } catch (final Exception ex) { } throw new ParserException("PDF content reader", source.getURI(), e); } String content; try { content = new String(baos.toByteArray(), "UTF-8"); } catch (UnsupportedEncodingException e) { content = new String(baos.toByteArray()); } if (content != null && content.length() > 0) { resource.addProperty(CIDER.data_content_text, content); } return model; }
From source file:net.yacy.document.parser.pdfParser.java
License:Open Source License
@Override public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, false)) throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(), location);/* w w w . java 2 s.c om*/ // create a pdf parser PDDocument pdfDoc; //final PDFParser pdfParser; try { Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain pdfDoc = PDDocument.load(source); //PDFParser pdfParser = new PDFParser(source); //pdfParser.parse(); //pdfDoc = pdfParser.getPDDocument(); } catch (final IOException e) { throw new Parser.Failure(e.getMessage(), location); } finally { Thread.currentThread().setPriority(Thread.NORM_PRIORITY); } if (pdfDoc.isEncrypted()) { try { pdfDoc.openProtection(new StandardDecryptionMaterial("")); } catch (final BadSecurityHandlerException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location); } catch (final IOException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location); } catch (final CryptographyException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location); } final AccessPermission perm = pdfDoc.getCurrentAccessPermission(); if (perm == null || !perm.canExtractContent()) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted and cannot be decrypted", location); } } // extracting some metadata PDDocumentInformation info = pdfDoc.getDocumentInformation(); String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null; Date docDate = new Date(); if (info != null) { docTitle = info.getTitle(); docSubject = info.getSubject(); docAuthor = info.getAuthor(); docPublisher = info.getProducer(); if (docPublisher == null || docPublisher.isEmpty()) docPublisher = info.getCreator(); docKeywordStr = info.getKeywords(); try { if (info.getModificationDate() != null) docDate = info.getModificationDate().getTime(); } catch (IOException e) { } // unused: // info.getTrapped()); } info = null; if (docTitle == null || docTitle.isEmpty()) { docTitle = MultiProtocolURL.unescape(location.getFileName()); } if (docTitle == null) { docTitle = docSubject; } String[] docKeywords = null; if (docKeywordStr != null) { docKeywords = docKeywordStr.split(" |,"); } Collection<AnchorURL>[] pdflinks = null; Document[] result = null; try { // get the links pdflinks = extractPdfLinks(pdfDoc); // get the fulltext (either per document or for each page) final PDFTextStripper stripper = new PDFTextStripper("UTF-8"); if (individualPages) { // this is a hack which stores individual pages of the source pdf into individual index documents // the new documents will get a virtual link with a post argument page=X appended to the original url // collect text int pagecount = pdfDoc.getNumberOfPages(); String[] pages = new String[pagecount]; for (int page = 1; page <= pagecount; page++) { stripper.setStartPage(page); stripper.setEndPage(page); pages[page - 1] = stripper.getText(pdfDoc); //System.out.println("PAGE " + page + ": " + pages[page - 1]); } // create individual documents for each page assert pages.length == pdflinks.length : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.length; result = new Document[Math.min(pages.length, pdflinks.length)]; String loc = location.toNormalform(true); for (int page = 0; page < result.length; page++) { result[page] = new Document( new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash mimeType, "UTF-8", this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]), pdflinks == null || page >= pdflinks.length ? null : pdflinks[page], null, null, false, docDate); } } else { // collect the whole text at once final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); byte[] contentBytes = new byte[0]; stripper.setEndPage(3); // get first 3 pages (always) writer.append(stripper.getText(pdfDoc)); contentBytes = writer.getBytes(); // remember text in case of interrupting thread if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text) stripper.setEndPage(Integer.MAX_VALUE); // set to default // we start the pdf parsing in a separate thread to ensure that it can be terminated final PDDocument pdfDocC = pdfDoc; final Thread t = new Thread() { @Override public void run() { Thread.currentThread().setName("pdfParser.getText:" + location); try { writer.append(stripper.getText(pdfDocC)); } catch (final Throwable e) { } } }; t.start(); t.join(3000); // pdfbox likes to forget to terminate ... (quite often) if (t.isAlive()) t.interrupt(); } contentBytes = writer.getBytes(); // get final text before closing writer Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>(); for (Collection<AnchorURL> pdflinksx : pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); result = new Document[] { new Document(location, mimeType, "UTF-8", this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, contentBytes, pdflinksCombined, null, null, false, docDate) }; } } catch (final Throwable e) { //close the writer (in finally) //throw new Parser.Failure(e.getMessage(), location); } finally { try { pdfDoc.close(); } catch (final Throwable e) { } } // clear resources in pdfbox. they say that is resolved but it's not. see: // https://issues.apache.org/jira/browse/PDFBOX-313 // https://issues.apache.org/jira/browse/PDFBOX-351 // https://issues.apache.org/jira/browse/PDFBOX-441 // the pdfbox still generates enormeous number of object allocations and don't delete these // the following Object are statically stored and never flushed: // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary, // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull // the great number of these objects can easily be seen in Java Visual VM // we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out. pdfDoc = null; clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); return result; }