List of usage examples for org.apache.pdfbox.pdmodel PDDocument getDocumentInformation
public PDDocumentInformation getDocumentInformation()
From source file:PDFConverter.java
License:Apache License
/** * Implementation is informed by PDFBox authors. * * @param doc/*from www . j a va2 s . c o m*/ * @return * @throws IOException */ @Override public synchronized ConvertedDocument convert(java.io.File doc) throws IOException { /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Adapted from LucenePDFDocument.java from PDFBox lucene project * * This class is used to create a document for the lucene search engine. * This should easily plug into the IndexHTML or IndexFiles that comes * with the lucene project. This class will populate the following * fields. * <table> <tr> <th>Lucene Field Name</th> <th>Description</th> </tr> * <tr> * <td>path</td> <td>File system path if loaded from a file</td> </tr> * <tr> * <td>url</td> <td>URL to PDF document</td> </tr> <tr> * <td>contents</td> * <td>Entire contents of PDF document, indexed but not stored</td> * </tr> * <tr> <td>summary</td> <td>First 500 characters of content</td> </tr> * <tr> * <td>modified</td> <td>The modified date/time according to the url or * path</td> </tr> <tr> <td>uid</td> <td>A unique identifier for the * Lucene document.</td> </tr> <tr> <td>CreationDate</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>Creator</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>Keywords</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>ModificationDate</td> * <td>From PDF meta-data if available</td> </tr> <tr> <td>Producer</td> * <td>From PDF meta-data if available</td> </tr> <tr> <td>Subject</td> * <td>From PDF meta-data if available</td> </tr> <tr> <td>Trapped</td> * <td>From PDF meta-data if available</td> </tr> <tr> * <td>Encrypted</td> <td>From PDF meta-data if available</td> </tr> * </table> * * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a> * @version $Revision: 1.23 $ * * @throws IOException If there is an error parsing the document. */ PDDocument pdfDocument = null; ConvertedDocument textdoc = new ConvertedDocument(doc); try { pdfDocument = PDDocument.load(doc); if (pdfDocument.isEncrypted()) { //Just try using the default password and move on // Even if the doc is encrypted, apparently you can try. Throw exception if it fails. textdoc.addProperty("encrypted", "YES"); } //create a writer where to append the text content. StringWriter writer = new StringWriter(); stripper.resetEngine(); stripper.writeText(pdfDocument, writer); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { textdoc.addAuthor(info.getAuthor()); try { textdoc.addCreateDate(info.getCreationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } textdoc.addProperty("creator_tool", info.getCreator()); textdoc.addProperty("keywords", info.getKeywords()); /* try { metadata.add("ModificationDate", info.getModificationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } */ //metadata.add("Producer", info.getProducer()); textdoc.addProperty("subject", info.getSubject()); String ttl = info.getTitle(); if (ttl == null || "untitled".equalsIgnoreCase(ttl)) { ttl = textdoc.filename; } textdoc.addTitle(ttl); // metadata.add("Trapped", info.getTrapped()); // TODO: Character set is what? textdoc.setEncoding("UTF-8"); } // Note: the buffer to string operation is costless; // the char array value of the writer buffer and the content string // is shared as long as the buffer content is not modified, which will // not occur here. textdoc.setText(writer.getBuffer().toString()); return textdoc; } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:PDFExtractMetadata.java
License:Apache License
/** * This is the main method./*from w ww. j a v a 2s .c o m*/ * * @param args The command line arguments. * * @throws IOException If there is an error parsing the document. * @throws XmpParsingException */ public static void main(String[] args) throws IOException, XmpParsingException { if (args.length != 1) { usage(); System.exit(1); } else { PDDocument document = null; try { document = PDDocument.load(new File(args[0])); PDDocumentCatalog catalog = document.getDocumentCatalog(); PDMetadata meta = catalog.getMetadata(); if (meta != null) { DomXmpParser xmpParser = new DomXmpParser(); try { XMPMetadata metadata = xmpParser.parse(meta.createInputStream()); DublinCoreSchema dc = metadata.getDublinCoreSchema(); if (dc != null) { display("Title:", dc.getTitle()); display("Description:", dc.getDescription()); listString("Creators: ", dc.getCreators()); listCalendar("Dates:", dc.getDates()); listString("Subjects:", dc.getSubjects()); } AdobePDFSchema pdf = metadata.getAdobePDFSchema(); if (pdf != null) { display("Keywords:", pdf.getKeywords()); display("PDF Version:", pdf.getPDFVersion()); display("PDF Producer:", pdf.getProducer()); } XMPBasicSchema basic = metadata.getXMPBasicSchema(); if (basic != null) { display("Create Date:", basic.getCreateDate()); display("Modify Date:", basic.getModifyDate()); display("Creator Tool:", basic.getCreatorTool()); } } catch (XmpParsingException e) { System.err.println("An error ouccred when parsing the meta data: " + e.getMessage()); } } else { // The pdf doesn't contain any metadata, try to use the // document information instead PDDocumentInformation information = document.getDocumentInformation(); if (information != null) { showDocumentInformation(information); } } } finally { if (document != null) { document.close(); } } } }
From source file:adams.flow.transformer.PDFMetaData.java
License:Open Source License
/** * Executes the flow item./*from w w w. j a v a 2 s . co m*/ * * @return null if everything is fine, otherwise error message */ @Override protected String doExecute() { String result; File file; SpreadSheet sheet; PDDocument document; PDDocumentInformation info; Row row; Set<String> keys; result = null; // get file if (m_InputToken.getPayload() instanceof File) file = (File) m_InputToken.getPayload(); else file = new PlaceholderFile((String) m_InputToken.getPayload()); sheet = new DefaultSpreadSheet(); sheet.setDataRowClass(SparseDataRow.class); sheet.setName("Meta-Data: " + file.getAbsolutePath()); try { row = sheet.addRow(); document = PDDocument.load(file.getAbsoluteFile()); info = document.getDocumentInformation(); addCell(row, "Title", info.getTitle()); addCell(row, "Subject", info.getSubject()); addCell(row, "Author", info.getAuthor()); addCell(row, "Keywords", info.getKeywords()); addCell(row, "Producer", info.getProducer()); addCell(row, "Creation Date", info.getCreationDate()); addCell(row, "Modification Date", info.getModificationDate()); addCell(row, "Creator", info.getCreator()); addCell(row, "Trapped", info.getTrapped()); keys = info.getMetadataKeys(); for (String key : keys) addCell(row, "Meta-" + key, info.getCustomMetadataValue(key)); } catch (Exception e) { result = handleException("Failed to extract meta-data: ", e); } if (result == null) m_OutputToken = new Token(sheet); return result; }
From source file:au.org.alfred.icu.pdf.services.factories.tests.PDFBoxPrintingFactoryTest.java
@Test public void canSetDocumentInformation() throws IOException { PDDocument pdf = new PDDocument(); String title = "PMIID"; String subject = "VISIT_NUMBER"; String keywords = "VISIT_NUMBER"; String creator = "USER_NAME"; String author = "USER_NAME"; ICUDischargeSummaryFactory.addDocumentInformation(pdf, title, subject, keywords, creator, author); assertTrue(pdf.getDocumentInformation().getAuthor().equals(author)); }
From source file:com.fangxin365.core.utils.PDFMerger.java
License:Apache License
/** * append all pages from source to destination. * // w w w . ja va 2 s .c om * @param destination * the document to receive the pages * @param source * the document originating the new pages * * @throws IOException * If there is an error accessing data from either document. */ public void appendDocument(PDDocument destination, PDDocument source) throws IOException { if (destination.isEncrypted()) { System.out.println("Error: destination PDF is encrypted, can't append encrypted PDF documents."); } if (source.isEncrypted()) { System.out.println("Error: source PDF is encrypted, can't append encrypted PDF documents."); } PDDocumentInformation destInfo = destination.getDocumentInformation(); PDDocumentInformation srcInfo = source.getDocumentInformation(); destInfo.getDictionary().mergeInto(srcInfo.getDictionary()); PDDocumentCatalog destCatalog = destination.getDocumentCatalog(); PDDocumentCatalog srcCatalog = source.getDocumentCatalog(); // use the highest version number for the resulting pdf float destVersion = destination.getDocument().getVersion(); float srcVersion = source.getDocument().getVersion(); if (destVersion < srcVersion) { destination.getDocument().setVersion(srcVersion); } if (destCatalog.getOpenAction() == null) { destCatalog.setOpenAction(srcCatalog.getOpenAction()); } // maybe there are some shared resources for all pages COSDictionary srcPages = (COSDictionary) srcCatalog.getCOSDictionary().getDictionaryObject(COSName.PAGES); COSDictionary srcResources = (COSDictionary) srcPages.getDictionaryObject(COSName.RESOURCES); COSDictionary destPages = (COSDictionary) destCatalog.getCOSDictionary().getDictionaryObject(COSName.PAGES); COSDictionary destResources = (COSDictionary) destPages.getDictionaryObject(COSName.RESOURCES); if (srcResources != null) { if (destResources != null) { destResources.mergeInto(srcResources); } else { destPages.setItem(COSName.RESOURCES, srcResources); } } PDFCloneUtility cloner = new PDFCloneUtility(destination); try { PDAcroForm destAcroForm = destCatalog.getAcroForm(); PDAcroForm srcAcroForm = srcCatalog.getAcroForm(); if (destAcroForm == null) { cloner.cloneForNewDocument(srcAcroForm); destCatalog.setAcroForm(srcAcroForm); } else { if (srcAcroForm != null) { mergeAcroForm(cloner, destAcroForm, srcAcroForm); } } } catch (Exception e) { // if we are not ignoring exceptions, we'll re-throw this if (!ignoreAcroFormErrors) { throw (IOException) e; } } COSArray destThreads = (COSArray) destCatalog.getCOSDictionary().getDictionaryObject(COSName.THREADS); COSArray srcThreads = (COSArray) cloner .cloneForNewDocument(destCatalog.getCOSDictionary().getDictionaryObject(COSName.THREADS)); if (destThreads == null) { destCatalog.getCOSDictionary().setItem(COSName.THREADS, srcThreads); } else { destThreads.addAll(srcThreads); } PDDocumentNameDictionary destNames = destCatalog.getNames(); PDDocumentNameDictionary srcNames = srcCatalog.getNames(); if (srcNames != null) { if (destNames == null) { destCatalog.getCOSDictionary().setItem(COSName.NAMES, cloner.cloneForNewDocument(srcNames)); } else { cloner.cloneMerge(srcNames, destNames); } } PDDocumentOutline destOutline = destCatalog.getDocumentOutline(); PDDocumentOutline srcOutline = srcCatalog.getDocumentOutline(); if (srcOutline != null) { if (destOutline == null) { PDDocumentOutline cloned = new PDDocumentOutline( (COSDictionary) cloner.cloneForNewDocument(srcOutline)); destCatalog.setDocumentOutline(cloned); } else { PDOutlineItem first = srcOutline.getFirstChild(); if (first != null) { PDOutlineItem clonedFirst = new PDOutlineItem( (COSDictionary) cloner.cloneForNewDocument(first)); destOutline.appendChild(clonedFirst); } } } String destPageMode = destCatalog.getPageMode(); String srcPageMode = srcCatalog.getPageMode(); if (destPageMode == null) { destCatalog.setPageMode(srcPageMode); } COSDictionary destLabels = (COSDictionary) destCatalog.getCOSDictionary() .getDictionaryObject(COSName.PAGE_LABELS); COSDictionary srcLabels = (COSDictionary) srcCatalog.getCOSDictionary() .getDictionaryObject(COSName.PAGE_LABELS); if (srcLabels != null) { int destPageCount = destination.getNumberOfPages(); COSArray destNums = null; if (destLabels == null) { destLabels = new COSDictionary(); destNums = new COSArray(); destLabels.setItem(COSName.NUMS, destNums); destCatalog.getCOSDictionary().setItem(COSName.PAGE_LABELS, destLabels); } else { destNums = (COSArray) destLabels.getDictionaryObject(COSName.NUMS); } COSArray srcNums = (COSArray) srcLabels.getDictionaryObject(COSName.NUMS); if (srcNums != null) { for (int i = 0; i < srcNums.size(); i += 2) { COSNumber labelIndex = (COSNumber) srcNums.getObject(i); long labelIndexValue = labelIndex.intValue(); destNums.add(COSInteger.get(labelIndexValue + destPageCount)); destNums.add(cloner.cloneForNewDocument(srcNums.getObject(i + 1))); } } } COSStream destMetadata = (COSStream) destCatalog.getCOSDictionary().getDictionaryObject(COSName.METADATA); COSStream srcMetadata = (COSStream) srcCatalog.getCOSDictionary().getDictionaryObject(COSName.METADATA); if (destMetadata == null && srcMetadata != null) { PDStream newStream = new PDStream(destination, srcMetadata.getUnfilteredStream(), false); newStream.getStream().mergeInto(srcMetadata); newStream.addCompression(); destCatalog.getCOSDictionary().setItem(COSName.METADATA, newStream); } // finally append the pages @SuppressWarnings("unchecked") List<PDPage> pages = srcCatalog.getAllPages(); Iterator<PDPage> pageIter = pages.iterator(); while (pageIter.hasNext()) { PDPage page = pageIter.next(); PDPage newPage = new PDPage((COSDictionary) cloner.cloneForNewDocument(page.getCOSDictionary())); newPage.setCropBox(page.findCropBox()); newPage.setMediaBox(page.findMediaBox()); newPage.setRotation(page.findRotation()); destination.addPage(newPage); } }
From source file:com.formkiq.core.service.generator.pdfbox.PdfEditorServiceImpl.java
License:Apache License
/** * Build {@link FormJSON} from {@link PDDocument}. * @param doc {@link PDDocument}//from w w w.j ava 2 s. c o m * @param texts {@link List} of {@link PdfTextField} * @return {@link FormJSON} */ private FormJSON buildFormJSON(final PDDocument doc, final List<PdfTextField> texts) { String title = doc.getDocumentInformation().getTitle(); if (isEmpty(title)) { float maxFont = texts.stream().map(s -> Float.valueOf(s.getFontSize())).max(Float::compare).get() .floatValue(); StringBuilder sb = new StringBuilder(); for (PdfTextField text : texts) { if ((int) text.getFontSize() == (int) maxFont) { sb.append(text.getText() + " "); } } title = sb.toString(); } title = isEmpty(title) ? "Untitled" : title.trim(); FormJSON form = ObjectBuilder.buildFormJSON(title); return form; }
From source file:com.github.joemcintyre.pdffinish.PDFFinish.java
License:Open Source License
/** * Show metadata from PDF document./*from w ww. j a v a 2s .com*/ * * @param document Loaded PDF document. */ private static void showMetadata(PDDocument document) throws IOException { PDDocumentInformation info = document.getDocumentInformation(); System.out.println("Title: " + info.getTitle()); System.out.println("Author: " + info.getAuthor()); System.out.println("Subject: " + info.getSubject()); System.out.println("Keywords: " + info.getKeywords()); System.out.println("Creator: " + info.getCreator()); System.out.println("Producer: " + info.getProducer()); System.out.println("Creation Date: " + info.getCreationDate()); System.out.println("Modification Date: " + info.getModificationDate()); }
From source file:com.github.joemcintyre.pdffinish.PDFFinish.java
License:Open Source License
/** * Update metadata.// w w w.ja v a 2 s . c o m * * @param document Loaded PDF document. * @throws IOException */ private void updateMetadata(PDDocument document) throws IOException { PDDocumentInformation info = document.getDocumentInformation(); if (title != null) { info.setTitle(title); } if (author != null) { info.setAuthor(author); } if (subject != null) { info.setSubject(subject); } if (keywords != null) { info.setKeywords(keywords); } }
From source file:com.jaeksoft.searchlib.parser.PdfParser.java
License:Open Source License
private void extractMetaData(ParserResultItem result, PDDocument pdf) throws IOException { PDDocumentInformation info = pdf.getDocumentInformation(); if (info != null) { result.addField(ParserFieldEnum.title, info.getTitle()); result.addField(ParserFieldEnum.subject, info.getSubject()); result.addField(ParserFieldEnum.author, info.getAuthor()); result.addField(ParserFieldEnum.producer, info.getProducer()); result.addField(ParserFieldEnum.keywords, info.getKeywords()); String d = getDate(getCreationDate(info)); if (d != null) result.addField(ParserFieldEnum.creation_date, d); d = getDate(getModificationDate(info)); if (d != null) result.addField(ParserFieldEnum.modification_date, d); }/*from w w w . j a va 2 s . com*/ int pages = pdf.getNumberOfPages(); result.addField(ParserFieldEnum.number_of_pages, pages); PDDocumentCatalog catalog = pdf.getDocumentCatalog(); if (catalog != null) { result.addField(ParserFieldEnum.language, catalog.getLanguage()); } }
From source file:com.openkm.util.metadata.MetadataExtractor.java
License:Open Source License
/** * Extract metadata from PDF//from w ww . jav a 2 s . c o m */ public static PdfMetadata pdfExtractor(InputStream is) throws IOException { PDDocument doc = PDDocument.load(is); PDDocumentInformation info = doc.getDocumentInformation(); PdfMetadata md = new PdfMetadata(); md.setNumberOfPages(doc.getNumberOfPages()); md.setTitle(info.getTitle()); md.setAuthor(info.getAuthor()); md.setSubject(info.getSubject()); md.setKeywords(info.getKeywords()); md.setCreator(info.getCreator()); md.setProducer(info.getProducer()); md.setTrapped(info.getTrapped()); md.setCreationDate(info.getCreationDate()); md.setModificationDate(info.getModificationDate()); log.info("pdfExtractor: {}", md); return md; }