List of usage examples for org.apache.pdfbox.pdmodel PDDocumentInformation getCustomMetadataValue
public String getCustomMetadataValue(String fieldName)
From source file:adams.flow.transformer.PDFMetaData.java
License:Open Source License
/** * Executes the flow item./*from ww w .j a v a 2 s . c om*/ * * @return null if everything is fine, otherwise error message */ @Override protected String doExecute() { String result; File file; SpreadSheet sheet; PDDocument document; PDDocumentInformation info; Row row; Set<String> keys; result = null; // get file if (m_InputToken.getPayload() instanceof File) file = (File) m_InputToken.getPayload(); else file = new PlaceholderFile((String) m_InputToken.getPayload()); sheet = new DefaultSpreadSheet(); sheet.setDataRowClass(SparseDataRow.class); sheet.setName("Meta-Data: " + file.getAbsolutePath()); try { row = sheet.addRow(); document = PDDocument.load(file.getAbsoluteFile()); info = document.getDocumentInformation(); addCell(row, "Title", info.getTitle()); addCell(row, "Subject", info.getSubject()); addCell(row, "Author", info.getAuthor()); addCell(row, "Keywords", info.getKeywords()); addCell(row, "Producer", info.getProducer()); addCell(row, "Creation Date", info.getCreationDate()); addCell(row, "Modification Date", info.getModificationDate()); addCell(row, "Creator", info.getCreator()); addCell(row, "Trapped", info.getTrapped()); keys = info.getMetadataKeys(); for (String key : keys) addCell(row, "Meta-" + key, info.getCustomMetadataValue(key)); } catch (Exception e) { result = handleException("Failed to extract meta-data: ", e); } if (result == null) m_OutputToken = new Token(sheet); return result; }
From source file:com.sustainalytics.crawlerfilter.PDFTitleGeneration.java
License:Apache License
/** * This method extracts creation date/ custom date of a PDF file * @param file is a File object// ww w. j av a 2s .co m * @return String that contains the creation date/ custom date of the PDF */ public static String extractDate(File file) { PDDocument document = null; boolean isDamaged = false; //to deal with damaged pdf String creationDateMetaData = ""; try { document = PDDocument.load(file.toString()); /*If the PDF file is not damanged --->*/ if (!isDamaged) { /*...but the file is encrypted --->*/ if (document.isEncrypted()) { logger.info("File " + file.getName() + "is encrypted. Trying to decrypt..."); try { /*...then decryptt it --->*/ document.decrypt(""); document.setAllSecurityToBeRemoved(true); logger.info("File " + file.getName() + "successfully decrypted!"); } catch (CryptographyException e) { logger.info("Error decrypting file " + file.getName()); isDamaged = true; } } /*<--work around to decrypt an encrypted pdf ends here*/ /*Metadata extraction --->*/ PDDocumentInformation info = document.getDocumentInformation(); /*We are only interested in date data--->*/ Calendar calendar = info.getCreationDate(); int creationYear = 0, creationMonth = 0, creationDate = 0; if (calendar != null) { creationYear = calendar.get(Calendar.YEAR); creationMonth = calendar.get(Calendar.MONTH) + 1; creationDate = calendar.get(Calendar.DATE); } /*<---Date data extraction complete*/ /*If creation date is not empty --->*/ if (creationYear != 0) { creationDateMetaData = creationYear + "-" + creationMonth + "-" + creationDate; } //<--- creation date found and the date part of the title is generated /*No creation date is found --->*/ else { SimpleDateFormat dateFormatter = new SimpleDateFormat("MM/dd/yyyy"); Date customDate = null; /*But we have custom date some times --->*/ try { customDate = dateFormatter.parse(info.getCustomMetadataValue("customdate")); } catch (ParseException e) { logger.info("Error parsing date from custom date"); } calendar = Calendar.getInstance(); calendar.setTime(customDate); if (calendar != null) { creationYear = calendar.get(Calendar.YEAR); creationMonth = calendar.get(Calendar.MONTH) + 1; creationDate = calendar.get(Calendar.DATE); } /*<---Date data extraction complete from customdate*/ if (creationYear != 0) { creationDateMetaData = creationYear + "-" + creationMonth + "-" + creationDate; } } //<--- work around if no creation date is found } /*<--- Good to know that the PDF was not damaged*/ } catch (IOException e) { /*If the PDF was not read by the system --->*/ logger.info("Error processing file " + file.getName()); /*... then maybe it is damaged*/ isDamaged = true; } finally { try { /*If the file was good, not damaged, then please close it --->*/ if (!isDamaged) { document.close(); logger.info("File " + file.getName() + " is closed successfully!"); } } catch (IOException e) { logger.info("Error closing file " + file.getName()); } } /*<--- PDF closing done!*/ return creationDateMetaData; }
From source file:org.codelibs.fess.crawler.extractor.impl.PdfExtractor.java
License:Apache License
private void extractMetadata(final PDDocument document, final ExtractData extractData) { final PDDocumentInformation info = document.getDocumentInformation(); if (info == null) { return;/*w ww. j a v a2s . c om*/ } for (final String key : info.getMetadataKeys()) { final String value = info.getCustomMetadataValue(key); addMetadata(extractData, key, value); } }
From source file:org.lockss.plugin.georgthiemeverlag.GeorgThiemeVerlagPdfFilterFactory.java
License:Open Source License
@Override public void transform(ArchivalUnit au, PdfDocument pdfDocument) throws PdfException { pdfDocument.unsetModificationDate(); PdfUtil.normalizeTrailerId(pdfDocument); pdfDocument.unsetMetadata();/*www . ja v a 2s .c o m*/ PDDocumentInformation pdDocInfo = ((GtvPdfBoxDocument) pdfDocument).getPdDocumentInformation(); if (pdDocInfo.getCustomMetadataValue(GtvPdfBoxDocument.PDFDATE) != null) { pdDocInfo.setCustomMetadataValue(GtvPdfBoxDocument.PDFDATE, null); } if (pdDocInfo.getCustomMetadataValue(GtvPdfBoxDocument.PDFUSER) != null) { pdDocInfo.setCustomMetadataValue(GtvPdfBoxDocument.PDFUSER, null); } PdfStateMachineWorker worker = new PdfStateMachineWorker(); boolean anyXform = false; for (PdfPage pdfPage : pdfDocument.getPages()) { PdfTokenStream pdfTokenStream = pdfPage.getPageTokenStream(); worker.process(pdfTokenStream); if (worker.getResult()) { anyXform = true; List<PdfToken> tokens = pdfTokenStream.getTokens(); // clear tokens including text markers tokens.subList(worker.getBegin(), worker.getEnd() + 1).clear(); pdfTokenStream.setTokens(tokens); } } if (log.isDebug2()) { log.debug2("Transform: " + anyXform); } }
From source file:org.pdfmetamodifier.MetadataHelper.java
License:Apache License
/** * Convert Metadata object to list of lines. * /* w ww . ja v a2s . c o m*/ * @param metadata * Source Metadata object. * @return list of lines with Metadata representation. */ public static List<String> metadataToLineList(final PDDocumentInformation documentInformation) { final List<String> lineList = new ArrayList<>(); if (documentInformation != null) { final List<String> matadataKeys = new ArrayList<>(documentInformation.getMetadataKeys()); Collections.sort(matadataKeys); for (String key : matadataKeys) { final String value = documentInformation.getCustomMetadataValue(key); if (value != null) { lineList.add(String.format(METADATA_LINE_TEMPLATE, key, value)); } } } return lineList; }