List of usage examples for org.apache.pdfbox.pdmodel PDDocumentInformation getCreationDate
public Calendar getCreationDate()
From source file:com.qwazr.extractor.parser.PdfBox.java
License:Apache License
private Calendar getModificationDate(PDDocumentInformation pdfInfo) { try {// w w w . j a v a 2 s . com return pdfInfo.getCreationDate(); } catch (IOException e) { logger.warn(e.getMessage()); return null; } }
From source file:com.qwazr.library.pdfbox.PdfBoxParser.java
License:Apache License
private void extractMetaData(final PDDocument pdf, final ParserFieldsBuilder metas) { metas.set(MIME_TYPE, DEFAULT_MIMETYPES[0]); final PDDocumentInformation info = pdf.getDocumentInformation(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(SUBJECT, info.getSubject()); metas.add(AUTHOR, info.getAuthor()); metas.add(PRODUCER, info.getProducer()); metas.add(KEYWORDS, info.getKeywords()); metas.add(CREATION_DATE, info.getCreationDate()); metas.add(MODIFICATION_DATE, info.getModificationDate()); }/* w ww . j av a2s . c o m*/ int pages = pdf.getNumberOfPages(); metas.add(NUMBER_OF_PAGES, pages); PDDocumentCatalog catalog = pdf.getDocumentCatalog(); if (catalog != null) metas.add(LANGUAGE, catalog.getLanguage()); }
From source file:com.sustainalytics.crawlerfilter.PDFTitleGeneration.java
License:Apache License
/** * This method extracts creation date/ custom date of a PDF file * @param file is a File object//from w w w .j a v a 2 s . c o m * @return String that contains the creation date/ custom date of the PDF */ public static String extractDate(File file) { PDDocument document = null; boolean isDamaged = false; //to deal with damaged pdf String creationDateMetaData = ""; try { document = PDDocument.load(file.toString()); /*If the PDF file is not damanged --->*/ if (!isDamaged) { /*...but the file is encrypted --->*/ if (document.isEncrypted()) { logger.info("File " + file.getName() + "is encrypted. Trying to decrypt..."); try { /*...then decryptt it --->*/ document.decrypt(""); document.setAllSecurityToBeRemoved(true); logger.info("File " + file.getName() + "successfully decrypted!"); } catch (CryptographyException e) { logger.info("Error decrypting file " + file.getName()); isDamaged = true; } } /*<--work around to decrypt an encrypted pdf ends here*/ /*Metadata extraction --->*/ PDDocumentInformation info = document.getDocumentInformation(); /*We are only interested in date data--->*/ Calendar calendar = info.getCreationDate(); int creationYear = 0, creationMonth = 0, creationDate = 0; if (calendar != null) { creationYear = calendar.get(Calendar.YEAR); creationMonth = calendar.get(Calendar.MONTH) + 1; creationDate = calendar.get(Calendar.DATE); } /*<---Date data extraction complete*/ /*If creation date is not empty --->*/ if (creationYear != 0) { creationDateMetaData = creationYear + "-" + creationMonth + "-" + creationDate; } //<--- creation date found and the date part of the title is generated /*No creation date is found --->*/ else { SimpleDateFormat dateFormatter = new SimpleDateFormat("MM/dd/yyyy"); Date customDate = null; /*But we have custom date some times --->*/ try { customDate = dateFormatter.parse(info.getCustomMetadataValue("customdate")); } catch (ParseException e) { logger.info("Error parsing date from custom date"); } calendar = Calendar.getInstance(); calendar.setTime(customDate); if (calendar != null) { creationYear = calendar.get(Calendar.YEAR); creationMonth = calendar.get(Calendar.MONTH) + 1; creationDate = calendar.get(Calendar.DATE); } /*<---Date data extraction complete from customdate*/ if (creationYear != 0) { creationDateMetaData = creationYear + "-" + creationMonth + "-" + creationDate; } } //<--- work around if no creation date is found } /*<--- Good to know that the PDF was not damaged*/ } catch (IOException e) { /*If the PDF was not read by the system --->*/ logger.info("Error processing file " + file.getName()); /*... then maybe it is damaged*/ isDamaged = true; } finally { try { /*If the file was good, not damaged, then please close it --->*/ if (!isDamaged) { document.close(); logger.info("File " + file.getName() + " is closed successfully!"); } } catch (IOException e) { logger.info("Error closing file " + file.getName()); } } /*<--- PDF closing done!*/ return creationDateMetaData; }
From source file:com.wintindustries.pdffilter.pdfcore.PDFTester.java
static public void printMetadata(PDDocument document) throws IOException { PDDocumentInformation info = document.getDocumentInformation(); PDDocumentCatalog cat = document.getDocumentCatalog(); PDMetadata metadata = cat.getMetadata(); System.out.println("Page Count=" + document.getNumberOfPages()); System.out.println("Title=" + info.getTitle()); System.out.println("Author=" + info.getAuthor()); System.out.println("Subject=" + info.getSubject()); System.out.println("Keywords=" + info.getKeywords()); System.out.println("Creator=" + info.getCreator()); System.out.println("Producer=" + info.getProducer()); System.out.println("Creation Date=" + formatDate(info.getCreationDate())); System.out.println("Modification Date=" + formatDate(info.getModificationDate())); System.out.println("Trapped=" + info.getTrapped()); if (metadata != null) { System.out.println("Metadata=" + metadata.getInputStreamAsString()); }/*from www .j a va 2s . com*/ }
From source file:ddf.catalog.transformer.input.pdf.PdfInputTransformer.java
License:Open Source License
/** * @param pdfDocument PDF document//from w w w.j a v a2 s .com * @param metacard A mutable metacard to add the extracted data to */ private void extractPdfMetadata(PDDocument pdfDocument, MetacardImpl metacard) { PDDocumentInformation documentInformation = pdfDocument.getDocumentInformation(); setDateIfNotNull(documentInformation.getCreationDate(), metacard, Metacard.CREATED); setDateIfNotNull(documentInformation.getModificationDate(), metacard, Metacard.MODIFIED); if (usePdfTitleAsTitle) { setIfNotBlank(documentInformation.getTitle(), metacard, Metacard.TITLE); } setIfNotBlank(documentInformation.getAuthor(), metacard, Contact.CREATOR_NAME); setIfNotBlank(documentInformation.getSubject(), metacard, Metacard.DESCRIPTION); setIfNotBlank(documentInformation.getKeywords(), metacard, Topic.KEYWORD); }
From source file:de.offis.health.icardea.cied.pdf.extractor.PDFApachePDFBoxExtractor.java
License:Apache License
/** * This method will return the key and value pairs stored in the PDF * information. It's the basic information like title, subject, author, * creator, keywords, producer (meaning application) as well as creation * and modification date. The method is provided for debugging purposes. * /* w ww.j a v a 2 s. c om*/ * @return Returns <code>key=value</code> pair line by line (using system * dependent newline). */ @SuppressWarnings("unused") private String getPdfInfo() { StringBuffer stringBuffer = new StringBuffer(); if (pdfDocument != null) { PDDocumentInformation pdfInfo = pdfDocument.getDocumentInformation(); // Title if (pdfInfo.getTitle() != null) { stringBuffer.append("Title"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getTitle()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // Subject if (pdfInfo.getSubject() != null) { stringBuffer.append("Subject"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getSubject()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // Keywords if (pdfInfo.getKeywords() != null) { stringBuffer.append("Keywords"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getKeywords()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // Author if (pdfInfo.getAuthor() != null) { stringBuffer.append("Author"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getAuthor()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // Producer if (pdfInfo.getProducer() != null) { stringBuffer.append("Producer"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getProducer()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // Creator if (pdfInfo.getCreator() != null) { stringBuffer.append("Creator"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getCreator()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // CreationDate try { if (pdfInfo.getCreationDate() != null) { stringBuffer.append("CreationDate"); stringBuffer.append("="); stringBuffer.append(GlobalTools.calendar2String(pdfInfo.getCreationDate(), GlobalTools.DATE_FORMAT_STRING_ISO8601)); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if } catch (IOException ex) { } // end try..catch // ModDate try { if (pdfInfo.getModificationDate() != null) { stringBuffer.append("ModDate"); stringBuffer.append("="); stringBuffer.append(GlobalTools.calendar2String(pdfInfo.getModificationDate(), GlobalTools.DATE_FORMAT_STRING_ISO8601)); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if } catch (IOException ex) { } // end try..catch } // end if return stringBuffer.toString(); }
From source file:fr.univ_tours.etu.pdf.LucenePDFDocument.java
License:Apache License
/** * This will add the contents to the lucene document. * * @param document The document to add the contents to. * @param is The stream to get the contents from. * @param documentLocation The location of the document, used just for debug messages. * * @throws IOException If there is an error parsing the document. *///from ww w . j av a2 s . co m private void addContent(Document document, InputStream is, String documentLocation) throws IOException { PDDocument pdfDocument = null; try { pdfDocument = PDDocument.load(is); // create a writer where to append the text content. StringWriter writer = new StringWriter(); if (stripper == null) { stripper = new PDFTextStripper(); } stripper.writeText(pdfDocument, writer); String contentsDirty = writer.getBuffer().toString(); //System.out.println(contentsDirty.substring(0,100)); String contents = contentsDirty.replaceAll("\\p{Sm}|\\p{Sk}|\\p{So}", " "); //System.out.println(contents); // addTextField(document, DocFields.CONTENTS, reader); TextField ne = this.getNamedEntities(contents); String lemmas = nlpNeTokenizer.getLemmaString(); //StringReader reader = new StringReader(contents); StringReader reader = new StringReader(lemmas); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. FieldType type = new FieldType(); type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); type.setStored(false); type.setTokenized(true); document.add(new Field(DocFields.CONTENTS, reader, type)); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { document.add(ne);//adding named entities addTextField(document, DocFields.AUTHOR, info.getAuthor()); try {//to avoid issues with CreationDate addUnstoredDate(document, DocFields.CREATION_DATE, info.getCreationDate().getTime()); } catch (Exception e) { System.out.println("Warning: some issue with CreationDate attribute!"); } addTextField(document, DocFields.CREATOR, info.getCreator()); addTextField(document, DocFields.KEYWORDS, info.getKeywords()); addTextField(document, DocFields.SUBJECT, info.getSubject()); addTextField(document, DocFields.TITLE, info.getTitle()); //addTextField(document, "Title", info.getTitle()); //addTextField(document, "ModificationDate", info.getModificationDate()); //addTextField(document, "Producer", info.getProducer()); //addTextField(document, "Trapped", info.getTrapped()); } int summarySize = Math.min(contents.length(), 1500); String summary = contents.substring(0, summarySize); // Add the summary as an UnIndexed field, so that it is stored and returned // with hit documents for display. addUnindexedField(document, DocFields.SUMMARY, summary); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:fr.univ_tours.etu.searcher.LucenePDFDocument.java
License:Apache License
/** * This will add the contents to the lucene document. * * @param document The document to add the contents to. * @param is The stream to get the contents from. * @param documentLocation The location of the document, used just for debug messages. * * @throws IOException If there is an error parsing the document. *//*from w ww.j a v a 2 s.c o m*/ private void addContent(Document document, InputStream is, String documentLocation) throws IOException { PDDocument pdfDocument = null; try { pdfDocument = PDDocument.load(is); // create a writer where to append the text content. StringWriter writer = new StringWriter(); if (stripper == null) { stripper = new PDFTextStripper(); } stripper.writeText(pdfDocument, writer); // Note: the buffer to string operation is costless; // the char array value of the writer buffer and the content string // is shared as long as the buffer content is not modified, which will // not occur here. String contents = writer.getBuffer().toString(); StringReader reader = new StringReader(contents); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. addTextField(document, "contents", reader); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { addTextField(document, "Author", info.getAuthor()); addTextField(document, "CreationDate", info.getCreationDate()); addTextField(document, "Creator", info.getCreator()); addTextField(document, "Keywords", info.getKeywords()); addTextField(document, "ModificationDate", info.getModificationDate()); addTextField(document, "Producer", info.getProducer()); addTextField(document, "Subject", info.getSubject()); addTextField(document, "Title", info.getTitle()); addTextField(document, "Trapped", info.getTrapped()); } int summarySize = Math.min(contents.length(), 1500); String summary = contents.substring(0, summarySize); // Add the summary as an UnIndexed field, so that it is stored and returned // with hit documents for display. addUnindexedField(document, "summary", summary); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:mj.ocraptor.extraction.tika.parser.pdf.PDFParser.java
License:Apache License
private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException { PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); addMetadata(metadata, TikaCoreProperties.TITLE, info.getTitle()); addMetadata(metadata, TikaCoreProperties.CREATOR, info.getAuthor()); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); try {/* w w w. ja va2 s . co m*/ // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); } catch (IOException e) { // Invalid date format, just ignore } try { Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); } catch (IOException e) { // Invalid date format, just ignore } // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList(new String[] { "Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped" }); for (COSName key : info.getDictionary().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); } } }
From source file:net.padaf.preflight.xmp.SynchronizedMetaDataValidation.java
License:Apache License
/** * Analyze if the CreationDate embedded in Document Information dictionary and * in XMP properties are synchronized/* www. ja v a 2 s . com*/ * * @param dico * Document Information Dictionary * @param xmp * XMP Basic Schema * @param ve * The list of validation errors * @throws ValidationException */ protected void analyzeCreationDateProperty(PDDocumentInformation dico, XMPBasicSchema xmp, List<ValidationError> ve) throws ValidationException { Calendar creationDate; try { creationDate = dico.getCreationDate(); } catch (IOException e) { // If there is an error while converting this property to a date throw formatAccessException("Document Information", "CreationDate", e); } if (creationDate != null) { if (xmp != null) { Calendar xmpCreationDate = xmp.getCreateDateValue(); if (xmpCreationDate == null) { ve.add(AbsentXMPPropertyError("CreationDate", "Property is not defined")); } else { if (!DateConverter.toISO8601(xmpCreationDate).equals(DateConverter.toISO8601(creationDate))) { ve.add(unsynchronizedMetaDataError("CreationDate")); } } } else { ve.add(AbsentSchemaMetaDataError("CreationDate", "Basic XMP")); } } }