Example usage for org.apache.pdfbox.pdmodel PDDocumentInformation getCreationDate

List of usage examples for org.apache.pdfbox.pdmodel PDDocumentInformation getCreationDate

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocumentInformation getCreationDate.

Prototype

public Calendar getCreationDate() 

Source Link

Document

This will get the creation date of the document.

Usage

From source file:com.qwazr.extractor.parser.PdfBox.java

License:Apache License

private Calendar getModificationDate(PDDocumentInformation pdfInfo) {
    try {// w w w  . j a v  a 2 s .  com
        return pdfInfo.getCreationDate();
    } catch (IOException e) {
        logger.warn(e.getMessage());
        return null;
    }
}

From source file:com.qwazr.library.pdfbox.PdfBoxParser.java

License:Apache License

private void extractMetaData(final PDDocument pdf, final ParserFieldsBuilder metas) {
    metas.set(MIME_TYPE, DEFAULT_MIMETYPES[0]);
    final PDDocumentInformation info = pdf.getDocumentInformation();
    if (info != null) {
        metas.add(TITLE, info.getTitle());
        metas.add(SUBJECT, info.getSubject());
        metas.add(AUTHOR, info.getAuthor());
        metas.add(PRODUCER, info.getProducer());
        metas.add(KEYWORDS, info.getKeywords());
        metas.add(CREATION_DATE, info.getCreationDate());
        metas.add(MODIFICATION_DATE, info.getModificationDate());
    }/* w  ww . j av  a2s  .  c  o m*/
    int pages = pdf.getNumberOfPages();
    metas.add(NUMBER_OF_PAGES, pages);
    PDDocumentCatalog catalog = pdf.getDocumentCatalog();
    if (catalog != null)
        metas.add(LANGUAGE, catalog.getLanguage());
}

From source file:com.sustainalytics.crawlerfilter.PDFTitleGeneration.java

License:Apache License

/**
 * This method extracts creation date/ custom date of a PDF file
 * @param file is a File object//from  w  w  w  .j a v  a 2  s  .  c o  m
 * @return String that contains the creation date/ custom date of the PDF
 */
public static String extractDate(File file) {
    PDDocument document = null;
    boolean isDamaged = false; //to deal with damaged pdf
    String creationDateMetaData = "";
    try {
        document = PDDocument.load(file.toString());
        /*If the PDF file is not damanged --->*/
        if (!isDamaged) {
            /*...but the file is encrypted --->*/
            if (document.isEncrypted()) {
                logger.info("File " + file.getName() + "is encrypted. Trying to decrypt...");
                try {
                    /*...then decryptt it --->*/
                    document.decrypt("");
                    document.setAllSecurityToBeRemoved(true);
                    logger.info("File " + file.getName() + "successfully decrypted!");
                } catch (CryptographyException e) {
                    logger.info("Error decrypting file " + file.getName());
                    isDamaged = true;
                }

            } /*<--work around to decrypt an encrypted pdf ends here*/

            /*Metadata extraction --->*/
            PDDocumentInformation info = document.getDocumentInformation();

            /*We are only interested in date data--->*/
            Calendar calendar = info.getCreationDate();
            int creationYear = 0, creationMonth = 0, creationDate = 0;
            if (calendar != null) {
                creationYear = calendar.get(Calendar.YEAR);
                creationMonth = calendar.get(Calendar.MONTH) + 1;
                creationDate = calendar.get(Calendar.DATE);

            } /*<---Date data extraction complete*/

            /*If creation date is not empty --->*/
            if (creationYear != 0) {
                creationDateMetaData = creationYear + "-" + creationMonth + "-" + creationDate;
            } //<--- creation date found and the date part of the title is generated
            /*No creation date is found --->*/
            else {
                SimpleDateFormat dateFormatter = new SimpleDateFormat("MM/dd/yyyy");
                Date customDate = null;
                /*But we have custom date some times --->*/
                try {
                    customDate = dateFormatter.parse(info.getCustomMetadataValue("customdate"));
                } catch (ParseException e) {
                    logger.info("Error parsing date from custom date");
                }
                calendar = Calendar.getInstance();
                calendar.setTime(customDate);
                if (calendar != null) {
                    creationYear = calendar.get(Calendar.YEAR);
                    creationMonth = calendar.get(Calendar.MONTH) + 1;
                    creationDate = calendar.get(Calendar.DATE);

                } /*<---Date data extraction complete from customdate*/
                if (creationYear != 0) {
                    creationDateMetaData = creationYear + "-" + creationMonth + "-" + creationDate;
                }
            } //<--- work around if no creation date is found

        } /*<--- Good to know that the PDF was not damaged*/
    } catch (IOException e) { /*If the PDF was not read by the system --->*/
        logger.info("Error processing file " + file.getName());
        /*... then maybe it is damaged*/
        isDamaged = true;
    } finally {
        try {
            /*If the file was good, not damaged, then please close it --->*/
            if (!isDamaged) {
                document.close();
                logger.info("File " + file.getName() + " is closed successfully!");
            }
        } catch (IOException e) {
            logger.info("Error closing file " + file.getName());
        }
    } /*<--- PDF closing done!*/
    return creationDateMetaData;
}

From source file:com.wintindustries.pdffilter.pdfcore.PDFTester.java

static public void printMetadata(PDDocument document) throws IOException {
    PDDocumentInformation info = document.getDocumentInformation();
    PDDocumentCatalog cat = document.getDocumentCatalog();
    PDMetadata metadata = cat.getMetadata();
    System.out.println("Page Count=" + document.getNumberOfPages());
    System.out.println("Title=" + info.getTitle());
    System.out.println("Author=" + info.getAuthor());
    System.out.println("Subject=" + info.getSubject());
    System.out.println("Keywords=" + info.getKeywords());
    System.out.println("Creator=" + info.getCreator());
    System.out.println("Producer=" + info.getProducer());
    System.out.println("Creation Date=" + formatDate(info.getCreationDate()));
    System.out.println("Modification Date=" + formatDate(info.getModificationDate()));
    System.out.println("Trapped=" + info.getTrapped());
    if (metadata != null) {
        System.out.println("Metadata=" + metadata.getInputStreamAsString());
    }/*from   www  .j  a  va 2s .  com*/
}

From source file:ddf.catalog.transformer.input.pdf.PdfInputTransformer.java

License:Open Source License

/**
 * @param pdfDocument PDF document//from w w w.j a v  a2 s  .com
 * @param metacard    A mutable metacard to add the extracted data to
 */
private void extractPdfMetadata(PDDocument pdfDocument, MetacardImpl metacard) {

    PDDocumentInformation documentInformation = pdfDocument.getDocumentInformation();

    setDateIfNotNull(documentInformation.getCreationDate(), metacard, Metacard.CREATED);

    setDateIfNotNull(documentInformation.getModificationDate(), metacard, Metacard.MODIFIED);

    if (usePdfTitleAsTitle) {
        setIfNotBlank(documentInformation.getTitle(), metacard, Metacard.TITLE);
    }

    setIfNotBlank(documentInformation.getAuthor(), metacard, Contact.CREATOR_NAME);

    setIfNotBlank(documentInformation.getSubject(), metacard, Metacard.DESCRIPTION);

    setIfNotBlank(documentInformation.getKeywords(), metacard, Topic.KEYWORD);

}

From source file:de.offis.health.icardea.cied.pdf.extractor.PDFApachePDFBoxExtractor.java

License:Apache License

/**
 * This method will return the key and value pairs stored in the PDF
 * information. It's the basic information like title, subject, author,
 * creator, keywords, producer (meaning application) as well as creation
 * and modification date. The method is provided for debugging purposes.
 * /* w ww.j a v a 2  s.  c om*/
 * @return Returns <code>key=value</code> pair line by line (using system
 * dependent newline).
 */
@SuppressWarnings("unused")
private String getPdfInfo() {
    StringBuffer stringBuffer = new StringBuffer();
    if (pdfDocument != null) {
        PDDocumentInformation pdfInfo = pdfDocument.getDocumentInformation();

        // Title
        if (pdfInfo.getTitle() != null) {
            stringBuffer.append("Title");
            stringBuffer.append("=");
            stringBuffer.append(pdfInfo.getTitle());
            stringBuffer.append(GlobalTools.LINESEPARATOR);
        } // end if

        // Subject
        if (pdfInfo.getSubject() != null) {
            stringBuffer.append("Subject");
            stringBuffer.append("=");
            stringBuffer.append(pdfInfo.getSubject());
            stringBuffer.append(GlobalTools.LINESEPARATOR);
        } // end if

        // Keywords
        if (pdfInfo.getKeywords() != null) {
            stringBuffer.append("Keywords");
            stringBuffer.append("=");
            stringBuffer.append(pdfInfo.getKeywords());
            stringBuffer.append(GlobalTools.LINESEPARATOR);
        } // end if

        // Author
        if (pdfInfo.getAuthor() != null) {
            stringBuffer.append("Author");
            stringBuffer.append("=");
            stringBuffer.append(pdfInfo.getAuthor());
            stringBuffer.append(GlobalTools.LINESEPARATOR);
        } // end if

        // Producer
        if (pdfInfo.getProducer() != null) {
            stringBuffer.append("Producer");
            stringBuffer.append("=");
            stringBuffer.append(pdfInfo.getProducer());
            stringBuffer.append(GlobalTools.LINESEPARATOR);
        } // end if

        // Creator
        if (pdfInfo.getCreator() != null) {
            stringBuffer.append("Creator");
            stringBuffer.append("=");
            stringBuffer.append(pdfInfo.getCreator());
            stringBuffer.append(GlobalTools.LINESEPARATOR);
        } // end if

        // CreationDate
        try {
            if (pdfInfo.getCreationDate() != null) {
                stringBuffer.append("CreationDate");
                stringBuffer.append("=");
                stringBuffer.append(GlobalTools.calendar2String(pdfInfo.getCreationDate(),
                        GlobalTools.DATE_FORMAT_STRING_ISO8601));
                stringBuffer.append(GlobalTools.LINESEPARATOR);
            } // end if
        } catch (IOException ex) {
        } // end try..catch

        // ModDate
        try {
            if (pdfInfo.getModificationDate() != null) {
                stringBuffer.append("ModDate");
                stringBuffer.append("=");
                stringBuffer.append(GlobalTools.calendar2String(pdfInfo.getModificationDate(),
                        GlobalTools.DATE_FORMAT_STRING_ISO8601));
                stringBuffer.append(GlobalTools.LINESEPARATOR);
            } // end if
        } catch (IOException ex) {
        } // end try..catch
    } // end if

    return stringBuffer.toString();
}

From source file:fr.univ_tours.etu.pdf.LucenePDFDocument.java

License:Apache License

/**
 * This will add the contents to the lucene document.
 *
 * @param document The document to add the contents to.
 * @param is The stream to get the contents from.
 * @param documentLocation The location of the document, used just for debug messages.
 *
 * @throws IOException If there is an error parsing the document.
 *///from   ww  w  . j  av a2  s  .  co  m
private void addContent(Document document, InputStream is, String documentLocation) throws IOException {
    PDDocument pdfDocument = null;
    try {
        pdfDocument = PDDocument.load(is);

        // create a writer where to append the text content.
        StringWriter writer = new StringWriter();
        if (stripper == null) {
            stripper = new PDFTextStripper();
        }
        stripper.writeText(pdfDocument, writer);

        String contentsDirty = writer.getBuffer().toString();
        //System.out.println(contentsDirty.substring(0,100));
        String contents = contentsDirty.replaceAll("\\p{Sm}|\\p{Sk}|\\p{So}", " ");
        //System.out.println(contents);

        // addTextField(document, DocFields.CONTENTS, reader);
        TextField ne = this.getNamedEntities(contents);

        String lemmas = nlpNeTokenizer.getLemmaString();

        //StringReader reader = new StringReader(contents);
        StringReader reader = new StringReader(lemmas);

        // Add the tag-stripped contents as a Reader-valued Text field so it will
        // get tokenized and indexed.

        FieldType type = new FieldType();
        type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        type.setStored(false);
        type.setTokenized(true);
        document.add(new Field(DocFields.CONTENTS, reader, type));

        PDDocumentInformation info = pdfDocument.getDocumentInformation();
        if (info != null) {
            document.add(ne);//adding named entities
            addTextField(document, DocFields.AUTHOR, info.getAuthor());

            try {//to avoid issues with CreationDate
                addUnstoredDate(document, DocFields.CREATION_DATE, info.getCreationDate().getTime());
            } catch (Exception e) {
                System.out.println("Warning: some issue with CreationDate attribute!");
            }

            addTextField(document, DocFields.CREATOR, info.getCreator());
            addTextField(document, DocFields.KEYWORDS, info.getKeywords());

            addTextField(document, DocFields.SUBJECT, info.getSubject());
            addTextField(document, DocFields.TITLE, info.getTitle());

            //addTextField(document, "Title", info.getTitle());
            //addTextField(document, "ModificationDate", info.getModificationDate());
            //addTextField(document, "Producer", info.getProducer());
            //addTextField(document, "Trapped", info.getTrapped());

        }

        int summarySize = Math.min(contents.length(), 1500);
        String summary = contents.substring(0, summarySize);
        // Add the summary as an UnIndexed field, so that it is stored and returned
        // with hit documents for display.
        addUnindexedField(document, DocFields.SUMMARY, summary);
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}

From source file:fr.univ_tours.etu.searcher.LucenePDFDocument.java

License:Apache License

/**
 * This will add the contents to the lucene document.
 *
 * @param document The document to add the contents to.
 * @param is The stream to get the contents from.
 * @param documentLocation The location of the document, used just for debug messages.
 *
 * @throws IOException If there is an error parsing the document.
 *//*from   w ww.j  a  v  a 2 s.c  o m*/
private void addContent(Document document, InputStream is, String documentLocation) throws IOException {
    PDDocument pdfDocument = null;
    try {
        pdfDocument = PDDocument.load(is);

        // create a writer where to append the text content.
        StringWriter writer = new StringWriter();
        if (stripper == null) {
            stripper = new PDFTextStripper();
        }
        stripper.writeText(pdfDocument, writer);

        // Note: the buffer to string operation is costless;
        // the char array value of the writer buffer and the content string
        // is shared as long as the buffer content is not modified, which will
        // not occur here.
        String contents = writer.getBuffer().toString();

        StringReader reader = new StringReader(contents);

        // Add the tag-stripped contents as a Reader-valued Text field so it will
        // get tokenized and indexed.
        addTextField(document, "contents", reader);

        PDDocumentInformation info = pdfDocument.getDocumentInformation();
        if (info != null) {
            addTextField(document, "Author", info.getAuthor());
            addTextField(document, "CreationDate", info.getCreationDate());
            addTextField(document, "Creator", info.getCreator());
            addTextField(document, "Keywords", info.getKeywords());
            addTextField(document, "ModificationDate", info.getModificationDate());
            addTextField(document, "Producer", info.getProducer());
            addTextField(document, "Subject", info.getSubject());
            addTextField(document, "Title", info.getTitle());
            addTextField(document, "Trapped", info.getTrapped());
        }
        int summarySize = Math.min(contents.length(), 1500);
        String summary = contents.substring(0, summarySize);
        // Add the summary as an UnIndexed field, so that it is stored and returned
        // with hit documents for display.
        addUnindexedField(document, "summary", summary);
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}

From source file:mj.ocraptor.extraction.tika.parser.pdf.PDFParser.java

License:Apache License

private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException {
    PDDocumentInformation info = document.getDocumentInformation();
    metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
    addMetadata(metadata, TikaCoreProperties.TITLE, info.getTitle());
    addMetadata(metadata, TikaCoreProperties.CREATOR, info.getAuthor());
    addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
    addMetadata(metadata, "producer", info.getProducer());
    // TODO: Move to description in Tika 2.0
    addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
    addMetadata(metadata, "trapped", info.getTrapped());
    try {/*  w w  w.  ja va2 s .  co  m*/
        // TODO Remove these in Tika 2.0
        addMetadata(metadata, "created", info.getCreationDate());
        addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
    } catch (IOException e) {
        // Invalid date format, just ignore
    }
    try {
        Calendar modified = info.getModificationDate();
        addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
        addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
    } catch (IOException e) {
        // Invalid date format, just ignore
    }

    // All remaining metadata is custom
    // Copy this over as-is
    List<String> handledMetadata = Arrays.asList(new String[] { "Author", "Creator", "CreationDate", "ModDate",
            "Keywords", "Producer", "Subject", "Title", "Trapped" });
    for (COSName key : info.getDictionary().keySet()) {
        String name = key.getName();
        if (!handledMetadata.contains(name)) {
            addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
        }
    }
}

From source file:net.padaf.preflight.xmp.SynchronizedMetaDataValidation.java

License:Apache License

/**
 * Analyze if the CreationDate embedded in Document Information dictionary and
 * in XMP properties are synchronized/* www.  ja v a 2 s .  com*/
 * 
 * @param dico
 *          Document Information Dictionary
 * @param xmp
 *          XMP Basic Schema
 * @param ve
 *          The list of validation errors
 * @throws ValidationException
 */
protected void analyzeCreationDateProperty(PDDocumentInformation dico, XMPBasicSchema xmp,
        List<ValidationError> ve) throws ValidationException {
    Calendar creationDate;
    try {
        creationDate = dico.getCreationDate();
    } catch (IOException e) {
        // If there is an error while converting this property to a date
        throw formatAccessException("Document Information", "CreationDate", e);
    }
    if (creationDate != null) {
        if (xmp != null) {
            Calendar xmpCreationDate = xmp.getCreateDateValue();

            if (xmpCreationDate == null) {
                ve.add(AbsentXMPPropertyError("CreationDate", "Property is not defined"));
            } else {
                if (!DateConverter.toISO8601(xmpCreationDate).equals(DateConverter.toISO8601(creationDate))) {
                    ve.add(unsynchronizedMetaDataError("CreationDate"));
                }
            }

        } else {
            ve.add(AbsentSchemaMetaDataError("CreationDate", "Basic XMP"));
        }
    }
}