Example usage for org.apache.pdfbox.pdmodel PDDocumentInformation getModificationDate

List of usage examples for org.apache.pdfbox.pdmodel PDDocumentInformation getModificationDate

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocumentInformation getModificationDate.

Prototype

public Calendar getModificationDate() 

Source Link

Document

This will get the modification date of the document.

Usage

From source file:adams.flow.transformer.PDFMetaData.java

License:Open Source License

/**
 * Executes the flow item./*  w  w w.  j a va  2 s  .c  o  m*/
 *
 * @return      null if everything is fine, otherwise error message
 */
@Override
protected String doExecute() {
    String result;
    File file;
    SpreadSheet sheet;
    PDDocument document;
    PDDocumentInformation info;
    Row row;
    Set<String> keys;

    result = null;

    // get file
    if (m_InputToken.getPayload() instanceof File)
        file = (File) m_InputToken.getPayload();
    else
        file = new PlaceholderFile((String) m_InputToken.getPayload());

    sheet = new DefaultSpreadSheet();
    sheet.setDataRowClass(SparseDataRow.class);
    sheet.setName("Meta-Data: " + file.getAbsolutePath());

    try {
        row = sheet.addRow();
        document = PDDocument.load(file.getAbsoluteFile());
        info = document.getDocumentInformation();

        addCell(row, "Title", info.getTitle());
        addCell(row, "Subject", info.getSubject());
        addCell(row, "Author", info.getAuthor());
        addCell(row, "Keywords", info.getKeywords());
        addCell(row, "Producer", info.getProducer());
        addCell(row, "Creation Date", info.getCreationDate());
        addCell(row, "Modification Date", info.getModificationDate());
        addCell(row, "Creator", info.getCreator());
        addCell(row, "Trapped", info.getTrapped());
        keys = info.getMetadataKeys();
        for (String key : keys)
            addCell(row, "Meta-" + key, info.getCustomMetadataValue(key));
    } catch (Exception e) {
        result = handleException("Failed to extract meta-data: ", e);
    }

    if (result == null)
        m_OutputToken = new Token(sheet);

    return result;
}

From source file:com.esri.geoportal.commons.pdf.PdfUtils.java

License:Apache License

/**
 * Reads metadata values from a PDF file.
 * //www  .j a v  a 2  s. c o  m
 * @param rawBytes the PDF to read
 * @param defaultTitle title to be used if the PDF metadata doesn't have one
 * @param geometryServiceUrl url of a <a href="https://developers.arcgis.com/rest/services-reference/geometry-service.htm">geometry service</a> for reprojecting coordinates. 
 * 
 * @return metadata properties or null if the PDF cannot be read.
 * 
 * @throws IOException on parsing error
 */
public static Properties readMetadata(byte[] rawBytes, String defaultTitle, String geometryServiceUrl)
        throws IOException {
    Properties ret = new Properties();

    // Attempt to read in the PDF file
    try (PDDocument document = PDDocument.load(rawBytes)) {

        // See if we can read the PDF
        if (!document.isEncrypted()) {
            // Get document metadata
            PDDocumentInformation info = document.getDocumentInformation();

            if (info != null) {

                if (info.getTitle() != null) {
                    ret.put(PROP_TITLE, info.getTitle());
                } else {
                    ret.put(PROP_TITLE, defaultTitle);
                }

                if (info.getSubject() != null) {
                    ret.put(PROP_SUBJECT, info.getSubject());
                } else {

                    StringBuilder psudoSubject = new StringBuilder("");
                    psudoSubject.append("\nAuthor: " + info.getAuthor());
                    psudoSubject.append("\nCreator: " + info.getCreator());
                    psudoSubject.append("\nProducer: " + info.getProducer());

                    ret.put(PROP_SUBJECT, psudoSubject.toString());
                }

                if (info.getModificationDate() != null) {
                    ret.put(PROP_MODIFICATION_DATE, info.getModificationDate().getTime());
                } else {
                    ret.put(PROP_MODIFICATION_DATE, info.getCreationDate().getTime());
                }
            } else {
                LOG.warn("Got null metadata for PDF file");
                return null;
            }

            // Attempt to read in geospatial PDF data
            COSObject measure = document.getDocument().getObjectByType(COSName.getPDFName("Measure"));
            String bBox = null;
            if (measure != null) {
                // This is a Geospatial PDF (i.e. Adobe's standard)
                COSDictionary dictionary = (COSDictionary) measure.getObject();

                float[] coords = ((COSArray) dictionary.getItem("GPTS")).toFloatArray();

                bBox = generateBbox(coords);
            } else {
                PDPage page = document.getPage(0);
                if (page.getCOSObject().containsKey(COSName.getPDFName("LGIDict"))) {
                    // This is a GeoPDF (i.e. TerraGo's standard)
                    bBox = extractGeoPDFProps(page, geometryServiceUrl);
                }
            }

            if (bBox != null) {
                ret.put(PROP_BBOX, bBox);
            }

        } else {
            LOG.warn("Cannot read encrypted PDF file");
            return null;
        }

    } catch (IOException ex) {
        LOG.error("Exception reading PDF", ex);
        throw ex;
    }

    return ret;
}

From source file:com.github.joemcintyre.pdffinish.PDFFinish.java

License:Open Source License

/**
 * Show metadata from PDF document.// w  w  w.  j a v  a2s .c  o m
 * 
 * @param document Loaded PDF document.
 */
private static void showMetadata(PDDocument document) throws IOException {
    PDDocumentInformation info = document.getDocumentInformation();
    System.out.println("Title: " + info.getTitle());
    System.out.println("Author: " + info.getAuthor());
    System.out.println("Subject: " + info.getSubject());
    System.out.println("Keywords: " + info.getKeywords());
    System.out.println("Creator: " + info.getCreator());
    System.out.println("Producer: " + info.getProducer());
    System.out.println("Creation Date: " + info.getCreationDate());
    System.out.println("Modification Date: " + info.getModificationDate());
}

From source file:com.openkm.util.metadata.MetadataExtractor.java

License:Open Source License

/**
 * Extract metadata from PDF/*ww  w. j  a  v a 2 s.  c om*/
 */
public static PdfMetadata pdfExtractor(InputStream is) throws IOException {
    PDDocument doc = PDDocument.load(is);
    PDDocumentInformation info = doc.getDocumentInformation();
    PdfMetadata md = new PdfMetadata();

    md.setNumberOfPages(doc.getNumberOfPages());
    md.setTitle(info.getTitle());
    md.setAuthor(info.getAuthor());
    md.setSubject(info.getSubject());
    md.setKeywords(info.getKeywords());
    md.setCreator(info.getCreator());
    md.setProducer(info.getProducer());
    md.setTrapped(info.getTrapped());
    md.setCreationDate(info.getCreationDate());
    md.setModificationDate(info.getModificationDate());

    log.info("pdfExtractor: {}", md);
    return md;
}

From source file:com.qwazr.library.pdfbox.PdfBoxParser.java

License:Apache License

private void extractMetaData(final PDDocument pdf, final ParserFieldsBuilder metas) {
    metas.set(MIME_TYPE, DEFAULT_MIMETYPES[0]);
    final PDDocumentInformation info = pdf.getDocumentInformation();
    if (info != null) {
        metas.add(TITLE, info.getTitle());
        metas.add(SUBJECT, info.getSubject());
        metas.add(AUTHOR, info.getAuthor());
        metas.add(PRODUCER, info.getProducer());
        metas.add(KEYWORDS, info.getKeywords());
        metas.add(CREATION_DATE, info.getCreationDate());
        metas.add(MODIFICATION_DATE, info.getModificationDate());
    }/*  w  w  w .j av  a 2 s .  c o m*/
    int pages = pdf.getNumberOfPages();
    metas.add(NUMBER_OF_PAGES, pages);
    PDDocumentCatalog catalog = pdf.getDocumentCatalog();
    if (catalog != null)
        metas.add(LANGUAGE, catalog.getLanguage());
}

From source file:com.wintindustries.pdffilter.pdfcore.PDFTester.java

static public void printMetadata(PDDocument document) throws IOException {
    PDDocumentInformation info = document.getDocumentInformation();
    PDDocumentCatalog cat = document.getDocumentCatalog();
    PDMetadata metadata = cat.getMetadata();
    System.out.println("Page Count=" + document.getNumberOfPages());
    System.out.println("Title=" + info.getTitle());
    System.out.println("Author=" + info.getAuthor());
    System.out.println("Subject=" + info.getSubject());
    System.out.println("Keywords=" + info.getKeywords());
    System.out.println("Creator=" + info.getCreator());
    System.out.println("Producer=" + info.getProducer());
    System.out.println("Creation Date=" + formatDate(info.getCreationDate()));
    System.out.println("Modification Date=" + formatDate(info.getModificationDate()));
    System.out.println("Trapped=" + info.getTrapped());
    if (metadata != null) {
        System.out.println("Metadata=" + metadata.getInputStreamAsString());
    }// w  w  w .ja  v  a  2  s.co m
}

From source file:ddf.catalog.transformer.input.pdf.PdfInputTransformer.java

License:Open Source License

/**
 * @param pdfDocument PDF document/*from w  w w  .  j  a  va2s  .co  m*/
 * @param metacard    A mutable metacard to add the extracted data to
 */
private void extractPdfMetadata(PDDocument pdfDocument, MetacardImpl metacard) {

    PDDocumentInformation documentInformation = pdfDocument.getDocumentInformation();

    setDateIfNotNull(documentInformation.getCreationDate(), metacard, Metacard.CREATED);

    setDateIfNotNull(documentInformation.getModificationDate(), metacard, Metacard.MODIFIED);

    if (usePdfTitleAsTitle) {
        setIfNotBlank(documentInformation.getTitle(), metacard, Metacard.TITLE);
    }

    setIfNotBlank(documentInformation.getAuthor(), metacard, Contact.CREATOR_NAME);

    setIfNotBlank(documentInformation.getSubject(), metacard, Metacard.DESCRIPTION);

    setIfNotBlank(documentInformation.getKeywords(), metacard, Topic.KEYWORD);

}

From source file:de.offis.health.icardea.cied.pdf.extractor.PDFApachePDFBoxExtractor.java

License:Apache License

/**
 * This method will return the key and value pairs stored in the PDF
 * information. It's the basic information like title, subject, author,
 * creator, keywords, producer (meaning application) as well as creation
 * and modification date. The method is provided for debugging purposes.
 * /*from   www . j ava  2 s .  c om*/
 * @return Returns <code>key=value</code> pair line by line (using system
 * dependent newline).
 */
@SuppressWarnings("unused")
private String getPdfInfo() {
    StringBuffer stringBuffer = new StringBuffer();
    if (pdfDocument != null) {
        PDDocumentInformation pdfInfo = pdfDocument.getDocumentInformation();

        // Title
        if (pdfInfo.getTitle() != null) {
            stringBuffer.append("Title");
            stringBuffer.append("=");
            stringBuffer.append(pdfInfo.getTitle());
            stringBuffer.append(GlobalTools.LINESEPARATOR);
        } // end if

        // Subject
        if (pdfInfo.getSubject() != null) {
            stringBuffer.append("Subject");
            stringBuffer.append("=");
            stringBuffer.append(pdfInfo.getSubject());
            stringBuffer.append(GlobalTools.LINESEPARATOR);
        } // end if

        // Keywords
        if (pdfInfo.getKeywords() != null) {
            stringBuffer.append("Keywords");
            stringBuffer.append("=");
            stringBuffer.append(pdfInfo.getKeywords());
            stringBuffer.append(GlobalTools.LINESEPARATOR);
        } // end if

        // Author
        if (pdfInfo.getAuthor() != null) {
            stringBuffer.append("Author");
            stringBuffer.append("=");
            stringBuffer.append(pdfInfo.getAuthor());
            stringBuffer.append(GlobalTools.LINESEPARATOR);
        } // end if

        // Producer
        if (pdfInfo.getProducer() != null) {
            stringBuffer.append("Producer");
            stringBuffer.append("=");
            stringBuffer.append(pdfInfo.getProducer());
            stringBuffer.append(GlobalTools.LINESEPARATOR);
        } // end if

        // Creator
        if (pdfInfo.getCreator() != null) {
            stringBuffer.append("Creator");
            stringBuffer.append("=");
            stringBuffer.append(pdfInfo.getCreator());
            stringBuffer.append(GlobalTools.LINESEPARATOR);
        } // end if

        // CreationDate
        try {
            if (pdfInfo.getCreationDate() != null) {
                stringBuffer.append("CreationDate");
                stringBuffer.append("=");
                stringBuffer.append(GlobalTools.calendar2String(pdfInfo.getCreationDate(),
                        GlobalTools.DATE_FORMAT_STRING_ISO8601));
                stringBuffer.append(GlobalTools.LINESEPARATOR);
            } // end if
        } catch (IOException ex) {
        } // end try..catch

        // ModDate
        try {
            if (pdfInfo.getModificationDate() != null) {
                stringBuffer.append("ModDate");
                stringBuffer.append("=");
                stringBuffer.append(GlobalTools.calendar2String(pdfInfo.getModificationDate(),
                        GlobalTools.DATE_FORMAT_STRING_ISO8601));
                stringBuffer.append(GlobalTools.LINESEPARATOR);
            } // end if
        } catch (IOException ex) {
        } // end try..catch
    } // end if

    return stringBuffer.toString();
}

From source file:fr.univ_tours.etu.searcher.LucenePDFDocument.java

License:Apache License

/**
 * This will add the contents to the lucene document.
 *
 * @param document The document to add the contents to.
 * @param is The stream to get the contents from.
 * @param documentLocation The location of the document, used just for debug messages.
 *
 * @throws IOException If there is an error parsing the document.
 *///from w ww .j ava 2 s  .  co  m
private void addContent(Document document, InputStream is, String documentLocation) throws IOException {
    PDDocument pdfDocument = null;
    try {
        pdfDocument = PDDocument.load(is);

        // create a writer where to append the text content.
        StringWriter writer = new StringWriter();
        if (stripper == null) {
            stripper = new PDFTextStripper();
        }
        stripper.writeText(pdfDocument, writer);

        // Note: the buffer to string operation is costless;
        // the char array value of the writer buffer and the content string
        // is shared as long as the buffer content is not modified, which will
        // not occur here.
        String contents = writer.getBuffer().toString();

        StringReader reader = new StringReader(contents);

        // Add the tag-stripped contents as a Reader-valued Text field so it will
        // get tokenized and indexed.
        addTextField(document, "contents", reader);

        PDDocumentInformation info = pdfDocument.getDocumentInformation();
        if (info != null) {
            addTextField(document, "Author", info.getAuthor());
            addTextField(document, "CreationDate", info.getCreationDate());
            addTextField(document, "Creator", info.getCreator());
            addTextField(document, "Keywords", info.getKeywords());
            addTextField(document, "ModificationDate", info.getModificationDate());
            addTextField(document, "Producer", info.getProducer());
            addTextField(document, "Subject", info.getSubject());
            addTextField(document, "Title", info.getTitle());
            addTextField(document, "Trapped", info.getTrapped());
        }
        int summarySize = Math.min(contents.length(), 1500);
        String summary = contents.substring(0, summarySize);
        // Add the summary as an UnIndexed field, so that it is stored and returned
        // with hit documents for display.
        addUnindexedField(document, "summary", summary);
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}

From source file:mj.ocraptor.extraction.tika.parser.pdf.PDFParser.java

License:Apache License

private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException {
    PDDocumentInformation info = document.getDocumentInformation();
    metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
    addMetadata(metadata, TikaCoreProperties.TITLE, info.getTitle());
    addMetadata(metadata, TikaCoreProperties.CREATOR, info.getAuthor());
    addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
    addMetadata(metadata, "producer", info.getProducer());
    // TODO: Move to description in Tika 2.0
    addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
    addMetadata(metadata, "trapped", info.getTrapped());
    try {/*from  ww w .  ja va 2  s  . co m*/
        // TODO Remove these in Tika 2.0
        addMetadata(metadata, "created", info.getCreationDate());
        addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
    } catch (IOException e) {
        // Invalid date format, just ignore
    }
    try {
        Calendar modified = info.getModificationDate();
        addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
        addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
    } catch (IOException e) {
        // Invalid date format, just ignore
    }

    // All remaining metadata is custom
    // Copy this over as-is
    List<String> handledMetadata = Arrays.asList(new String[] { "Author", "Creator", "CreationDate", "ModDate",
            "Keywords", "Producer", "Subject", "Title", "Trapped" });
    for (COSName key : info.getDictionary().keySet()) {
        String name = key.getName();
        if (!handledMetadata.contains(name)) {
            addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
        }
    }
}