Example usage for org.apache.pdfbox.pdmodel PDDocument getDocumentInformation

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument getDocumentInformation.

Prototype

public PDDocumentInformation getDocumentInformation()

Source Link

Document

This will get the document info dictionary.

Usage

From source file:com.opensearchserver.extractor.parser.PdfBox.java

License:Apache License

private void extractMetaData(PDDocument pdf) throws IOException {
    PDDocumentInformation info = pdf.getDocumentInformation();
    if (info != null) {
        metas.add(TITLE, info.getTitle());
        metas.add(SUBJECT, info.getSubject());
        metas.add(AUTHOR, info.getAuthor());
        metas.add(PRODUCER, info.getProducer());
        metas.add(KEYWORDS, info.getKeywords());
        metas.add(CREATION_DATE, getDate(getCreationDate(info)));
        metas.add(MODIFICATION_DATE, getModificationDate(info));
    }/*from   w w w . j av  a  2s  .  c o  m*/
    int pages = pdf.getNumberOfPages();
    metas.add(NUMBER_OF_PAGES, pages);
    PDDocumentCatalog catalog = pdf.getDocumentCatalog();
    if (catalog != null)
        metas.add(LANGUAGE, catalog.getLanguage());
}

From source file:com.qwazr.library.pdfbox.PdfBoxParser.java

License:Apache License

private void extractMetaData(final PDDocument pdf, final ParserFieldsBuilder metas) {
    metas.set(MIME_TYPE, DEFAULT_MIMETYPES[0]);
    final PDDocumentInformation info = pdf.getDocumentInformation();
    if (info != null) {
        metas.add(TITLE, info.getTitle());
        metas.add(SUBJECT, info.getSubject());
        metas.add(AUTHOR, info.getAuthor());
        metas.add(PRODUCER, info.getProducer());
        metas.add(KEYWORDS, info.getKeywords());
        metas.add(CREATION_DATE, info.getCreationDate());
        metas.add(MODIFICATION_DATE, info.getModificationDate());
    }/*from  w ww  .  j  a va2 s .  co  m*/
    int pages = pdf.getNumberOfPages();
    metas.add(NUMBER_OF_PAGES, pages);
    PDDocumentCatalog catalog = pdf.getDocumentCatalog();
    if (catalog != null)
        metas.add(LANGUAGE, catalog.getLanguage());
}

From source file:com.sustainalytics.crawlerfilter.PDFTitleGeneration.java

License:Apache License

/**
 * This method extracts creation date/ custom date of a PDF file
 * @param file is a File object/*  w  w w  .j  av  a2  s.c  om*/
 * @return String that contains the creation date/ custom date of the PDF
 */
public static String extractDate(File file) {
    PDDocument document = null;
    boolean isDamaged = false; //to deal with damaged pdf
    String creationDateMetaData = "";
    try {
        document = PDDocument.load(file.toString());
        /*If the PDF file is not damanged --->*/
        if (!isDamaged) {
            /*...but the file is encrypted --->*/
            if (document.isEncrypted()) {
                logger.info("File " + file.getName() + "is encrypted. Trying to decrypt...");
                try {
                    /*...then decryptt it --->*/
                    document.decrypt("");
                    document.setAllSecurityToBeRemoved(true);
                    logger.info("File " + file.getName() + "successfully decrypted!");
                } catch (CryptographyException e) {
                    logger.info("Error decrypting file " + file.getName());
                    isDamaged = true;
                }

            } /*<--work around to decrypt an encrypted pdf ends here*/

            /*Metadata extraction --->*/
            PDDocumentInformation info = document.getDocumentInformation();

            /*We are only interested in date data--->*/
            Calendar calendar = info.getCreationDate();
            int creationYear = 0, creationMonth = 0, creationDate = 0;
            if (calendar != null) {
                creationYear = calendar.get(Calendar.YEAR);
                creationMonth = calendar.get(Calendar.MONTH) + 1;
                creationDate = calendar.get(Calendar.DATE);

            } /*<---Date data extraction complete*/

            /*If creation date is not empty --->*/
            if (creationYear != 0) {
                creationDateMetaData = creationYear + "-" + creationMonth + "-" + creationDate;
            } //<--- creation date found and the date part of the title is generated
            /*No creation date is found --->*/
            else {
                SimpleDateFormat dateFormatter = new SimpleDateFormat("MM/dd/yyyy");
                Date customDate = null;
                /*But we have custom date some times --->*/
                try {
                    customDate = dateFormatter.parse(info.getCustomMetadataValue("customdate"));
                } catch (ParseException e) {
                    logger.info("Error parsing date from custom date");
                }
                calendar = Calendar.getInstance();
                calendar.setTime(customDate);
                if (calendar != null) {
                    creationYear = calendar.get(Calendar.YEAR);
                    creationMonth = calendar.get(Calendar.MONTH) + 1;
                    creationDate = calendar.get(Calendar.DATE);

                } /*<---Date data extraction complete from customdate*/
                if (creationYear != 0) {
                    creationDateMetaData = creationYear + "-" + creationMonth + "-" + creationDate;
                }
            } //<--- work around if no creation date is found

        } /*<--- Good to know that the PDF was not damaged*/
    } catch (IOException e) { /*If the PDF was not read by the system --->*/
        logger.info("Error processing file " + file.getName());
        /*... then maybe it is damaged*/
        isDamaged = true;
    } finally {
        try {
            /*If the file was good, not damaged, then please close it --->*/
            if (!isDamaged) {
                document.close();
                logger.info("File " + file.getName() + " is closed successfully!");
            }
        } catch (IOException e) {
            logger.info("Error closing file " + file.getName());
        }
    } /*<--- PDF closing done!*/
    return creationDateMetaData;
}

From source file:com.synopsys.integration.blackduck.report.pdf.RiskReportPdfWriter.java

License:Apache License

public File createPDFReportFile(final File outputDirectory, final ReportData report)
        throws RiskReportException {
    final IntegrationEscapeUtil escapeUtil = new IntegrationEscapeUtil();
    final String escapedProjectName = escapeUtil.escapeForUri(report.getProjectName());
    final String escapedProjectVersionName = escapeUtil.escapeForUri(report.getProjectVersion());
    final File pdfFile = new File(outputDirectory,
            escapedProjectName + "_" + escapedProjectVersionName + "_BlackDuck_RiskReport.pdf");
    if (pdfFile.exists()) {
        pdfFile.delete();/*w w  w.  j  a  v a2s.  c  o  m*/
    }
    final PDDocument document = new PDDocument();
    document.getDocumentInformation().setAuthor("Black Duck Software");
    document.getDocumentInformation().setCreator("Integrations");
    document.getDocumentInformation().setSubject("Hub Risk Report");

    try (PDFBoxManager pdfManager = new PDFBoxManager(pdfFile, document)) {
        this.pdfManager = pdfManager;
        final PDRectangle pageBox = pdfManager.currentPage.getMediaBox();
        final float pageWidth = pageBox.getWidth();
        final float pageHeight = pageBox.getHeight();

        final PDRectangle headerRectangle = writeHeader(pageWidth, pageHeight);
        final PDRectangle bottomOfProjectInfoRectangle = writeProjectInformation(pageWidth,
                headerRectangle.getLowerLeftY(), report);
        final PDRectangle bottomOfSummaryTableRectangle = writeSummaryTables(pageWidth,
                bottomOfProjectInfoRectangle.getLowerLeftY(), report);
        final PDRectangle bottomOfComponentTableRectangle = writeComponentTable(pageWidth,
                bottomOfSummaryTableRectangle.getLowerLeftY(), report);

        return pdfFile;
    } catch (final IOException | URISyntaxException e) {
        final String errorString = "Couldn't create the report: ";
        logger.trace(errorString + e.getMessage(), e);
        throw new RiskReportException(errorString + e.getMessage(), e);
    }
}

From source file:com.synopsys.integration.blackduck.service.model.pdf.RiskReportPdfWriter.java

License:Apache License

public File createPDFReportFile(final File outputDirectory, final ReportData report)
        throws RiskReportException {
    final IntegrationEscapeUtil escapeUtil = new IntegrationEscapeUtil();
    final String escapedProjectName = escapeUtil.escapeForUri(report.getProjectName());
    final String escapedProjectVersionName = escapeUtil.escapeForUri(report.getProjectVersion());
    final File pdfFile = new File(outputDirectory,
            escapedProjectName + "_" + escapedProjectVersionName + "_BlackDuck_RiskReport.pdf");
    if (pdfFile.exists()) {
        pdfFile.delete();//from w  w  w.j a v  a  2s. c  o m
    }
    final PDDocument document = new PDDocument();
    document.getDocumentInformation().setAuthor("Black Duck Software");
    document.getDocumentInformation().setCreator("Integrations");
    document.getDocumentInformation().setSubject("Black Duck Risk Report");

    try (PDFBoxManager pdfManager = new PDFBoxManager(pdfFile, document)) {
        this.pdfManager = pdfManager;
        final PDRectangle pageBox = pdfManager.currentPage.getMediaBox();
        final float pageWidth = pageBox.getWidth();
        final float pageHeight = pageBox.getHeight();

        final PDRectangle headerRectangle = writeHeader(pageWidth, pageHeight);
        final PDRectangle bottomOfProjectInfoRectangle = writeProjectInformation(pageWidth,
                headerRectangle.getLowerLeftY(), report);
        final PDRectangle bottomOfSummaryTableRectangle = writeSummaryTables(pageWidth,
                bottomOfProjectInfoRectangle.getLowerLeftY(), report);
        final PDRectangle bottomOfComponentTableRectangle = writeComponentTable(pageWidth,
                bottomOfSummaryTableRectangle.getLowerLeftY(), report);

        return pdfFile;
    } catch (final IOException | URISyntaxException e) {
        final String errorString = "Couldn't create the report: ";
        logger.trace(errorString + e.getMessage(), e);
        throw new RiskReportException(errorString + e.getMessage(), e);
    }
}

From source file:com.wintindustries.pdffilter.pdfcore.PDFTester.java

static public void printMetadata(PDDocument document) throws IOException {
    PDDocumentInformation info = document.getDocumentInformation();
    PDDocumentCatalog cat = document.getDocumentCatalog();
    PDMetadata metadata = cat.getMetadata();
    System.out.println("Page Count=" + document.getNumberOfPages());
    System.out.println("Title=" + info.getTitle());
    System.out.println("Author=" + info.getAuthor());
    System.out.println("Subject=" + info.getSubject());
    System.out.println("Keywords=" + info.getKeywords());
    System.out.println("Creator=" + info.getCreator());
    System.out.println("Producer=" + info.getProducer());
    System.out.println("Creation Date=" + formatDate(info.getCreationDate()));
    System.out.println("Modification Date=" + formatDate(info.getModificationDate()));
    System.out.println("Trapped=" + info.getTrapped());
    if (metadata != null) {
        System.out.println("Metadata=" + metadata.getInputStreamAsString());
    }//from   w  w w.j a v  a  2 s .c o m
}

From source file:ddf.catalog.transformer.input.pdf.PdfInputTransformer.java

License:Open Source License

/**
 * @param pdfDocument PDF document/*  www  . j  a  v  a  2  s.  c o m*/
 * @param metacard    A mutable metacard to add the extracted data to
 */
private void extractPdfMetadata(PDDocument pdfDocument, MetacardImpl metacard) {

    PDDocumentInformation documentInformation = pdfDocument.getDocumentInformation();

    setDateIfNotNull(documentInformation.getCreationDate(), metacard, Metacard.CREATED);

    setDateIfNotNull(documentInformation.getModificationDate(), metacard, Metacard.MODIFIED);

    if (usePdfTitleAsTitle) {
        setIfNotBlank(documentInformation.getTitle(), metacard, Metacard.TITLE);
    }

    setIfNotBlank(documentInformation.getAuthor(), metacard, Contact.CREATOR_NAME);

    setIfNotBlank(documentInformation.getSubject(), metacard, Metacard.DESCRIPTION);

    setIfNotBlank(documentInformation.getKeywords(), metacard, Topic.KEYWORD);

}

From source file:es.ucm.pdfmeta.Main.java

License:Open Source License

private static void modifyDocFromModel(PDDocument doc, MetadataModel<String> m) {
    doc.getDocumentInformation().setAuthor(m.getProperty(AUTHOR_PROPERTY_NAME).getValue());
    doc.getDocumentInformation().setTitle(m.getProperty(TITLE_PROPERTY_NAME).getValue());
    doc.getDocumentInformation().setCustomMetadataValue(BIBTEX_PROPERTY_NAME,
            m.getProperty(BIBTEX_PROPERTY_NAME).getValue());
}

From source file:es.ucm.pdfmeta.Main.java

License:Open Source License

private static MetadataModel<String> buildModelFromDocument(PDDocument doc) {
    MetadataModel<String> m = new MetadataModel<>();
    m.setProperty(AUTHOR_PROPERTY_NAME,// w w  w  .j  a va 2s . co  m
            new MetadataProperty<>(AUTHOR_PROPERTY_NAME, doc.getDocumentInformation().getAuthor()));
    m.setProperty(TITLE_PROPERTY_NAME,
            new MetadataProperty<>(TITLE_PROPERTY_NAME, doc.getDocumentInformation().getTitle()));
    m.setProperty(BIBTEX_PROPERTY_NAME, new MetadataProperty<>(BIBTEX_PROPERTY_NAME,
            doc.getDocumentInformation().getCustomMetadataValue(BIBTEX_PROPERTY_NAME)));
    return m;
}

From source file:fr.univ_tours.etu.pdf.LucenePDFDocument.java

License:Apache License

/**
 * This will add the contents to the lucene document.
 *
 * @param document The document to add the contents to.
 * @param is The stream to get the contents from.
 * @param documentLocation The location of the document, used just for debug messages.
 *
 * @throws IOException If there is an error parsing the document.
 *///from   ww w. ja v a 2 s  .  c  om
private void addContent(Document document, InputStream is, String documentLocation) throws IOException {
    PDDocument pdfDocument = null;
    try {
        pdfDocument = PDDocument.load(is);

        // create a writer where to append the text content.
        StringWriter writer = new StringWriter();
        if (stripper == null) {
            stripper = new PDFTextStripper();
        }
        stripper.writeText(pdfDocument, writer);

        String contentsDirty = writer.getBuffer().toString();
        //System.out.println(contentsDirty.substring(0,100));
        String contents = contentsDirty.replaceAll("\\p{Sm}|\\p{Sk}|\\p{So}", " ");
        //System.out.println(contents);

        // addTextField(document, DocFields.CONTENTS, reader);
        TextField ne = this.getNamedEntities(contents);

        String lemmas = nlpNeTokenizer.getLemmaString();

        //StringReader reader = new StringReader(contents);
        StringReader reader = new StringReader(lemmas);

        // Add the tag-stripped contents as a Reader-valued Text field so it will
        // get tokenized and indexed.

        FieldType type = new FieldType();
        type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        type.setStored(false);
        type.setTokenized(true);
        document.add(new Field(DocFields.CONTENTS, reader, type));

        PDDocumentInformation info = pdfDocument.getDocumentInformation();
        if (info != null) {
            document.add(ne);//adding named entities
            addTextField(document, DocFields.AUTHOR, info.getAuthor());

            try {//to avoid issues with CreationDate
                addUnstoredDate(document, DocFields.CREATION_DATE, info.getCreationDate().getTime());
            } catch (Exception e) {
                System.out.println("Warning: some issue with CreationDate attribute!");
            }

            addTextField(document, DocFields.CREATOR, info.getCreator());
            addTextField(document, DocFields.KEYWORDS, info.getKeywords());

            addTextField(document, DocFields.SUBJECT, info.getSubject());
            addTextField(document, DocFields.TITLE, info.getTitle());

            //addTextField(document, "Title", info.getTitle());
            //addTextField(document, "ModificationDate", info.getModificationDate());
            //addTextField(document, "Producer", info.getProducer());
            //addTextField(document, "Trapped", info.getTrapped());

        }

        int summarySize = Math.min(contents.length(), 1500);
        String summary = contents.substring(0, summarySize);
        // Add the summary as an UnIndexed field, so that it is stored and returned
        // with hit documents for display.
        addUnindexedField(document, DocFields.SUMMARY, summary);
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}