Example usage for org.apache.pdfbox.pdmodel PDDocumentInformation getAuthor

List of usage examples for org.apache.pdfbox.pdmodel PDDocumentInformation getAuthor

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocumentInformation getAuthor.

Prototype

public String getAuthor() 

Source Link

Document

This will get the author of the document.

Usage

From source file:ddf.catalog.transformer.input.pdf.PdfInputTransformer.java

License:Open Source License

/**
 * @param pdfDocument PDF document/* ww  w  . ja v  a 2  s  . c o  m*/
 * @param metacard    A mutable metacard to add the extracted data to
 */
private void extractPdfMetadata(PDDocument pdfDocument, MetacardImpl metacard) {

    PDDocumentInformation documentInformation = pdfDocument.getDocumentInformation();

    setDateIfNotNull(documentInformation.getCreationDate(), metacard, Metacard.CREATED);

    setDateIfNotNull(documentInformation.getModificationDate(), metacard, Metacard.MODIFIED);

    if (usePdfTitleAsTitle) {
        setIfNotBlank(documentInformation.getTitle(), metacard, Metacard.TITLE);
    }

    setIfNotBlank(documentInformation.getAuthor(), metacard, Contact.CREATOR_NAME);

    setIfNotBlank(documentInformation.getSubject(), metacard, Metacard.DESCRIPTION);

    setIfNotBlank(documentInformation.getKeywords(), metacard, Topic.KEYWORD);

}

From source file:de.offis.health.icardea.cied.pdf.extractor.PDFApachePDFBoxExtractor.java

License:Apache License

/**
 * This method will return the key and value pairs stored in the PDF
 * information. It's the basic information like title, subject, author,
 * creator, keywords, producer (meaning application) as well as creation
 * and modification date. The method is provided for debugging purposes.
 * //from www  . j av  a  2  s. c  om
 * @return Returns <code>key=value</code> pair line by line (using system
 * dependent newline).
 */
@SuppressWarnings("unused")
private String getPdfInfo() {
    StringBuffer stringBuffer = new StringBuffer();
    if (pdfDocument != null) {
        PDDocumentInformation pdfInfo = pdfDocument.getDocumentInformation();

        // Title
        if (pdfInfo.getTitle() != null) {
            stringBuffer.append("Title");
            stringBuffer.append("=");
            stringBuffer.append(pdfInfo.getTitle());
            stringBuffer.append(GlobalTools.LINESEPARATOR);
        } // end if

        // Subject
        if (pdfInfo.getSubject() != null) {
            stringBuffer.append("Subject");
            stringBuffer.append("=");
            stringBuffer.append(pdfInfo.getSubject());
            stringBuffer.append(GlobalTools.LINESEPARATOR);
        } // end if

        // Keywords
        if (pdfInfo.getKeywords() != null) {
            stringBuffer.append("Keywords");
            stringBuffer.append("=");
            stringBuffer.append(pdfInfo.getKeywords());
            stringBuffer.append(GlobalTools.LINESEPARATOR);
        } // end if

        // Author
        if (pdfInfo.getAuthor() != null) {
            stringBuffer.append("Author");
            stringBuffer.append("=");
            stringBuffer.append(pdfInfo.getAuthor());
            stringBuffer.append(GlobalTools.LINESEPARATOR);
        } // end if

        // Producer
        if (pdfInfo.getProducer() != null) {
            stringBuffer.append("Producer");
            stringBuffer.append("=");
            stringBuffer.append(pdfInfo.getProducer());
            stringBuffer.append(GlobalTools.LINESEPARATOR);
        } // end if

        // Creator
        if (pdfInfo.getCreator() != null) {
            stringBuffer.append("Creator");
            stringBuffer.append("=");
            stringBuffer.append(pdfInfo.getCreator());
            stringBuffer.append(GlobalTools.LINESEPARATOR);
        } // end if

        // CreationDate
        try {
            if (pdfInfo.getCreationDate() != null) {
                stringBuffer.append("CreationDate");
                stringBuffer.append("=");
                stringBuffer.append(GlobalTools.calendar2String(pdfInfo.getCreationDate(),
                        GlobalTools.DATE_FORMAT_STRING_ISO8601));
                stringBuffer.append(GlobalTools.LINESEPARATOR);
            } // end if
        } catch (IOException ex) {
        } // end try..catch

        // ModDate
        try {
            if (pdfInfo.getModificationDate() != null) {
                stringBuffer.append("ModDate");
                stringBuffer.append("=");
                stringBuffer.append(GlobalTools.calendar2String(pdfInfo.getModificationDate(),
                        GlobalTools.DATE_FORMAT_STRING_ISO8601));
                stringBuffer.append(GlobalTools.LINESEPARATOR);
            } // end if
        } catch (IOException ex) {
        } // end try..catch
    } // end if

    return stringBuffer.toString();
}

From source file:fr.univ_tours.etu.pdf.LucenePDFDocument.java

License:Apache License

/**
 * This will add the contents to the lucene document.
 *
 * @param document The document to add the contents to.
 * @param is The stream to get the contents from.
 * @param documentLocation The location of the document, used just for debug messages.
 *
 * @throws IOException If there is an error parsing the document.
 *///from www. j  ava 2s. com
private void addContent(Document document, InputStream is, String documentLocation) throws IOException {
    PDDocument pdfDocument = null;
    try {
        pdfDocument = PDDocument.load(is);

        // create a writer where to append the text content.
        StringWriter writer = new StringWriter();
        if (stripper == null) {
            stripper = new PDFTextStripper();
        }
        stripper.writeText(pdfDocument, writer);

        String contentsDirty = writer.getBuffer().toString();
        //System.out.println(contentsDirty.substring(0,100));
        String contents = contentsDirty.replaceAll("\\p{Sm}|\\p{Sk}|\\p{So}", " ");
        //System.out.println(contents);

        // addTextField(document, DocFields.CONTENTS, reader);
        TextField ne = this.getNamedEntities(contents);

        String lemmas = nlpNeTokenizer.getLemmaString();

        //StringReader reader = new StringReader(contents);
        StringReader reader = new StringReader(lemmas);

        // Add the tag-stripped contents as a Reader-valued Text field so it will
        // get tokenized and indexed.

        FieldType type = new FieldType();
        type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        type.setStored(false);
        type.setTokenized(true);
        document.add(new Field(DocFields.CONTENTS, reader, type));

        PDDocumentInformation info = pdfDocument.getDocumentInformation();
        if (info != null) {
            document.add(ne);//adding named entities
            addTextField(document, DocFields.AUTHOR, info.getAuthor());

            try {//to avoid issues with CreationDate
                addUnstoredDate(document, DocFields.CREATION_DATE, info.getCreationDate().getTime());
            } catch (Exception e) {
                System.out.println("Warning: some issue with CreationDate attribute!");
            }

            addTextField(document, DocFields.CREATOR, info.getCreator());
            addTextField(document, DocFields.KEYWORDS, info.getKeywords());

            addTextField(document, DocFields.SUBJECT, info.getSubject());
            addTextField(document, DocFields.TITLE, info.getTitle());

            //addTextField(document, "Title", info.getTitle());
            //addTextField(document, "ModificationDate", info.getModificationDate());
            //addTextField(document, "Producer", info.getProducer());
            //addTextField(document, "Trapped", info.getTrapped());

        }

        int summarySize = Math.min(contents.length(), 1500);
        String summary = contents.substring(0, summarySize);
        // Add the summary as an UnIndexed field, so that it is stored and returned
        // with hit documents for display.
        addUnindexedField(document, DocFields.SUMMARY, summary);
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}

From source file:fr.univ_tours.etu.searcher.LucenePDFDocument.java

License:Apache License

/**
 * This will add the contents to the lucene document.
 *
 * @param document The document to add the contents to.
 * @param is The stream to get the contents from.
 * @param documentLocation The location of the document, used just for debug messages.
 *
 * @throws IOException If there is an error parsing the document.
 *//*from  w ww  .  j  ava2  s .c o m*/
private void addContent(Document document, InputStream is, String documentLocation) throws IOException {
    PDDocument pdfDocument = null;
    try {
        pdfDocument = PDDocument.load(is);

        // create a writer where to append the text content.
        StringWriter writer = new StringWriter();
        if (stripper == null) {
            stripper = new PDFTextStripper();
        }
        stripper.writeText(pdfDocument, writer);

        // Note: the buffer to string operation is costless;
        // the char array value of the writer buffer and the content string
        // is shared as long as the buffer content is not modified, which will
        // not occur here.
        String contents = writer.getBuffer().toString();

        StringReader reader = new StringReader(contents);

        // Add the tag-stripped contents as a Reader-valued Text field so it will
        // get tokenized and indexed.
        addTextField(document, "contents", reader);

        PDDocumentInformation info = pdfDocument.getDocumentInformation();
        if (info != null) {
            addTextField(document, "Author", info.getAuthor());
            addTextField(document, "CreationDate", info.getCreationDate());
            addTextField(document, "Creator", info.getCreator());
            addTextField(document, "Keywords", info.getKeywords());
            addTextField(document, "ModificationDate", info.getModificationDate());
            addTextField(document, "Producer", info.getProducer());
            addTextField(document, "Subject", info.getSubject());
            addTextField(document, "Title", info.getTitle());
            addTextField(document, "Trapped", info.getTrapped());
        }
        int summarySize = Math.min(contents.length(), 1500);
        String summary = contents.substring(0, summarySize);
        // Add the summary as an UnIndexed field, so that it is stored and returned
        // with hit documents for display.
        addUnindexedField(document, "summary", summary);
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}

From source file:mj.ocraptor.extraction.tika.parser.pdf.PDFParser.java

License:Apache License

private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException {
    PDDocumentInformation info = document.getDocumentInformation();
    metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
    addMetadata(metadata, TikaCoreProperties.TITLE, info.getTitle());
    addMetadata(metadata, TikaCoreProperties.CREATOR, info.getAuthor());
    addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
    addMetadata(metadata, "producer", info.getProducer());
    // TODO: Move to description in Tika 2.0
    addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
    addMetadata(metadata, "trapped", info.getTrapped());
    try {//from   w w  w  . ja  v a 2  s . c o m
        // TODO Remove these in Tika 2.0
        addMetadata(metadata, "created", info.getCreationDate());
        addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
    } catch (IOException e) {
        // Invalid date format, just ignore
    }
    try {
        Calendar modified = info.getModificationDate();
        addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
        addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
    } catch (IOException e) {
        // Invalid date format, just ignore
    }

    // All remaining metadata is custom
    // Copy this over as-is
    List<String> handledMetadata = Arrays.asList(new String[] { "Author", "Creator", "CreationDate", "ModDate",
            "Keywords", "Producer", "Subject", "Title", "Trapped" });
    for (COSName key : info.getDictionary().keySet()) {
        String name = key.getName();
        if (!handledMetadata.contains(name)) {
            addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
        }
    }
}

From source file:net.padaf.preflight.xmp.SynchronizedMetaDataValidation.java

License:Apache License

/**
 * Analyze if Author(s) embedded in Document Information dictionary and in XMP
 * properties are synchronized/*from  w w w.  j a va 2  s .  c om*/
 * 
 * @param dico
 *          Document Information Dictionary
 * @param dc
 *          Dublin Core Schema
 * @param ve
 *          The list of validation errors
 */
protected void analyzeAuthorProperty(PDDocumentInformation dico, DublinCoreSchema dc,
        List<ValidationError> ve) {
    String author = dico.getAuthor();
    if (author != null) {
        if (dc != null) {
            if (dc.getCreator() != null) {
                if (dc.getCreatorValue().size() != 1) {
                    ve.add(AbsentXMPPropertyError("Author",
                            "In XMP metadata, Author(s) must be represented by a single entry in a text array (dc:creator) "));
                } else {
                    if (dc.getCreatorValue().get(0) == null) {
                        ve.add(AbsentXMPPropertyError("Author", "Property is defined as null"));
                    } else {
                        if (!dc.getCreatorValue().get(0).equals(author)) {
                            ve.add(unsynchronizedMetaDataError("Author"));
                        }
                    }
                }
            } else {
                ve.add(AbsentXMPPropertyError("Author", "Property is not defined in XMP Metadata"));
            }
        } else {
            ve.add(AbsentSchemaMetaDataError("Author", "Dublin Core"));
        }
    }
}

From source file:net.sf.jabref.logic.xmp.XMPUtil.java

License:Open Source License

/**
 * Helper function for retrieving a BibEntry from the
 * PDDocumentInformation in a PDF file./*w w  w  .ja v a2 s. co m*/
 *
 * To understand how to get hold of a PDDocumentInformation have a look in
 * the test cases for XMPUtil.
 *
 * The BibEntry is build by mapping individual fields in the document
 * information (like author, title, keywords) to fields in a bibtex entry.
 *
 * @param di
 *            The document information from which to build a BibEntry.
 *
 * @return The bibtex entry found in the document information.
 */
public static Optional<BibEntry> getBibtexEntryFromDocumentInformation(PDDocumentInformation di) {

    BibEntry entry = new BibEntry();
    entry.setType("misc");

    String s = di.getAuthor();
    if (s != null) {
        entry.setField("author", s);
    }

    s = di.getTitle();
    if (s != null) {
        entry.setField("title", s);
    }

    s = di.getKeywords();
    if (s != null) {
        entry.setField("keywords", s);
    }

    s = di.getSubject();
    if (s != null) {
        entry.setField("abstract", s);
    }

    COSDictionary dict = di.getDictionary();
    for (Map.Entry<COSName, COSBase> o : dict.entrySet()) {
        String key = o.getKey().getName();
        if (key.startsWith("bibtex/")) {
            String value = dict.getString(key);
            key = key.substring("bibtex/".length());
            if ("entrytype".equals(key)) {
                entry.setType(value);
            } else {
                entry.setField(key, value);
            }
        }
    }

    // Return empty Optional if no values were found
    return entry.getFieldNames().isEmpty() ? Optional.empty() : Optional.of(entry);
}

From source file:net.sf.jabref.util.XMPUtil.java

License:Open Source License

/**
 * Helper function for retrieving a BibtexEntry from the
 * PDDocumentInformation in a PDF file.//  w  w w.ja va2  s  .  c o  m
 * 
 * To understand how to get hold of a PDDocumentInformation have a look in
 * the test cases for XMPUtil.
 * 
 * The BibtexEntry is build by mapping individual fields in the document
 * information (like author, title, keywords) to fields in a bibtex entry.
 * 
 * @param di
 *            The document information from which to build a BibtexEntry.
 * 
 * @return The bibtex entry found in the document information.
 */
@SuppressWarnings("unchecked")
public static BibtexEntry getBibtexEntryFromDocumentInformation(PDDocumentInformation di) {

    BibtexEntry entry = new BibtexEntry();

    String s = di.getAuthor();
    if (s != null) {
        entry.setField("author", s);
    }

    s = di.getTitle();
    if (s != null) {
        entry.setField("title", s);
    }

    s = di.getKeywords();
    if (s != null) {
        entry.setField("keywords", s);
    }

    s = di.getSubject();
    if (s != null) {
        entry.setField("abstract", s);
    }

    COSDictionary dict = di.getDictionary();
    for (Map.Entry<COSName, COSBase> o : dict.entrySet()) {
        String key = o.getKey().getName();
        if (key.startsWith("bibtex/")) {
            String value = dict.getString(key);
            key = key.substring("bibtex/".length());
            if (key.equals("entrytype")) {
                BibtexEntryType type = BibtexEntryType.getStandardType(value);
                if (type != null) {
                    entry.setType(type);
                }
            } else {
                entry.setField(key, value);
            }
        }
    }

    // Return null if no values were found
    return (!entry.getAllFields().isEmpty() ? entry : null);
}

From source file:net.sf.mmm.content.parser.impl.pdf.ContentParserPdf.java

License:Apache License

/**
 * {@inheritDoc}/*from w  w  w.ja v  a  2s .  c  o  m*/
 */
@Override
public void parse(InputStream inputStream, long filesize, ContentParserOptions options,
        MutableGenericContext context) throws Exception {

    PDFParser parser = new PDFParser(inputStream);
    parser.parse();
    PDDocument pdfDoc = parser.getPDDocument();
    try {
        if (pdfDoc.isEncrypted()) {
            // pdfDoc.decrypt("password");
            return;
        }
        PDDocumentInformation info = pdfDoc.getDocumentInformation();
        String title = info.getTitle();
        if (title != null) {
            context.setVariable(VARIABLE_NAME_TITLE, title);
        }
        String keywords = info.getKeywords();
        if (keywords != null) {
            context.setVariable(VARIABLE_NAME_KEYWORDS, keywords);
        }
        String author = info.getAuthor();
        if (author != null) {
            context.setVariable(VARIABLE_NAME_CREATOR, author);
        }

        if (filesize < options.getMaximumBufferSize()) {
            PDFTextStripper stripper = new PDFTextStripper();
            context.setVariable(VARIABLE_NAME_TEXT, stripper.getText(pdfDoc));
        }
    } finally {
        pdfDoc.close();
    }
}

From source file:net.sourceforge.docfetcher.model.parse.PdfParser.java

License:Open Source License

@Override
protected ParseResult parse(@NotNull InputStream in, @NotNull final ParseContext context)
        throws ParseException {
    PDDocument pdfDoc = null;//from w  ww .  j av a2  s.  com
    try {
        /*
         * TODO post-release-1.1: check if 'force' argument in PDDocument/Stripper increases
         * number of parsed PDF files
         */
        pdfDoc = PDDocument.load(in, true);
        PDDocumentInformation pdInfo;
        final int pageCount;
        try {
            pdInfo = pdfDoc.getDocumentInformation();
            pageCount = pdfDoc.getNumberOfPages();
        } catch (ClassCastException e) {
            // Bug #3529070 and #3528345
            throw new ParseException(e);
        }
        StringWriter writer = new StringWriter();

        /*
         * If the PDF file is encrypted, the PDF stripper will automatically
         * try an empty password.
         * 
         * In contrast to the paging PDF parser that is used for the
         * preview, we do not need to call setSortByPosition(true) here
         * because the extracted text will be digested by Lucene anyway.
         */
        PDFTextStripper stripper = new PDFTextStripper() {
            protected void startPage(PDPage page) throws IOException {
                context.getReporter().subInfo(getCurrentPageNo(), pageCount);
            }

            protected void endPage(PDPage page) throws IOException {
                if (context.getCancelable().isCanceled())
                    setEndPage(0);
            }
        };
        stripper.setForceParsing(true);

        try {
            stripper.writeText(pdfDoc, writer);
        } catch (RuntimeException e) {
            /*
             * PDFTextStripper.writeText can throw various
             * RuntimeExceptions, see bugs #3446010, #3448272, #3444887.
             */
            throw new ParseException(e);
        }

        return new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle()).addAuthor(pdInfo.getAuthor())
                .addMiscMetadata(pdInfo.getSubject()).addMiscMetadata(pdInfo.getKeywords());
    } catch (IOException e) {
        if (e.getCause() instanceof CryptographyException)
            throw new ParseException(Msg.doc_pw_protected.get());
        throw new ParseException(e);
    } finally {
        close(pdfDoc);
    }
}