List of usage examples for org.apache.pdfbox.pdmodel PDDocumentInformation getAuthor
public String getAuthor()
From source file:ddf.catalog.transformer.input.pdf.PdfInputTransformer.java
License:Open Source License
/** * @param pdfDocument PDF document/* ww w . ja v a 2 s . c o m*/ * @param metacard A mutable metacard to add the extracted data to */ private void extractPdfMetadata(PDDocument pdfDocument, MetacardImpl metacard) { PDDocumentInformation documentInformation = pdfDocument.getDocumentInformation(); setDateIfNotNull(documentInformation.getCreationDate(), metacard, Metacard.CREATED); setDateIfNotNull(documentInformation.getModificationDate(), metacard, Metacard.MODIFIED); if (usePdfTitleAsTitle) { setIfNotBlank(documentInformation.getTitle(), metacard, Metacard.TITLE); } setIfNotBlank(documentInformation.getAuthor(), metacard, Contact.CREATOR_NAME); setIfNotBlank(documentInformation.getSubject(), metacard, Metacard.DESCRIPTION); setIfNotBlank(documentInformation.getKeywords(), metacard, Topic.KEYWORD); }
From source file:de.offis.health.icardea.cied.pdf.extractor.PDFApachePDFBoxExtractor.java
License:Apache License
/** * This method will return the key and value pairs stored in the PDF * information. It's the basic information like title, subject, author, * creator, keywords, producer (meaning application) as well as creation * and modification date. The method is provided for debugging purposes. * //from www . j av a 2 s. c om * @return Returns <code>key=value</code> pair line by line (using system * dependent newline). */ @SuppressWarnings("unused") private String getPdfInfo() { StringBuffer stringBuffer = new StringBuffer(); if (pdfDocument != null) { PDDocumentInformation pdfInfo = pdfDocument.getDocumentInformation(); // Title if (pdfInfo.getTitle() != null) { stringBuffer.append("Title"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getTitle()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // Subject if (pdfInfo.getSubject() != null) { stringBuffer.append("Subject"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getSubject()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // Keywords if (pdfInfo.getKeywords() != null) { stringBuffer.append("Keywords"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getKeywords()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // Author if (pdfInfo.getAuthor() != null) { stringBuffer.append("Author"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getAuthor()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // Producer if (pdfInfo.getProducer() != null) { stringBuffer.append("Producer"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getProducer()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // Creator if (pdfInfo.getCreator() != null) { stringBuffer.append("Creator"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getCreator()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // CreationDate try { if (pdfInfo.getCreationDate() != null) { stringBuffer.append("CreationDate"); stringBuffer.append("="); stringBuffer.append(GlobalTools.calendar2String(pdfInfo.getCreationDate(), GlobalTools.DATE_FORMAT_STRING_ISO8601)); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if } catch (IOException ex) { } // end try..catch // ModDate try { if (pdfInfo.getModificationDate() != null) { stringBuffer.append("ModDate"); stringBuffer.append("="); stringBuffer.append(GlobalTools.calendar2String(pdfInfo.getModificationDate(), GlobalTools.DATE_FORMAT_STRING_ISO8601)); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if } catch (IOException ex) { } // end try..catch } // end if return stringBuffer.toString(); }
From source file:fr.univ_tours.etu.pdf.LucenePDFDocument.java
License:Apache License
/** * This will add the contents to the lucene document. * * @param document The document to add the contents to. * @param is The stream to get the contents from. * @param documentLocation The location of the document, used just for debug messages. * * @throws IOException If there is an error parsing the document. *///from www. j ava 2s. com private void addContent(Document document, InputStream is, String documentLocation) throws IOException { PDDocument pdfDocument = null; try { pdfDocument = PDDocument.load(is); // create a writer where to append the text content. StringWriter writer = new StringWriter(); if (stripper == null) { stripper = new PDFTextStripper(); } stripper.writeText(pdfDocument, writer); String contentsDirty = writer.getBuffer().toString(); //System.out.println(contentsDirty.substring(0,100)); String contents = contentsDirty.replaceAll("\\p{Sm}|\\p{Sk}|\\p{So}", " "); //System.out.println(contents); // addTextField(document, DocFields.CONTENTS, reader); TextField ne = this.getNamedEntities(contents); String lemmas = nlpNeTokenizer.getLemmaString(); //StringReader reader = new StringReader(contents); StringReader reader = new StringReader(lemmas); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. FieldType type = new FieldType(); type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); type.setStored(false); type.setTokenized(true); document.add(new Field(DocFields.CONTENTS, reader, type)); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { document.add(ne);//adding named entities addTextField(document, DocFields.AUTHOR, info.getAuthor()); try {//to avoid issues with CreationDate addUnstoredDate(document, DocFields.CREATION_DATE, info.getCreationDate().getTime()); } catch (Exception e) { System.out.println("Warning: some issue with CreationDate attribute!"); } addTextField(document, DocFields.CREATOR, info.getCreator()); addTextField(document, DocFields.KEYWORDS, info.getKeywords()); addTextField(document, DocFields.SUBJECT, info.getSubject()); addTextField(document, DocFields.TITLE, info.getTitle()); //addTextField(document, "Title", info.getTitle()); //addTextField(document, "ModificationDate", info.getModificationDate()); //addTextField(document, "Producer", info.getProducer()); //addTextField(document, "Trapped", info.getTrapped()); } int summarySize = Math.min(contents.length(), 1500); String summary = contents.substring(0, summarySize); // Add the summary as an UnIndexed field, so that it is stored and returned // with hit documents for display. addUnindexedField(document, DocFields.SUMMARY, summary); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:fr.univ_tours.etu.searcher.LucenePDFDocument.java
License:Apache License
/** * This will add the contents to the lucene document. * * @param document The document to add the contents to. * @param is The stream to get the contents from. * @param documentLocation The location of the document, used just for debug messages. * * @throws IOException If there is an error parsing the document. *//*from w ww . j ava2 s .c o m*/ private void addContent(Document document, InputStream is, String documentLocation) throws IOException { PDDocument pdfDocument = null; try { pdfDocument = PDDocument.load(is); // create a writer where to append the text content. StringWriter writer = new StringWriter(); if (stripper == null) { stripper = new PDFTextStripper(); } stripper.writeText(pdfDocument, writer); // Note: the buffer to string operation is costless; // the char array value of the writer buffer and the content string // is shared as long as the buffer content is not modified, which will // not occur here. String contents = writer.getBuffer().toString(); StringReader reader = new StringReader(contents); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. addTextField(document, "contents", reader); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { addTextField(document, "Author", info.getAuthor()); addTextField(document, "CreationDate", info.getCreationDate()); addTextField(document, "Creator", info.getCreator()); addTextField(document, "Keywords", info.getKeywords()); addTextField(document, "ModificationDate", info.getModificationDate()); addTextField(document, "Producer", info.getProducer()); addTextField(document, "Subject", info.getSubject()); addTextField(document, "Title", info.getTitle()); addTextField(document, "Trapped", info.getTrapped()); } int summarySize = Math.min(contents.length(), 1500); String summary = contents.substring(0, summarySize); // Add the summary as an UnIndexed field, so that it is stored and returned // with hit documents for display. addUnindexedField(document, "summary", summary); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:mj.ocraptor.extraction.tika.parser.pdf.PDFParser.java
License:Apache License
private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException { PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); addMetadata(metadata, TikaCoreProperties.TITLE, info.getTitle()); addMetadata(metadata, TikaCoreProperties.CREATOR, info.getAuthor()); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); try {//from w w w . ja v a 2 s . c o m // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); } catch (IOException e) { // Invalid date format, just ignore } try { Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); } catch (IOException e) { // Invalid date format, just ignore } // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList(new String[] { "Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped" }); for (COSName key : info.getDictionary().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); } } }
From source file:net.padaf.preflight.xmp.SynchronizedMetaDataValidation.java
License:Apache License
/** * Analyze if Author(s) embedded in Document Information dictionary and in XMP * properties are synchronized/*from w w w. j a va 2 s . c om*/ * * @param dico * Document Information Dictionary * @param dc * Dublin Core Schema * @param ve * The list of validation errors */ protected void analyzeAuthorProperty(PDDocumentInformation dico, DublinCoreSchema dc, List<ValidationError> ve) { String author = dico.getAuthor(); if (author != null) { if (dc != null) { if (dc.getCreator() != null) { if (dc.getCreatorValue().size() != 1) { ve.add(AbsentXMPPropertyError("Author", "In XMP metadata, Author(s) must be represented by a single entry in a text array (dc:creator) ")); } else { if (dc.getCreatorValue().get(0) == null) { ve.add(AbsentXMPPropertyError("Author", "Property is defined as null")); } else { if (!dc.getCreatorValue().get(0).equals(author)) { ve.add(unsynchronizedMetaDataError("Author")); } } } } else { ve.add(AbsentXMPPropertyError("Author", "Property is not defined in XMP Metadata")); } } else { ve.add(AbsentSchemaMetaDataError("Author", "Dublin Core")); } } }
From source file:net.sf.jabref.logic.xmp.XMPUtil.java
License:Open Source License
/** * Helper function for retrieving a BibEntry from the * PDDocumentInformation in a PDF file./*w w w .ja v a2 s. co m*/ * * To understand how to get hold of a PDDocumentInformation have a look in * the test cases for XMPUtil. * * The BibEntry is build by mapping individual fields in the document * information (like author, title, keywords) to fields in a bibtex entry. * * @param di * The document information from which to build a BibEntry. * * @return The bibtex entry found in the document information. */ public static Optional<BibEntry> getBibtexEntryFromDocumentInformation(PDDocumentInformation di) { BibEntry entry = new BibEntry(); entry.setType("misc"); String s = di.getAuthor(); if (s != null) { entry.setField("author", s); } s = di.getTitle(); if (s != null) { entry.setField("title", s); } s = di.getKeywords(); if (s != null) { entry.setField("keywords", s); } s = di.getSubject(); if (s != null) { entry.setField("abstract", s); } COSDictionary dict = di.getDictionary(); for (Map.Entry<COSName, COSBase> o : dict.entrySet()) { String key = o.getKey().getName(); if (key.startsWith("bibtex/")) { String value = dict.getString(key); key = key.substring("bibtex/".length()); if ("entrytype".equals(key)) { entry.setType(value); } else { entry.setField(key, value); } } } // Return empty Optional if no values were found return entry.getFieldNames().isEmpty() ? Optional.empty() : Optional.of(entry); }
From source file:net.sf.jabref.util.XMPUtil.java
License:Open Source License
/** * Helper function for retrieving a BibtexEntry from the * PDDocumentInformation in a PDF file.// w w w.ja va2 s . c o m * * To understand how to get hold of a PDDocumentInformation have a look in * the test cases for XMPUtil. * * The BibtexEntry is build by mapping individual fields in the document * information (like author, title, keywords) to fields in a bibtex entry. * * @param di * The document information from which to build a BibtexEntry. * * @return The bibtex entry found in the document information. */ @SuppressWarnings("unchecked") public static BibtexEntry getBibtexEntryFromDocumentInformation(PDDocumentInformation di) { BibtexEntry entry = new BibtexEntry(); String s = di.getAuthor(); if (s != null) { entry.setField("author", s); } s = di.getTitle(); if (s != null) { entry.setField("title", s); } s = di.getKeywords(); if (s != null) { entry.setField("keywords", s); } s = di.getSubject(); if (s != null) { entry.setField("abstract", s); } COSDictionary dict = di.getDictionary(); for (Map.Entry<COSName, COSBase> o : dict.entrySet()) { String key = o.getKey().getName(); if (key.startsWith("bibtex/")) { String value = dict.getString(key); key = key.substring("bibtex/".length()); if (key.equals("entrytype")) { BibtexEntryType type = BibtexEntryType.getStandardType(value); if (type != null) { entry.setType(type); } } else { entry.setField(key, value); } } } // Return null if no values were found return (!entry.getAllFields().isEmpty() ? entry : null); }
From source file:net.sf.mmm.content.parser.impl.pdf.ContentParserPdf.java
License:Apache License
/** * {@inheritDoc}/*from w w w.ja v a 2s . c o m*/ */ @Override public void parse(InputStream inputStream, long filesize, ContentParserOptions options, MutableGenericContext context) throws Exception { PDFParser parser = new PDFParser(inputStream); parser.parse(); PDDocument pdfDoc = parser.getPDDocument(); try { if (pdfDoc.isEncrypted()) { // pdfDoc.decrypt("password"); return; } PDDocumentInformation info = pdfDoc.getDocumentInformation(); String title = info.getTitle(); if (title != null) { context.setVariable(VARIABLE_NAME_TITLE, title); } String keywords = info.getKeywords(); if (keywords != null) { context.setVariable(VARIABLE_NAME_KEYWORDS, keywords); } String author = info.getAuthor(); if (author != null) { context.setVariable(VARIABLE_NAME_CREATOR, author); } if (filesize < options.getMaximumBufferSize()) { PDFTextStripper stripper = new PDFTextStripper(); context.setVariable(VARIABLE_NAME_TEXT, stripper.getText(pdfDoc)); } } finally { pdfDoc.close(); } }
From source file:net.sourceforge.docfetcher.model.parse.PdfParser.java
License:Open Source License
@Override protected ParseResult parse(@NotNull InputStream in, @NotNull final ParseContext context) throws ParseException { PDDocument pdfDoc = null;//from w ww . j av a2 s. com try { /* * TODO post-release-1.1: check if 'force' argument in PDDocument/Stripper increases * number of parsed PDF files */ pdfDoc = PDDocument.load(in, true); PDDocumentInformation pdInfo; final int pageCount; try { pdInfo = pdfDoc.getDocumentInformation(); pageCount = pdfDoc.getNumberOfPages(); } catch (ClassCastException e) { // Bug #3529070 and #3528345 throw new ParseException(e); } StringWriter writer = new StringWriter(); /* * If the PDF file is encrypted, the PDF stripper will automatically * try an empty password. * * In contrast to the paging PDF parser that is used for the * preview, we do not need to call setSortByPosition(true) here * because the extracted text will be digested by Lucene anyway. */ PDFTextStripper stripper = new PDFTextStripper() { protected void startPage(PDPage page) throws IOException { context.getReporter().subInfo(getCurrentPageNo(), pageCount); } protected void endPage(PDPage page) throws IOException { if (context.getCancelable().isCanceled()) setEndPage(0); } }; stripper.setForceParsing(true); try { stripper.writeText(pdfDoc, writer); } catch (RuntimeException e) { /* * PDFTextStripper.writeText can throw various * RuntimeExceptions, see bugs #3446010, #3448272, #3444887. */ throw new ParseException(e); } return new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle()).addAuthor(pdInfo.getAuthor()) .addMiscMetadata(pdInfo.getSubject()).addMiscMetadata(pdInfo.getKeywords()); } catch (IOException e) { if (e.getCause() instanceof CryptographyException) throw new ParseException(Msg.doc_pw_protected.get()); throw new ParseException(e); } finally { close(pdfDoc); } }