List of usage examples for org.apache.pdfbox.pdmodel PDDocument getDocumentInformation
public PDDocumentInformation getDocumentInformation()
From source file:com.opensearchserver.extractor.parser.PdfBox.java
License:Apache License
private void extractMetaData(PDDocument pdf) throws IOException { PDDocumentInformation info = pdf.getDocumentInformation(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(SUBJECT, info.getSubject()); metas.add(AUTHOR, info.getAuthor()); metas.add(PRODUCER, info.getProducer()); metas.add(KEYWORDS, info.getKeywords()); metas.add(CREATION_DATE, getDate(getCreationDate(info))); metas.add(MODIFICATION_DATE, getModificationDate(info)); }/*from w w w . j av a 2s . c o m*/ int pages = pdf.getNumberOfPages(); metas.add(NUMBER_OF_PAGES, pages); PDDocumentCatalog catalog = pdf.getDocumentCatalog(); if (catalog != null) metas.add(LANGUAGE, catalog.getLanguage()); }
From source file:com.qwazr.library.pdfbox.PdfBoxParser.java
License:Apache License
private void extractMetaData(final PDDocument pdf, final ParserFieldsBuilder metas) { metas.set(MIME_TYPE, DEFAULT_MIMETYPES[0]); final PDDocumentInformation info = pdf.getDocumentInformation(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(SUBJECT, info.getSubject()); metas.add(AUTHOR, info.getAuthor()); metas.add(PRODUCER, info.getProducer()); metas.add(KEYWORDS, info.getKeywords()); metas.add(CREATION_DATE, info.getCreationDate()); metas.add(MODIFICATION_DATE, info.getModificationDate()); }/*from w ww . j a va2 s . co m*/ int pages = pdf.getNumberOfPages(); metas.add(NUMBER_OF_PAGES, pages); PDDocumentCatalog catalog = pdf.getDocumentCatalog(); if (catalog != null) metas.add(LANGUAGE, catalog.getLanguage()); }
From source file:com.sustainalytics.crawlerfilter.PDFTitleGeneration.java
License:Apache License
/** * This method extracts creation date/ custom date of a PDF file * @param file is a File object/* w w w .j av a2 s.c om*/ * @return String that contains the creation date/ custom date of the PDF */ public static String extractDate(File file) { PDDocument document = null; boolean isDamaged = false; //to deal with damaged pdf String creationDateMetaData = ""; try { document = PDDocument.load(file.toString()); /*If the PDF file is not damanged --->*/ if (!isDamaged) { /*...but the file is encrypted --->*/ if (document.isEncrypted()) { logger.info("File " + file.getName() + "is encrypted. Trying to decrypt..."); try { /*...then decryptt it --->*/ document.decrypt(""); document.setAllSecurityToBeRemoved(true); logger.info("File " + file.getName() + "successfully decrypted!"); } catch (CryptographyException e) { logger.info("Error decrypting file " + file.getName()); isDamaged = true; } } /*<--work around to decrypt an encrypted pdf ends here*/ /*Metadata extraction --->*/ PDDocumentInformation info = document.getDocumentInformation(); /*We are only interested in date data--->*/ Calendar calendar = info.getCreationDate(); int creationYear = 0, creationMonth = 0, creationDate = 0; if (calendar != null) { creationYear = calendar.get(Calendar.YEAR); creationMonth = calendar.get(Calendar.MONTH) + 1; creationDate = calendar.get(Calendar.DATE); } /*<---Date data extraction complete*/ /*If creation date is not empty --->*/ if (creationYear != 0) { creationDateMetaData = creationYear + "-" + creationMonth + "-" + creationDate; } //<--- creation date found and the date part of the title is generated /*No creation date is found --->*/ else { SimpleDateFormat dateFormatter = new SimpleDateFormat("MM/dd/yyyy"); Date customDate = null; /*But we have custom date some times --->*/ try { customDate = dateFormatter.parse(info.getCustomMetadataValue("customdate")); } catch (ParseException e) { logger.info("Error parsing date from custom date"); } calendar = Calendar.getInstance(); calendar.setTime(customDate); if (calendar != null) { creationYear = calendar.get(Calendar.YEAR); creationMonth = calendar.get(Calendar.MONTH) + 1; creationDate = calendar.get(Calendar.DATE); } /*<---Date data extraction complete from customdate*/ if (creationYear != 0) { creationDateMetaData = creationYear + "-" + creationMonth + "-" + creationDate; } } //<--- work around if no creation date is found } /*<--- Good to know that the PDF was not damaged*/ } catch (IOException e) { /*If the PDF was not read by the system --->*/ logger.info("Error processing file " + file.getName()); /*... then maybe it is damaged*/ isDamaged = true; } finally { try { /*If the file was good, not damaged, then please close it --->*/ if (!isDamaged) { document.close(); logger.info("File " + file.getName() + " is closed successfully!"); } } catch (IOException e) { logger.info("Error closing file " + file.getName()); } } /*<--- PDF closing done!*/ return creationDateMetaData; }
From source file:com.synopsys.integration.blackduck.report.pdf.RiskReportPdfWriter.java
License:Apache License
public File createPDFReportFile(final File outputDirectory, final ReportData report) throws RiskReportException { final IntegrationEscapeUtil escapeUtil = new IntegrationEscapeUtil(); final String escapedProjectName = escapeUtil.escapeForUri(report.getProjectName()); final String escapedProjectVersionName = escapeUtil.escapeForUri(report.getProjectVersion()); final File pdfFile = new File(outputDirectory, escapedProjectName + "_" + escapedProjectVersionName + "_BlackDuck_RiskReport.pdf"); if (pdfFile.exists()) { pdfFile.delete();/*w w w. j a v a2s. c o m*/ } final PDDocument document = new PDDocument(); document.getDocumentInformation().setAuthor("Black Duck Software"); document.getDocumentInformation().setCreator("Integrations"); document.getDocumentInformation().setSubject("Hub Risk Report"); try (PDFBoxManager pdfManager = new PDFBoxManager(pdfFile, document)) { this.pdfManager = pdfManager; final PDRectangle pageBox = pdfManager.currentPage.getMediaBox(); final float pageWidth = pageBox.getWidth(); final float pageHeight = pageBox.getHeight(); final PDRectangle headerRectangle = writeHeader(pageWidth, pageHeight); final PDRectangle bottomOfProjectInfoRectangle = writeProjectInformation(pageWidth, headerRectangle.getLowerLeftY(), report); final PDRectangle bottomOfSummaryTableRectangle = writeSummaryTables(pageWidth, bottomOfProjectInfoRectangle.getLowerLeftY(), report); final PDRectangle bottomOfComponentTableRectangle = writeComponentTable(pageWidth, bottomOfSummaryTableRectangle.getLowerLeftY(), report); return pdfFile; } catch (final IOException | URISyntaxException e) { final String errorString = "Couldn't create the report: "; logger.trace(errorString + e.getMessage(), e); throw new RiskReportException(errorString + e.getMessage(), e); } }
From source file:com.synopsys.integration.blackduck.service.model.pdf.RiskReportPdfWriter.java
License:Apache License
public File createPDFReportFile(final File outputDirectory, final ReportData report) throws RiskReportException { final IntegrationEscapeUtil escapeUtil = new IntegrationEscapeUtil(); final String escapedProjectName = escapeUtil.escapeForUri(report.getProjectName()); final String escapedProjectVersionName = escapeUtil.escapeForUri(report.getProjectVersion()); final File pdfFile = new File(outputDirectory, escapedProjectName + "_" + escapedProjectVersionName + "_BlackDuck_RiskReport.pdf"); if (pdfFile.exists()) { pdfFile.delete();//from w w w.j a v a 2s. c o m } final PDDocument document = new PDDocument(); document.getDocumentInformation().setAuthor("Black Duck Software"); document.getDocumentInformation().setCreator("Integrations"); document.getDocumentInformation().setSubject("Black Duck Risk Report"); try (PDFBoxManager pdfManager = new PDFBoxManager(pdfFile, document)) { this.pdfManager = pdfManager; final PDRectangle pageBox = pdfManager.currentPage.getMediaBox(); final float pageWidth = pageBox.getWidth(); final float pageHeight = pageBox.getHeight(); final PDRectangle headerRectangle = writeHeader(pageWidth, pageHeight); final PDRectangle bottomOfProjectInfoRectangle = writeProjectInformation(pageWidth, headerRectangle.getLowerLeftY(), report); final PDRectangle bottomOfSummaryTableRectangle = writeSummaryTables(pageWidth, bottomOfProjectInfoRectangle.getLowerLeftY(), report); final PDRectangle bottomOfComponentTableRectangle = writeComponentTable(pageWidth, bottomOfSummaryTableRectangle.getLowerLeftY(), report); return pdfFile; } catch (final IOException | URISyntaxException e) { final String errorString = "Couldn't create the report: "; logger.trace(errorString + e.getMessage(), e); throw new RiskReportException(errorString + e.getMessage(), e); } }
From source file:com.wintindustries.pdffilter.pdfcore.PDFTester.java
static public void printMetadata(PDDocument document) throws IOException { PDDocumentInformation info = document.getDocumentInformation(); PDDocumentCatalog cat = document.getDocumentCatalog(); PDMetadata metadata = cat.getMetadata(); System.out.println("Page Count=" + document.getNumberOfPages()); System.out.println("Title=" + info.getTitle()); System.out.println("Author=" + info.getAuthor()); System.out.println("Subject=" + info.getSubject()); System.out.println("Keywords=" + info.getKeywords()); System.out.println("Creator=" + info.getCreator()); System.out.println("Producer=" + info.getProducer()); System.out.println("Creation Date=" + formatDate(info.getCreationDate())); System.out.println("Modification Date=" + formatDate(info.getModificationDate())); System.out.println("Trapped=" + info.getTrapped()); if (metadata != null) { System.out.println("Metadata=" + metadata.getInputStreamAsString()); }//from w w w.j a v a 2 s .c o m }
From source file:ddf.catalog.transformer.input.pdf.PdfInputTransformer.java
License:Open Source License
/** * @param pdfDocument PDF document/* www . j a v a 2 s. c o m*/ * @param metacard A mutable metacard to add the extracted data to */ private void extractPdfMetadata(PDDocument pdfDocument, MetacardImpl metacard) { PDDocumentInformation documentInformation = pdfDocument.getDocumentInformation(); setDateIfNotNull(documentInformation.getCreationDate(), metacard, Metacard.CREATED); setDateIfNotNull(documentInformation.getModificationDate(), metacard, Metacard.MODIFIED); if (usePdfTitleAsTitle) { setIfNotBlank(documentInformation.getTitle(), metacard, Metacard.TITLE); } setIfNotBlank(documentInformation.getAuthor(), metacard, Contact.CREATOR_NAME); setIfNotBlank(documentInformation.getSubject(), metacard, Metacard.DESCRIPTION); setIfNotBlank(documentInformation.getKeywords(), metacard, Topic.KEYWORD); }
From source file:es.ucm.pdfmeta.Main.java
License:Open Source License
private static void modifyDocFromModel(PDDocument doc, MetadataModel<String> m) { doc.getDocumentInformation().setAuthor(m.getProperty(AUTHOR_PROPERTY_NAME).getValue()); doc.getDocumentInformation().setTitle(m.getProperty(TITLE_PROPERTY_NAME).getValue()); doc.getDocumentInformation().setCustomMetadataValue(BIBTEX_PROPERTY_NAME, m.getProperty(BIBTEX_PROPERTY_NAME).getValue()); }
From source file:es.ucm.pdfmeta.Main.java
License:Open Source License
private static MetadataModel<String> buildModelFromDocument(PDDocument doc) { MetadataModel<String> m = new MetadataModel<>(); m.setProperty(AUTHOR_PROPERTY_NAME,// w w w .j a va 2s . co m new MetadataProperty<>(AUTHOR_PROPERTY_NAME, doc.getDocumentInformation().getAuthor())); m.setProperty(TITLE_PROPERTY_NAME, new MetadataProperty<>(TITLE_PROPERTY_NAME, doc.getDocumentInformation().getTitle())); m.setProperty(BIBTEX_PROPERTY_NAME, new MetadataProperty<>(BIBTEX_PROPERTY_NAME, doc.getDocumentInformation().getCustomMetadataValue(BIBTEX_PROPERTY_NAME))); return m; }
From source file:fr.univ_tours.etu.pdf.LucenePDFDocument.java
License:Apache License
/** * This will add the contents to the lucene document. * * @param document The document to add the contents to. * @param is The stream to get the contents from. * @param documentLocation The location of the document, used just for debug messages. * * @throws IOException If there is an error parsing the document. *///from ww w. ja v a 2 s . c om private void addContent(Document document, InputStream is, String documentLocation) throws IOException { PDDocument pdfDocument = null; try { pdfDocument = PDDocument.load(is); // create a writer where to append the text content. StringWriter writer = new StringWriter(); if (stripper == null) { stripper = new PDFTextStripper(); } stripper.writeText(pdfDocument, writer); String contentsDirty = writer.getBuffer().toString(); //System.out.println(contentsDirty.substring(0,100)); String contents = contentsDirty.replaceAll("\\p{Sm}|\\p{Sk}|\\p{So}", " "); //System.out.println(contents); // addTextField(document, DocFields.CONTENTS, reader); TextField ne = this.getNamedEntities(contents); String lemmas = nlpNeTokenizer.getLemmaString(); //StringReader reader = new StringReader(contents); StringReader reader = new StringReader(lemmas); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. FieldType type = new FieldType(); type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); type.setStored(false); type.setTokenized(true); document.add(new Field(DocFields.CONTENTS, reader, type)); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { document.add(ne);//adding named entities addTextField(document, DocFields.AUTHOR, info.getAuthor()); try {//to avoid issues with CreationDate addUnstoredDate(document, DocFields.CREATION_DATE, info.getCreationDate().getTime()); } catch (Exception e) { System.out.println("Warning: some issue with CreationDate attribute!"); } addTextField(document, DocFields.CREATOR, info.getCreator()); addTextField(document, DocFields.KEYWORDS, info.getKeywords()); addTextField(document, DocFields.SUBJECT, info.getSubject()); addTextField(document, DocFields.TITLE, info.getTitle()); //addTextField(document, "Title", info.getTitle()); //addTextField(document, "ModificationDate", info.getModificationDate()); //addTextField(document, "Producer", info.getProducer()); //addTextField(document, "Trapped", info.getTrapped()); } int summarySize = Math.min(contents.length(), 1500); String summary = contents.substring(0, summarySize); // Add the summary as an UnIndexed field, so that it is stored and returned // with hit documents for display. addUnindexedField(document, DocFields.SUMMARY, summary); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }