List of usage examples for org.apache.pdfbox.pdmodel PDDocument getDocumentInformation
public PDDocumentInformation getDocumentInformation()
From source file:org.paxle.parser.pdf.impl.PdfParser.java
License:Open Source License
/** * A function to extract metadata from the PDF-document. *//*from w ww . ja v a 2 s . com*/ protected void extractMetaData(IParserDocument parserDoc, PDDocument pddDoc) throws IOException { // extract metadata final PDDocumentInformation metadata = pddDoc.getDocumentInformation(); if (metadata == null) return; // document title final String title = metadata.getTitle(); if (title != null && title.length() > 0) parserDoc.setTitle(title); // document author(s) final String author = metadata.getAuthor(); if (author != null && author.length() > 0) parserDoc.setAuthor(author); ; // subject final String summary = metadata.getSubject(); if (summary != null && summary.length() > 0) parserDoc.setSummary(summary); // keywords final String keywords = metadata.getKeywords(); if (keywords != null && keywords.length() > 0) { String[] keywordArray = keywords.split("[,;\\s]"); if (keywordArray != null && keywordArray.length > 0) { parserDoc.setKeywords(Arrays.asList(keywordArray)); } } // last modification date final Calendar lastMod = metadata.getModificationDate(); if (lastMod != null) { parserDoc.setLastChanged(lastMod.getTime()); } }
From source file:org.pdfmetamodifier.IOHelper.java
License:Apache License
/** * Save Metadata./*from w w w .ja v a 2s .com*/ * * @param pdfFile * Source PDF file. * @param metadataFile * File with Metadata in user-frendly format. * @throws IOException */ /* * See: * https://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/pdmodel/ExtractMetadata.java?view=markup */ public static void saveMetadata(final File pdfFile, final File metadataFile) throws IOException { PDDocument document = null; try { // Read PDF file. document = PDDocument.load(pdfFile); if (document.isEncrypted()) { throw new IOException("Document is encrypted."); } // Get data from PDF file. final PDDocumentInformation information = document.getDocumentInformation(); // Convert. final List<String> lines = MetadataHelper.metadataToLineList(information); // Write line list into the text file. Files.write(metadataFile.toPath(), lines); } finally { if (document != null) { document.close(); } } }
From source file:org.pdfsam.pdf.DefaultPDFBoxLoader.java
License:Open Source License
public void accept(PDDocument document, PdfDocumentDescriptor descriptor) { descriptor.pages(document.getNumberOfPages()); descriptor.setVersion(getVersion(Float.toString(document.getVersion()))); PDDocumentInformation info = document.getDocumentInformation(); descriptor.putInformation(PdfMetadataKey.TITLE.getKey(), info.getTitle()); descriptor.putInformation(PdfMetadataKey.AUTHOR.getKey(), info.getAuthor()); descriptor.putInformation(PdfMetadataKey.CREATOR.getKey(), info.getCreator()); descriptor.putInformation(PdfMetadataKey.SUBJECT.getKey(), info.getSubject()); descriptor.putInformation(PdfMetadataKey.KEYWORDS.getKey(), info.getKeywords()); descriptor.putInformation("Producer", info.getProducer()); Optional.ofNullable(info.getCreationDate()).map(FORMATTER::format) .ifPresent(c -> descriptor.putInformation("FormattedCreationDate", c)); }
From source file:org.terrier.indexing.PDFDocument.java
License:Mozilla Public License
/** * Returns the reader of text, which is suitable for parsing terms out of, * and which is created by converting the file represented by * parameter docStream. This method involves running the stream * through the PDFParser etc provided in the org.pdfbox library. * On error, it returns null, and sets EOD to true, so no terms * can be read from this document.//w ww . j a v a 2s .c o m * @param is the input stream that represents the document's file. * @return Reader a reader that is fed to an indexer. */ protected Reader getReader(InputStream is) { if ((Files.length(filename) / 1048576) > 300) { logger.info("Skipping document " + filename + " because it's size exceeds 300Mb"); return new StringReader(""); } PDDocument pdfDocument = null; Reader rtr = null; try { pdfDocument = PDDocument.load(is); if (pdfDocument.isEncrypted()) { //Just try using the default password and move on pdfDocument.decrypt(""); } //create a writer where to append the text content. StringWriter writer = new StringWriter(); PDFTextStripper stripper = new PDFTextStripper(); stripper.writeText(pdfDocument, writer); String contents = writer.getBuffer().toString(); int spaceCount = StringUtils.countMatches(contents, " "); for (char badChar : new char[] { '\u00A0', '\u2029', '#' }) { final int count = StringUtils.countMatches(contents, "" + badChar); if (count > spaceCount / 2) { contents = contents.replace(badChar, ' '); spaceCount += count; } } rtr = new StringReader(contents); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null && USE_PDF_TITLE) { setProperty("title", info.getTitle()); } else { setProperty("title", new java.io.File(super.filename).getName()); } } catch (CryptographyException e) { throw new RuntimeException("Error decrypting PDF document: " + e); } catch (InvalidPasswordException e) { //they didn't suppply a password and the default of "" was wrong. throw new RuntimeException("Error: The PDF document is encrypted and will not be indexed."); } catch (Exception e) { throw new RuntimeException("Error extracting PDF document", e); } finally { if (pdfDocument != null) { try { pdfDocument.close(); } catch (IOException ioe) { } } } return rtr; }
From source file:org.wandora.application.tools.extractors.files.SimpleDocumentExtractor.java
License:Open Source License
public void _extractTopicsFromStream(String locator, InputStream inputStream, TopicMap topicMap, Topic topic) { try {/*from ww w . j ava 2 s. c om*/ String name = locator; if (name.indexOf("/") != -1) { name = name.substring(name.lastIndexOf("/") + 1); } else if (name.indexOf("\\") != -1) { name = name.substring(name.lastIndexOf("\\") + 1); } String lowerCaseLocator = locator.toLowerCase(); // --- HANDLE PDF ENRICHMENT TEXT --- if (lowerCaseLocator.endsWith("pdf")) { PDDocument doc = PDDocument.load(locator); PDDocumentInformation info = doc.getDocumentInformation(); DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT); // --- PDF PRODUCER --- String producer = info.getProducer(); if (producer != null && producer.length() > 0) { Topic producerType = createTopic(topicMap, "pdf-producer"); setData(topic, producerType, defaultLang, producer.trim()); } // --- PDF MODIFICATION DATE --- Calendar mCal = info.getModificationDate(); if (mCal != null) { String mdate = dateFormatter.format(mCal.getTime()); if (mdate != null && mdate.length() > 0) { Topic modificationDateType = createTopic(topicMap, "pdf-modification-date"); setData(topic, modificationDateType, defaultLang, mdate.trim()); } } // --- PDF CREATOR --- String creator = info.getCreator(); if (creator != null && creator.length() > 0) { Topic creatorType = createTopic(topicMap, "pdf-creator"); setData(topic, creatorType, defaultLang, creator.trim()); } // --- PDF CREATION DATE --- Calendar cCal = info.getCreationDate(); if (cCal != null) { String cdate = dateFormatter.format(cCal.getTime()); if (cdate != null && cdate.length() > 0) { Topic creationDateType = createTopic(topicMap, "pdf-creation-date"); setData(topic, creationDateType, defaultLang, cdate.trim()); } } // --- PDF AUTHOR --- String author = info.getAuthor(); if (author != null && author.length() > 0) { Topic authorType = createTopic(topicMap, "pdf-author"); setData(topic, authorType, defaultLang, author.trim()); } // --- PDF SUBJECT --- String subject = info.getSubject(); if (subject != null && subject.length() > 0) { Topic subjectType = createTopic(topicMap, "pdf-subject"); setData(topic, subjectType, defaultLang, subject.trim()); } // --- PDF TITLE --- String title = info.getSubject(); if (title != null && title.length() > 0) { Topic titleType = createTopic(topicMap, "pdf-title"); setData(topic, titleType, defaultLang, title.trim()); } // --- PDF KEYWORDS (SEPARATED WITH SEMICOLON) --- String keywords = info.getKeywords(); if (keywords != null && keywords.length() > 0) { Topic keywordType = createTopic(topicMap, "pdf-keyword"); String[] keywordArray = keywords.split(";"); String keyword = null; for (int i = 0; i < keywordArray.length; i++) { keyword = Textbox.trimExtraSpaces(keywordArray[i]); if (keyword != null && keyword.length() > 0) { Topic keywordTopic = createTopic(topicMap, keyword, keywordType); createAssociation(topicMap, keywordType, new Topic[] { topic, keywordTopic }); } } } // --- PDF TEXT CONTENT --- PDFTextStripper stripper = new PDFTextStripper(); String content = stripper.getText(doc); doc.close(); setTextEnrichment(topic, topicMap, content, name); } // --- HANDLE RTF DOCUMENTS --- else if (lowerCaseLocator.endsWith("rtf")) { String content = Textbox.RTF2PlainText(inputStream); setTextEnrichment(topic, topicMap, content, name); } // --- HANDLE OFFICE DOCUMENTS --- else if (lowerCaseLocator.endsWith("doc") || lowerCaseLocator.endsWith("docx") || lowerCaseLocator.endsWith("ppt") || lowerCaseLocator.endsWith("xls") || lowerCaseLocator.endsWith("vsd")) { String content = MSOfficeBox.getText(inputStream); if (content != null) { setTextEnrichment(topic, topicMap, content, name); } } else if (lowerCaseLocator.endsWith("odt") || lowerCaseLocator.endsWith("odp") || lowerCaseLocator.endsWith("odg") || lowerCaseLocator.endsWith("ods")) { org.odftoolkit.simple.Document oodocument = org.odftoolkit.simple.Document .loadDocument(inputStream); String content = OpenOfficeBox.getText(oodocument); setTextEnrichment(topic, topicMap, content, name); org.odftoolkit.simple.meta.Meta meta = oodocument.getOfficeMetadata(); // --- OO KEYWORDS --- List<String> keywords = meta.getKeywords(); if (keywords != null && !keywords.isEmpty()) { Topic keywordType = createTopic(topicMap, "oo-keyword"); for (String keyword : keywords) { keyword = keyword.trim(); if (keyword != null && keyword.length() > 0) { Topic keywordTopic = createTopic(topicMap, keyword, keywordType); createAssociation(topicMap, keywordType, new Topic[] { topic, keywordTopic }); } } } // --- OO TITLE --- String title = meta.getTitle(); if (title != null && title.length() > 0) { Topic titleType = createTopic(topicMap, "oo-title"); setData(topic, titleType, defaultLang, title.trim()); } // --- OO SUBJECT --- String subject = meta.getSubject(); if (subject != null && subject.length() > 0) { Topic subjectType = createTopic(topicMap, "oo-subject"); setData(topic, subjectType, defaultLang, subject.trim()); } // --- OO CREATOR --- String author = meta.getCreator(); if (author != null && author.length() > 0) { Topic authorType = createTopic(topicMap, "oo-author"); setData(topic, authorType, defaultLang, author.trim()); } // --- OO CREATION DATE --- Calendar cCal = meta.getCreationDate(); if (cCal != null) { DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT); String cdate = dateFormatter.format(cCal.getTime()); if (cdate != null && cdate.length() > 0) { Topic creationDateType = createTopic(topicMap, "oo-creation-date"); setData(topic, creationDateType, defaultLang, cdate.trim()); } } // --- OO DESCRIPTION --- String description = meta.getDescription(); if (description != null && description.length() > 0) { Topic descriptionType = createTopic(topicMap, "oo-description"); setData(topic, descriptionType, defaultLang, description.trim()); } // --- OO GENERATOR --- String generator = meta.getGenerator(); if (generator != null && generator.length() > 0) { Topic generatorType = createTopic(topicMap, "oo-generator"); setData(topic, generatorType, defaultLang, generator.trim()); } } else if (lowerCaseLocator.endsWith("html") || lowerCaseLocator.endsWith("htm")) { String content = IObox.loadFile(new InputStreamReader(inputStream)); setTextEnrichment(topic, topicMap, content, name); } else if (lowerCaseLocator.endsWith("txt") || lowerCaseLocator.endsWith("text")) { String content = IObox.loadFile(new InputStreamReader(inputStream)); setTextEnrichment(topic, topicMap, content, name); } // --- HANDLE ANY OTHER DOCUMENTS --- else { byte[] content = IObox.loadBFile(inputStream); String mimeType = ""; MimeUtil.registerMimeDetector("eu.medsea.mimeutil.detector.MagicMimeMimeDetector"); Collection<MimeType> mimeTypes = new ArrayList(); if (locator != null) { if (MimeTypes.getMimeType(locator) != null) { mimeTypes.add(new MimeType(MimeTypes.getMimeType(locator))); } mimeTypes.addAll(MimeUtil.getMimeTypes(locator)); } mimeTypes.addAll(MimeUtil.getMimeTypes(content)); boolean isText = false; for (MimeType mime : mimeTypes) { if (MimeUtil.isTextMimeType(mime)) { isText = true; break; } } if (isText) { setTextEnrichment(topic, topicMap, new String(content), name); } else { if (!mimeTypes.isEmpty()) { MimeType mime = mimeTypes.iterator().next(); mimeType = mime.toString(); } setBinaryEnrichment(topic, topicMap, content, mimeType); } } } catch (Exception e) { log(e); } }
From source file:org.wandora.application.tools.extractors.files.SimplePDFExtractor.java
License:Open Source License
public void _extractTopicsFromStream(String locator, InputStream inputStream, TopicMap topicMap, Topic pdfTopic) {/*from ww w .j a v a 2 s .c o m*/ PDDocument doc = null; try { if (locator.startsWith("http://")) { doc = PDDocument.load(new URL(locator)); } else { doc = PDDocument.load(new File(locator)); } PDDocumentInformation info = doc.getDocumentInformation(); DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT); // --- PDF PRODUCER --- String producer = info.getProducer(); if (producer != null && producer.length() > 0) { Topic producerType = createTopic(topicMap, "pdf-producer"); setData(pdfTopic, producerType, defaultLang, producer.trim()); } // --- PDF MODIFICATION DATE --- Calendar mCal = info.getModificationDate(); if (mCal != null) { String mdate = dateFormatter.format(mCal.getTime()); if (mdate != null && mdate.length() > 0) { Topic modificationDateType = createTopic(topicMap, "pdf-modification-date"); setData(pdfTopic, modificationDateType, defaultLang, mdate.trim()); } } // --- PDF CREATOR --- String creator = info.getCreator(); if (creator != null && creator.length() > 0) { Topic creatorType = createTopic(topicMap, "pdf-creator"); setData(pdfTopic, creatorType, defaultLang, creator.trim()); } // --- PDF CREATION DATE --- Calendar cCal = info.getCreationDate(); if (cCal != null) { String cdate = dateFormatter.format(cCal.getTime()); if (cdate != null && cdate.length() > 0) { Topic creationDateType = createTopic(topicMap, "pdf-creation-date"); setData(pdfTopic, creationDateType, defaultLang, cdate.trim()); } } // --- PDF AUTHOR --- String author = info.getAuthor(); if (author != null && author.length() > 0) { Topic authorType = createTopic(topicMap, "pdf-author"); setData(pdfTopic, authorType, defaultLang, author.trim()); } // --- PDF SUBJECT --- String subject = info.getSubject(); if (subject != null && subject.length() > 0) { Topic subjectType = createTopic(topicMap, "pdf-subject"); setData(pdfTopic, subjectType, defaultLang, subject.trim()); } // --- PDF TITLE --- String title = info.getSubject(); if (title != null && title.length() > 0) { if (makeVariantFromTitle) { pdfTopic.setDisplayName(defaultLang, title); } else { Topic titleType = createTopic(topicMap, "pdf-title"); setData(pdfTopic, titleType, defaultLang, title.trim()); } } // --- PDF KEYWORDS (SEPARATED WITH SEMICOLON) --- String keywords = info.getKeywords(); if (keywords != null && keywords.length() > 0) { Topic keywordType = createTopic(topicMap, "pdf-keyword"); String[] keywordArray = keywords.split(";"); String keyword = null; for (int i = 0; i < keywordArray.length; i++) { keyword = Textbox.trimExtraSpaces(keywordArray[i]); if (keyword != null && keyword.length() > 0) { Topic keywordTopic = createTopic(topicMap, keyword, keywordType); createAssociation(topicMap, keywordType, new Topic[] { pdfTopic, keywordTopic }); } } } // --- PDF TEXT CONTENT --- PDFTextStripper stripper = new PDFTextStripper(); String content = new String(); if (makePageTopics) { int pages = doc.getNumberOfPages(); String pageContent = null; for (int i = 0; i < pages; i++) { stripper.setStartPage(i); stripper.setEndPage(i); pageContent = stripper.getText(doc); Topic pageType = createTopic(topicMap, "pdf-page"); Topic pageTopic = createTopic(topicMap, pdfTopic.getBaseName() + " (page " + i + ")", pageType); Topic orderType = createTopic(topicMap, "order"); Topic orderTopic = createTopic(topicMap, i + ".", orderType); Topic contentType = createTopic(topicMap, "pdf-text"); setData(pageTopic, contentType, defaultLang, pageContent.trim()); createAssociation(topicMap, pageType, new Topic[] { pdfTopic, pageTopic, orderTopic }); } } else { content = stripper.getText(doc); } if (!makePageTopics && content != null && content.length() > 0) { Topic contentType = createTopic(topicMap, "pdf-text"); setData(pdfTopic, contentType, defaultLang, content.trim()); } doc.close(); } catch (Exception e) { e.printStackTrace(); try { if (doc != null) doc.close(); } catch (Exception ix) { e.printStackTrace(); } } }
From source file:org.wandora.application.tools.extractors.fng.ExtractFNGTextEnrichment.java
License:Open Source License
public void _extractTopicsFromStream(String locator, InputStream inputStream, TopicMap topicMap, Topic textTopic) {//from w w w . j av a 2 s.c om try { String lowerCaseLocator = locator.toLowerCase(); // --- HANDLE PDF ENRICHMENT TEXT --- if (lowerCaseLocator.endsWith("pdf")) { PDDocument doc = PDDocument.load(new URL(locator)); PDDocumentInformation info = doc.getDocumentInformation(); // --- PDF SUBJECT --- String subject = info.getSubject(); if (subject != null && subject.length() > 0) { Topic subjectType = createTopic(topicMap, "subject"); setData(textTopic, subjectType, defaultLang, subject.trim()); } // --- PDF TITLE --- String title = info.getTitle(); if (title != null && title.length() > 0) { Topic titleType = createTopic(topicMap, "title"); setData(textTopic, titleType, defaultLang, title.trim()); } // --- PDF KEYWORDS --- String keywords = info.getKeywords(); if (keywords != null && keywords.length() > 0) { Topic keywordType = createTopic(topicMap, "keywords"); setData(textTopic, keywordType, defaultLang, keywords.trim()); } // --- PDF TEXT CONTENT --- PDFTextStripper stripper = new PDFTextStripper(); String content = stripper.getText(doc); setTextEnrichment(textTopic, topicMap, content); doc.close(); } // --- HANDLE RTF DOCUMENTS --- else if (lowerCaseLocator.endsWith("rtf")) { String content = Textbox.RTF2PlainText(inputStream); setTextEnrichment(textTopic, topicMap, content); } // --- HANDLE OFFICE DOCUMENTS --- else if (lowerCaseLocator.endsWith("doc") || lowerCaseLocator.endsWith("docx") || lowerCaseLocator.endsWith("ppt") || lowerCaseLocator.endsWith("xsl") || lowerCaseLocator.endsWith("vsd")) { String content = MSOfficeBox.getText(inputStream); if (content != null) { setTextEnrichment(textTopic, topicMap, content); } } // --- HANDLE TXT DOCUMENTS --- else { String content = IObox.loadFile(new InputStreamReader(inputStream)); setTextEnrichment(textTopic, topicMap, content); } } catch (Exception e) { log(e); } }
From source file:org.wandora.piccolo.utils.crawler.handlers.PDFHandler.java
License:Open Source License
public void handle(CrawlerAccess crawler, InputStream in, int depth, URL page) { try {//from ww w.ja v a 2 s . c om Document d = new Document(); PDDocument doc = PDDocument.load(page); PDDocumentInformation info = doc.getDocumentInformation(); PDFTextStripper stripper = new PDFTextStripper(); String content = stripper.getText(doc); doc.close(); d.add(LuceneCrawler.subject(info.getSubject())); d.add(LuceneCrawler.title(info.getTitle())); d.add(LuceneCrawler.keywords(info.getKeywords())); d.add(LuceneCrawler.content(content)); d.add(LuceneCrawler.location(page.toString())); crawler.addObject(d); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } }
From source file:org.wso2.carbon.apimgt.impl.reportgen.ReportGenerator.java
License:Open Source License
/** * Generate PDF file for API microgateway request summary * * @param table object containing table headers and row data * @return InputStream pdf as a stream/*from w w w . j a v a 2s . c om*/ * @throws IOException * @throws COSVisitorException */ public InputStream generateMGRequestSummeryPDF(TableData table) throws IOException, COSVisitorException { String[] columnHeaders = table.getColumnHeaders(); PDDocument document = new PDDocument(); PDPage page = new PDPage(); page.setMediaBox(PDPage.PAGE_SIZE_A4); page.setRotation(0); document.addPage(page); PDPageContentStream contentStream = new PDPageContentStream(document, page, false, false); // add logo InputStream in = APIManagerComponent.class.getResourceAsStream("/report/wso2-logo.jpg"); PDJpeg img = new PDJpeg(document, in); contentStream.drawImage(img, 375, 755); // Add topic contentStream.setFont(PDType1Font.HELVETICA_BOLD, 16); writeContent(contentStream, CELL_MARGIN, 770, "API Microgateway request summary"); // Add generated time contentStream.setFont(PDType1Font.HELVETICA_BOLD, FONT_SIZE); writeContent(contentStream, CELL_MARGIN, 730, "Report generated on: " + new Date().toString()); contentStream.setFont(TEXT_FONT, FONT_SIZE); // add table with data drowTableGrid(contentStream, table.getRows().size()); writeRowsContent(contentStream, columnHeaders, table.getRows()); // Add meta data // Whenever the summary report structure is updated this should be changed String requestCount = table.getRows().get(0).getEntries().get(2); document.getDocumentInformation().setCustomMetadataValue(MGW_META, getMetaCount(requestCount)); contentStream.close(); ByteArrayOutputStream out = new ByteArrayOutputStream(); document.save(out); document.close(); return new ByteArrayInputStream(out.toByteArray()); }
From source file:org.xstudiosys.pdfxmp.AddMetadataFromDocInfo.java
License:Apache License
/** * This will print the documents data./*from w ww.j a va2s . c om*/ * * @param args The command line arguments. * * @throws Exception If there is an error parsing the document. */ public static void main(String[] args) throws Exception { if (args.length != 2) { usage(); } else { PDDocument document = null; try { document = PDDocument.load(args[0]); if (document.isEncrypted()) { System.err.println("Error: Cannot add metadata to encrypted document."); System.exit(1); } PDDocumentCatalog catalog = document.getDocumentCatalog(); PDDocumentInformation info = document.getDocumentInformation(); XMPMetadata metadata = new XMPMetadata(); XMPSchemaPDF pdfSchema = metadata.addPDFSchema(); pdfSchema.setKeywords(info.getKeywords()); pdfSchema.setProducer(info.getProducer()); XMPSchemaBasic basicSchema = metadata.addBasicSchema(); basicSchema.setModifyDate(info.getModificationDate()); basicSchema.setCreateDate(info.getCreationDate()); basicSchema.setCreatorTool(info.getCreator()); basicSchema.setMetadataDate(new GregorianCalendar()); XMPSchemaDublinCore dcSchema = metadata.addDublinCoreSchema(); dcSchema.setTitle(info.getTitle()); dcSchema.addCreator("PDFBox"); dcSchema.setDescription(info.getSubject()); PDMetadata metadataStream = new PDMetadata(document); metadataStream.importXMPMetadata(metadata); catalog.setMetadata(metadataStream); document.save(args[1]); } finally { if (document != null) { document.close(); } } } }