List of usage examples for org.apache.pdfbox.pdmodel PDDocumentInformation getTitle
public String getTitle()
From source file:org.nuxeo.pdf.PDFInfo.java
License:Open Source License
/** * After building the object with the correct constructor, and after * possibly having set some parsing property (<code>setParseWithXMP()</code> * for example), this method will extract the information from the PDF. * <p>/*ww w .j a v a 2 s .c o m*/ * After extraction, caller get the info: Either all of them ( * <code>toHashMap()</code> or <code>toString()</code>) or individual info * (see all getters) * * @throws ClientException * * @since 5.9.5 */ public void run() throws ClientException { // In case the caller calls several time the run() method if (!alreadyParsed) { fileName = pdfBlob.getFilename(); // Getting the file size os ok only if the blob is already backed by // a // File. If it is pure Stream, we give up File pdfFile = BlobHelper.getFileFromBlob(pdfBlob); if (pdfFile == null) { fileSize = -1; } else { fileSize = pdfFile.length(); } try { pdfDoc = PDDocument.load(pdfBlob.getStream()); isEncrypted = pdfDoc.isEncrypted(); if (isEncrypted) { pdfDoc.openProtection(new StandardDecryptionMaterial(password)); } numberOfPages = pdfDoc.getNumberOfPages(); PDDocumentCatalog docCatalog = pdfDoc.getDocumentCatalog(); pageLayout = checkNotNull(docCatalog.getPageLayout()); pdfVersion = "" + pdfDoc.getDocument().getVersion(); PDDocumentInformation docInfo = pdfDoc.getDocumentInformation(); author = checkNotNull(docInfo.getAuthor()); contentCreator = checkNotNull(docInfo.getCreator()); keywords = checkNotNull(docInfo.getKeywords()); creationDate = docInfo.getCreationDate(); modificationDate = docInfo.getModificationDate(); producer = checkNotNull(docInfo.getProducer()); subject = checkNotNull(docInfo.getSubject()); title = checkNotNull(docInfo.getTitle()); // Getting dimension is a bit tricky mediaBoxWidthInPoints = -1; mediaBoxHeightInPoints = -1; cropBoxWidthInPoints = -1; cropBoxHeightInPoints = -1; List<PDPage> allPages = docCatalog.getAllPages(); boolean gotMediaBox = false; boolean gotCropBox = false; for (PDPage page : allPages) { if (page != null) { PDRectangle r = page.findMediaBox(); if (r != null) { mediaBoxWidthInPoints = r.getWidth(); mediaBoxHeightInPoints = r.getHeight(); gotMediaBox = true; } r = page.findCropBox(); if (r != null) { cropBoxWidthInPoints = r.getWidth(); cropBoxHeightInPoints = r.getHeight(); gotCropBox = true; } } if (gotMediaBox && gotCropBox) { break; } } if (doXMP) { xmp = null; PDMetadata metadata = docCatalog.getMetadata(); if (metadata != null) { xmp = ""; InputStream xmlInputStream = metadata.createInputStream(); InputStreamReader isr = new InputStreamReader(xmlInputStream); BufferedReader reader = new BufferedReader(isr); String line; do { line = reader.readLine(); if (line != null) { xmp += line + "\n"; } } while (line != null); reader.close(); } } } catch (IOException | BadSecurityHandlerException | CryptographyException e) { throw new ClientException(/* * "Cannot get PDF info: " + * e.getMessage(), */e); } finally { if (pdfDoc != null) { try { pdfDoc.close(); } catch (IOException e) { // Ignore } pdfDoc = null; } alreadyParsed = true; } } }
From source file:org.nuxeo.pdf.test.PDFPageExtractorTest.java
License:Open Source License
@Test public void testExtractPages_WithSetInfo() throws Exception { Blob extracted;// w w w. j ava2s .c om String originalName = pdfFileBlob.getFilename().replace(".pdf", ""); PDFPageExtractor pe = new PDFPageExtractor(pdfFileBlob); extracted = pe.extract(5, 9, null, "One Upon a Time", "Fairyland", "Cool Author"); assertTrue(extracted instanceof FileBlob); assertEquals(originalName + "-5-9.pdf", extracted.getFilename()); PDDocument doc = PDDocument.load(extracted.getStream()); utils.track(doc); PDDocumentInformation docInfo = doc.getDocumentInformation(); assertEquals("One Upon a Time", docInfo.getTitle()); assertEquals("Fairyland", docInfo.getSubject()); assertEquals("Cool Author", docInfo.getAuthor()); doc.close(); utils.untrack(doc); }
From source file:org.nuxeo.pdf.test.PDFUtilsTest.java
License:Open Source License
@Test public void test_setInfos() throws Exception { PDDocument doc = PDDocument.load(pdfFile); utils.track(doc);/* w ww. j av a2 s . c o m*/ PDDocumentInformation docInfoOriginal = doc.getDocumentInformation(); // Check original document has the expected values assertEquals("Untitled 3", docInfoOriginal.getTitle()); assertNull(docInfoOriginal.getSubject()); assertNull(docInfoOriginal.getAuthor()); // Now, modify // First, actually, don't modify PDFUtils.setInfos(doc, null, "", null); PDDocumentInformation newDocInfo = doc.getDocumentInformation(); assertEquals(docInfoOriginal.getTitle(), newDocInfo.getTitle()); assertEquals(docInfoOriginal.getSubject(), newDocInfo.getSubject()); assertEquals(docInfoOriginal.getAuthor(), newDocInfo.getAuthor()); // Now, modify PDFUtils.setInfos(doc, "The Title", "The Subject", "The Author"); newDocInfo = doc.getDocumentInformation(); assertEquals("The Title", newDocInfo.getTitle()); assertEquals("The Subject", newDocInfo.getSubject()); assertEquals("The Author", newDocInfo.getAuthor()); doc.close(); utils.untrack(doc); }
From source file:org.paxle.parser.pdf.impl.PdfParser.java
License:Open Source License
/** * A function to extract metadata from the PDF-document. *//*from ww w. j a v a 2s .co m*/ protected void extractMetaData(IParserDocument parserDoc, PDDocument pddDoc) throws IOException { // extract metadata final PDDocumentInformation metadata = pddDoc.getDocumentInformation(); if (metadata == null) return; // document title final String title = metadata.getTitle(); if (title != null && title.length() > 0) parserDoc.setTitle(title); // document author(s) final String author = metadata.getAuthor(); if (author != null && author.length() > 0) parserDoc.setAuthor(author); ; // subject final String summary = metadata.getSubject(); if (summary != null && summary.length() > 0) parserDoc.setSummary(summary); // keywords final String keywords = metadata.getKeywords(); if (keywords != null && keywords.length() > 0) { String[] keywordArray = keywords.split("[,;\\s]"); if (keywordArray != null && keywordArray.length > 0) { parserDoc.setKeywords(Arrays.asList(keywordArray)); } } // last modification date final Calendar lastMod = metadata.getModificationDate(); if (lastMod != null) { parserDoc.setLastChanged(lastMod.getTime()); } }
From source file:org.pdfsam.pdf.DefaultPDFBoxLoader.java
License:Open Source License
public void accept(PDDocument document, PdfDocumentDescriptor descriptor) { descriptor.pages(document.getNumberOfPages()); descriptor.setVersion(getVersion(Float.toString(document.getVersion()))); PDDocumentInformation info = document.getDocumentInformation(); descriptor.putInformation(PdfMetadataKey.TITLE.getKey(), info.getTitle()); descriptor.putInformation(PdfMetadataKey.AUTHOR.getKey(), info.getAuthor()); descriptor.putInformation(PdfMetadataKey.CREATOR.getKey(), info.getCreator()); descriptor.putInformation(PdfMetadataKey.SUBJECT.getKey(), info.getSubject()); descriptor.putInformation(PdfMetadataKey.KEYWORDS.getKey(), info.getKeywords()); descriptor.putInformation("Producer", info.getProducer()); Optional.ofNullable(info.getCreationDate()).map(FORMATTER::format) .ifPresent(c -> descriptor.putInformation("FormattedCreationDate", c)); }
From source file:org.terrier.indexing.PDFDocument.java
License:Mozilla Public License
/** * Returns the reader of text, which is suitable for parsing terms out of, * and which is created by converting the file represented by * parameter docStream. This method involves running the stream * through the PDFParser etc provided in the org.pdfbox library. * On error, it returns null, and sets EOD to true, so no terms * can be read from this document.//from ww w . j a v a2s . c o m * @param is the input stream that represents the document's file. * @return Reader a reader that is fed to an indexer. */ protected Reader getReader(InputStream is) { if ((Files.length(filename) / 1048576) > 300) { logger.info("Skipping document " + filename + " because it's size exceeds 300Mb"); return new StringReader(""); } PDDocument pdfDocument = null; Reader rtr = null; try { pdfDocument = PDDocument.load(is); if (pdfDocument.isEncrypted()) { //Just try using the default password and move on pdfDocument.decrypt(""); } //create a writer where to append the text content. StringWriter writer = new StringWriter(); PDFTextStripper stripper = new PDFTextStripper(); stripper.writeText(pdfDocument, writer); String contents = writer.getBuffer().toString(); int spaceCount = StringUtils.countMatches(contents, " "); for (char badChar : new char[] { '\u00A0', '\u2029', '#' }) { final int count = StringUtils.countMatches(contents, "" + badChar); if (count > spaceCount / 2) { contents = contents.replace(badChar, ' '); spaceCount += count; } } rtr = new StringReader(contents); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null && USE_PDF_TITLE) { setProperty("title", info.getTitle()); } else { setProperty("title", new java.io.File(super.filename).getName()); } } catch (CryptographyException e) { throw new RuntimeException("Error decrypting PDF document: " + e); } catch (InvalidPasswordException e) { //they didn't suppply a password and the default of "" was wrong. throw new RuntimeException("Error: The PDF document is encrypted and will not be indexed."); } catch (Exception e) { throw new RuntimeException("Error extracting PDF document", e); } finally { if (pdfDocument != null) { try { pdfDocument.close(); } catch (IOException ioe) { } } } return rtr; }
From source file:org.wandora.application.tools.extractors.fng.ExtractFNGTextEnrichment.java
License:Open Source License
public void _extractTopicsFromStream(String locator, InputStream inputStream, TopicMap topicMap, Topic textTopic) {//from w w w . j ava 2 s .c o m try { String lowerCaseLocator = locator.toLowerCase(); // --- HANDLE PDF ENRICHMENT TEXT --- if (lowerCaseLocator.endsWith("pdf")) { PDDocument doc = PDDocument.load(new URL(locator)); PDDocumentInformation info = doc.getDocumentInformation(); // --- PDF SUBJECT --- String subject = info.getSubject(); if (subject != null && subject.length() > 0) { Topic subjectType = createTopic(topicMap, "subject"); setData(textTopic, subjectType, defaultLang, subject.trim()); } // --- PDF TITLE --- String title = info.getTitle(); if (title != null && title.length() > 0) { Topic titleType = createTopic(topicMap, "title"); setData(textTopic, titleType, defaultLang, title.trim()); } // --- PDF KEYWORDS --- String keywords = info.getKeywords(); if (keywords != null && keywords.length() > 0) { Topic keywordType = createTopic(topicMap, "keywords"); setData(textTopic, keywordType, defaultLang, keywords.trim()); } // --- PDF TEXT CONTENT --- PDFTextStripper stripper = new PDFTextStripper(); String content = stripper.getText(doc); setTextEnrichment(textTopic, topicMap, content); doc.close(); } // --- HANDLE RTF DOCUMENTS --- else if (lowerCaseLocator.endsWith("rtf")) { String content = Textbox.RTF2PlainText(inputStream); setTextEnrichment(textTopic, topicMap, content); } // --- HANDLE OFFICE DOCUMENTS --- else if (lowerCaseLocator.endsWith("doc") || lowerCaseLocator.endsWith("docx") || lowerCaseLocator.endsWith("ppt") || lowerCaseLocator.endsWith("xsl") || lowerCaseLocator.endsWith("vsd")) { String content = MSOfficeBox.getText(inputStream); if (content != null) { setTextEnrichment(textTopic, topicMap, content); } } // --- HANDLE TXT DOCUMENTS --- else { String content = IObox.loadFile(new InputStreamReader(inputStream)); setTextEnrichment(textTopic, topicMap, content); } } catch (Exception e) { log(e); } }
From source file:org.wandora.piccolo.utils.crawler.handlers.PDFHandler.java
License:Open Source License
public void handle(CrawlerAccess crawler, InputStream in, int depth, URL page) { try {/* w ww .j a v a 2 s .co m*/ Document d = new Document(); PDDocument doc = PDDocument.load(page); PDDocumentInformation info = doc.getDocumentInformation(); PDFTextStripper stripper = new PDFTextStripper(); String content = stripper.getText(doc); doc.close(); d.add(LuceneCrawler.subject(info.getSubject())); d.add(LuceneCrawler.title(info.getTitle())); d.add(LuceneCrawler.keywords(info.getKeywords())); d.add(LuceneCrawler.content(content)); d.add(LuceneCrawler.location(page.toString())); crawler.addObject(d); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } }
From source file:org.xstudiosys.pdfxmp.AddMetadataFromDocInfo.java
License:Apache License
/** * This will print the documents data./*from w w w .ja va 2 s . c o m*/ * * @param args The command line arguments. * * @throws Exception If there is an error parsing the document. */ public static void main(String[] args) throws Exception { if (args.length != 2) { usage(); } else { PDDocument document = null; try { document = PDDocument.load(args[0]); if (document.isEncrypted()) { System.err.println("Error: Cannot add metadata to encrypted document."); System.exit(1); } PDDocumentCatalog catalog = document.getDocumentCatalog(); PDDocumentInformation info = document.getDocumentInformation(); XMPMetadata metadata = new XMPMetadata(); XMPSchemaPDF pdfSchema = metadata.addPDFSchema(); pdfSchema.setKeywords(info.getKeywords()); pdfSchema.setProducer(info.getProducer()); XMPSchemaBasic basicSchema = metadata.addBasicSchema(); basicSchema.setModifyDate(info.getModificationDate()); basicSchema.setCreateDate(info.getCreationDate()); basicSchema.setCreatorTool(info.getCreator()); basicSchema.setMetadataDate(new GregorianCalendar()); XMPSchemaDublinCore dcSchema = metadata.addDublinCoreSchema(); dcSchema.setTitle(info.getTitle()); dcSchema.addCreator("PDFBox"); dcSchema.setDescription(info.getSubject()); PDMetadata metadataStream = new PDMetadata(document); metadataStream.importXMPMetadata(metadata); catalog.setMetadata(metadataStream); document.save(args[1]); } finally { if (document != null) { document.close(); } } } }
From source file:org.xstudiosys.pdfxmp.MarkBuilder.java
License:Open Source License
public void onComplete(PDDocument document) { try {/*from w w w . ja va 2s .c o m*/ PDDocumentCatalog catalog = document.getDocumentCatalog(); PDDocumentInformation info = document.getDocumentInformation(); XMPMetadata metadata = new XMPMetadata(); XMPSchemaPDF pdfSchema = metadata.addPDFSchema(); pdfSchema.setKeywords(info.getKeywords()); pdfSchema.setProducer(info.getProducer()); XMPSchemaBasic basicSchema = metadata.addBasicSchema(); basicSchema.setModifyDate(info.getModificationDate()); basicSchema.setCreateDate(info.getCreationDate()); basicSchema.setCreatorTool(info.getCreator()); basicSchema.setMetadataDate(new GregorianCalendar()); XMPSchemaDublinCore dcSchema = metadata.addDublinCoreSchema(); dcSchema.setTitle(info.getTitle()); dcSchema.addCreator("PDFBox"); dcSchema.setDescription(info.getSubject()); PDMetadata metadataStream = new PDMetadata(document); metadataStream.importXMPMetadata(metadata); catalog.setMetadata(metadataStream); } catch (Exception e) { e.printStackTrace(); } }