List of usage examples for org.apache.pdfbox.pdmodel PDDocument close
@Override public void close() throws IOException
From source file:org.wandora.application.tools.extractors.files.SimplePDFExtractor.java
License:Open Source License
public void _extractTopicsFromStream(String locator, InputStream inputStream, TopicMap topicMap, Topic pdfTopic) {/* w ww . j a va 2 s .c o m*/ PDDocument doc = null; try { if (locator.startsWith("http://")) { doc = PDDocument.load(new URL(locator)); } else { doc = PDDocument.load(new File(locator)); } PDDocumentInformation info = doc.getDocumentInformation(); DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT); // --- PDF PRODUCER --- String producer = info.getProducer(); if (producer != null && producer.length() > 0) { Topic producerType = createTopic(topicMap, "pdf-producer"); setData(pdfTopic, producerType, defaultLang, producer.trim()); } // --- PDF MODIFICATION DATE --- Calendar mCal = info.getModificationDate(); if (mCal != null) { String mdate = dateFormatter.format(mCal.getTime()); if (mdate != null && mdate.length() > 0) { Topic modificationDateType = createTopic(topicMap, "pdf-modification-date"); setData(pdfTopic, modificationDateType, defaultLang, mdate.trim()); } } // --- PDF CREATOR --- String creator = info.getCreator(); if (creator != null && creator.length() > 0) { Topic creatorType = createTopic(topicMap, "pdf-creator"); setData(pdfTopic, creatorType, defaultLang, creator.trim()); } // --- PDF CREATION DATE --- Calendar cCal = info.getCreationDate(); if (cCal != null) { String cdate = dateFormatter.format(cCal.getTime()); if (cdate != null && cdate.length() > 0) { Topic creationDateType = createTopic(topicMap, "pdf-creation-date"); setData(pdfTopic, creationDateType, defaultLang, cdate.trim()); } } // --- PDF AUTHOR --- String author = info.getAuthor(); if (author != null && author.length() > 0) { Topic authorType = createTopic(topicMap, "pdf-author"); setData(pdfTopic, authorType, defaultLang, author.trim()); } // --- PDF SUBJECT --- String subject = info.getSubject(); if (subject != null && subject.length() > 0) { Topic subjectType = createTopic(topicMap, "pdf-subject"); setData(pdfTopic, subjectType, defaultLang, subject.trim()); } // --- PDF TITLE --- String title = info.getSubject(); if (title != null && title.length() > 0) { if (makeVariantFromTitle) { pdfTopic.setDisplayName(defaultLang, title); } else { Topic titleType = createTopic(topicMap, "pdf-title"); setData(pdfTopic, titleType, defaultLang, title.trim()); } } // --- PDF KEYWORDS (SEPARATED WITH SEMICOLON) --- String keywords = info.getKeywords(); if (keywords != null && keywords.length() > 0) { Topic keywordType = createTopic(topicMap, "pdf-keyword"); String[] keywordArray = keywords.split(";"); String keyword = null; for (int i = 0; i < keywordArray.length; i++) { keyword = Textbox.trimExtraSpaces(keywordArray[i]); if (keyword != null && keyword.length() > 0) { Topic keywordTopic = createTopic(topicMap, keyword, keywordType); createAssociation(topicMap, keywordType, new Topic[] { pdfTopic, keywordTopic }); } } } // --- PDF TEXT CONTENT --- PDFTextStripper stripper = new PDFTextStripper(); String content = new String(); if (makePageTopics) { int pages = doc.getNumberOfPages(); String pageContent = null; for (int i = 0; i < pages; i++) { stripper.setStartPage(i); stripper.setEndPage(i); pageContent = stripper.getText(doc); Topic pageType = createTopic(topicMap, "pdf-page"); Topic pageTopic = createTopic(topicMap, pdfTopic.getBaseName() + " (page " + i + ")", pageType); Topic orderType = createTopic(topicMap, "order"); Topic orderTopic = createTopic(topicMap, i + ".", orderType); Topic contentType = createTopic(topicMap, "pdf-text"); setData(pageTopic, contentType, defaultLang, pageContent.trim()); createAssociation(topicMap, pageType, new Topic[] { pdfTopic, pageTopic, orderTopic }); } } else { content = stripper.getText(doc); } if (!makePageTopics && content != null && content.length() > 0) { Topic contentType = createTopic(topicMap, "pdf-text"); setData(pdfTopic, contentType, defaultLang, content.trim()); } doc.close(); } catch (Exception e) { e.printStackTrace(); try { if (doc != null) doc.close(); } catch (Exception ix) { e.printStackTrace(); } } }
From source file:org.wandora.application.tools.extractors.fng.ExtractFNGTextEnrichment.java
License:Open Source License
public void _extractTopicsFromStream(String locator, InputStream inputStream, TopicMap topicMap, Topic textTopic) {/*from w w w . j a v a 2s . c om*/ try { String lowerCaseLocator = locator.toLowerCase(); // --- HANDLE PDF ENRICHMENT TEXT --- if (lowerCaseLocator.endsWith("pdf")) { PDDocument doc = PDDocument.load(new URL(locator)); PDDocumentInformation info = doc.getDocumentInformation(); // --- PDF SUBJECT --- String subject = info.getSubject(); if (subject != null && subject.length() > 0) { Topic subjectType = createTopic(topicMap, "subject"); setData(textTopic, subjectType, defaultLang, subject.trim()); } // --- PDF TITLE --- String title = info.getTitle(); if (title != null && title.length() > 0) { Topic titleType = createTopic(topicMap, "title"); setData(textTopic, titleType, defaultLang, title.trim()); } // --- PDF KEYWORDS --- String keywords = info.getKeywords(); if (keywords != null && keywords.length() > 0) { Topic keywordType = createTopic(topicMap, "keywords"); setData(textTopic, keywordType, defaultLang, keywords.trim()); } // --- PDF TEXT CONTENT --- PDFTextStripper stripper = new PDFTextStripper(); String content = stripper.getText(doc); setTextEnrichment(textTopic, topicMap, content); doc.close(); } // --- HANDLE RTF DOCUMENTS --- else if (lowerCaseLocator.endsWith("rtf")) { String content = Textbox.RTF2PlainText(inputStream); setTextEnrichment(textTopic, topicMap, content); } // --- HANDLE OFFICE DOCUMENTS --- else if (lowerCaseLocator.endsWith("doc") || lowerCaseLocator.endsWith("docx") || lowerCaseLocator.endsWith("ppt") || lowerCaseLocator.endsWith("xsl") || lowerCaseLocator.endsWith("vsd")) { String content = MSOfficeBox.getText(inputStream); if (content != null) { setTextEnrichment(textTopic, topicMap, content); } } // --- HANDLE TXT DOCUMENTS --- else { String content = IObox.loadFile(new InputStreamReader(inputStream)); setTextEnrichment(textTopic, topicMap, content); } } catch (Exception e) { log(e); } }
From source file:org.wandora.piccolo.utils.crawler.handlers.PDFHandler.java
License:Open Source License
public void handle(CrawlerAccess crawler, InputStream in, int depth, URL page) { try {/* w w w . ja va 2s . com*/ Document d = new Document(); PDDocument doc = PDDocument.load(page); PDDocumentInformation info = doc.getDocumentInformation(); PDFTextStripper stripper = new PDFTextStripper(); String content = stripper.getText(doc); doc.close(); d.add(LuceneCrawler.subject(info.getSubject())); d.add(LuceneCrawler.title(info.getTitle())); d.add(LuceneCrawler.keywords(info.getKeywords())); d.add(LuceneCrawler.content(content)); d.add(LuceneCrawler.location(page.toString())); crawler.addObject(d); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } }
From source file:org.wandora.utils.PDFbox.java
License:Open Source License
public static String extractTextOutOfPDF(String url) { PDDocument doc = null; try {// www .ja va 2 s .com if (url.startsWith("file:")) { doc = PDDocument.load(new File(url)); } else { doc = PDDocument.load(new URL(url)); } PDFTextStripper stripper = new PDFTextStripper(); String content = stripper.getText(doc); doc.close(); return content; } catch (Exception e) { e.printStackTrace(); } return null; }
From source file:org.wangwei.pdf.AddImageToPDF.java
License:Apache License
/** * Add an image to an existing PDF document. * * @param inputFile The input PDF to add the image to. * @param image The filename of the image to put in the PDF. * @param outputFile The file to write to the pdf to. * @throws IOException If there is an error writing the data. * @throws COSVisitorException If there is an error writing the PDF. *//*from w w w . ja va 2 s . c o m*/ public void createPDFFromImage(String inputFile, String image, String outputFile) throws IOException, COSVisitorException { // the document PDDocument doc = null; try { doc = PDDocument.load(inputFile); // we will add the image to the first page. PDPage page = (PDPage) doc.getDocumentCatalog().getAllPages().get(0); PDXObjectImage ximage = null; if (image.toLowerCase().endsWith(".jpg")) { ximage = new PDJpeg(doc, new FileInputStream(image)); } else if (image.toLowerCase().endsWith(".tif") || image.toLowerCase().endsWith(".tiff")) { ximage = new PDCcitt(doc, new RandomAccessFile(new File(image), "r")); } else { BufferedImage awtImage = ImageIO.read(new File(image)); ximage = new PDPixelMap(doc, awtImage); } PDPageContentStream contentStream = new PDPageContentStream(doc, page, true, true); // contentStream.drawImage(ximage, 20, 20 ); // better method inspired by http://stackoverflow.com/a/22318681/535646 float scale = 1f; // reduce this value if the image is too large contentStream.drawXObject(ximage, 20, 20, ximage.getWidth() * scale, ximage.getHeight() * scale); contentStream.close(); doc.save(outputFile); } finally { if (doc != null) { doc.close(); } } }
From source file:org.wso2.carbon.apimgt.impl.reportgen.ReportGenerator.java
License:Open Source License
/** * Generate PDF file for API microgateway request summary * * @param table object containing table headers and row data * @return InputStream pdf as a stream/*from w w w . ja v a 2s . c om*/ * @throws IOException * @throws COSVisitorException */ public InputStream generateMGRequestSummeryPDF(TableData table) throws IOException, COSVisitorException { String[] columnHeaders = table.getColumnHeaders(); PDDocument document = new PDDocument(); PDPage page = new PDPage(); page.setMediaBox(PDPage.PAGE_SIZE_A4); page.setRotation(0); document.addPage(page); PDPageContentStream contentStream = new PDPageContentStream(document, page, false, false); // add logo InputStream in = APIManagerComponent.class.getResourceAsStream("/report/wso2-logo.jpg"); PDJpeg img = new PDJpeg(document, in); contentStream.drawImage(img, 375, 755); // Add topic contentStream.setFont(PDType1Font.HELVETICA_BOLD, 16); writeContent(contentStream, CELL_MARGIN, 770, "API Microgateway request summary"); // Add generated time contentStream.setFont(PDType1Font.HELVETICA_BOLD, FONT_SIZE); writeContent(contentStream, CELL_MARGIN, 730, "Report generated on: " + new Date().toString()); contentStream.setFont(TEXT_FONT, FONT_SIZE); // add table with data drowTableGrid(contentStream, table.getRows().size()); writeRowsContent(contentStream, columnHeaders, table.getRows()); // Add meta data // Whenever the summary report structure is updated this should be changed String requestCount = table.getRows().get(0).getEntries().get(2); document.getDocumentInformation().setCustomMetadataValue(MGW_META, getMetaCount(requestCount)); contentStream.close(); ByteArrayOutputStream out = new ByteArrayOutputStream(); document.save(out); document.close(); return new ByteArrayInputStream(out.toByteArray()); }
From source file:org.xcmis.renditions.impl.PDFDocumentRenditionProvider.java
License:Open Source License
/** * {@inheritDoc}/*from w w w .ja v a2 s. com*/ */ public RenditionContentStream getRenditionStream(ContentStream stream) throws IOException { PDDocument pdf = null; try { pdf = PDDocument.load(stream.getStream()); PDPage page = (PDPage) pdf.getDocumentCatalog().getAllPages().get(0); BufferedImage image = page.convertToImage(); // Determine scale and be sure both width and height are not greater the max int scale = (int) Math.max(Math.floor((image.getHeight() / maxHeight) + 1.0d), Math.floor((image.getWidth() / maxWidth) + 1.0d)); int height = image.getHeight() / scale; int width = image.getWidth() / scale; BufferedImage scaledImage = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB); Graphics2D graphics2D = scaledImage.createGraphics(); graphics2D.setRenderingHint(RenderingHints.KEY_INTERPOLATION, RenderingHints.VALUE_INTERPOLATION_BILINEAR); graphics2D.drawImage(image, 0, 0, width, height, null); graphics2D.dispose(); ByteArrayOutputStream out = new ByteArrayOutputStream(); ImageIO.write(scaledImage, "png", out); RenditionContentStream renditionStream = new RenditionContentStream(out.toByteArray(), null, new MimeType("image", " png"), getKind(), height, width); return renditionStream; } finally { if (pdf != null) { pdf.close(); } } }
From source file:org.xstudiosys.pdfxmp.AddMetadataFromDocInfo.java
License:Apache License
/** * This will print the documents data.// w ww . ja v a 2 s .c o m * * @param args The command line arguments. * * @throws Exception If there is an error parsing the document. */ public static void main(String[] args) throws Exception { if (args.length != 2) { usage(); } else { PDDocument document = null; try { document = PDDocument.load(args[0]); if (document.isEncrypted()) { System.err.println("Error: Cannot add metadata to encrypted document."); System.exit(1); } PDDocumentCatalog catalog = document.getDocumentCatalog(); PDDocumentInformation info = document.getDocumentInformation(); XMPMetadata metadata = new XMPMetadata(); XMPSchemaPDF pdfSchema = metadata.addPDFSchema(); pdfSchema.setKeywords(info.getKeywords()); pdfSchema.setProducer(info.getProducer()); XMPSchemaBasic basicSchema = metadata.addBasicSchema(); basicSchema.setModifyDate(info.getModificationDate()); basicSchema.setCreateDate(info.getCreationDate()); basicSchema.setCreatorTool(info.getCreator()); basicSchema.setMetadataDate(new GregorianCalendar()); XMPSchemaDublinCore dcSchema = metadata.addDublinCoreSchema(); dcSchema.setTitle(info.getTitle()); dcSchema.addCreator("PDFBox"); dcSchema.setDescription(info.getSubject()); PDMetadata metadataStream = new PDMetadata(document); metadataStream.importXMPMetadata(metadata); catalog.setMetadata(metadataStream); document.save(args[1]); } finally { if (document != null) { document.close(); } } } }
From source file:org.xstudiosys.pdfxmp.Main.java
License:Open Source License
public static void writeInfoDictionary(FileInputStream in, String outputFile, byte[] xmp) throws IOException, COSVisitorException { PDFParser parser = new PDFParser(in); parser.parse();/*from w w w . ja v a 2s . c o m*/ PDDocument document = parser.getPDDocument(); PDDocumentInformation info = document.getDocumentInformation(); /* for (Entry<String, String> entry : XmpUtils.toInfo(xmp).entrySet()) { info.setCustomMetadataValue(entry.getKey(), entry.getValue()); } */ document.setDocumentInformation(info); document.save(outputFile); document.close(); }
From source file:org.xstudiosys.pdfxmp.XMPUtil.java
License:Open Source License
/** * Try to read the given BibTexEntry from the XMP-stream of the given * inputstream containing a PDF-file.//w w w . ja va2 s . c o m * * @param inputStream * The inputstream to read from. * * @throws IOException * Throws an IOException if the file cannot be read, so the user * than remove a lock or cancel the operation. */ @SuppressWarnings("unchecked") public static List<BibtexEntry> readXMP(InputStream inputStream) throws IOException { List<BibtexEntry> result = new LinkedList<BibtexEntry>(); PDDocument document = null; try { document = PDDocument.load(inputStream); if (document.isEncrypted()) { throw new EncryptionNotSupportedException("Error: Cannot read metadata from encrypted document."); } XMPMetadata meta = getXMPMetadata(document); // If we did not find any XMP metadata, search for non XMP metadata if (meta != null) { List<XMPSchema> schemas = meta.getSchemasByNamespaceURI(XMPSchemaBibtex.NAMESPACE); for (XMPSchema schema : schemas) { XMPSchemaBibtex bib = (XMPSchemaBibtex) schema; result.add(bib.getBibtexEntry()); } // If we did not find anything have a look if a Dublin Core exists if (result.size() == 0) { schemas = meta.getSchemasByNamespaceURI(XMPSchemaDublinCore.NAMESPACE); for (XMPSchema schema : schemas) { XMPSchemaDublinCore dc = (XMPSchemaDublinCore) schema; BibtexEntry entry = getBibtexEntryFromDublinCore(dc); if (entry != null) result.add(entry); } } } if (result.size() == 0) { BibtexEntry entry = getBibtexEntryFromDocumentInformation(document.getDocumentInformation()); if (entry != null) result.add(entry); } } finally { if (document != null) document.close(); } // return null, if no metadata was found if (result.size() == 0) return null; return result; }