List of usage examples for org.apache.pdfbox.pdmodel PDDocument getNumberOfPages
public int getNumberOfPages()
From source file:org.ghost4j.document.PDFDocument.java
License:LGPL
public int getPageCount() throws DocumentException { int pageCount = 0; if (content == null) { return pageCount; }// w w w . j a va 2s .co m ByteArrayInputStream bais = null; PDDocument document = null; try { bais = new ByteArrayInputStream(content); document = PDDocument.load(bais); pageCount = document.getNumberOfPages(); } catch (Exception e) { throw new DocumentException(e); } finally { if (document != null) try { document.close(); } catch (IOException e) { e.printStackTrace(); } IOUtils.closeQuietly(bais); } return pageCount; }
From source file:org.ghost4j.document.PDFDocument.java
License:LGPL
@Override public void append(Document document) throws DocumentException { super.append(document); ByteArrayOutputStream baos = null; PDDocument mergedDocument = new PDDocument(); try {//w ww .j av a 2 s . com baos = new ByteArrayOutputStream(); ByteArrayInputStream bais = new ByteArrayInputStream(content); PDDocument pDocument = PDDocument.load(bais); int pageCount = pDocument.getNumberOfPages(); for (int i = 0; i < pageCount; i++) { mergedDocument.addPage((PDPage) pDocument.getDocumentCatalog().getAllPages().get(i)); } // copy new document ByteArrayInputStream baisNewDoc = new ByteArrayInputStream(document.getContent()); PDDocument pNewDocument = PDDocument.load(baisNewDoc); pageCount = pNewDocument.getNumberOfPages(); for (int i = 0; i < pageCount; i++) { mergedDocument.addPage((PDPage) pNewDocument.getDocumentCatalog().getAllPages().get(i)); } mergedDocument.save(baos); mergedDocument.close(); // replace content with new content content = baos.toByteArray(); } catch (Exception e) { throw new DocumentException(e); } finally { IOUtils.closeQuietly(baos); } }
From source file:org.knoesis.matvocab.indexer.LucenePDFDocument.java
License:Apache License
/** * This will add the contents to the lucene document. * * @param document The document to add the contents to. * @param is The stream to get the contents from. * @param documentLocation The location of the document, used just for debug messages. * * @throws IOException If there is an error parsing the document. *//*from w w w .jav a 2s . c om*/ private void addContent(Document document, InputStream is, String documentLocation, PDFTextStripper stripper) throws IOException { PDDocument pdfDocument = null; try { pdfDocument = PDDocument.load(is); if (pdfDocument.isEncrypted()) { //Just try using the default password and move on pdfDocument.decrypt(""); } //create a writer where to append the text content. StringWriter writer = new StringWriter(); if (stripper == null) { stripper = new PDFTextStripper(); } else { stripper.resetEngine(); } stripper.writeText(pdfDocument, writer); // Note: the buffer to string operation is costless; // the char array value of the writer buffer and the content string // is shared as long as the buffer content is not modified, which will // not occur here. String contents = writer.getBuffer().toString(); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. addField(document, "contents", contents); addField(document, "stemmedcontents", contents); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { addField(document, "Author", info.getAuthor()); try { addField(document, "CreationDate", info.getCreationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } addField(document, "Creator", info.getCreator()); addField(document, "Keywords", info.getKeywords()); try { addField(document, "ModificationDate", info.getModificationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } addField(document, "Producer", info.getProducer()); addField(document, "Subject", info.getSubject()); addField(document, "Title", info.getTitle()); addField(document, "Trapped", info.getTrapped()); } int summarySize = Math.min(contents.length(), 500); String summary = contents.substring(0, summarySize); // Add the summary as an UnIndexed field, so that it is stored and returned // with hit documents for display. addField(document, "summary", summary); addField(document, "numpages", String.valueOf(pdfDocument.getNumberOfPages())); } catch (CryptographyException e) { throw new IOException("Error decrypting document(" + documentLocation + "): " + e); } catch (InvalidPasswordException e) { //they didn't suppply a password and the default of "" was wrong. throw new IOException( "Error: The document(" + documentLocation + ") is encrypted and will not be indexed."); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:org.mycore.media.MCRMediaPDFParser.java
License:Open Source License
/** * Parse file and store metadata in related Object. * /* www.jav a2 s . c om*/ * @return MCRMediaObject * can be held any MCRMediaObject * @see MCRMediaObject#clone() */ @SuppressWarnings("unchecked") public synchronized MCRMediaObject parse(File file) throws Exception { if (!file.exists()) throw new IOException("File \"" + file.getName() + "\" doesn't exists!"); MCRPDFObject media = new MCRPDFObject(); LOGGER.info("parse " + file.getName() + "..."); PDDocument pdf = PDDocument.load(file); try { media.fileName = file.getName(); media.fileSize = file.length(); media.folderName = (file.getAbsolutePath()).replace(file.getName(), ""); PDPageTree pages = pdf.getDocumentCatalog().getPages(); media.numPages = pdf.getNumberOfPages(); PDPage page = (PDPage) pages.get(0); PDRectangle rect = page.getMediaBox(); media.width = Math.round(rect.getWidth()); media.height = Math.round(rect.getHeight()); PDDocumentInformation info = pdf.getDocumentInformation(); if (info != null) { media.tags = new MCRMediaTagObject(); media.tags.author = info.getAuthor(); media.tags.creator = info.getCreator(); media.tags.producer = info.getProducer(); media.tags.title = info.getTitle(); media.tags.subject = info.getSubject(); media.tags.keywords = info.getKeywords(); } } catch (Exception e) { LOGGER.error(e.getMessage()); throw new Exception(e.getMessage()); } finally { pdf.close(); } return media; }
From source file:org.nines.NinesStatementHandlerTest.java
License:Apache License
@Test public void testPdfStrip() { try {/* w w w .j a v a2 s. c o m*/ FileInputStream is = new FileInputStream(new File("test_data/sample.pdf")); PDDocument pdfDoc = PDDocument.load(is); assertEquals(2, pdfDoc.getNumberOfPages()); PDFTextStripper pdfStrip = new PDFTextStripper(); String text = pdfStrip.getText(pdfDoc); assertNotNull(text); System.out.println(text); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
From source file:org.nuxeo.pdf.test.PDFMergeTest.java
License:Open Source License
protected void checkMergedPDF(Blob inBlob, boolean jutsFirst2Pages) throws IOException { File tempFile = File.createTempFile("testmergepdf", ".pdf"); utils.track(tempFile);//w w w. ja va 2 s . c om inBlob.transferTo(tempFile); PDDocument doc = PDDocument.load(tempFile); assertNotNull(doc); utils.track(doc); // 2 + 3 + 1 if (jutsFirst2Pages) { assertEquals(5, doc.getNumberOfPages()); } else { assertEquals(6, doc.getNumberOfPages()); } String txt; txt = utils.extractText(doc, 1, 1); assertTrue(txt.indexOf(MERGEPDF_CHECK_PREFIX + "1") > -1); txt = utils.extractText(doc, 3, 3); assertTrue(txt.indexOf(MERGEPDF_CHECK_PREFIX + "2") > -1); if (!jutsFirst2Pages) { txt = utils.extractText(doc, 6, 6); assertTrue(txt.indexOf(MERGEPDF_CHECK_PREFIX + "3") > -1); } doc.close(); utils.untrack(doc); tempFile.delete(); utils.untrack(tempFile); }
From source file:org.nuxeo.pdf.test.PDFPageExtractorTest.java
License:Open Source License
protected void checkPDFBeforeTest() throws IOException { PDDocument doc = PDDocument.load(pdfFile); assertNotNull(doc);/*from w w w .ja v a 2 s. c o m*/ utils.track(doc); assertEquals(13, doc.getNumberOfPages()); doc.close(); utils.untrack(doc); }
From source file:org.nuxeo.pdf.test.PDFPageExtractorTest.java
License:Open Source License
protected void checkExtractedPdf(Blob inBlob, int inExpectedPageCount, String inExpectedTextAtPos0) throws Exception { PDDocument doc = PDDocument.load(inBlob.getStream()); utils.track(doc);/*w w w . ja va2 s. c o m*/ assertEquals(inExpectedPageCount, doc.getNumberOfPages()); String txt = utils.extractText(doc, 1, 1); assertEquals(0, txt.indexOf(inExpectedTextAtPos0)); doc.close(); utils.untrack(doc); }
From source file:org.nuxeo.pdf.test.PDFPageNumberingTest.java
License:Open Source License
protected void checkPDFBeforeTest() throws IOException { PDDocument doc = PDDocument.load(pdfFile); assertNotNull(doc);/*from w ww. ja v a 2s . c o m*/ utils.track(doc); assertEquals(13, doc.getNumberOfPages()); PDFTextStripper stripper = new PDFTextStripper(); String allTheText = stripper.getText(doc); for (int i = 0; i < 10; i++) { assertEquals(-1, allTheText.indexOf("" + i)); } doc.close(); utils.untrack(doc); }
From source file:org.nuxeo.pdf.test.PDFTextExtractorTest.java
License:Open Source License
protected void checkPDFBeforeTest() throws IOException { PDDocument doc = PDDocument.load(pdfFile); assertNotNull(doc);// ww w. j a v a 2 s. co m utils.track(doc); assertEquals(6, doc.getNumberOfPages()); doc.close(); utils.untrack(doc); }