List of usage examples for org.apache.pdfbox.pdmodel PDDocument isEncrypted
public boolean isEncrypted()
From source file:ddf.catalog.transformer.input.pdf.PdfInputTransformer.java
License:Open Source License
private Metacard transformPdf(String id, PDDocument pdfDocument, String contentInput) throws IOException { MetacardImpl metacard = initializeMetacard(id, contentInput); if (pdfDocument.isEncrypted()) { LOGGER.debug("Cannot transform encrypted pdf"); return metacard; }//from w w w . j a v a2 s . co m extractPdfMetadata(pdfDocument, metacard); pdfThumbnailGenerator.apply(pdfDocument).ifPresent(metacard::setThumbnail); Optional.ofNullable(geoParser.apply(pdfDocument)).ifPresent(metacard::setLocation); return metacard; }
From source file:de.catma.document.source.contenthandler.PDFContentHandler.java
License:Open Source License
public void load(InputStream is) throws IOException { PDDocument document = null; try {/*from ww w . j a va 2 s. c o m*/ document = PDDocument.load(is, false); if (document.isEncrypted()) { throw new IOException("can not open pdf document because it is encrypted"); } AccessPermission ap = document.getCurrentAccessPermission(); if (!ap.canExtractContent()) { throw new IOException("You do not have permission to extract text"); } PDFTextStripper stripper = new PDFTextStripper("UTF-8"); stripper.setForceParsing(false); stripper.setSortByPosition(false); stripper.setShouldSeparateByBeads(true); stripper.setStartPage(1); stripper.setEndPage(Integer.MAX_VALUE); ByteArrayOutputStream os = new ByteArrayOutputStream(); Writer w = new OutputStreamWriter(os); try { stripper.writeText(document, w); } finally { w.close(); } // some pdfs seem to include non valid unicode characters // and this causes problems when converting text to HTML // for GUI delivery and during indexing setContent(os.toString().replaceAll("[^\\x09\\x0A\\x0D\\x20-\\uD7FF\\uE000-\\uFFFD\\u10000-\\u10FFFF]", "?")); } finally { if (document != null) { document.close(); } } }
From source file:de.ilias.services.lucene.index.file.PDFBoxPDFHandler.java
License:Open Source License
/** * @throws IOException /*from w w w . j a v a 2s . c om*/ * @see de.ilias.services.lucene.index.file.FileHandler#getContent(java.io.InputStream) */ public String getContent(InputStream is) throws FileHandlerException { PDDocument pddo = null; PDFTextStripper stripper = null; String str = new String(""); try { pddo = PDDocument.load(is); if (pddo.isEncrypted()) { logger.warn("PDF Document is encrypted. Trying empty password..."); return ""; } stripper = new PDFTextStripper(); str = stripper.getText(pddo); } catch (NumberFormatException e) { logger.warn("Invalid PDF version number given. Aborting"); } catch (IOException e) { logger.warn(e.getMessage()); throw new FileHandlerException(e); } catch (Exception e) { logger.error(e.getMessage()); throw new FileHandlerException(e); } finally { try { if (pddo != null) pddo.close(); } catch (IOException e) { ; } } return str; }
From source file:de.tudarmstadt.ukp.dkpro.core.io.pdf.Pdf2CasConverter.java
License:Apache License
public void writeText(final CAS aCas, final InputStream aIs) throws IOException { final PDDocument doc = PDDocument.load(aIs); try {//from w w w . j a va 2 s . c o m if (doc.isEncrypted()) { throw new IOException("Encrypted documents currently not supported"); } cas = aCas; text = new StringBuilder(); writeText(doc); } finally { doc.close(); } }
From source file:editorframework.pdfbox.PDFBoxDocumentAdaptee.java
private static PDDocument parseDocument(String filename) throws IOException { PDDocument document = PDDocument.load(filename); if (document.isEncrypted()) { try {/* w w w .j a va 2s .co m*/ document.decrypt(""); } catch (org.apache.pdfbox.exceptions.CryptographyException e) { e.printStackTrace(); } } return document; }
From source file:editorframework.pdfbox.testes.PDFReaderAdaptor.java
License:Apache License
private static PDDocument parseDocument(InputStream input) throws IOException { PDDocument document = PDDocument.load(input); if (document.isEncrypted()) { try {/*from ww w. j a v a 2 s . c om*/ document.decrypt(""); } catch (org.apache.pdfbox.exceptions.CryptographyException e) { e.printStackTrace(); } } return document; }
From source file:edu.ur.ir.index.DefaultPdfTextExtractor.java
License:Apache License
/** * Extract text from the PDF document/* ww w . j a va 2 s . c o m*/ * @throws Exception * * @see edu.ur.ir.index.FileTextExtractor#getText(java.io.File) */ public String getText(File f) throws Exception { String text = null; // don't even try if the file is too large if (isFileTooLarge(f) || f.length() <= 0l) { return text; } PDDocument pdDoc = null; try { pdDoc = PDDocument.load(f); // don't do anything with decripted docs if (!pdDoc.isEncrypted()) { PDFTextStripper stripper = new PDFTextStripper(); String myText = stripper.getText(pdDoc); if (myText != null && !myText.trim().equals("")) { text = myText; } } else { log.error("pdf " + f.getAbsolutePath() + " is encrypted and " + " cannot be decrypted because we don't have a password"); } } catch (OutOfMemoryError oome) { text = null; log.error("could not extract text", oome); throw (oome); } catch (Exception e) { log.error("could not extract text with other error", e); text = null; throw (e); } finally { closePDDocument(pdDoc); pdDoc = null; } return text; }
From source file:edu.uwm.jiaoduan.lab.ExtractTextByArea.java
License:Apache License
/** * This will print the documents text in a certain area. * * @param args The command line arguments. * * @throws Exception If there is an error parsing the document. *///from w w w . j a v a 2 s. co m public static void main(String[] args) throws Exception { args = new String[] { "test.pdf" }; if (args.length != 1) { usage(); } else { PDDocument document = null; try { document = PDDocument.load(args[0]); if (document.isEncrypted()) { try { document.decrypt(""); } catch (InvalidPasswordException e) { System.err.println("Error: Document is encrypted with a password."); System.exit(1); } } PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.setSortByPosition(true); //Rectangle rect = new Rectangle( 99,219,80,15 ); //convert xfdf coordinate to rectangle Rectangle2D.Double rect = new Rectangle2D.Double(); List allPages = document.getDocumentCatalog().getAllPages(); PDPage firstPage = (PDPage) allPages.get(0); double pageHeight = firstPage.getMediaBox().getHeight(); //125.824906,672.39502,390.577109,694.679017 double[] coords = new double[] { 58.50615, 500.847504, 302.919073, 552.419312 }; //rect.height = 694.679017 - 672.39502; rect.height = coords[3] - coords[1]; //rect.width = 390.577109 - 125.824906; rect.width = coords[2] - coords[0]; ; //rect.x = 125.824906; rect.x = coords[0]; //rect.y = pageHeight -672.39502 - rect.height; rect.y = pageHeight - coords[1] - rect.height; System.out.println(rect); stripper.addRegion("class1", rect); stripper.extractRegions(firstPage); System.out.println("Text in the area:" + rect); System.out.println(stripper.getTextForRegion("class1")); } finally { if (document != null) { document.close(); } } } }
From source file:fr.acxio.tools.agia.file.pdf.AbstractPDDocumentFactory.java
License:Apache License
protected PDDocument loadDocument(InputStream sInputStream, Map<String, Object> sParameters) throws IOException, CryptographyException, InvalidPasswordException { PDDocument aDocument = PDDocument.load(sInputStream); if (aDocument.isEncrypted()) { aDocument.decrypt((String) sParameters.get(PDDocumentFactoryConstants.PARAM_PASSWORD)); }/*from w w w.ja v a 2 s . co m*/ return aDocument; }
From source file:fr.acxio.tools.agia.file.pdf.AbstractPDDocumentFactory.java
License:Apache License
protected PDDocument loadDocument(File sFile, Map<String, Object> sParameters) throws IOException, CryptographyException, InvalidPasswordException { PDDocument aDocument = null; if (Boolean.TRUE.equals(sParameters.get(PARAM_NONSEQ))) { aDocument = PDDocument.loadNonSeq(sFile, (RandomAccess) sParameters.get(PDDocumentFactoryConstants.PARAM_SCRATCHFILE), (String) sParameters.get(PDDocumentFactoryConstants.PARAM_PASSWORD)); } else {//from w w w . java2s. c om aDocument = PDDocument.load(sFile); if (aDocument.isEncrypted()) { aDocument.decrypt((String) sParameters.get(PDDocumentFactoryConstants.PARAM_PASSWORD)); } } return aDocument; }