List of usage examples for org.apache.commons.io IOExceptionWithCause IOExceptionWithCause
public IOExceptionWithCause(String message, Throwable cause)
From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java
void doOCROnCurrentPage() throws IOException, TikaException, SAXException { if (config.getOcrStrategy().equals(NO_OCR)) { return;// w w w .j av a 2 s. co m } TesseractOCRConfig tesseractConfig = context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG); TesseractOCRParser tesseractOCRParser = new TesseractOCRParser(); if (!tesseractOCRParser.hasTesseract(tesseractConfig)) { throw new TikaException("Tesseract is not available. " + "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly"); } PDFRenderer renderer = new PDFRenderer(pdDocument); TemporaryResources tmp = new TemporaryResources(); try { BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOcrImageType()); Path tmpFile = tmp.createTempFile(); try (OutputStream os = Files.newOutputStream(tmpFile)) { //TODO: get output format from TesseractConfig ImageIOUtil.writeImage(image, config.getOcrImageFormatName(), os, config.getOcrDPI(), config.getOcrImageQuality()); } try (InputStream is = TikaInputStream.get(tmpFile)) { tesseractOCRParser.parseInline(is, xhtml, tesseractConfig); } } catch (IOException e) { handleCatchableIOE(e); } catch (SAXException e) { throw new IOExceptionWithCause("error writing OCR content from PDF", e); } finally { tmp.dispose(); } }
From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java
@Override protected void endPage(PDPage page) throws IOException { try {//w ww . ja va 2 s . co m for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); try { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "source", "source", "CDATA", "annotation"); extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes); } catch (SAXException e) { throw new IOExceptionWithCause("file embedded in annotation sax exception", e); } catch (TikaException e) { throw new IOExceptionWithCause("file embedded in annotation tika exception", e); } catch (IOException e) { handleCatchableIOE(e); } } else if (annotation instanceof PDAnnotationWidget) { handleWidget((PDAnnotationWidget) annotation); } // TODO: remove once PDFBOX-1143 is fixed: if (config.getExtractAnnotationText()) { PDActionURI uri = getActionURI(annotation); if (uri != null) { String link = uri.getURI(); if (link != null && link.trim().length() > 0) { xhtml.startElement("div", "class", "annotation"); xhtml.startElement("a", "href", link); xhtml.characters(link); xhtml.endElement("a"); xhtml.endElement("div"); } } if (annotation instanceof PDAnnotationMarkup) { PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; String title = annotationMarkup.getTitlePopup(); String subject = annotationMarkup.getSubject(); String contents = annotationMarkup.getContents(); // TODO: maybe also annotationMarkup.getRichContents()? if (title != null || subject != null || contents != null) { xhtml.startElement("div", "class", "annotation"); if (title != null) { xhtml.startElement("div", "class", "annotationTitle"); xhtml.characters(title); xhtml.endElement("div"); } if (subject != null) { xhtml.startElement("div", "class", "annotationSubject"); xhtml.characters(subject); xhtml.endElement("div"); } if (contents != null) { xhtml.startElement("div", "class", "annotationContents"); xhtml.characters(contents); xhtml.endElement("div"); } xhtml.endElement("div"); } } } } if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { doOCROnCurrentPage(); } PDPageAdditionalActions pageActions = page.getActions(); if (pageActions != null) { handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE); handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN); } xhtml.endElement("div"); } catch (SAXException | TikaException e) { throw new IOExceptionWithCause("Unable to end a page", e); } catch (IOException e) { exceptions.add(e); } finally { pageIndex++; } }
From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java
@Override protected void startDocument(PDDocument pdf) throws IOException { try {//from w ww .jav a 2 s. c o m xhtml.startDocument(); try { handleDestinationOrAction(pdf.getDocumentCatalog().getOpenAction(), ActionTrigger.DOCUMENT_OPEN); } catch (IOException e) { //See PDFBOX-3773 //swallow -- no need to report this } } catch (TikaException | SAXException e) { throw new IOExceptionWithCause("Unable to start a document", e); } }
From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java
@Override protected void endDocument(PDDocument pdf) throws IOException { try {/*from www. java 2s.c o m*/ // Extract text for any bookmarks: if (config.getExtractBookmarksText()) { extractBookmarkText(); } try { extractEmbeddedDocuments(pdf); } catch (IOException e) { handleCatchableIOE(e); } //extract acroform data at end of doc if (config.getExtractAcroFormContent() == true) { try { extractAcroForm(pdf); } catch (IOException e) { handleCatchableIOE(e); } } PDDocumentCatalogAdditionalActions additionalActions = pdf.getDocumentCatalog().getActions(); handleDestinationOrAction(additionalActions.getDP(), ActionTrigger.AFTER_DOCUMENT_PRINT); handleDestinationOrAction(additionalActions.getDS(), ActionTrigger.AFTER_DOCUMENT_SAVE); handleDestinationOrAction(additionalActions.getWC(), ActionTrigger.BEFORE_DOCUMENT_CLOSE); handleDestinationOrAction(additionalActions.getWP(), ActionTrigger.BEFORE_DOCUMENT_PRINT); handleDestinationOrAction(additionalActions.getWS(), ActionTrigger.BEFORE_DOCUMENT_SAVE); xhtml.endDocument(); } catch (TikaException e) { throw new IOExceptionWithCause("Unable to end a document", e); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to end a document", e); } }
From source file:org.apache.tika.parser.pdf.PDF2XHTML.java
@Override protected void startDocument(PDDocument pdf) throws IOException { try {/*from www . ja v a2 s . c o m*/ handler.startDocument(); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to start a document", e); } }
From source file:org.apache.tika.parser.pdf.PDF2XHTML.java
@Override protected void endDocument(PDDocument pdf) throws IOException { try {/*ww w. j ava 2 s . c o m*/ // Extract text for any bookmarks: extractBookmarkText(); extractEmbeddedDocuments(pdf, originalHandler); //extract acroform data at end of doc if (config.getExtractAcroFormContent() == true) { extractAcroForm(pdf, handler); } handler.endDocument(); } catch (TikaException e) { throw new IOExceptionWithCause("Unable to end a document", e); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to end a document", e); } }
From source file:org.apache.tika.parser.pdf.PDF2XHTML.java
@Override protected void startPage(PDPage page) throws IOException { try {/*from w ww. j av a 2s . c o m*/ handler.startElement("div", "class", "page"); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to start a page", e); } writeParagraphStart(); }
From source file:org.apache.tika.parser.pdf.PDF2XHTML.java
@Override protected void endPage(PDPage page) throws IOException { try {/* w w w. j ava 2s.com*/ writeParagraphEnd(); extractImages(page.getResources(), new HashSet<COSBase>()); EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); try { extractMultiOSPDEmbeddedFiles("", fileSpec, extractor); } catch (SAXException e) { throw new IOExceptionWithCause("file embedded in annotation sax exception", e); } catch (TikaException e) { throw new IOExceptionWithCause("file embedded in annotation tika exception", e); } } // TODO: remove once PDFBOX-1143 is fixed: if (config.getExtractAnnotationText()) { if (annotation instanceof PDAnnotationLink) { PDAnnotationLink annotationlink = (PDAnnotationLink) annotation; if (annotationlink.getAction() != null) { PDAction action = annotationlink.getAction(); if (action instanceof PDActionURI) { PDActionURI uri = (PDActionURI) action; String link = uri.getURI(); if (link != null) { handler.startElement("div", "class", "annotation"); handler.startElement("a", "href", link); handler.endElement("a"); handler.endElement("div"); } } } } if (annotation instanceof PDAnnotationMarkup) { PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; String title = annotationMarkup.getTitlePopup(); String subject = annotationMarkup.getSubject(); String contents = annotationMarkup.getContents(); // TODO: maybe also annotationMarkup.getRichContents()? if (title != null || subject != null || contents != null) { handler.startElement("div", "class", "annotation"); if (title != null) { handler.startElement("div", "class", "annotationTitle"); handler.characters(title); handler.endElement("div"); } if (subject != null) { handler.startElement("div", "class", "annotationSubject"); handler.characters(subject); handler.endElement("div"); } if (contents != null) { handler.startElement("div", "class", "annotationContents"); handler.characters(contents); handler.endElement("div"); } handler.endElement("div"); } } } } handler.endElement("div"); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to end a page", e); } }
From source file:org.apache.tika.parser.pdf.PDF2XHTML.java
@Override protected void writeParagraphStart() throws IOException { super.writeParagraphStart(); try {//from ww w . j a v a 2 s.c o m handler.startElement("p"); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to start a paragraph", e); } }
From source file:org.apache.tika.parser.pdf.PDF2XHTML.java
@Override protected void writeParagraphEnd() throws IOException { super.writeParagraphEnd(); try {// w w w. j a v a2 s .c o m handler.endElement("p"); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to end a paragraph", e); } }