Example usage for org.apache.commons.io IOExceptionWithCause IOExceptionWithCause

Introduction

In this page you can find the example usage for org.apache.commons.io IOExceptionWithCause IOExceptionWithCause.

Prototype

public IOExceptionWithCause(String message, Throwable cause)

Source Link

Document

Constructs a new instance with the given message and cause.

Usage

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
    if (config.getOcrStrategy().equals(NO_OCR)) {
        return;// w  w  w  .j av a  2 s.  co  m
    }
    TesseractOCRConfig tesseractConfig = context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG);

    TesseractOCRParser tesseractOCRParser = new TesseractOCRParser();
    if (!tesseractOCRParser.hasTesseract(tesseractConfig)) {
        throw new TikaException("Tesseract is not available. "
                + "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly");
    }

    PDFRenderer renderer = new PDFRenderer(pdDocument);
    TemporaryResources tmp = new TemporaryResources();
    try {
        BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOcrImageType());
        Path tmpFile = tmp.createTempFile();
        try (OutputStream os = Files.newOutputStream(tmpFile)) {
            //TODO: get output format from TesseractConfig
            ImageIOUtil.writeImage(image, config.getOcrImageFormatName(), os, config.getOcrDPI(),
                    config.getOcrImageQuality());
        }
        try (InputStream is = TikaInputStream.get(tmpFile)) {
            tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
        }
    } catch (IOException e) {
        handleCatchableIOE(e);
    } catch (SAXException e) {
        throw new IOExceptionWithCause("error writing OCR content from PDF", e);
    } finally {
        tmp.dispose();
    }
}

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

@Override
protected void endPage(PDPage page) throws IOException {

    try {//w  ww . ja  va 2 s . co m
        for (PDAnnotation annotation : page.getAnnotations()) {

            if (annotation instanceof PDAnnotationFileAttachment) {
                PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
                PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
                try {
                    AttributesImpl attributes = new AttributesImpl();
                    attributes.addAttribute("", "source", "source", "CDATA", "annotation");
                    extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes);
                } catch (SAXException e) {
                    throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
                } catch (TikaException e) {
                    throw new IOExceptionWithCause("file embedded in annotation tika exception", e);
                } catch (IOException e) {
                    handleCatchableIOE(e);
                }
            } else if (annotation instanceof PDAnnotationWidget) {
                handleWidget((PDAnnotationWidget) annotation);
            }
            // TODO: remove once PDFBOX-1143 is fixed:
            if (config.getExtractAnnotationText()) {
                PDActionURI uri = getActionURI(annotation);
                if (uri != null) {
                    String link = uri.getURI();
                    if (link != null && link.trim().length() > 0) {
                        xhtml.startElement("div", "class", "annotation");
                        xhtml.startElement("a", "href", link);
                        xhtml.characters(link);
                        xhtml.endElement("a");
                        xhtml.endElement("div");
                    }
                }

                if (annotation instanceof PDAnnotationMarkup) {
                    PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
                    String title = annotationMarkup.getTitlePopup();
                    String subject = annotationMarkup.getSubject();
                    String contents = annotationMarkup.getContents();
                    // TODO: maybe also annotationMarkup.getRichContents()?
                    if (title != null || subject != null || contents != null) {
                        xhtml.startElement("div", "class", "annotation");

                        if (title != null) {
                            xhtml.startElement("div", "class", "annotationTitle");
                            xhtml.characters(title);
                            xhtml.endElement("div");
                        }

                        if (subject != null) {
                            xhtml.startElement("div", "class", "annotationSubject");
                            xhtml.characters(subject);
                            xhtml.endElement("div");
                        }

                        if (contents != null) {
                            xhtml.startElement("div", "class", "annotationContents");
                            xhtml.characters(contents);
                            xhtml.endElement("div");
                        }

                        xhtml.endElement("div");
                    }
                }
            }
        }
        if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
            doOCROnCurrentPage();
        }

        PDPageAdditionalActions pageActions = page.getActions();
        if (pageActions != null) {
            handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE);
            handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN);
        }
        xhtml.endElement("div");
    } catch (SAXException | TikaException e) {
        throw new IOExceptionWithCause("Unable to end a page", e);
    } catch (IOException e) {
        exceptions.add(e);
    } finally {
        pageIndex++;
    }
}

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

@Override
protected void startDocument(PDDocument pdf) throws IOException {
    try {//from  w ww .jav  a  2  s. c o  m
        xhtml.startDocument();
        try {
            handleDestinationOrAction(pdf.getDocumentCatalog().getOpenAction(), ActionTrigger.DOCUMENT_OPEN);
        } catch (IOException e) {
            //See PDFBOX-3773
            //swallow -- no need to report this
        }
    } catch (TikaException | SAXException e) {
        throw new IOExceptionWithCause("Unable to start a document", e);
    }
}

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

@Override
protected void endDocument(PDDocument pdf) throws IOException {
    try {/*from  www.  java 2s.c o  m*/
        // Extract text for any bookmarks:
        if (config.getExtractBookmarksText()) {
            extractBookmarkText();
        }

        try {
            extractEmbeddedDocuments(pdf);
        } catch (IOException e) {
            handleCatchableIOE(e);
        }

        //extract acroform data at end of doc
        if (config.getExtractAcroFormContent() == true) {
            try {
                extractAcroForm(pdf);
            } catch (IOException e) {
                handleCatchableIOE(e);
            }
        }
        PDDocumentCatalogAdditionalActions additionalActions = pdf.getDocumentCatalog().getActions();
        handleDestinationOrAction(additionalActions.getDP(), ActionTrigger.AFTER_DOCUMENT_PRINT);
        handleDestinationOrAction(additionalActions.getDS(), ActionTrigger.AFTER_DOCUMENT_SAVE);
        handleDestinationOrAction(additionalActions.getWC(), ActionTrigger.BEFORE_DOCUMENT_CLOSE);
        handleDestinationOrAction(additionalActions.getWP(), ActionTrigger.BEFORE_DOCUMENT_PRINT);
        handleDestinationOrAction(additionalActions.getWS(), ActionTrigger.BEFORE_DOCUMENT_SAVE);
        xhtml.endDocument();
    } catch (TikaException e) {
        throw new IOExceptionWithCause("Unable to end a document", e);
    } catch (SAXException e) {
        throw new IOExceptionWithCause("Unable to end a document", e);
    }
}

From source file:org.apache.tika.parser.pdf.PDF2XHTML.java

@Override
protected void startDocument(PDDocument pdf) throws IOException {
    try {/*from www . ja  v a2  s  .  c o m*/
        handler.startDocument();
    } catch (SAXException e) {
        throw new IOExceptionWithCause("Unable to start a document", e);
    }
}

From source file:org.apache.tika.parser.pdf.PDF2XHTML.java

@Override
protected void endDocument(PDDocument pdf) throws IOException {
    try {/*ww w.  j ava 2 s  .  c o m*/
        // Extract text for any bookmarks:
        extractBookmarkText();
        extractEmbeddedDocuments(pdf, originalHandler);

        //extract acroform data at end of doc
        if (config.getExtractAcroFormContent() == true) {
            extractAcroForm(pdf, handler);
        }
        handler.endDocument();
    } catch (TikaException e) {
        throw new IOExceptionWithCause("Unable to end a document", e);
    } catch (SAXException e) {
        throw new IOExceptionWithCause("Unable to end a document", e);
    }
}

From source file:org.apache.tika.parser.pdf.PDF2XHTML.java

@Override
protected void startPage(PDPage page) throws IOException {
    try {/*from  w ww.  j av a  2s . c  o m*/
        handler.startElement("div", "class", "page");
    } catch (SAXException e) {
        throw new IOExceptionWithCause("Unable to start a page", e);
    }
    writeParagraphStart();
}

From source file:org.apache.tika.parser.pdf.PDF2XHTML.java

@Override
protected void endPage(PDPage page) throws IOException {
    try {/*  w  w  w. j  ava  2s.com*/
        writeParagraphEnd();

        extractImages(page.getResources(), new HashSet<COSBase>());

        EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
        for (PDAnnotation annotation : page.getAnnotations()) {

            if (annotation instanceof PDAnnotationFileAttachment) {
                PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
                PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
                try {
                    extractMultiOSPDEmbeddedFiles("", fileSpec, extractor);
                } catch (SAXException e) {
                    throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
                } catch (TikaException e) {
                    throw new IOExceptionWithCause("file embedded in annotation tika exception", e);
                }
            }
            // TODO: remove once PDFBOX-1143 is fixed:
            if (config.getExtractAnnotationText()) {
                if (annotation instanceof PDAnnotationLink) {
                    PDAnnotationLink annotationlink = (PDAnnotationLink) annotation;
                    if (annotationlink.getAction() != null) {
                        PDAction action = annotationlink.getAction();
                        if (action instanceof PDActionURI) {
                            PDActionURI uri = (PDActionURI) action;
                            String link = uri.getURI();
                            if (link != null) {
                                handler.startElement("div", "class", "annotation");
                                handler.startElement("a", "href", link);
                                handler.endElement("a");
                                handler.endElement("div");
                            }
                        }
                    }
                }

                if (annotation instanceof PDAnnotationMarkup) {
                    PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
                    String title = annotationMarkup.getTitlePopup();
                    String subject = annotationMarkup.getSubject();
                    String contents = annotationMarkup.getContents();
                    // TODO: maybe also annotationMarkup.getRichContents()?
                    if (title != null || subject != null || contents != null) {
                        handler.startElement("div", "class", "annotation");

                        if (title != null) {
                            handler.startElement("div", "class", "annotationTitle");
                            handler.characters(title);
                            handler.endElement("div");
                        }

                        if (subject != null) {
                            handler.startElement("div", "class", "annotationSubject");
                            handler.characters(subject);
                            handler.endElement("div");
                        }

                        if (contents != null) {
                            handler.startElement("div", "class", "annotationContents");
                            handler.characters(contents);
                            handler.endElement("div");
                        }

                        handler.endElement("div");
                    }
                }
            }
        }

        handler.endElement("div");
    } catch (SAXException e) {
        throw new IOExceptionWithCause("Unable to end a page", e);
    }
}

From source file:org.apache.tika.parser.pdf.PDF2XHTML.java

@Override
protected void writeParagraphStart() throws IOException {
    super.writeParagraphStart();
    try {//from ww  w .  j  a v  a  2  s.c o m
        handler.startElement("p");
    } catch (SAXException e) {
        throw new IOExceptionWithCause("Unable to start a paragraph", e);
    }
}

From source file:org.apache.tika.parser.pdf.PDF2XHTML.java

@Override
protected void writeParagraphEnd() throws IOException {
    super.writeParagraphEnd();
    try {//  w w w.  j a v a2  s .c  o m
        handler.endElement("p");
    } catch (SAXException e) {
        throw new IOExceptionWithCause("Unable to end a paragraph", e);
    }
}