Example usage for org.apache.commons.io IOExceptionWithCause IOExceptionWithCause

List of usage examples for org.apache.commons.io IOExceptionWithCause IOExceptionWithCause

Introduction

In this page you can find the example usage for org.apache.commons.io IOExceptionWithCause IOExceptionWithCause.

Prototype

public IOExceptionWithCause(String message, Throwable cause) 

Source Link

Document

Constructs a new instance with the given message and cause.

Usage

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
    if (config.getOcrStrategy().equals(NO_OCR)) {
        return;// w  w  w  .j av a  2 s.  co  m
    }
    TesseractOCRConfig tesseractConfig = context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG);

    TesseractOCRParser tesseractOCRParser = new TesseractOCRParser();
    if (!tesseractOCRParser.hasTesseract(tesseractConfig)) {
        throw new TikaException("Tesseract is not available. "
                + "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly");
    }

    PDFRenderer renderer = new PDFRenderer(pdDocument);
    TemporaryResources tmp = new TemporaryResources();
    try {
        BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOcrImageType());
        Path tmpFile = tmp.createTempFile();
        try (OutputStream os = Files.newOutputStream(tmpFile)) {
            //TODO: get output format from TesseractConfig
            ImageIOUtil.writeImage(image, config.getOcrImageFormatName(), os, config.getOcrDPI(),
                    config.getOcrImageQuality());
        }
        try (InputStream is = TikaInputStream.get(tmpFile)) {
            tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
        }
    } catch (IOException e) {
        handleCatchableIOE(e);
    } catch (SAXException e) {
        throw new IOExceptionWithCause("error writing OCR content from PDF", e);
    } finally {
        tmp.dispose();
    }
}

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

@Override
protected void endPage(PDPage page) throws IOException {

    try {//w  ww . ja  va 2 s . co m
        for (PDAnnotation annotation : page.getAnnotations()) {

            if (annotation instanceof PDAnnotationFileAttachment) {
                PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
                PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
                try {
                    AttributesImpl attributes = new AttributesImpl();
                    attributes.addAttribute("", "source", "source", "CDATA", "annotation");
                    extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes);
                } catch (SAXException e) {
                    throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
                } catch (TikaException e) {
                    throw new IOExceptionWithCause("file embedded in annotation tika exception", e);
                } catch (IOException e) {
                    handleCatchableIOE(e);
                }
            } else if (annotation instanceof PDAnnotationWidget) {
                handleWidget((PDAnnotationWidget) annotation);
            }
            // TODO: remove once PDFBOX-1143 is fixed:
            if (config.getExtractAnnotationText()) {
                PDActionURI uri = getActionURI(annotation);
                if (uri != null) {
                    String link = uri.getURI();
                    if (link != null && link.trim().length() > 0) {
                        xhtml.startElement("div", "class", "annotation");
                        xhtml.startElement("a", "href", link);
                        xhtml.characters(link);
                        xhtml.endElement("a");
                        xhtml.endElement("div");
                    }
                }

                if (annotation instanceof PDAnnotationMarkup) {
                    PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
                    String title = annotationMarkup.getTitlePopup();
                    String subject = annotationMarkup.getSubject();
                    String contents = annotationMarkup.getContents();
                    // TODO: maybe also annotationMarkup.getRichContents()?
                    if (title != null || subject != null || contents != null) {
                        xhtml.startElement("div", "class", "annotation");

                        if (title != null) {
                            xhtml.startElement("div", "class", "annotationTitle");
                            xhtml.characters(title);
                            xhtml.endElement("div");
                        }

                        if (subject != null) {
                            xhtml.startElement("div", "class", "annotationSubject");
                            xhtml.characters(subject);
                            xhtml.endElement("div");
                        }

                        if (contents != null) {
                            xhtml.startElement("div", "class", "annotationContents");
                            xhtml.characters(contents);
                            xhtml.endElement("div");
                        }

                        xhtml.endElement("div");
                    }
                }
            }
        }
        if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
            doOCROnCurrentPage();
        }

        PDPageAdditionalActions pageActions = page.getActions();
        if (pageActions != null) {
            handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE);
            handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN);
        }
        xhtml.endElement("div");
    } catch (SAXException | TikaException e) {
        throw new IOExceptionWithCause("Unable to end a page", e);
    } catch (IOException e) {
        exceptions.add(e);
    } finally {
        pageIndex++;
    }
}

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

@Override
protected void startDocument(PDDocument pdf) throws IOException {
    try {//from  w ww .jav  a  2  s. c o  m
        xhtml.startDocument();
        try {
            handleDestinationOrAction(pdf.getDocumentCatalog().getOpenAction(), ActionTrigger.DOCUMENT_OPEN);
        } catch (IOException e) {
            //See PDFBOX-3773
            //swallow -- no need to report this
        }
    } catch (TikaException | SAXException e) {
        throw new IOExceptionWithCause("Unable to start a document", e);
    }
}

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

@Override
protected void endDocument(PDDocument pdf) throws IOException {
    try {/*from  www.  java 2s.c o  m*/
        // Extract text for any bookmarks:
        if (config.getExtractBookmarksText()) {
            extractBookmarkText();
        }

        try {
            extractEmbeddedDocuments(pdf);
        } catch (IOException e) {
            handleCatchableIOE(e);
        }

        //extract acroform data at end of doc
        if (config.getExtractAcroFormContent() == true) {
            try {
                extractAcroForm(pdf);
            } catch (IOException e) {
                handleCatchableIOE(e);
            }
        }
        PDDocumentCatalogAdditionalActions additionalActions = pdf.getDocumentCatalog().getActions();
        handleDestinationOrAction(additionalActions.getDP(), ActionTrigger.AFTER_DOCUMENT_PRINT);
        handleDestinationOrAction(additionalActions.getDS(), ActionTrigger.AFTER_DOCUMENT_SAVE);
        handleDestinationOrAction(additionalActions.getWC(), ActionTrigger.BEFORE_DOCUMENT_CLOSE);
        handleDestinationOrAction(additionalActions.getWP(), ActionTrigger.BEFORE_DOCUMENT_PRINT);
        handleDestinationOrAction(additionalActions.getWS(), ActionTrigger.BEFORE_DOCUMENT_SAVE);
        xhtml.endDocument();
    } catch (TikaException e) {
        throw new IOExceptionWithCause("Unable to end a document", e);
    } catch (SAXException e) {
        throw new IOExceptionWithCause("Unable to end a document", e);
    }
}

From source file:org.apache.tika.parser.pdf.PDF2XHTML.java

@Override
protected void startDocument(PDDocument pdf) throws IOException {
    try {/*from www . ja  v a2  s  .  c o m*/
        handler.startDocument();
    } catch (SAXException e) {
        throw new IOExceptionWithCause("Unable to start a document", e);
    }
}

From source file:org.apache.tika.parser.pdf.PDF2XHTML.java

@Override
protected void endDocument(PDDocument pdf) throws IOException {
    try {/*ww w.  j ava 2 s  .  c o m*/
        // Extract text for any bookmarks:
        extractBookmarkText();
        extractEmbeddedDocuments(pdf, originalHandler);

        //extract acroform data at end of doc
        if (config.getExtractAcroFormContent() == true) {
            extractAcroForm(pdf, handler);
        }
        handler.endDocument();
    } catch (TikaException e) {
        throw new IOExceptionWithCause("Unable to end a document", e);
    } catch (SAXException e) {
        throw new IOExceptionWithCause("Unable to end a document", e);
    }
}

From source file:org.apache.tika.parser.pdf.PDF2XHTML.java

@Override
protected void startPage(PDPage page) throws IOException {
    try {/*from  w ww.  j av a  2s . c  o m*/
        handler.startElement("div", "class", "page");
    } catch (SAXException e) {
        throw new IOExceptionWithCause("Unable to start a page", e);
    }
    writeParagraphStart();
}

From source file:org.apache.tika.parser.pdf.PDF2XHTML.java

@Override
protected void endPage(PDPage page) throws IOException {
    try {/*  w  w  w. j  ava  2s.com*/
        writeParagraphEnd();

        extractImages(page.getResources(), new HashSet<COSBase>());

        EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
        for (PDAnnotation annotation : page.getAnnotations()) {

            if (annotation instanceof PDAnnotationFileAttachment) {
                PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
                PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
                try {
                    extractMultiOSPDEmbeddedFiles("", fileSpec, extractor);
                } catch (SAXException e) {
                    throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
                } catch (TikaException e) {
                    throw new IOExceptionWithCause("file embedded in annotation tika exception", e);
                }
            }
            // TODO: remove once PDFBOX-1143 is fixed:
            if (config.getExtractAnnotationText()) {
                if (annotation instanceof PDAnnotationLink) {
                    PDAnnotationLink annotationlink = (PDAnnotationLink) annotation;
                    if (annotationlink.getAction() != null) {
                        PDAction action = annotationlink.getAction();
                        if (action instanceof PDActionURI) {
                            PDActionURI uri = (PDActionURI) action;
                            String link = uri.getURI();
                            if (link != null) {
                                handler.startElement("div", "class", "annotation");
                                handler.startElement("a", "href", link);
                                handler.endElement("a");
                                handler.endElement("div");
                            }
                        }
                    }
                }

                if (annotation instanceof PDAnnotationMarkup) {
                    PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
                    String title = annotationMarkup.getTitlePopup();
                    String subject = annotationMarkup.getSubject();
                    String contents = annotationMarkup.getContents();
                    // TODO: maybe also annotationMarkup.getRichContents()?
                    if (title != null || subject != null || contents != null) {
                        handler.startElement("div", "class", "annotation");

                        if (title != null) {
                            handler.startElement("div", "class", "annotationTitle");
                            handler.characters(title);
                            handler.endElement("div");
                        }

                        if (subject != null) {
                            handler.startElement("div", "class", "annotationSubject");
                            handler.characters(subject);
                            handler.endElement("div");
                        }

                        if (contents != null) {
                            handler.startElement("div", "class", "annotationContents");
                            handler.characters(contents);
                            handler.endElement("div");
                        }

                        handler.endElement("div");
                    }
                }
            }
        }

        handler.endElement("div");
    } catch (SAXException e) {
        throw new IOExceptionWithCause("Unable to end a page", e);
    }
}

From source file:org.apache.tika.parser.pdf.PDF2XHTML.java

@Override
protected void writeParagraphStart() throws IOException {
    super.writeParagraphStart();
    try {//from ww  w .  j  a v  a  2  s.c o m
        handler.startElement("p");
    } catch (SAXException e) {
        throw new IOExceptionWithCause("Unable to start a paragraph", e);
    }
}

From source file:org.apache.tika.parser.pdf.PDF2XHTML.java

@Override
protected void writeParagraphEnd() throws IOException {
    super.writeParagraphEnd();
    try {//  w w w.  j a v a2  s .c  o m
        handler.endElement("p");
    } catch (SAXException e) {
        throw new IOExceptionWithCause("Unable to end a paragraph", e);
    }
}