Example usage for org.apache.commons.io.input CloseShieldInputStream CloseShieldInputStream

Introduction

In this page you can find the example usage for org.apache.commons.io.input CloseShieldInputStream CloseShieldInputStream.

Prototype

public CloseShieldInputStream(InputStream in)

Source Link

Document

Creates a proxy that shields the given input stream from being closed.

Usage

From source file:org.apache.tika.parser.html.HtmlParser.java

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    // Automatically detect the character encoding
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata,
            getEncodingDetector(context))) {
        Charset charset = reader.getCharset();
        String previous = metadata.get(Metadata.CONTENT_TYPE);
        MediaType contentType = null;/*from w w w  .  jav a 2s  .  com*/
        if (previous == null || previous.startsWith("text/html")) {
            contentType = new MediaType(MediaType.TEXT_HTML, charset);
        } else if (previous.startsWith("application/xhtml+xml")) {
            contentType = new MediaType(XHTML, charset);
        } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
            contentType = new MediaType(WAP_XHTML, charset);
        } else if (previous.startsWith("application/x-asp")) {
            contentType = new MediaType(X_ASP, charset);
        }
        if (contentType != null) {
            metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
        }
        // deprecated, see TIKA-431
        metadata.set(Metadata.CONTENT_ENCODING, charset.name());

        // Get the HTML mapper from the parse context
        HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper());

        // Parse the HTML document
        org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();

        // Use schema from context or default
        Schema schema = context.get(Schema.class, HTML_SCHEMA);

        // TIKA-528: Reuse share schema to avoid heavy instantiation
        parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
        // TIKA-599: Shared schema is thread-safe only if bogons are ignored
        parser.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);

        parser.setContentHandler(new XHTMLDowngradeHandler(new HtmlHandler(mapper, handler, metadata)));

        parser.parse(reader.asInputSource());
    }
}

From source file:org.apache.tika.parser.image.ImageParser.java

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
    String type = metadata.get(Metadata.CONTENT_TYPE);
    if (type != null) {
        // If the old (pre-RFC7903) BMP mime type is given,
        //  fix it up to the new one, so Java is happy
        if (OLD_BMP_TYPE.toString().equals(type)) {
            type = MAIN_BMP_TYPE.toString();
        }//from  w  w w  .  j av  a  2 s .  com

        try {
            Iterator<ImageReader> iterator = ImageIO.getImageReadersByMIMEType(type);
            if (iterator.hasNext()) {
                ImageReader reader = iterator.next();
                try {
                    try (ImageInputStream imageStream = ImageIO
                            .createImageInputStream(new CloseShieldInputStream(stream))) {
                        reader.setInput(imageStream);

                        metadata.set(Metadata.IMAGE_WIDTH, Integer.toString(reader.getWidth(0)));
                        metadata.set(Metadata.IMAGE_LENGTH, Integer.toString(reader.getHeight(0)));
                        metadata.set("height", Integer.toString(reader.getHeight(0)));
                        metadata.set("width", Integer.toString(reader.getWidth(0)));

                        loadMetadata(reader.getImageMetadata(0), metadata);
                    }
                } finally {
                    reader.dispose();
                }
            }

            // Translate certain Metadata tags from the ImageIO
            //  specific namespace into the general Tika one
            setIfPresent(metadata, "CommentExtensions CommentExtension", TikaCoreProperties.COMMENTS);
            setIfPresent(metadata, "markerSequence com", TikaCoreProperties.COMMENTS);
            setIfPresent(metadata, "Data BitsPerSample", Metadata.BITS_PER_SAMPLE);
        } catch (IIOException e) {
            // TIKA-619: There is a known bug in the Sun API when dealing with GIF images
            //  which Tika will just ignore.
            if (!(e.getMessage() != null && e.getMessage().equals("Unexpected block type 0!")
                    && type.equals("image/gif"))) {
                throw new TikaException(type + " parse error", e);
            }
        }
    }

    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.endDocument();
}

From source file:org.apache.tika.parser.isatab.ISATabUtils.java

public static void parseInvestigation(InputStream stream, XHTMLContentHandler handler, Metadata metadata,
        ParseContext context, String studyFileName) throws IOException, TikaException, SAXException {

    TikaConfig tikaConfig = context.get(TikaConfig.class);
    if (tikaConfig == null) {
        tikaConfig = TikaConfig.getDefaultConfig();
    }/*  w  ww  .j a  v  a  2s.c o  m*/
    // Automatically detect the character encoding
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata,
            tikaConfig.getEncodingDetector())) {
        extractMetadata(reader, metadata, studyFileName);
    }
}

From source file:org.apache.tika.parser.isatab.ISATabUtils.java

public static void parseStudy(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata,
        ParseContext context) throws IOException, TikaException, SAXException {
    TikaInputStream tis = TikaInputStream.get(stream);
    // Automatically detect the character encoding
    TikaConfig tikaConfig = context.get(TikaConfig.class);
    if (tikaConfig == null) {
        tikaConfig = TikaConfig.getDefaultConfig();
    }/*from  w  ww  . jav  a2  s  .co m*/
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis), metadata,
            tikaConfig.getEncodingDetector()); CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) {
        Iterator<CSVRecord> iterator = csvParser.iterator();

        xhtml.startElement("table");

        xhtml.startElement("thead");
        if (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            for (int i = 0; i < record.size(); i++) {
                xhtml.startElement("th");
                xhtml.characters(record.get(i));
                xhtml.endElement("th");
            }
        }
        xhtml.endElement("thead");

        xhtml.startElement("tbody");
        while (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            xhtml.startElement("tr");
            for (int j = 0; j < record.size(); j++) {
                xhtml.startElement("td");
                xhtml.characters(record.get(j));
                xhtml.endElement("td");
            }
            xhtml.endElement("tr");
        }
        xhtml.endElement("tbody");

        xhtml.endElement("table");
    }
}

From source file:org.apache.tika.parser.isatab.ISATabUtils.java

public static void parseAssay(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata,
        ParseContext context) throws IOException, TikaException, SAXException {
    TikaInputStream tis = TikaInputStream.get(stream);

    // Automatically detect the character encoding

    TikaConfig tikaConfig = context.get(TikaConfig.class);
    if (tikaConfig == null) {
        tikaConfig = TikaConfig.getDefaultConfig();
    }//from   w  w w  .j  av  a2s .c o m
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis), metadata,
            tikaConfig.getEncodingDetector()); CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) {
        xhtml.startElement("table");

        Iterator<CSVRecord> iterator = csvParser.iterator();

        xhtml.startElement("thead");
        if (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            for (int i = 0; i < record.size(); i++) {
                xhtml.startElement("th");
                xhtml.characters(record.get(i));
                xhtml.endElement("th");
            }
        }
        xhtml.endElement("thead");

        xhtml.startElement("tbody");
        while (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            xhtml.startElement("tr");
            for (int j = 0; j < record.size(); j++) {
                xhtml.startElement("td");
                xhtml.characters(record.get(j));
                xhtml.endElement("td");
            }
            xhtml.endElement("tr");
        }
        xhtml.endElement("tbody");

        xhtml.endElement("table");
    }
}

From source file:org.apache.tika.parser.iwork.IWorkPackageParser.java

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
    ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);
    ZipArchiveEntry entry = zip.getNextZipEntry();

    while (entry != null) {
        if (!IWORK_CONTENT_ENTRIES.contains(entry.getName())) {
            entry = zip.getNextZipEntry();
            continue;
        }/*from  w ww .j  av  a 2  s . c  o m*/

        InputStream entryStream = new BufferedInputStream(zip, 4096);
        entryStream.mark(4096);
        IWORKDocumentType type = IWORKDocumentType.detectType(entryStream);
        entryStream.reset();

        if (type != null) {
            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
            ContentHandler contentHandler;

            switch (type) {
            case KEYNOTE:
                contentHandler = new KeynoteContentHandler(xhtml, metadata);
                break;
            case NUMBERS:
                contentHandler = new NumbersContentHandler(xhtml, metadata);
                break;
            case PAGES:
                contentHandler = new PagesContentHandler(xhtml, metadata);
                break;
            case ENCRYPTED:
                // We can't do anything for the file right now
                contentHandler = null;
                break;
            default:
                throw new TikaException("Unhandled iWorks file " + type);
            }

            metadata.add(Metadata.CONTENT_TYPE, type.getType().toString());
            xhtml.startDocument();
            if (contentHandler != null) {
                context.getSAXParser().parse(new CloseShieldInputStream(entryStream),
                        new OfflineContentHandler(contentHandler));
            }
            xhtml.endDocument();
        }

        entry = zip.getNextZipEntry();
    }
    // Don't close the zip InputStream (TIKA-1117).
}

From source file:org.apache.tika.parser.microsoft.OfficeParser.java

/**
 * Extracts properties and text from an MS Document input stream
 *//*from w ww  . j a va2s  .  com*/
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    configure(context);
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();

    final DirectoryNode root;
    TikaInputStream tstream = TikaInputStream.cast(stream);
    NPOIFSFileSystem mustCloseFs = null;
    try {
        if (tstream == null) {
            mustCloseFs = new NPOIFSFileSystem(new CloseShieldInputStream(stream));
            root = mustCloseFs.getRoot();
        } else {
            final Object container = tstream.getOpenContainer();
            if (container instanceof NPOIFSFileSystem) {
                root = ((NPOIFSFileSystem) container).getRoot();
            } else if (container instanceof DirectoryNode) {
                root = (DirectoryNode) container;
            } else {
                NPOIFSFileSystem fs = null;
                if (tstream.hasFile()) {
                    fs = new NPOIFSFileSystem(tstream.getFile(), true);
                } else {
                    fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream));
                }
                //tstream will close the fs, no need to close this below
                tstream.setOpenContainer(fs);
                root = fs.getRoot();

            }
        }
        parse(root, context, metadata, xhtml);
        OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class);

        if (officeParserConfig.getExtractMacros()) {
            //now try to get macros
            extractMacros(root.getNFileSystem(), xhtml,
                    EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
        }
    } finally {
        IOUtils.closeQuietly(mustCloseFs);
    }
    xhtml.endDocument();
}

From source file:org.apache.tika.parser.microsoft.ooxml.OOXMLExtractorFactory.java

public static void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata,
        ParseContext context) throws IOException, SAXException, TikaException {
    Locale locale = context.get(Locale.class, Locale.getDefault());
    ExtractorFactory.setThreadPrefersEventExtractors(true);

    try {/*from  w  ww .  j  av a  2 s.c o  m*/
        OOXMLExtractor extractor;
        OPCPackage pkg;

        // Locate or Open the OPCPackage for the file
        TikaInputStream tis = TikaInputStream.cast(stream);
        if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
            pkg = (OPCPackage) tis.getOpenContainer();
        } else if (tis != null && tis.hasFile()) {
            pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
            tis.setOpenContainer(pkg);
        } else {
            InputStream shield = new CloseShieldInputStream(stream);
            pkg = OPCPackage.open(shield);
        }

        // Get the type, and ensure it's one we handle
        MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
        if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
            // Not a supported type, delegate to Empty Parser
            EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
            return;
        }
        metadata.set(Metadata.CONTENT_TYPE, type.toString());

        // Have the appropriate OOXML text extractor picked
        POIXMLTextExtractor poiExtractor = null;
        // This has already been set by OOXMLParser's call to configure()
        // We can rely on this being non-null.
        OfficeParserConfig config = context.get(OfficeParserConfig.class);
        if (config.getUseSAXDocxExtractor()) {
            poiExtractor = trySXWPF(pkg);
        }
        if (poiExtractor == null && config.getUseSAXPptxExtractor()) {
            poiExtractor = trySXSLF(pkg);
        }
        if (poiExtractor == null) {
            poiExtractor = ExtractorFactory.createExtractor(pkg);
        }

        POIXMLDocument document = poiExtractor.getDocument();
        if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
            extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
        } else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
            extractor = new XSSFExcelExtractorDecorator(context, poiExtractor, locale);
        } else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
            extractor = new SXWPFWordExtractorDecorator(metadata, context,
                    (XWPFEventBasedWordExtractor) poiExtractor);
            metadata.add("X-Parsed-By", XWPFEventBasedWordExtractor.class.getCanonicalName());
        } else if (poiExtractor instanceof XSLFEventBasedPowerPointExtractor) {
            extractor = new SXSLFPowerPointExtractorDecorator(metadata, context,
                    (XSLFEventBasedPowerPointExtractor) poiExtractor);
            metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getCanonicalName());
        } else if (document == null) {
            throw new TikaException(
                    "Expecting UserModel based POI OOXML extractor with a document, but none found. "
                            + "The extractor returned was a " + poiExtractor);
        } else if (document instanceof XMLSlideShow) {
            extractor = new XSLFPowerPointExtractorDecorator(context,
                    (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) poiExtractor);
        } else if (document instanceof XWPFDocument) {
            extractor = new XWPFWordExtractorDecorator(context, (XWPFWordExtractor) poiExtractor);
        } else {
            extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
        }

        // Get the bulk of the metadata first, so that it's accessible during
        //  parsing if desired by the client (see TIKA-1109)
        extractor.getMetadataExtractor().extract(metadata);

        // Extract the text, along with any in-document metadata
        extractor.getXHTML(baseHandler, metadata, context);
    } catch (IllegalArgumentException e) {
        if (e.getMessage() != null && e.getMessage().startsWith("No supported documents found")) {
            throw new TikaException(
                    "TIKA-418: RuntimeException while getting content" + " for thmx and xps file types", e);
        } else {
            throw new TikaException("Error creating OOXML extractor", e);
        }
    } catch (InvalidFormatException e) {
        throw new TikaException("Error creating OOXML extractor", e);
    } catch (OpenXML4JException e) {
        throw new TikaException("Error creating OOXML extractor", e);
    } catch (XmlException e) {
        throw new TikaException("Error creating OOXML extractor", e);

    }
}

From source file:org.apache.tika.parser.microsoft.ooxml.SXSLFPowerPointExtractorDecorator.java

private void loadCommentAuthors() {
    PackageRelationshipCollection prc = null;
    try {//from  ww  w. jav  a  2 s.  c om
        prc = mainDocument.getRelationshipsByType(XSLFRelation.COMMENT_AUTHORS.getRelation());
    } catch (InvalidFormatException e) {
        metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
    }
    if (prc == null || prc.size() == 0) {
        return;
    }

    for (int i = 0; i < prc.size(); i++) {
        PackagePart commentAuthorsPart = null;
        try {
            commentAuthorsPart = commentAuthorsPart = mainDocument.getRelatedPart(prc.getRelationship(i));
        } catch (InvalidFormatException e) {
            metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
        }
        if (commentAuthorsPart == null) {
            continue;
        }
        try (InputStream stream = commentAuthorsPart.getInputStream()) {
            context.getSAXParser().parse(new CloseShieldInputStream(stream),
                    new OfflineContentHandler(new XSLFCommentAuthorHandler()));

        } catch (TikaException | SAXException | IOException e) {
            metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
        }
    }

}

From source file:org.apache.tika.parser.microsoft.ooxml.SXSLFPowerPointExtractorDecorator.java

private void handleSlidePart(PackagePart slidePart, XHTMLContentHandler xhtml)
        throws IOException, SAXException {
    Map<String, String> linkedRelationships = loadLinkedRelationships(slidePart, false, metadata);

    //        Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
    xhtml.startElement("div", "class", "slide-content");
    try (InputStream stream = slidePart.getInputStream()) {
        context.getSAXParser().parse(new CloseShieldInputStream(stream),
                new OfflineContentHandler(new EmbeddedContentHandler(new OOXMLWordAndPowerPointTextHandler(
                        new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships))));

    } catch (TikaException e) {
        metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
    }/*from w  ww  . ja v  a2 s.  c  om*/

    xhtml.endElement("div");

    handleBasicRelatedParts(XSLFRelation.SLIDE_LAYOUT.getRelation(), "slide-master-content", slidePart,
            new PlaceHolderSkipper(new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml),
                    linkedRelationships)));

    handleBasicRelatedParts(XSLFRelation.NOTES.getRelation(), "slide-notes", slidePart,
            new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships));

    handleBasicRelatedParts(XSLFRelation.NOTES_MASTER.getRelation(), "slide-notes-master", slidePart,
            new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships));

    handleBasicRelatedParts(XSLFRelation.COMMENTS.getRelation(), null, slidePart,
            new XSLFCommentsHandler(xhtml));

}