Example usage for org.apache.commons.io.input CloseShieldInputStream CloseShieldInputStream

List of usage examples for org.apache.commons.io.input CloseShieldInputStream CloseShieldInputStream

Introduction

In this page you can find the example usage for org.apache.commons.io.input CloseShieldInputStream CloseShieldInputStream.

Prototype

public CloseShieldInputStream(InputStream in) 

Source Link

Document

Creates a proxy that shields the given input stream from being closed.

Usage

From source file:org.apache.tika.parser.html.HtmlParser.java

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    // Automatically detect the character encoding
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata,
            getEncodingDetector(context))) {
        Charset charset = reader.getCharset();
        String previous = metadata.get(Metadata.CONTENT_TYPE);
        MediaType contentType = null;/*from w w w  .  jav a 2s  .  com*/
        if (previous == null || previous.startsWith("text/html")) {
            contentType = new MediaType(MediaType.TEXT_HTML, charset);
        } else if (previous.startsWith("application/xhtml+xml")) {
            contentType = new MediaType(XHTML, charset);
        } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
            contentType = new MediaType(WAP_XHTML, charset);
        } else if (previous.startsWith("application/x-asp")) {
            contentType = new MediaType(X_ASP, charset);
        }
        if (contentType != null) {
            metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
        }
        // deprecated, see TIKA-431
        metadata.set(Metadata.CONTENT_ENCODING, charset.name());

        // Get the HTML mapper from the parse context
        HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper());

        // Parse the HTML document
        org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();

        // Use schema from context or default
        Schema schema = context.get(Schema.class, HTML_SCHEMA);

        // TIKA-528: Reuse share schema to avoid heavy instantiation
        parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
        // TIKA-599: Shared schema is thread-safe only if bogons are ignored
        parser.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);

        parser.setContentHandler(new XHTMLDowngradeHandler(new HtmlHandler(mapper, handler, metadata)));

        parser.parse(reader.asInputSource());
    }
}

From source file:org.apache.tika.parser.image.ImageParser.java

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
    String type = metadata.get(Metadata.CONTENT_TYPE);
    if (type != null) {
        // If the old (pre-RFC7903) BMP mime type is given,
        //  fix it up to the new one, so Java is happy
        if (OLD_BMP_TYPE.toString().equals(type)) {
            type = MAIN_BMP_TYPE.toString();
        }//from  w  w w  .  j av  a  2 s .  com

        try {
            Iterator<ImageReader> iterator = ImageIO.getImageReadersByMIMEType(type);
            if (iterator.hasNext()) {
                ImageReader reader = iterator.next();
                try {
                    try (ImageInputStream imageStream = ImageIO
                            .createImageInputStream(new CloseShieldInputStream(stream))) {
                        reader.setInput(imageStream);

                        metadata.set(Metadata.IMAGE_WIDTH, Integer.toString(reader.getWidth(0)));
                        metadata.set(Metadata.IMAGE_LENGTH, Integer.toString(reader.getHeight(0)));
                        metadata.set("height", Integer.toString(reader.getHeight(0)));
                        metadata.set("width", Integer.toString(reader.getWidth(0)));

                        loadMetadata(reader.getImageMetadata(0), metadata);
                    }
                } finally {
                    reader.dispose();
                }
            }

            // Translate certain Metadata tags from the ImageIO
            //  specific namespace into the general Tika one
            setIfPresent(metadata, "CommentExtensions CommentExtension", TikaCoreProperties.COMMENTS);
            setIfPresent(metadata, "markerSequence com", TikaCoreProperties.COMMENTS);
            setIfPresent(metadata, "Data BitsPerSample", Metadata.BITS_PER_SAMPLE);
        } catch (IIOException e) {
            // TIKA-619: There is a known bug in the Sun API when dealing with GIF images
            //  which Tika will just ignore.
            if (!(e.getMessage() != null && e.getMessage().equals("Unexpected block type 0!")
                    && type.equals("image/gif"))) {
                throw new TikaException(type + " parse error", e);
            }
        }
    }

    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.endDocument();
}

From source file:org.apache.tika.parser.isatab.ISATabUtils.java

public static void parseInvestigation(InputStream stream, XHTMLContentHandler handler, Metadata metadata,
        ParseContext context, String studyFileName) throws IOException, TikaException, SAXException {

    TikaConfig tikaConfig = context.get(TikaConfig.class);
    if (tikaConfig == null) {
        tikaConfig = TikaConfig.getDefaultConfig();
    }/*  w  ww  .j a  v  a  2s.c o  m*/
    // Automatically detect the character encoding
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata,
            tikaConfig.getEncodingDetector())) {
        extractMetadata(reader, metadata, studyFileName);
    }
}

From source file:org.apache.tika.parser.isatab.ISATabUtils.java

public static void parseStudy(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata,
        ParseContext context) throws IOException, TikaException, SAXException {
    TikaInputStream tis = TikaInputStream.get(stream);
    // Automatically detect the character encoding
    TikaConfig tikaConfig = context.get(TikaConfig.class);
    if (tikaConfig == null) {
        tikaConfig = TikaConfig.getDefaultConfig();
    }/*from  w  ww  . jav  a2  s  .co m*/
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis), metadata,
            tikaConfig.getEncodingDetector()); CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) {
        Iterator<CSVRecord> iterator = csvParser.iterator();

        xhtml.startElement("table");

        xhtml.startElement("thead");
        if (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            for (int i = 0; i < record.size(); i++) {
                xhtml.startElement("th");
                xhtml.characters(record.get(i));
                xhtml.endElement("th");
            }
        }
        xhtml.endElement("thead");

        xhtml.startElement("tbody");
        while (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            xhtml.startElement("tr");
            for (int j = 0; j < record.size(); j++) {
                xhtml.startElement("td");
                xhtml.characters(record.get(j));
                xhtml.endElement("td");
            }
            xhtml.endElement("tr");
        }
        xhtml.endElement("tbody");

        xhtml.endElement("table");
    }
}

From source file:org.apache.tika.parser.isatab.ISATabUtils.java

public static void parseAssay(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata,
        ParseContext context) throws IOException, TikaException, SAXException {
    TikaInputStream tis = TikaInputStream.get(stream);

    // Automatically detect the character encoding

    TikaConfig tikaConfig = context.get(TikaConfig.class);
    if (tikaConfig == null) {
        tikaConfig = TikaConfig.getDefaultConfig();
    }//from   w  w w  .j  av  a2s .c o m
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis), metadata,
            tikaConfig.getEncodingDetector()); CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) {
        xhtml.startElement("table");

        Iterator<CSVRecord> iterator = csvParser.iterator();

        xhtml.startElement("thead");
        if (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            for (int i = 0; i < record.size(); i++) {
                xhtml.startElement("th");
                xhtml.characters(record.get(i));
                xhtml.endElement("th");
            }
        }
        xhtml.endElement("thead");

        xhtml.startElement("tbody");
        while (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            xhtml.startElement("tr");
            for (int j = 0; j < record.size(); j++) {
                xhtml.startElement("td");
                xhtml.characters(record.get(j));
                xhtml.endElement("td");
            }
            xhtml.endElement("tr");
        }
        xhtml.endElement("tbody");

        xhtml.endElement("table");
    }
}

From source file:org.apache.tika.parser.iwork.IWorkPackageParser.java

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
    ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);
    ZipArchiveEntry entry = zip.getNextZipEntry();

    while (entry != null) {
        if (!IWORK_CONTENT_ENTRIES.contains(entry.getName())) {
            entry = zip.getNextZipEntry();
            continue;
        }/*from  w ww .j  av  a 2  s . c  o m*/

        InputStream entryStream = new BufferedInputStream(zip, 4096);
        entryStream.mark(4096);
        IWORKDocumentType type = IWORKDocumentType.detectType(entryStream);
        entryStream.reset();

        if (type != null) {
            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
            ContentHandler contentHandler;

            switch (type) {
            case KEYNOTE:
                contentHandler = new KeynoteContentHandler(xhtml, metadata);
                break;
            case NUMBERS:
                contentHandler = new NumbersContentHandler(xhtml, metadata);
                break;
            case PAGES:
                contentHandler = new PagesContentHandler(xhtml, metadata);
                break;
            case ENCRYPTED:
                // We can't do anything for the file right now
                contentHandler = null;
                break;
            default:
                throw new TikaException("Unhandled iWorks file " + type);
            }

            metadata.add(Metadata.CONTENT_TYPE, type.getType().toString());
            xhtml.startDocument();
            if (contentHandler != null) {
                context.getSAXParser().parse(new CloseShieldInputStream(entryStream),
                        new OfflineContentHandler(contentHandler));
            }
            xhtml.endDocument();
        }

        entry = zip.getNextZipEntry();
    }
    // Don't close the zip InputStream (TIKA-1117).
}

From source file:org.apache.tika.parser.microsoft.OfficeParser.java

/**
 * Extracts properties and text from an MS Document input stream
 *//*from w ww  . j a va2s  .  com*/
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    configure(context);
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();

    final DirectoryNode root;
    TikaInputStream tstream = TikaInputStream.cast(stream);
    NPOIFSFileSystem mustCloseFs = null;
    try {
        if (tstream == null) {
            mustCloseFs = new NPOIFSFileSystem(new CloseShieldInputStream(stream));
            root = mustCloseFs.getRoot();
        } else {
            final Object container = tstream.getOpenContainer();
            if (container instanceof NPOIFSFileSystem) {
                root = ((NPOIFSFileSystem) container).getRoot();
            } else if (container instanceof DirectoryNode) {
                root = (DirectoryNode) container;
            } else {
                NPOIFSFileSystem fs = null;
                if (tstream.hasFile()) {
                    fs = new NPOIFSFileSystem(tstream.getFile(), true);
                } else {
                    fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream));
                }
                //tstream will close the fs, no need to close this below
                tstream.setOpenContainer(fs);
                root = fs.getRoot();

            }
        }
        parse(root, context, metadata, xhtml);
        OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class);

        if (officeParserConfig.getExtractMacros()) {
            //now try to get macros
            extractMacros(root.getNFileSystem(), xhtml,
                    EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
        }
    } finally {
        IOUtils.closeQuietly(mustCloseFs);
    }
    xhtml.endDocument();
}

From source file:org.apache.tika.parser.microsoft.ooxml.OOXMLExtractorFactory.java

public static void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata,
        ParseContext context) throws IOException, SAXException, TikaException {
    Locale locale = context.get(Locale.class, Locale.getDefault());
    ExtractorFactory.setThreadPrefersEventExtractors(true);

    try {/*from  w  ww .  j  av a  2 s.c o  m*/
        OOXMLExtractor extractor;
        OPCPackage pkg;

        // Locate or Open the OPCPackage for the file
        TikaInputStream tis = TikaInputStream.cast(stream);
        if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
            pkg = (OPCPackage) tis.getOpenContainer();
        } else if (tis != null && tis.hasFile()) {
            pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
            tis.setOpenContainer(pkg);
        } else {
            InputStream shield = new CloseShieldInputStream(stream);
            pkg = OPCPackage.open(shield);
        }

        // Get the type, and ensure it's one we handle
        MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
        if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
            // Not a supported type, delegate to Empty Parser
            EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
            return;
        }
        metadata.set(Metadata.CONTENT_TYPE, type.toString());

        // Have the appropriate OOXML text extractor picked
        POIXMLTextExtractor poiExtractor = null;
        // This has already been set by OOXMLParser's call to configure()
        // We can rely on this being non-null.
        OfficeParserConfig config = context.get(OfficeParserConfig.class);
        if (config.getUseSAXDocxExtractor()) {
            poiExtractor = trySXWPF(pkg);
        }
        if (poiExtractor == null && config.getUseSAXPptxExtractor()) {
            poiExtractor = trySXSLF(pkg);
        }
        if (poiExtractor == null) {
            poiExtractor = ExtractorFactory.createExtractor(pkg);
        }

        POIXMLDocument document = poiExtractor.getDocument();
        if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
            extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
        } else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
            extractor = new XSSFExcelExtractorDecorator(context, poiExtractor, locale);
        } else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
            extractor = new SXWPFWordExtractorDecorator(metadata, context,
                    (XWPFEventBasedWordExtractor) poiExtractor);
            metadata.add("X-Parsed-By", XWPFEventBasedWordExtractor.class.getCanonicalName());
        } else if (poiExtractor instanceof XSLFEventBasedPowerPointExtractor) {
            extractor = new SXSLFPowerPointExtractorDecorator(metadata, context,
                    (XSLFEventBasedPowerPointExtractor) poiExtractor);
            metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getCanonicalName());
        } else if (document == null) {
            throw new TikaException(
                    "Expecting UserModel based POI OOXML extractor with a document, but none found. "
                            + "The extractor returned was a " + poiExtractor);
        } else if (document instanceof XMLSlideShow) {
            extractor = new XSLFPowerPointExtractorDecorator(context,
                    (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) poiExtractor);
        } else if (document instanceof XWPFDocument) {
            extractor = new XWPFWordExtractorDecorator(context, (XWPFWordExtractor) poiExtractor);
        } else {
            extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
        }

        // Get the bulk of the metadata first, so that it's accessible during
        //  parsing if desired by the client (see TIKA-1109)
        extractor.getMetadataExtractor().extract(metadata);

        // Extract the text, along with any in-document metadata
        extractor.getXHTML(baseHandler, metadata, context);
    } catch (IllegalArgumentException e) {
        if (e.getMessage() != null && e.getMessage().startsWith("No supported documents found")) {
            throw new TikaException(
                    "TIKA-418: RuntimeException while getting content" + " for thmx and xps file types", e);
        } else {
            throw new TikaException("Error creating OOXML extractor", e);
        }
    } catch (InvalidFormatException e) {
        throw new TikaException("Error creating OOXML extractor", e);
    } catch (OpenXML4JException e) {
        throw new TikaException("Error creating OOXML extractor", e);
    } catch (XmlException e) {
        throw new TikaException("Error creating OOXML extractor", e);

    }
}

From source file:org.apache.tika.parser.microsoft.ooxml.SXSLFPowerPointExtractorDecorator.java

private void loadCommentAuthors() {
    PackageRelationshipCollection prc = null;
    try {//from  ww  w. jav  a  2 s.  c om
        prc = mainDocument.getRelationshipsByType(XSLFRelation.COMMENT_AUTHORS.getRelation());
    } catch (InvalidFormatException e) {
        metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
    }
    if (prc == null || prc.size() == 0) {
        return;
    }

    for (int i = 0; i < prc.size(); i++) {
        PackagePart commentAuthorsPart = null;
        try {
            commentAuthorsPart = commentAuthorsPart = mainDocument.getRelatedPart(prc.getRelationship(i));
        } catch (InvalidFormatException e) {
            metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
        }
        if (commentAuthorsPart == null) {
            continue;
        }
        try (InputStream stream = commentAuthorsPart.getInputStream()) {
            context.getSAXParser().parse(new CloseShieldInputStream(stream),
                    new OfflineContentHandler(new XSLFCommentAuthorHandler()));

        } catch (TikaException | SAXException | IOException e) {
            metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
        }
    }

}

From source file:org.apache.tika.parser.microsoft.ooxml.SXSLFPowerPointExtractorDecorator.java

private void handleSlidePart(PackagePart slidePart, XHTMLContentHandler xhtml)
        throws IOException, SAXException {
    Map<String, String> linkedRelationships = loadLinkedRelationships(slidePart, false, metadata);

    //        Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
    xhtml.startElement("div", "class", "slide-content");
    try (InputStream stream = slidePart.getInputStream()) {
        context.getSAXParser().parse(new CloseShieldInputStream(stream),
                new OfflineContentHandler(new EmbeddedContentHandler(new OOXMLWordAndPowerPointTextHandler(
                        new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships))));

    } catch (TikaException e) {
        metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
    }/*from w  ww  . ja v  a2 s.  c  om*/

    xhtml.endElement("div");

    handleBasicRelatedParts(XSLFRelation.SLIDE_LAYOUT.getRelation(), "slide-master-content", slidePart,
            new PlaceHolderSkipper(new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml),
                    linkedRelationships)));

    handleBasicRelatedParts(XSLFRelation.NOTES.getRelation(), "slide-notes", slidePart,
            new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships));

    handleBasicRelatedParts(XSLFRelation.NOTES_MASTER.getRelation(), "slide-notes-master", slidePart,
            new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships));

    handleBasicRelatedParts(XSLFRelation.COMMENTS.getRelation(), null, slidePart,
            new XSLFCommentsHandler(xhtml));

}