Example usage for org.apache.commons.io.input CloseShieldInputStream CloseShieldInputStream

Introduction

In this page you can find the example usage for org.apache.commons.io.input CloseShieldInputStream CloseShieldInputStream.

Prototype

public CloseShieldInputStream(InputStream in)

Source Link

Document

Creates a proxy that shields the given input stream from being closed.

Usage

From source file:org.apache.tika.parser.microsoft.ooxml.SXWPFWordExtractorDecorator.java

private void handlePart(PackagePart packagePart, XWPFStylesShim styles, XWPFListManager listManager,
        XHTMLContentHandler xhtml) throws IOException, SAXException {

    Map<String, String> linkedRelationships = loadLinkedRelationships(packagePart, true, metadata);
    try (InputStream stream = packagePart.getInputStream()) {
        context.getSAXParser().parse(new CloseShieldInputStream(stream),
                new OfflineContentHandler(new EmbeddedContentHandler(new OOXMLWordAndPowerPointTextHandler(
                        new OOXMLTikaBodyPartHandler(xhtml, styles, listManager, config), linkedRelationships,
                        config.getIncludeShapeBasedContent()))));
    } catch (TikaException e) {
        metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));

    }/*from   ww  w. ja v  a  2s .  c  o m*/

}

From source file:org.apache.tika.parser.microsoft.ooxml.xps.XPSExtractorDecorator.java

private void handleDocuments(PackageRelationship packageRelationship, XHTMLContentHandler xhtml)
        throws IOException, SAXException, TikaException {
    try (InputStream stream = pkg.getPart(packageRelationship).getInputStream()) {
        XMLReaderUtils.parseSAX(new CloseShieldInputStream(stream),
                new OfflineContentHandler(new EmbeddedContentHandler(new FixedDocSeqHandler(xhtml))), context);
    }//from  w  w  w .  jav  a 2 s.  c om
}

From source file:org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006.Word2006MLParser.java

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
    //set OfficeParserConfig if the user hasn't specified one
    configure(context);//from w  ww . j a v  a  2s  .c o  m

    final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);

    xhtml.startDocument();

    try {
        context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(
                new EmbeddedContentHandler(new Word2006MLDocHandler(xhtml, metadata, context))));
    } catch (SAXException e) {
        throw new TikaException("XML parse error", e);
    } finally {
        xhtml.endDocument();
    }
}

From source file:org.apache.tika.parser.microsoft.ooxml.xwpf.Word2006MLParser.java

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
    final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);

    xhtml.startDocument();/*from w w w  .  j a v a 2s.c  o  m*/

    try {
        context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(
                new EmbeddedContentHandler(new Word2006MLHandler(xhtml, metadata, context))));
    } catch (SAXException e) {
        throw new TikaException("XML parse error", e);
    } finally {
        xhtml.endDocument();
    }
}

From source file:org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor.java

private void handlePart(PackagePart packagePart, XWPFListManager xwpfListManager, StringBuilder buffer)
        throws IOException, SAXException {

    Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
    try (InputStream stream = packagePart.getInputStream()) {
        XMLReader reader = SAXHelper.newXMLReader();
        reader.setContentHandler(/*from   w w w  .  ja v  a 2 s  .  c  o m*/
                new OOXMLWordAndPowerPointTextHandler(new XWPFToTextContentHandler(buffer), hyperlinks));
        reader.parse(new InputSource(new CloseShieldInputStream(stream)));

    } catch (ParserConfigurationException e) {
        e.printStackTrace();
    }

}

From source file:org.apache.tika.parser.microsoft.xml.AbstractXML2003Parser.java

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
    setContentType(metadata);/*from  w ww . j ava  2  s  .com*/

    final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();

    TaggedContentHandler tagged = new TaggedContentHandler(xhtml);
    try {
        context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(
                new EmbeddedContentHandler(getContentHandler(tagged, metadata, context))));
    } catch (SAXException e) {
        tagged.throwIfCauseOf(e);
        throw new TikaException("XML parse error", e);
    } finally {
        xhtml.endDocument();
    }
}

From source file:org.apache.tika.parser.odf.OpenDocumentContentParser.java

void parseInternal(InputStream stream, final ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS);

    SAXParser parser = context.getSAXParser();
    parser.parse(new CloseShieldInputStream(stream),
            new OfflineContentHandler(new NSNormalizerContentHandler(dh)));
}

From source file:org.apache.tika.parser.pdf.PDFParser.java

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);

    PDDocument pdfDocument = null;/*w  w w .  ja va 2s  .  com*/

    String password = "";
    try {
        // PDFBox can process entirely in memory, or can use a temp file
        //  for unpacked / processed resources
        // Decide which to do based on if we're reading from a file or not already
        //TODO: make this configurable via MemoryUsageSetting
        TikaInputStream tstream = TikaInputStream.cast(stream);
        password = getPassword(metadata, context);
        if (tstream != null && tstream.hasFile()) {
            // File based -- send file directly to PDFBox
            pdfDocument = PDDocument.load(tstream.getPath().toFile(), password);
        } else {
            pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password);
        }
        metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(pdfDocument.isEncrypted()));

        metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
        extractMetadata(pdfDocument, metadata, context);
        AccessChecker checker = localConfig.getAccessChecker();
        checker.check(metadata);
        if (handler != null) {
            if (shouldHandleXFAOnly(pdfDocument, localConfig)) {
                handleXFAOnly(pdfDocument, handler, metadata, context);
            } else if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) {
                metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
                OCR2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
            } else {
                if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
                    metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
                }
                PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
            }
        }
    } catch (InvalidPasswordException e) {
        metadata.set(PDF.IS_ENCRYPTED, "true");
        throw new EncryptedDocumentException(e);
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}

From source file:org.apache.tika.parser.pdf18.PDFParser.java

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    PDDocument pdfDocument = null;//from w w w.jav a 2  s  .c  o  m
    TemporaryResources tmp = new TemporaryResources();
    //config from context, or default if not set via context
    PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
    String password = "";
    try {
        // PDFBox can process entirely in memory, or can use a temp file
        //  for unpacked / processed resources
        // Decide which to do based on if we're reading from a file or not already
        TikaInputStream tstream = TikaInputStream.cast(stream);
        password = getPassword(metadata, context);
        if (tstream != null && tstream.hasFile()) {
            // File based, take that as a cue to use a temporary file
            RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw");
            if (localConfig.getUseNonSequentialParser() == true) {
                pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), scratchFile, password);
            } else {
                pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true);
            }
        } else {
            // Go for the normal, stream based in-memory parsing
            if (localConfig.getUseNonSequentialParser() == true) {
                pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream),
                        new RandomAccessBuffer(), password);
            } else {
                pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true);
            }
        }
        metadata.set("pdf:encrypted", Boolean.toString(pdfDocument.isEncrypted()));

        //if using the classic parser and the doc is encrypted, we must manually decrypt
        if (!localConfig.getUseNonSequentialParser() && pdfDocument.isEncrypted()) {
            pdfDocument.decrypt(password);
        }

        metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
        extractMetadata(pdfDocument, metadata);

        AccessChecker checker = localConfig.getAccessChecker();
        checker.check(metadata);
        if (handler != null) {
            if (shouldHandleXFAOnly(pdfDocument, localConfig)) {
                handleXFAOnly(pdfDocument, handler, metadata);
            } else {
                PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
            }
        }

    } catch (CryptographyException e) {
        //seq parser throws CryptographyException for bad password
        throw new EncryptedDocumentException(e);
    } catch (IOException e) {
        //nonseq parser throws IOException for bad password
        //At the Tika level, we want the same exception to be thrown
        if (e.getMessage() != null && e.getMessage().contains("Error (CryptographyException)")) {
            metadata.set("pdf:encrypted", Boolean.toString(true));
            throw new EncryptedDocumentException(e);
        }
        //rethrow any other IOExceptions
        throw e;
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
        tmp.dispose();
        //TODO: once we migrate to PDFBox 2.0, remove this (PDFBOX-2200)
        PDFont.clearResources();
    }
}

From source file:org.apache.tika.parser.pkg.CompressorParser.java

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
    // At the end we want to close the compression stream to release
    // any associated resources, but the underlying document stream
    // should not be closed
    if (stream.markSupported()) {
        stream = new CloseShieldInputStream(stream);
    } else {/*from  www .  jav a2 s . c o m*/
        // Ensure that the stream supports the mark feature
        stream = new BufferedInputStream(new CloseShieldInputStream(stream));
    }

    CompressorInputStream cis;
    try {
        CompressorParserOptions options = context.get(CompressorParserOptions.class,
                new CompressorParserOptions() {
                    public boolean decompressConcatenated(Metadata metadata) {
                        return false;
                    }
                });
        TikaCompressorStreamFactory factory = new TikaCompressorStreamFactory(
                options.decompressConcatenated(metadata), memoryLimitInKb);
        cis = factory.createCompressorInputStream(stream);
    } catch (CompressorException e) {
        if (e.getMessage() != null && e.getMessage().startsWith("MemoryLimitException:")) {
            throw new TikaMemoryLimitException(e.getMessage());
        }
        throw new TikaException("Unable to uncompress document stream", e);
    }

    MediaType type = getMediaType(cis);
    if (!type.equals(MediaType.OCTET_STREAM)) {
        metadata.set(CONTENT_TYPE, type.toString());
    }

    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();

    try {
        Metadata entrydata = new Metadata();
        String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
        if (name != null) {
            if (name.endsWith(".tbz")) {
                name = name.substring(0, name.length() - 4) + ".tar";
            } else if (name.endsWith(".tbz2")) {
                name = name.substring(0, name.length() - 5) + ".tar";
            } else if (name.endsWith(".bz")) {
                name = name.substring(0, name.length() - 3);
            } else if (name.endsWith(".bz2")) {
                name = name.substring(0, name.length() - 4);
            } else if (name.endsWith(".xz")) {
                name = name.substring(0, name.length() - 3);
            } else if (name.endsWith(".zlib")) {
                name = name.substring(0, name.length() - 5);
            } else if (name.endsWith(".pack")) {
                name = name.substring(0, name.length() - 5);
            } else if (name.length() > 0) {
                name = GzipUtils.getUncompressedFilename(name);
            }
            entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
        }

        // Use the delegate parser to parse the compressed document
        EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
        if (extractor.shouldParseEmbedded(entrydata)) {
            extractor.parseEmbedded(cis, xhtml, entrydata, true);
        }
    } finally {
        cis.close();
    }

    xhtml.endDocument();
}