Example usage for org.apache.commons.io.input CloseShieldInputStream CloseShieldInputStream

List of usage examples for org.apache.commons.io.input CloseShieldInputStream CloseShieldInputStream

Introduction

In this page you can find the example usage for org.apache.commons.io.input CloseShieldInputStream CloseShieldInputStream.

Prototype

public CloseShieldInputStream(InputStream in) 

Source Link

Document

Creates a proxy that shields the given input stream from being closed.

Usage

From source file:org.apache.tika.parser.microsoft.ooxml.SXWPFWordExtractorDecorator.java

private void handlePart(PackagePart packagePart, XWPFStylesShim styles, XWPFListManager listManager,
        XHTMLContentHandler xhtml) throws IOException, SAXException {

    Map<String, String> linkedRelationships = loadLinkedRelationships(packagePart, true, metadata);
    try (InputStream stream = packagePart.getInputStream()) {
        context.getSAXParser().parse(new CloseShieldInputStream(stream),
                new OfflineContentHandler(new EmbeddedContentHandler(new OOXMLWordAndPowerPointTextHandler(
                        new OOXMLTikaBodyPartHandler(xhtml, styles, listManager, config), linkedRelationships,
                        config.getIncludeShapeBasedContent()))));
    } catch (TikaException e) {
        metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));

    }/*from   ww  w. ja v  a  2s .  c  o m*/

}

From source file:org.apache.tika.parser.microsoft.ooxml.xps.XPSExtractorDecorator.java

private void handleDocuments(PackageRelationship packageRelationship, XHTMLContentHandler xhtml)
        throws IOException, SAXException, TikaException {
    try (InputStream stream = pkg.getPart(packageRelationship).getInputStream()) {
        XMLReaderUtils.parseSAX(new CloseShieldInputStream(stream),
                new OfflineContentHandler(new EmbeddedContentHandler(new FixedDocSeqHandler(xhtml))), context);
    }//from  w  w  w .  jav  a 2 s.  c om
}

From source file:org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006.Word2006MLParser.java

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
    //set OfficeParserConfig if the user hasn't specified one
    configure(context);//from w  ww . j a v  a  2s  .c o  m

    final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);

    xhtml.startDocument();

    try {
        context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(
                new EmbeddedContentHandler(new Word2006MLDocHandler(xhtml, metadata, context))));
    } catch (SAXException e) {
        throw new TikaException("XML parse error", e);
    } finally {
        xhtml.endDocument();
    }
}

From source file:org.apache.tika.parser.microsoft.ooxml.xwpf.Word2006MLParser.java

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
    final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);

    xhtml.startDocument();/*from w w w  .  j a v a 2s.c  o  m*/

    try {
        context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(
                new EmbeddedContentHandler(new Word2006MLHandler(xhtml, metadata, context))));
    } catch (SAXException e) {
        throw new TikaException("XML parse error", e);
    } finally {
        xhtml.endDocument();
    }
}

From source file:org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor.java

private void handlePart(PackagePart packagePart, XWPFListManager xwpfListManager, StringBuilder buffer)
        throws IOException, SAXException {

    Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
    try (InputStream stream = packagePart.getInputStream()) {
        XMLReader reader = SAXHelper.newXMLReader();
        reader.setContentHandler(/*from   w w w  .  ja v  a 2 s  .  c  o m*/
                new OOXMLWordAndPowerPointTextHandler(new XWPFToTextContentHandler(buffer), hyperlinks));
        reader.parse(new InputSource(new CloseShieldInputStream(stream)));

    } catch (ParserConfigurationException e) {
        e.printStackTrace();
    }

}

From source file:org.apache.tika.parser.microsoft.xml.AbstractXML2003Parser.java

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
    setContentType(metadata);/*from  w ww . j ava  2  s  .com*/

    final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();

    TaggedContentHandler tagged = new TaggedContentHandler(xhtml);
    try {
        context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(
                new EmbeddedContentHandler(getContentHandler(tagged, metadata, context))));
    } catch (SAXException e) {
        tagged.throwIfCauseOf(e);
        throw new TikaException("XML parse error", e);
    } finally {
        xhtml.endDocument();
    }
}

From source file:org.apache.tika.parser.odf.OpenDocumentContentParser.java

void parseInternal(InputStream stream, final ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS);

    SAXParser parser = context.getSAXParser();
    parser.parse(new CloseShieldInputStream(stream),
            new OfflineContentHandler(new NSNormalizerContentHandler(dh)));
}

From source file:org.apache.tika.parser.pdf.PDFParser.java

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);

    PDDocument pdfDocument = null;/*w  w w .  ja va 2s  .  com*/

    String password = "";
    try {
        // PDFBox can process entirely in memory, or can use a temp file
        //  for unpacked / processed resources
        // Decide which to do based on if we're reading from a file or not already
        //TODO: make this configurable via MemoryUsageSetting
        TikaInputStream tstream = TikaInputStream.cast(stream);
        password = getPassword(metadata, context);
        if (tstream != null && tstream.hasFile()) {
            // File based -- send file directly to PDFBox
            pdfDocument = PDDocument.load(tstream.getPath().toFile(), password);
        } else {
            pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password);
        }
        metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(pdfDocument.isEncrypted()));

        metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
        extractMetadata(pdfDocument, metadata, context);
        AccessChecker checker = localConfig.getAccessChecker();
        checker.check(metadata);
        if (handler != null) {
            if (shouldHandleXFAOnly(pdfDocument, localConfig)) {
                handleXFAOnly(pdfDocument, handler, metadata, context);
            } else if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) {
                metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
                OCR2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
            } else {
                if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
                    metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
                }
                PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
            }
        }
    } catch (InvalidPasswordException e) {
        metadata.set(PDF.IS_ENCRYPTED, "true");
        throw new EncryptedDocumentException(e);
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}

From source file:org.apache.tika.parser.pdf18.PDFParser.java

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    PDDocument pdfDocument = null;//from w w w.jav a 2  s  .c  o  m
    TemporaryResources tmp = new TemporaryResources();
    //config from context, or default if not set via context
    PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
    String password = "";
    try {
        // PDFBox can process entirely in memory, or can use a temp file
        //  for unpacked / processed resources
        // Decide which to do based on if we're reading from a file or not already
        TikaInputStream tstream = TikaInputStream.cast(stream);
        password = getPassword(metadata, context);
        if (tstream != null && tstream.hasFile()) {
            // File based, take that as a cue to use a temporary file
            RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw");
            if (localConfig.getUseNonSequentialParser() == true) {
                pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), scratchFile, password);
            } else {
                pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true);
            }
        } else {
            // Go for the normal, stream based in-memory parsing
            if (localConfig.getUseNonSequentialParser() == true) {
                pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream),
                        new RandomAccessBuffer(), password);
            } else {
                pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true);
            }
        }
        metadata.set("pdf:encrypted", Boolean.toString(pdfDocument.isEncrypted()));

        //if using the classic parser and the doc is encrypted, we must manually decrypt
        if (!localConfig.getUseNonSequentialParser() && pdfDocument.isEncrypted()) {
            pdfDocument.decrypt(password);
        }

        metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
        extractMetadata(pdfDocument, metadata);

        AccessChecker checker = localConfig.getAccessChecker();
        checker.check(metadata);
        if (handler != null) {
            if (shouldHandleXFAOnly(pdfDocument, localConfig)) {
                handleXFAOnly(pdfDocument, handler, metadata);
            } else {
                PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
            }
        }

    } catch (CryptographyException e) {
        //seq parser throws CryptographyException for bad password
        throw new EncryptedDocumentException(e);
    } catch (IOException e) {
        //nonseq parser throws IOException for bad password
        //At the Tika level, we want the same exception to be thrown
        if (e.getMessage() != null && e.getMessage().contains("Error (CryptographyException)")) {
            metadata.set("pdf:encrypted", Boolean.toString(true));
            throw new EncryptedDocumentException(e);
        }
        //rethrow any other IOExceptions
        throw e;
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
        tmp.dispose();
        //TODO: once we migrate to PDFBox 2.0, remove this (PDFBOX-2200)
        PDFont.clearResources();
    }
}

From source file:org.apache.tika.parser.pkg.CompressorParser.java

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
    // At the end we want to close the compression stream to release
    // any associated resources, but the underlying document stream
    // should not be closed
    if (stream.markSupported()) {
        stream = new CloseShieldInputStream(stream);
    } else {/*from  www .  jav a2 s . c o m*/
        // Ensure that the stream supports the mark feature
        stream = new BufferedInputStream(new CloseShieldInputStream(stream));
    }

    CompressorInputStream cis;
    try {
        CompressorParserOptions options = context.get(CompressorParserOptions.class,
                new CompressorParserOptions() {
                    public boolean decompressConcatenated(Metadata metadata) {
                        return false;
                    }
                });
        TikaCompressorStreamFactory factory = new TikaCompressorStreamFactory(
                options.decompressConcatenated(metadata), memoryLimitInKb);
        cis = factory.createCompressorInputStream(stream);
    } catch (CompressorException e) {
        if (e.getMessage() != null && e.getMessage().startsWith("MemoryLimitException:")) {
            throw new TikaMemoryLimitException(e.getMessage());
        }
        throw new TikaException("Unable to uncompress document stream", e);
    }

    MediaType type = getMediaType(cis);
    if (!type.equals(MediaType.OCTET_STREAM)) {
        metadata.set(CONTENT_TYPE, type.toString());
    }

    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();

    try {
        Metadata entrydata = new Metadata();
        String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
        if (name != null) {
            if (name.endsWith(".tbz")) {
                name = name.substring(0, name.length() - 4) + ".tar";
            } else if (name.endsWith(".tbz2")) {
                name = name.substring(0, name.length() - 5) + ".tar";
            } else if (name.endsWith(".bz")) {
                name = name.substring(0, name.length() - 3);
            } else if (name.endsWith(".bz2")) {
                name = name.substring(0, name.length() - 4);
            } else if (name.endsWith(".xz")) {
                name = name.substring(0, name.length() - 3);
            } else if (name.endsWith(".zlib")) {
                name = name.substring(0, name.length() - 5);
            } else if (name.endsWith(".pack")) {
                name = name.substring(0, name.length() - 5);
            } else if (name.length() > 0) {
                name = GzipUtils.getUncompressedFilename(name);
            }
            entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
        }

        // Use the delegate parser to parse the compressed document
        EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
        if (extractor.shouldParseEmbedded(entrydata)) {
            extractor.parseEmbedded(cis, xhtml, entrydata, true);
        }
    } finally {
        cis.close();
    }

    xhtml.endDocument();
}