List of usage examples for org.apache.commons.io.input CloseShieldInputStream CloseShieldInputStream
public CloseShieldInputStream(InputStream in)
From source file:org.apache.tika.parser.microsoft.ooxml.SXWPFWordExtractorDecorator.java
private void handlePart(PackagePart packagePart, XWPFStylesShim styles, XWPFListManager listManager, XHTMLContentHandler xhtml) throws IOException, SAXException { Map<String, String> linkedRelationships = loadLinkedRelationships(packagePart, true, metadata); try (InputStream stream = packagePart.getInputStream()) { context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(new OOXMLWordAndPowerPointTextHandler( new OOXMLTikaBodyPartHandler(xhtml, styles, listManager, config), linkedRelationships, config.getIncludeShapeBasedContent())))); } catch (TikaException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); }/*from ww w. ja v a 2s . c o m*/ }
From source file:org.apache.tika.parser.microsoft.ooxml.xps.XPSExtractorDecorator.java
private void handleDocuments(PackageRelationship packageRelationship, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { try (InputStream stream = pkg.getPart(packageRelationship).getInputStream()) { XMLReaderUtils.parseSAX(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(new FixedDocSeqHandler(xhtml))), context); }//from w w w . jav a 2 s. c om }
From source file:org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006.Word2006MLParser.java
@Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { //set OfficeParserConfig if the user hasn't specified one configure(context);//from w ww . j a v a 2s .c o m final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); try { context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler( new EmbeddedContentHandler(new Word2006MLDocHandler(xhtml, metadata, context)))); } catch (SAXException e) { throw new TikaException("XML parse error", e); } finally { xhtml.endDocument(); } }
From source file:org.apache.tika.parser.microsoft.ooxml.xwpf.Word2006MLParser.java
@Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument();/*from w w w . j a v a 2s.c o m*/ try { context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler( new EmbeddedContentHandler(new Word2006MLHandler(xhtml, metadata, context)))); } catch (SAXException e) { throw new TikaException("XML parse error", e); } finally { xhtml.endDocument(); } }
From source file:org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor.java
private void handlePart(PackagePart packagePart, XWPFListManager xwpfListManager, StringBuilder buffer) throws IOException, SAXException { Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart); try (InputStream stream = packagePart.getInputStream()) { XMLReader reader = SAXHelper.newXMLReader(); reader.setContentHandler(/*from w w w . ja v a 2 s . c o m*/ new OOXMLWordAndPowerPointTextHandler(new XWPFToTextContentHandler(buffer), hyperlinks)); reader.parse(new InputSource(new CloseShieldInputStream(stream))); } catch (ParserConfigurationException e) { e.printStackTrace(); } }
From source file:org.apache.tika.parser.microsoft.xml.AbstractXML2003Parser.java
@Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { setContentType(metadata);/*from w ww . j ava 2 s .com*/ final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); TaggedContentHandler tagged = new TaggedContentHandler(xhtml); try { context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler( new EmbeddedContentHandler(getContentHandler(tagged, metadata, context)))); } catch (SAXException e) { tagged.throwIfCauseOf(e); throw new TikaException("XML parse error", e); } finally { xhtml.endDocument(); } }
From source file:org.apache.tika.parser.odf.OpenDocumentContentParser.java
void parseInternal(InputStream stream, final ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS); SAXParser parser = context.getSAXParser(); parser.parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new NSNormalizerContentHandler(dh))); }
From source file:org.apache.tika.parser.pdf.PDFParser.java
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig); PDDocument pdfDocument = null;/*w w w . ja va 2s . com*/ String password = ""; try { // PDFBox can process entirely in memory, or can use a temp file // for unpacked / processed resources // Decide which to do based on if we're reading from a file or not already //TODO: make this configurable via MemoryUsageSetting TikaInputStream tstream = TikaInputStream.cast(stream); password = getPassword(metadata, context); if (tstream != null && tstream.hasFile()) { // File based -- send file directly to PDFBox pdfDocument = PDDocument.load(tstream.getPath().toFile(), password); } else { pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password); } metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(pdfDocument.isEncrypted())); metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString()); extractMetadata(pdfDocument, metadata, context); AccessChecker checker = localConfig.getAccessChecker(); checker.check(metadata); if (handler != null) { if (shouldHandleXFAOnly(pdfDocument, localConfig)) { handleXFAOnly(pdfDocument, handler, metadata, context); } else if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) { metadata.add("X-Parsed-By", TesseractOCRParser.class.toString()); OCR2XHTML.process(pdfDocument, handler, context, metadata, localConfig); } else { if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { metadata.add("X-Parsed-By", TesseractOCRParser.class.toString()); } PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig); } } } catch (InvalidPasswordException e) { metadata.set(PDF.IS_ENCRYPTED, "true"); throw new EncryptedDocumentException(e); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:org.apache.tika.parser.pdf18.PDFParser.java
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { PDDocument pdfDocument = null;//from w w w.jav a 2 s .c o m TemporaryResources tmp = new TemporaryResources(); //config from context, or default if not set via context PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig); String password = ""; try { // PDFBox can process entirely in memory, or can use a temp file // for unpacked / processed resources // Decide which to do based on if we're reading from a file or not already TikaInputStream tstream = TikaInputStream.cast(stream); password = getPassword(metadata, context); if (tstream != null && tstream.hasFile()) { // File based, take that as a cue to use a temporary file RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw"); if (localConfig.getUseNonSequentialParser() == true) { pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), scratchFile, password); } else { pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true); } } else { // Go for the normal, stream based in-memory parsing if (localConfig.getUseNonSequentialParser() == true) { pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), new RandomAccessBuffer(), password); } else { pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true); } } metadata.set("pdf:encrypted", Boolean.toString(pdfDocument.isEncrypted())); //if using the classic parser and the doc is encrypted, we must manually decrypt if (!localConfig.getUseNonSequentialParser() && pdfDocument.isEncrypted()) { pdfDocument.decrypt(password); } metadata.set(Metadata.CONTENT_TYPE, "application/pdf"); extractMetadata(pdfDocument, metadata); AccessChecker checker = localConfig.getAccessChecker(); checker.check(metadata); if (handler != null) { if (shouldHandleXFAOnly(pdfDocument, localConfig)) { handleXFAOnly(pdfDocument, handler, metadata); } else { PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig); } } } catch (CryptographyException e) { //seq parser throws CryptographyException for bad password throw new EncryptedDocumentException(e); } catch (IOException e) { //nonseq parser throws IOException for bad password //At the Tika level, we want the same exception to be thrown if (e.getMessage() != null && e.getMessage().contains("Error (CryptographyException)")) { metadata.set("pdf:encrypted", Boolean.toString(true)); throw new EncryptedDocumentException(e); } //rethrow any other IOExceptions throw e; } finally { if (pdfDocument != null) { pdfDocument.close(); } tmp.dispose(); //TODO: once we migrate to PDFBox 2.0, remove this (PDFBOX-2200) PDFont.clearResources(); } }
From source file:org.apache.tika.parser.pkg.CompressorParser.java
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // At the end we want to close the compression stream to release // any associated resources, but the underlying document stream // should not be closed if (stream.markSupported()) { stream = new CloseShieldInputStream(stream); } else {/*from www . jav a2 s . c o m*/ // Ensure that the stream supports the mark feature stream = new BufferedInputStream(new CloseShieldInputStream(stream)); } CompressorInputStream cis; try { CompressorParserOptions options = context.get(CompressorParserOptions.class, new CompressorParserOptions() { public boolean decompressConcatenated(Metadata metadata) { return false; } }); TikaCompressorStreamFactory factory = new TikaCompressorStreamFactory( options.decompressConcatenated(metadata), memoryLimitInKb); cis = factory.createCompressorInputStream(stream); } catch (CompressorException e) { if (e.getMessage() != null && e.getMessage().startsWith("MemoryLimitException:")) { throw new TikaMemoryLimitException(e.getMessage()); } throw new TikaException("Unable to uncompress document stream", e); } MediaType type = getMediaType(cis); if (!type.equals(MediaType.OCTET_STREAM)) { metadata.set(CONTENT_TYPE, type.toString()); } XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); try { Metadata entrydata = new Metadata(); String name = metadata.get(Metadata.RESOURCE_NAME_KEY); if (name != null) { if (name.endsWith(".tbz")) { name = name.substring(0, name.length() - 4) + ".tar"; } else if (name.endsWith(".tbz2")) { name = name.substring(0, name.length() - 5) + ".tar"; } else if (name.endsWith(".bz")) { name = name.substring(0, name.length() - 3); } else if (name.endsWith(".bz2")) { name = name.substring(0, name.length() - 4); } else if (name.endsWith(".xz")) { name = name.substring(0, name.length() - 3); } else if (name.endsWith(".zlib")) { name = name.substring(0, name.length() - 5); } else if (name.endsWith(".pack")) { name = name.substring(0, name.length() - 5); } else if (name.length() > 0) { name = GzipUtils.getUncompressedFilename(name); } entrydata.set(Metadata.RESOURCE_NAME_KEY, name); } // Use the delegate parser to parse the compressed document EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); if (extractor.shouldParseEmbedded(entrydata)) { extractor.parseEmbedded(cis, xhtml, entrydata, true); } } finally { cis.close(); } xhtml.endDocument(); }