List of usage examples for org.apache.commons.io.input CloseShieldInputStream CloseShieldInputStream
public CloseShieldInputStream(InputStream in)
From source file:org.apache.tika.parser.html.HtmlParser.java
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // Automatically detect the character encoding try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) { Charset charset = reader.getCharset(); String previous = metadata.get(Metadata.CONTENT_TYPE); MediaType contentType = null;/*from w w w . jav a 2s . com*/ if (previous == null || previous.startsWith("text/html")) { contentType = new MediaType(MediaType.TEXT_HTML, charset); } else if (previous.startsWith("application/xhtml+xml")) { contentType = new MediaType(XHTML, charset); } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) { contentType = new MediaType(WAP_XHTML, charset); } else if (previous.startsWith("application/x-asp")) { contentType = new MediaType(X_ASP, charset); } if (contentType != null) { metadata.set(Metadata.CONTENT_TYPE, contentType.toString()); } // deprecated, see TIKA-431 metadata.set(Metadata.CONTENT_ENCODING, charset.name()); // Get the HTML mapper from the parse context HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper()); // Parse the HTML document org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser(); // Use schema from context or default Schema schema = context.get(Schema.class, HTML_SCHEMA); // TIKA-528: Reuse share schema to avoid heavy instantiation parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema); // TIKA-599: Shared schema is thread-safe only if bogons are ignored parser.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true); parser.setContentHandler(new XHTMLDowngradeHandler(new HtmlHandler(mapper, handler, metadata))); parser.parse(reader.asInputSource()); } }
From source file:org.apache.tika.parser.image.ImageParser.java
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { String type = metadata.get(Metadata.CONTENT_TYPE); if (type != null) { // If the old (pre-RFC7903) BMP mime type is given, // fix it up to the new one, so Java is happy if (OLD_BMP_TYPE.toString().equals(type)) { type = MAIN_BMP_TYPE.toString(); }//from w w w . j av a 2 s . com try { Iterator<ImageReader> iterator = ImageIO.getImageReadersByMIMEType(type); if (iterator.hasNext()) { ImageReader reader = iterator.next(); try { try (ImageInputStream imageStream = ImageIO .createImageInputStream(new CloseShieldInputStream(stream))) { reader.setInput(imageStream); metadata.set(Metadata.IMAGE_WIDTH, Integer.toString(reader.getWidth(0))); metadata.set(Metadata.IMAGE_LENGTH, Integer.toString(reader.getHeight(0))); metadata.set("height", Integer.toString(reader.getHeight(0))); metadata.set("width", Integer.toString(reader.getWidth(0))); loadMetadata(reader.getImageMetadata(0), metadata); } } finally { reader.dispose(); } } // Translate certain Metadata tags from the ImageIO // specific namespace into the general Tika one setIfPresent(metadata, "CommentExtensions CommentExtension", TikaCoreProperties.COMMENTS); setIfPresent(metadata, "markerSequence com", TikaCoreProperties.COMMENTS); setIfPresent(metadata, "Data BitsPerSample", Metadata.BITS_PER_SAMPLE); } catch (IIOException e) { // TIKA-619: There is a known bug in the Sun API when dealing with GIF images // which Tika will just ignore. if (!(e.getMessage() != null && e.getMessage().equals("Unexpected block type 0!") && type.equals("image/gif"))) { throw new TikaException(type + " parse error", e); } } } XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.endDocument(); }
From source file:org.apache.tika.parser.isatab.ISATabUtils.java
public static void parseInvestigation(InputStream stream, XHTMLContentHandler handler, Metadata metadata, ParseContext context, String studyFileName) throws IOException, TikaException, SAXException { TikaConfig tikaConfig = context.get(TikaConfig.class); if (tikaConfig == null) { tikaConfig = TikaConfig.getDefaultConfig(); }/* w ww .j a v a 2s.c o m*/ // Automatically detect the character encoding try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, tikaConfig.getEncodingDetector())) { extractMetadata(reader, metadata, studyFileName); } }
From source file:org.apache.tika.parser.isatab.ISATabUtils.java
public static void parseStudy(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException { TikaInputStream tis = TikaInputStream.get(stream); // Automatically detect the character encoding TikaConfig tikaConfig = context.get(TikaConfig.class); if (tikaConfig == null) { tikaConfig = TikaConfig.getDefaultConfig(); }/*from w ww . jav a2 s .co m*/ try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis), metadata, tikaConfig.getEncodingDetector()); CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) { Iterator<CSVRecord> iterator = csvParser.iterator(); xhtml.startElement("table"); xhtml.startElement("thead"); if (iterator.hasNext()) { CSVRecord record = iterator.next(); for (int i = 0; i < record.size(); i++) { xhtml.startElement("th"); xhtml.characters(record.get(i)); xhtml.endElement("th"); } } xhtml.endElement("thead"); xhtml.startElement("tbody"); while (iterator.hasNext()) { CSVRecord record = iterator.next(); xhtml.startElement("tr"); for (int j = 0; j < record.size(); j++) { xhtml.startElement("td"); xhtml.characters(record.get(j)); xhtml.endElement("td"); } xhtml.endElement("tr"); } xhtml.endElement("tbody"); xhtml.endElement("table"); } }
From source file:org.apache.tika.parser.isatab.ISATabUtils.java
public static void parseAssay(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException { TikaInputStream tis = TikaInputStream.get(stream); // Automatically detect the character encoding TikaConfig tikaConfig = context.get(TikaConfig.class); if (tikaConfig == null) { tikaConfig = TikaConfig.getDefaultConfig(); }//from w w w .j av a2s .c o m try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis), metadata, tikaConfig.getEncodingDetector()); CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) { xhtml.startElement("table"); Iterator<CSVRecord> iterator = csvParser.iterator(); xhtml.startElement("thead"); if (iterator.hasNext()) { CSVRecord record = iterator.next(); for (int i = 0; i < record.size(); i++) { xhtml.startElement("th"); xhtml.characters(record.get(i)); xhtml.endElement("th"); } } xhtml.endElement("thead"); xhtml.startElement("tbody"); while (iterator.hasNext()) { CSVRecord record = iterator.next(); xhtml.startElement("tr"); for (int j = 0; j < record.size(); j++) { xhtml.startElement("td"); xhtml.characters(record.get(j)); xhtml.endElement("td"); } xhtml.endElement("tr"); } xhtml.endElement("tbody"); xhtml.endElement("table"); } }
From source file:org.apache.tika.parser.iwork.IWorkPackageParser.java
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { ZipArchiveInputStream zip = new ZipArchiveInputStream(stream); ZipArchiveEntry entry = zip.getNextZipEntry(); while (entry != null) { if (!IWORK_CONTENT_ENTRIES.contains(entry.getName())) { entry = zip.getNextZipEntry(); continue; }/*from w ww .j av a 2 s . c o m*/ InputStream entryStream = new BufferedInputStream(zip, 4096); entryStream.mark(4096); IWORKDocumentType type = IWORKDocumentType.detectType(entryStream); entryStream.reset(); if (type != null) { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); ContentHandler contentHandler; switch (type) { case KEYNOTE: contentHandler = new KeynoteContentHandler(xhtml, metadata); break; case NUMBERS: contentHandler = new NumbersContentHandler(xhtml, metadata); break; case PAGES: contentHandler = new PagesContentHandler(xhtml, metadata); break; case ENCRYPTED: // We can't do anything for the file right now contentHandler = null; break; default: throw new TikaException("Unhandled iWorks file " + type); } metadata.add(Metadata.CONTENT_TYPE, type.getType().toString()); xhtml.startDocument(); if (contentHandler != null) { context.getSAXParser().parse(new CloseShieldInputStream(entryStream), new OfflineContentHandler(contentHandler)); } xhtml.endDocument(); } entry = zip.getNextZipEntry(); } // Don't close the zip InputStream (TIKA-1117). }
From source file:org.apache.tika.parser.microsoft.OfficeParser.java
/** * Extracts properties and text from an MS Document input stream *//*from w ww . j a va2s . com*/ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { configure(context); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); final DirectoryNode root; TikaInputStream tstream = TikaInputStream.cast(stream); NPOIFSFileSystem mustCloseFs = null; try { if (tstream == null) { mustCloseFs = new NPOIFSFileSystem(new CloseShieldInputStream(stream)); root = mustCloseFs.getRoot(); } else { final Object container = tstream.getOpenContainer(); if (container instanceof NPOIFSFileSystem) { root = ((NPOIFSFileSystem) container).getRoot(); } else if (container instanceof DirectoryNode) { root = (DirectoryNode) container; } else { NPOIFSFileSystem fs = null; if (tstream.hasFile()) { fs = new NPOIFSFileSystem(tstream.getFile(), true); } else { fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream)); } //tstream will close the fs, no need to close this below tstream.setOpenContainer(fs); root = fs.getRoot(); } } parse(root, context, metadata, xhtml); OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class); if (officeParserConfig.getExtractMacros()) { //now try to get macros extractMacros(root.getNFileSystem(), xhtml, EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context)); } } finally { IOUtils.closeQuietly(mustCloseFs); } xhtml.endDocument(); }
From source file:org.apache.tika.parser.microsoft.ooxml.OOXMLExtractorFactory.java
public static void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { Locale locale = context.get(Locale.class, Locale.getDefault()); ExtractorFactory.setThreadPrefersEventExtractors(true); try {/*from w ww . j av a 2 s.c o m*/ OOXMLExtractor extractor; OPCPackage pkg; // Locate or Open the OPCPackage for the file TikaInputStream tis = TikaInputStream.cast(stream); if (tis != null && tis.getOpenContainer() instanceof OPCPackage) { pkg = (OPCPackage) tis.getOpenContainer(); } else if (tis != null && tis.hasFile()) { pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ); tis.setOpenContainer(pkg); } else { InputStream shield = new CloseShieldInputStream(stream); pkg = OPCPackage.open(shield); } // Get the type, and ensure it's one we handle MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg); if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) { // Not a supported type, delegate to Empty Parser EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context); return; } metadata.set(Metadata.CONTENT_TYPE, type.toString()); // Have the appropriate OOXML text extractor picked POIXMLTextExtractor poiExtractor = null; // This has already been set by OOXMLParser's call to configure() // We can rely on this being non-null. OfficeParserConfig config = context.get(OfficeParserConfig.class); if (config.getUseSAXDocxExtractor()) { poiExtractor = trySXWPF(pkg); } if (poiExtractor == null && config.getUseSAXPptxExtractor()) { poiExtractor = trySXSLF(pkg); } if (poiExtractor == null) { poiExtractor = ExtractorFactory.createExtractor(pkg); } POIXMLDocument document = poiExtractor.getDocument(); if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) { extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale); } else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) { extractor = new XSSFExcelExtractorDecorator(context, poiExtractor, locale); } else if (poiExtractor instanceof XWPFEventBasedWordExtractor) { extractor = new SXWPFWordExtractorDecorator(metadata, context, (XWPFEventBasedWordExtractor) poiExtractor); metadata.add("X-Parsed-By", XWPFEventBasedWordExtractor.class.getCanonicalName()); } else if (poiExtractor instanceof XSLFEventBasedPowerPointExtractor) { extractor = new SXSLFPowerPointExtractorDecorator(metadata, context, (XSLFEventBasedPowerPointExtractor) poiExtractor); metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getCanonicalName()); } else if (document == null) { throw new TikaException( "Expecting UserModel based POI OOXML extractor with a document, but none found. " + "The extractor returned was a " + poiExtractor); } else if (document instanceof XMLSlideShow) { extractor = new XSLFPowerPointExtractorDecorator(context, (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) poiExtractor); } else if (document instanceof XWPFDocument) { extractor = new XWPFWordExtractorDecorator(context, (XWPFWordExtractor) poiExtractor); } else { extractor = new POIXMLTextExtractorDecorator(context, poiExtractor); } // Get the bulk of the metadata first, so that it's accessible during // parsing if desired by the client (see TIKA-1109) extractor.getMetadataExtractor().extract(metadata); // Extract the text, along with any in-document metadata extractor.getXHTML(baseHandler, metadata, context); } catch (IllegalArgumentException e) { if (e.getMessage() != null && e.getMessage().startsWith("No supported documents found")) { throw new TikaException( "TIKA-418: RuntimeException while getting content" + " for thmx and xps file types", e); } else { throw new TikaException("Error creating OOXML extractor", e); } } catch (InvalidFormatException e) { throw new TikaException("Error creating OOXML extractor", e); } catch (OpenXML4JException e) { throw new TikaException("Error creating OOXML extractor", e); } catch (XmlException e) { throw new TikaException("Error creating OOXML extractor", e); } }
From source file:org.apache.tika.parser.microsoft.ooxml.SXSLFPowerPointExtractorDecorator.java
private void loadCommentAuthors() { PackageRelationshipCollection prc = null; try {//from ww w. jav a 2 s. c om prc = mainDocument.getRelationshipsByType(XSLFRelation.COMMENT_AUTHORS.getRelation()); } catch (InvalidFormatException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); } if (prc == null || prc.size() == 0) { return; } for (int i = 0; i < prc.size(); i++) { PackagePart commentAuthorsPart = null; try { commentAuthorsPart = commentAuthorsPart = mainDocument.getRelatedPart(prc.getRelationship(i)); } catch (InvalidFormatException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); } if (commentAuthorsPart == null) { continue; } try (InputStream stream = commentAuthorsPart.getInputStream()) { context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new XSLFCommentAuthorHandler())); } catch (TikaException | SAXException | IOException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); } } }
From source file:org.apache.tika.parser.microsoft.ooxml.SXSLFPowerPointExtractorDecorator.java
private void handleSlidePart(PackagePart slidePart, XHTMLContentHandler xhtml) throws IOException, SAXException { Map<String, String> linkedRelationships = loadLinkedRelationships(slidePart, false, metadata); // Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart); xhtml.startElement("div", "class", "slide-content"); try (InputStream stream = slidePart.getInputStream()) { context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(new OOXMLWordAndPowerPointTextHandler( new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships)))); } catch (TikaException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); }/*from w ww . ja v a2 s. c om*/ xhtml.endElement("div"); handleBasicRelatedParts(XSLFRelation.SLIDE_LAYOUT.getRelation(), "slide-master-content", slidePart, new PlaceHolderSkipper(new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships))); handleBasicRelatedParts(XSLFRelation.NOTES.getRelation(), "slide-notes", slidePart, new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships)); handleBasicRelatedParts(XSLFRelation.NOTES_MASTER.getRelation(), "slide-notes-master", slidePart, new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships)); handleBasicRelatedParts(XSLFRelation.COMMENTS.getRelation(), null, slidePart, new XSLFCommentsHandler(xhtml)); }