List of usage examples for org.xml.sax.helpers DefaultHandler DefaultHandler
DefaultHandler
From source file:org.apache.tika.parser.microsoft.JackcessParserTest.java
@Test public void testBasic() throws Exception { Parser p = new AutoDetectParser(); RecursiveParserWrapper w = new RecursiveParserWrapper(p, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); for (String fName : new String[] { "testAccess2.accdb", "testAccess2_2000.mdb", "testAccess2_2002-2003.mdb" }) { InputStream is = null;// w w w . j a v a 2 s . c o m try { is = this.getResourceAsStream("/test-documents/" + fName); Metadata meta = new Metadata(); ParseContext c = new ParseContext(); w.parse(is, new DefaultHandler(), meta, c); } finally { IOUtils.closeQuietly(is); } List<Metadata> list = w.getMetadata(); assertEquals(4, list.size()); String mainContent = list.get(0).get(RecursiveParserWrapper.TIKA_CONTENT); //make sure there's a thead and tbody assertContains("</thead><tbody>", mainContent); //assert table header assertContains("<th>ShortTextField</th>", mainContent); //test date format assertContains("6/24/15", mainContent); //test that markup is stripped assertContains("over the bold italic dog", mainContent); //test unicode assertContains("\u666E\u6797\u65AF\u987F\u5927\u5B66", mainContent); //test embedded document handling assertContains("Test Document with embedded pdf", list.get(3).get(RecursiveParserWrapper.TIKA_CONTENT)); w.reset(); } }
From source file:org.apache.tika.parser.ocr.TesseractOCRParser.java
@Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException { TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, DEFAULT_CONFIG); // If Tesseract is not on the path with the current config, do not try to run OCR // getSupportedTypes shouldn't have listed us as handling it, so this should only // occur if someone directly calls this parser, not via DefaultParser or similar if (!hasTesseract(config)) return;//from w ww . j av a2 s. c o m TemporaryResources tmp = new TemporaryResources(); try { TikaInputStream tikaStream = TikaInputStream.get(stream, tmp); //trigger the spooling to a tmp file if the stream wasn't //already a TikaInputStream that contained a file tikaStream.getPath(); //this is the text output file name specified on the tesseract //commandline. The actual output file name will have a suffix added. File tmpOCROutputFile = tmp.createTemporaryFile(); // Temporary workaround for TIKA-1445 - until we can specify // composite parsers with strategies (eg Composite, Try In Turn), // always send the image onwards to the regular parser to have // the metadata for them extracted as well _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new DefaultHandler(), metadata, parseContext); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); parse(tikaStream, tmpOCROutputFile, parseContext, xhtml, config); xhtml.endDocument(); } finally { tmp.dispose(); } }
From source file:org.apache.tika.parser.odf.OpenDocumentParser.java
private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, SAXException, TikaException { if (entry == null) return;//w w w .java 2s .c o m if (entry.getName().equals("mimetype")) { String type = IOUtils.toString(zip, UTF_8); metadata.set(Metadata.CONTENT_TYPE, type); } else if (entry.getName().equals(META_NAME)) { meta.parse(zip, new DefaultHandler(), metadata, context); } else if (entry.getName().endsWith("content.xml")) { if (content instanceof OpenDocumentContentParser) { ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context); } else { // Foreign content parser was set: content.parse(zip, handler, metadata, context); } } else if (entry.getName().endsWith("styles.xml")) { if (content instanceof OpenDocumentContentParser) { ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context); } else { // Foreign content parser was set: content.parse(zip, handler, metadata, context); } } else { String embeddedName = entry.getName(); //scrape everything under Thumbnails/ and Pictures/ if (embeddedName.contains("Thumbnails/") || embeddedName.contains("Pictures/")) { EmbeddedDocumentExtractor embeddedDocumentExtractor = EmbeddedDocumentUtil .getEmbeddedDocumentExtractor(context); Metadata embeddedMetadata = new Metadata(); embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName()); /* if (embeddedName.startsWith("Thumbnails/")) { embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.THUMBNAIL); }*/ if (embeddedName.contains("Pictures/")) { embeddedMetadata.set(TikaMetadataKeys.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); } if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { embeddedDocumentExtractor.parseEmbedded(zip, new EmbeddedContentHandler(handler), embeddedMetadata, false); } } } }
From source file:org.apache.tika.parser.pkg.ZipParserTest.java
@Test // TIKA-936 public void testCustomEncoding() throws Exception { ArchiveStreamFactory factory = new ArchiveStreamFactory(); factory.setEntryEncoding("SJIS"); trackingContext.set(ArchiveStreamFactory.class, factory); try (InputStream stream = TikaInputStream .get(Base64.decodeBase64("UEsDBBQAAAAIAI+CvUCDo3+zIgAAACgAAAAOAAAAk/qWe4zqg4GDgi50" + "eHRr2tj0qulsc2pzRHN609Gm7Y1OvFxNYLHJv6ZV97yCiQEAUEsBAh" + "QLFAAAAAgAj4K9QIOjf7MiAAAAKAAAAA4AAAAAAAAAAAAgAAAAAAAA" + "AJP6lnuM6oOBg4IudHh0UEsFBgAAAAABAAEAPAAAAE4AAAAAAA=="))) { autoDetectParser.parse(stream, new DefaultHandler(), new Metadata(), trackingContext); }/*w w w.ja v a2s .c o m*/ assertEquals(1, tracker.filenames.size()); assertEquals("\u65E5\u672C\u8A9E\u30E1\u30E2.txt", tracker.filenames.get(0)); }
From source file:org.apache.tika.parser.RecursiveParserWrapperTest.java
@Test public void testCharLimit() throws Exception { ParseContext context = new ParseContext(); Metadata metadata = new Metadata(); Parser wrapped = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 60)); InputStream stream = RecursiveParserWrapperTest.class .getResourceAsStream("/test-documents/test_recursive_embedded.docx"); wrapper.parse(stream, new DefaultHandler(), metadata, context); List<Metadata> list = wrapper.getMetadata(); assertEquals(5, list.size());/*from w w w . j av a2 s.c o m*/ int wlr = 0; for (Metadata m : list) { String limitReached = m.get(RecursiveParserWrapper.WRITE_LIMIT_REACHED); if (limitReached != null && limitReached.equals("true")) { wlr++; } } assertEquals(1, wlr); }
From source file:org.apache.tika.parser.RecursiveParserWrapperTest.java
@Test public void testMaxEmbedded() throws Exception { int maxEmbedded = 4; int totalNoLimit = 12;//including outer container file ParseContext context = new ParseContext(); Metadata metadata = new Metadata(); String limitReached = null;/*from w ww . j ava 2 s.c o m*/ Parser wrapped = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); InputStream stream = RecursiveParserWrapperTest.class .getResourceAsStream("/test-documents/test_recursive_embedded.docx"); wrapper.parse(stream, new DefaultHandler(), metadata, context); List<Metadata> list = wrapper.getMetadata(); //test default assertEquals(totalNoLimit, list.size()); limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED); assertNull(limitReached); wrapper.reset(); stream.close(); //test setting value metadata = new Metadata(); stream = RecursiveParserWrapperTest.class .getResourceAsStream("/test-documents/test_recursive_embedded.docx"); wrapper.setMaxEmbeddedResources(maxEmbedded); wrapper.parse(stream, new DefaultHandler(), metadata, context); list = wrapper.getMetadata(); //add 1 for outer container file assertEquals(maxEmbedded + 1, list.size()); limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED); assertEquals("true", limitReached); wrapper.reset(); stream.close(); //test setting value < 0 metadata = new Metadata(); stream = RecursiveParserWrapperTest.class .getResourceAsStream("/test-documents/test_recursive_embedded.docx"); wrapper.setMaxEmbeddedResources(-2); wrapper.parse(stream, new DefaultHandler(), metadata, context); assertEquals(totalNoLimit, list.size()); limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED); assertNull(limitReached); }
From source file:org.apache.tika.parser.RecursiveParserWrapperTest.java
@Test public void testPrimaryExcWEmbedded() throws Exception { //if embedded content is handled and then //the parser hits an exception in the container document, //that the first element of the returned list is the container document //and the second is the embedded content Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, "embedded_then_npe.xml"); ParseContext context = new ParseContext(); Parser wrapped = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), true); String path = "/test-documents/mock/embedded_then_npe.xml"; InputStream stream = null;/*from w ww . java2 s . co m*/ boolean npe = false; try { stream = RecursiveParserWrapperTest.class.getResourceAsStream(path); wrapper.parse(stream, new DefaultHandler(), metadata, context); } catch (TikaException e) { if (e.getCause().getClass().equals(NullPointerException.class)) { npe = true; } } finally { IOUtils.closeQuietly(stream); } assertTrue("npe", npe); List<Metadata> metadataList = wrapper.getMetadata(); assertEquals(2, metadataList.size()); Metadata outerMetadata = metadataList.get(0); Metadata embeddedMetadata = metadataList.get(1); assertContains("main_content", outerMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); assertEquals("embedded_then_npe.xml", outerMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY)); assertEquals("Nikolai Lobachevsky", outerMetadata.get("author")); assertContains("some_embedded_content", embeddedMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); assertEquals("embed1.xml", embeddedMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY)); assertEquals("embeddedAuthor", embeddedMetadata.get("author")); }
From source file:org.apache.tika.parser.RecursiveParserWrapperTest.java
private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory, boolean catchEmbeddedExceptions, DigestingParser.Digester digester) throws Exception { ParseContext context = new ParseContext(); Parser wrapped = new AutoDetectParser(); if (digester != null) { wrapped = new DigestingParser(wrapped, digester); }/*from ww w .j av a2 s. c o m*/ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, contentHandlerFactory, catchEmbeddedExceptions); String path = metadata.get(Metadata.RESOURCE_NAME_KEY); if (path == null) { path = "/test-documents/test_recursive_embedded.docx"; } else { path = "/test-documents/" + path; } InputStream stream = null; try { stream = TikaInputStream.get(RecursiveParserWrapperTest.class.getResource(path).toURI()); wrapper.parse(stream, new DefaultHandler(), metadata, context); } finally { IOUtils.closeQuietly(stream); } return wrapper.getMetadata(); }
From source file:org.apache.tika.server.MetadataEP.java
/** * Get all metadata that can be parsed from the specified input stream. An * error is produced if the input stream cannot be parsed. * //from www .j a v a 2 s .com * @param is * an input stream * @return the metadata * @throws Exception */ @POST public Response getMetadata(InputStream is) throws Exception { parser.parse(is, new DefaultHandler(), metadata); return Response.ok(metadata).build(); }
From source file:org.apache.tika.server.MetadataEP.java
/** * Get a specific TIKA metadata field as a simple text string. If the field is * multivalued, then only the first value is returned. If the input stream * cannot be parsed, but a value was found for the given metadata field, then * the value of the field is returned as part of a 200 OK response; otherwise * a {@link Status#BAD_REQUEST} is generated. If the stream was successfully * parsed but the specific metadata field was not found, then a * {@link Status#NOT_FOUND} is returned. * <p>/* w w w .j a va 2s . co m*/ * * @param field * the tika metadata field name * @param is * the document stream * @return one of {@link Status#OK}, {@link Status#NOT_FOUND}, or * {@link Status#BAD_REQUEST} * @throws Exception */ @POST @Path("{field}") @Produces(MediaType.TEXT_PLAIN) public Response getSimpleMetadataField(@PathParam("field") String field, InputStream is) throws Exception { // use BAD request to indicate that we may not have had enough data to // process the request Status defaultErrorResponse = Status.BAD_REQUEST; try { parser.parse(is, new DefaultHandler(), metadata); // once we've parsed the document successfully, we should use NOT_FOUND // if we did not see the field defaultErrorResponse = Status.NOT_FOUND; } catch (Exception e) { logger.info("Failed to process field " + field, e); } String value = metadata.get(field); if (value == null) { return Response.status(defaultErrorResponse).entity("Failed to get metadata field " + field).build(); } return Response.ok(value, MediaType.TEXT_PLAIN_TYPE).build(); }