Example usage for org.xml.sax.helpers DefaultHandler DefaultHandler

Introduction

In this page you can find the example usage for org.xml.sax.helpers DefaultHandler DefaultHandler.

Prototype

DefaultHandler

Source Link

Usage

From source file:org.apache.tika.parser.microsoft.JackcessParserTest.java

@Test
public void testBasic() throws Exception {

    Parser p = new AutoDetectParser();

    RecursiveParserWrapper w = new RecursiveParserWrapper(p,
            new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));

    for (String fName : new String[] { "testAccess2.accdb", "testAccess2_2000.mdb",
            "testAccess2_2002-2003.mdb" }) {
        InputStream is = null;//  w  w  w  .  j a v  a 2 s . c  o m
        try {
            is = this.getResourceAsStream("/test-documents/" + fName);

            Metadata meta = new Metadata();
            ParseContext c = new ParseContext();
            w.parse(is, new DefaultHandler(), meta, c);
        } finally {
            IOUtils.closeQuietly(is);
        }
        List<Metadata> list = w.getMetadata();
        assertEquals(4, list.size());
        String mainContent = list.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);

        //make sure there's a thead and tbody
        assertContains("</thead><tbody>", mainContent);

        //assert table header
        assertContains("<th>ShortTextField</th>", mainContent);

        //test date format
        assertContains("6/24/15", mainContent);

        //test that markup is stripped
        assertContains("over the bold italic dog", mainContent);

        //test unicode
        assertContains("\u666E\u6797\u65AF\u987F\u5927\u5B66", mainContent);

        //test embedded document handling
        assertContains("Test Document with embedded pdf", list.get(3).get(RecursiveParserWrapper.TIKA_CONTENT));

        w.reset();
    }
}

From source file:org.apache.tika.parser.ocr.TesseractOCRParser.java

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext)
        throws IOException, SAXException, TikaException {
    TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
    // If Tesseract is not on the path with the current config, do not try to run OCR
    // getSupportedTypes shouldn't have listed us as handling it, so this should only
    //  occur if someone directly calls this parser, not via DefaultParser or similar
    if (!hasTesseract(config))
        return;//from   w  ww  .  j  av  a2 s. c  o m

    TemporaryResources tmp = new TemporaryResources();
    try {
        TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);

        //trigger the spooling to a tmp file if the stream wasn't
        //already a TikaInputStream that contained a file
        tikaStream.getPath();
        //this is the text output file name specified on the tesseract
        //commandline.  The actual output file name will have a suffix added.
        File tmpOCROutputFile = tmp.createTemporaryFile();

        // Temporary workaround for TIKA-1445 - until we can specify
        //  composite parsers with strategies (eg Composite, Try In Turn),
        //  always send the image onwards to the regular parser to have
        //  the metadata for them extracted as well
        _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new DefaultHandler(), metadata, parseContext);

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        parse(tikaStream, tmpOCROutputFile, parseContext, xhtml, config);
        xhtml.endDocument();
    } finally {
        tmp.dispose();
    }
}

From source file:org.apache.tika.parser.odf.OpenDocumentParser.java

private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata, ParseContext context,
        EndDocumentShieldingContentHandler handler) throws IOException, SAXException, TikaException {
    if (entry == null)
        return;//w  w  w .java  2s .c  o  m

    if (entry.getName().equals("mimetype")) {
        String type = IOUtils.toString(zip, UTF_8);
        metadata.set(Metadata.CONTENT_TYPE, type);
    } else if (entry.getName().equals(META_NAME)) {
        meta.parse(zip, new DefaultHandler(), metadata, context);
    } else if (entry.getName().endsWith("content.xml")) {
        if (content instanceof OpenDocumentContentParser) {
            ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
        } else {
            // Foreign content parser was set:
            content.parse(zip, handler, metadata, context);
        }
    } else if (entry.getName().endsWith("styles.xml")) {
        if (content instanceof OpenDocumentContentParser) {
            ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
        } else {
            // Foreign content parser was set:
            content.parse(zip, handler, metadata, context);
        }
    } else {
        String embeddedName = entry.getName();
        //scrape everything under Thumbnails/ and Pictures/
        if (embeddedName.contains("Thumbnails/") || embeddedName.contains("Pictures/")) {
            EmbeddedDocumentExtractor embeddedDocumentExtractor = EmbeddedDocumentUtil
                    .getEmbeddedDocumentExtractor(context);
            Metadata embeddedMetadata = new Metadata();
            embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName());
            /* if (embeddedName.startsWith("Thumbnails/")) {
            embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                    TikaCoreProperties.EmbeddedResourceType.THUMBNAIL);
            }*/
            if (embeddedName.contains("Pictures/")) {
                embeddedMetadata.set(TikaMetadataKeys.EMBEDDED_RESOURCE_TYPE,
                        TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
            }
            if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
                embeddedDocumentExtractor.parseEmbedded(zip, new EmbeddedContentHandler(handler),
                        embeddedMetadata, false);
            }
        }

    }
}

From source file:org.apache.tika.parser.pkg.ZipParserTest.java

@Test // TIKA-936
public void testCustomEncoding() throws Exception {
    ArchiveStreamFactory factory = new ArchiveStreamFactory();
    factory.setEntryEncoding("SJIS");
    trackingContext.set(ArchiveStreamFactory.class, factory);

    try (InputStream stream = TikaInputStream
            .get(Base64.decodeBase64("UEsDBBQAAAAIAI+CvUCDo3+zIgAAACgAAAAOAAAAk/qWe4zqg4GDgi50"
                    + "eHRr2tj0qulsc2pzRHN609Gm7Y1OvFxNYLHJv6ZV97yCiQEAUEsBAh"
                    + "QLFAAAAAgAj4K9QIOjf7MiAAAAKAAAAA4AAAAAAAAAAAAgAAAAAAAA"
                    + "AJP6lnuM6oOBg4IudHh0UEsFBgAAAAABAAEAPAAAAE4AAAAAAA=="))) {
        autoDetectParser.parse(stream, new DefaultHandler(), new Metadata(), trackingContext);
    }/*w w w.ja  v a2s .c o m*/

    assertEquals(1, tracker.filenames.size());
    assertEquals("\u65E5\u672C\u8A9E\u30E1\u30E2.txt", tracker.filenames.get(0));
}

From source file:org.apache.tika.parser.RecursiveParserWrapperTest.java

@Test
public void testCharLimit() throws Exception {
    ParseContext context = new ParseContext();
    Metadata metadata = new Metadata();

    Parser wrapped = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
            new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 60));
    InputStream stream = RecursiveParserWrapperTest.class
            .getResourceAsStream("/test-documents/test_recursive_embedded.docx");
    wrapper.parse(stream, new DefaultHandler(), metadata, context);
    List<Metadata> list = wrapper.getMetadata();

    assertEquals(5, list.size());/*from   w  w w  .  j av  a2  s.c  o m*/

    int wlr = 0;
    for (Metadata m : list) {
        String limitReached = m.get(RecursiveParserWrapper.WRITE_LIMIT_REACHED);
        if (limitReached != null && limitReached.equals("true")) {
            wlr++;
        }
    }
    assertEquals(1, wlr);

}

From source file:org.apache.tika.parser.RecursiveParserWrapperTest.java

@Test
public void testMaxEmbedded() throws Exception {
    int maxEmbedded = 4;
    int totalNoLimit = 12;//including outer container file
    ParseContext context = new ParseContext();
    Metadata metadata = new Metadata();
    String limitReached = null;/*from  w  ww  .  j ava  2 s.c o  m*/

    Parser wrapped = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
            new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));

    InputStream stream = RecursiveParserWrapperTest.class
            .getResourceAsStream("/test-documents/test_recursive_embedded.docx");
    wrapper.parse(stream, new DefaultHandler(), metadata, context);
    List<Metadata> list = wrapper.getMetadata();
    //test default
    assertEquals(totalNoLimit, list.size());

    limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
    assertNull(limitReached);

    wrapper.reset();
    stream.close();

    //test setting value
    metadata = new Metadata();
    stream = RecursiveParserWrapperTest.class
            .getResourceAsStream("/test-documents/test_recursive_embedded.docx");
    wrapper.setMaxEmbeddedResources(maxEmbedded);
    wrapper.parse(stream, new DefaultHandler(), metadata, context);
    list = wrapper.getMetadata();

    //add 1 for outer container file
    assertEquals(maxEmbedded + 1, list.size());

    limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
    assertEquals("true", limitReached);

    wrapper.reset();
    stream.close();

    //test setting value < 0
    metadata = new Metadata();
    stream = RecursiveParserWrapperTest.class
            .getResourceAsStream("/test-documents/test_recursive_embedded.docx");

    wrapper.setMaxEmbeddedResources(-2);
    wrapper.parse(stream, new DefaultHandler(), metadata, context);
    assertEquals(totalNoLimit, list.size());
    limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
    assertNull(limitReached);
}

From source file:org.apache.tika.parser.RecursiveParserWrapperTest.java

@Test
public void testPrimaryExcWEmbedded() throws Exception {
    //if embedded content is handled and then
    //the parser hits an exception in the container document,
    //that the first element of the returned list is the container document
    //and the second is the embedded content
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, "embedded_then_npe.xml");

    ParseContext context = new ParseContext();
    Parser wrapped = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
            new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), true);
    String path = "/test-documents/mock/embedded_then_npe.xml";

    InputStream stream = null;/*from w ww  . java2 s . co  m*/
    boolean npe = false;
    try {
        stream = RecursiveParserWrapperTest.class.getResourceAsStream(path);
        wrapper.parse(stream, new DefaultHandler(), metadata, context);
    } catch (TikaException e) {
        if (e.getCause().getClass().equals(NullPointerException.class)) {
            npe = true;
        }
    } finally {
        IOUtils.closeQuietly(stream);
    }
    assertTrue("npe", npe);

    List<Metadata> metadataList = wrapper.getMetadata();
    assertEquals(2, metadataList.size());
    Metadata outerMetadata = metadataList.get(0);
    Metadata embeddedMetadata = metadataList.get(1);
    assertContains("main_content", outerMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
    assertEquals("embedded_then_npe.xml", outerMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
    assertEquals("Nikolai Lobachevsky", outerMetadata.get("author"));

    assertContains("some_embedded_content", embeddedMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
    assertEquals("embed1.xml", embeddedMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
    assertEquals("embeddedAuthor", embeddedMetadata.get("author"));
}

From source file:org.apache.tika.parser.RecursiveParserWrapperTest.java

private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory,
        boolean catchEmbeddedExceptions, DigestingParser.Digester digester) throws Exception {
    ParseContext context = new ParseContext();
    Parser wrapped = new AutoDetectParser();
    if (digester != null) {
        wrapped = new DigestingParser(wrapped, digester);
    }/*from ww w .j av  a2 s. c  o  m*/
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, contentHandlerFactory,
            catchEmbeddedExceptions);
    String path = metadata.get(Metadata.RESOURCE_NAME_KEY);
    if (path == null) {
        path = "/test-documents/test_recursive_embedded.docx";
    } else {
        path = "/test-documents/" + path;
    }
    InputStream stream = null;
    try {
        stream = TikaInputStream.get(RecursiveParserWrapperTest.class.getResource(path).toURI());
        wrapper.parse(stream, new DefaultHandler(), metadata, context);
    } finally {
        IOUtils.closeQuietly(stream);
    }
    return wrapper.getMetadata();

}

From source file:org.apache.tika.server.MetadataEP.java

/**
 * Get all metadata that can be parsed from the specified input stream. An
 * error is produced if the input stream cannot be parsed.
 * //from   www .j a  v a  2 s .com
 * @param is
 *          an input stream
 * @return the metadata
 * @throws Exception
 */
@POST
public Response getMetadata(InputStream is) throws Exception {
    parser.parse(is, new DefaultHandler(), metadata);
    return Response.ok(metadata).build();
}

From source file:org.apache.tika.server.MetadataEP.java

/**
 * Get a specific TIKA metadata field as a simple text string. If the field is
 * multivalued, then only the first value is returned. If the input stream
 * cannot be parsed, but a value was found for the given metadata field, then
 * the value of the field is returned as part of a 200 OK response; otherwise
 * a {@link Status#BAD_REQUEST} is generated. If the stream was successfully
 * parsed but the specific metadata field was not found, then a
 * {@link Status#NOT_FOUND} is returned.
 * <p>/*  w w w .j  a  va  2s . co  m*/
 * 
 * @param field
 *          the tika metadata field name
 * @param is
 *          the document stream
 * @return one of {@link Status#OK}, {@link Status#NOT_FOUND}, or
 *         {@link Status#BAD_REQUEST}
 * @throws Exception
 */
@POST
@Path("{field}")
@Produces(MediaType.TEXT_PLAIN)
public Response getSimpleMetadataField(@PathParam("field") String field, InputStream is) throws Exception {

    // use BAD request to indicate that we may not have had enough data to
    // process the request
    Status defaultErrorResponse = Status.BAD_REQUEST;
    try {
        parser.parse(is, new DefaultHandler(), metadata);
        // once we've parsed the document successfully, we should use NOT_FOUND
        // if we did not see the field
        defaultErrorResponse = Status.NOT_FOUND;
    } catch (Exception e) {
        logger.info("Failed to process field " + field, e);
    }
    String value = metadata.get(field);
    if (value == null) {
        return Response.status(defaultErrorResponse).entity("Failed to get metadata field " + field).build();
    }
    return Response.ok(value, MediaType.TEXT_PLAIN_TYPE).build();
}