Example usage for org.xml.sax.helpers DefaultHandler DefaultHandler

List of usage examples for org.xml.sax.helpers DefaultHandler DefaultHandler

Introduction

In this page you can find the example usage for org.xml.sax.helpers DefaultHandler DefaultHandler.

Prototype

DefaultHandler

Source Link

Usage

From source file:org.apache.tika.parser.microsoft.JackcessParserTest.java

@Test
public void testBasic() throws Exception {

    Parser p = new AutoDetectParser();

    RecursiveParserWrapper w = new RecursiveParserWrapper(p,
            new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));

    for (String fName : new String[] { "testAccess2.accdb", "testAccess2_2000.mdb",
            "testAccess2_2002-2003.mdb" }) {
        InputStream is = null;//  w  w  w  .  j a v  a 2 s . c  o m
        try {
            is = this.getResourceAsStream("/test-documents/" + fName);

            Metadata meta = new Metadata();
            ParseContext c = new ParseContext();
            w.parse(is, new DefaultHandler(), meta, c);
        } finally {
            IOUtils.closeQuietly(is);
        }
        List<Metadata> list = w.getMetadata();
        assertEquals(4, list.size());
        String mainContent = list.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);

        //make sure there's a thead and tbody
        assertContains("</thead><tbody>", mainContent);

        //assert table header
        assertContains("<th>ShortTextField</th>", mainContent);

        //test date format
        assertContains("6/24/15", mainContent);

        //test that markup is stripped
        assertContains("over the bold italic dog", mainContent);

        //test unicode
        assertContains("\u666E\u6797\u65AF\u987F\u5927\u5B66", mainContent);

        //test embedded document handling
        assertContains("Test Document with embedded pdf", list.get(3).get(RecursiveParserWrapper.TIKA_CONTENT));

        w.reset();
    }
}

From source file:org.apache.tika.parser.ocr.TesseractOCRParser.java

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext)
        throws IOException, SAXException, TikaException {
    TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
    // If Tesseract is not on the path with the current config, do not try to run OCR
    // getSupportedTypes shouldn't have listed us as handling it, so this should only
    //  occur if someone directly calls this parser, not via DefaultParser or similar
    if (!hasTesseract(config))
        return;//from   w  ww  .  j  av  a2 s. c  o m

    TemporaryResources tmp = new TemporaryResources();
    try {
        TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);

        //trigger the spooling to a tmp file if the stream wasn't
        //already a TikaInputStream that contained a file
        tikaStream.getPath();
        //this is the text output file name specified on the tesseract
        //commandline.  The actual output file name will have a suffix added.
        File tmpOCROutputFile = tmp.createTemporaryFile();

        // Temporary workaround for TIKA-1445 - until we can specify
        //  composite parsers with strategies (eg Composite, Try In Turn),
        //  always send the image onwards to the regular parser to have
        //  the metadata for them extracted as well
        _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new DefaultHandler(), metadata, parseContext);

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        parse(tikaStream, tmpOCROutputFile, parseContext, xhtml, config);
        xhtml.endDocument();
    } finally {
        tmp.dispose();
    }
}

From source file:org.apache.tika.parser.odf.OpenDocumentParser.java

private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata, ParseContext context,
        EndDocumentShieldingContentHandler handler) throws IOException, SAXException, TikaException {
    if (entry == null)
        return;//w  w  w .java  2s .c  o  m

    if (entry.getName().equals("mimetype")) {
        String type = IOUtils.toString(zip, UTF_8);
        metadata.set(Metadata.CONTENT_TYPE, type);
    } else if (entry.getName().equals(META_NAME)) {
        meta.parse(zip, new DefaultHandler(), metadata, context);
    } else if (entry.getName().endsWith("content.xml")) {
        if (content instanceof OpenDocumentContentParser) {
            ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
        } else {
            // Foreign content parser was set:
            content.parse(zip, handler, metadata, context);
        }
    } else if (entry.getName().endsWith("styles.xml")) {
        if (content instanceof OpenDocumentContentParser) {
            ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
        } else {
            // Foreign content parser was set:
            content.parse(zip, handler, metadata, context);
        }
    } else {
        String embeddedName = entry.getName();
        //scrape everything under Thumbnails/ and Pictures/
        if (embeddedName.contains("Thumbnails/") || embeddedName.contains("Pictures/")) {
            EmbeddedDocumentExtractor embeddedDocumentExtractor = EmbeddedDocumentUtil
                    .getEmbeddedDocumentExtractor(context);
            Metadata embeddedMetadata = new Metadata();
            embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName());
            /* if (embeddedName.startsWith("Thumbnails/")) {
            embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                    TikaCoreProperties.EmbeddedResourceType.THUMBNAIL);
            }*/
            if (embeddedName.contains("Pictures/")) {
                embeddedMetadata.set(TikaMetadataKeys.EMBEDDED_RESOURCE_TYPE,
                        TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
            }
            if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
                embeddedDocumentExtractor.parseEmbedded(zip, new EmbeddedContentHandler(handler),
                        embeddedMetadata, false);
            }
        }

    }
}

From source file:org.apache.tika.parser.pkg.ZipParserTest.java

@Test // TIKA-936
public void testCustomEncoding() throws Exception {
    ArchiveStreamFactory factory = new ArchiveStreamFactory();
    factory.setEntryEncoding("SJIS");
    trackingContext.set(ArchiveStreamFactory.class, factory);

    try (InputStream stream = TikaInputStream
            .get(Base64.decodeBase64("UEsDBBQAAAAIAI+CvUCDo3+zIgAAACgAAAAOAAAAk/qWe4zqg4GDgi50"
                    + "eHRr2tj0qulsc2pzRHN609Gm7Y1OvFxNYLHJv6ZV97yCiQEAUEsBAh"
                    + "QLFAAAAAgAj4K9QIOjf7MiAAAAKAAAAA4AAAAAAAAAAAAgAAAAAAAA"
                    + "AJP6lnuM6oOBg4IudHh0UEsFBgAAAAABAAEAPAAAAE4AAAAAAA=="))) {
        autoDetectParser.parse(stream, new DefaultHandler(), new Metadata(), trackingContext);
    }/*w w w.ja  v a2s .c o m*/

    assertEquals(1, tracker.filenames.size());
    assertEquals("\u65E5\u672C\u8A9E\u30E1\u30E2.txt", tracker.filenames.get(0));
}

From source file:org.apache.tika.parser.RecursiveParserWrapperTest.java

@Test
public void testCharLimit() throws Exception {
    ParseContext context = new ParseContext();
    Metadata metadata = new Metadata();

    Parser wrapped = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
            new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 60));
    InputStream stream = RecursiveParserWrapperTest.class
            .getResourceAsStream("/test-documents/test_recursive_embedded.docx");
    wrapper.parse(stream, new DefaultHandler(), metadata, context);
    List<Metadata> list = wrapper.getMetadata();

    assertEquals(5, list.size());/*from   w  w w  .  j av  a2  s.c  o m*/

    int wlr = 0;
    for (Metadata m : list) {
        String limitReached = m.get(RecursiveParserWrapper.WRITE_LIMIT_REACHED);
        if (limitReached != null && limitReached.equals("true")) {
            wlr++;
        }
    }
    assertEquals(1, wlr);

}

From source file:org.apache.tika.parser.RecursiveParserWrapperTest.java

@Test
public void testMaxEmbedded() throws Exception {
    int maxEmbedded = 4;
    int totalNoLimit = 12;//including outer container file
    ParseContext context = new ParseContext();
    Metadata metadata = new Metadata();
    String limitReached = null;/*from  w  ww  .  j ava  2 s.c o  m*/

    Parser wrapped = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
            new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));

    InputStream stream = RecursiveParserWrapperTest.class
            .getResourceAsStream("/test-documents/test_recursive_embedded.docx");
    wrapper.parse(stream, new DefaultHandler(), metadata, context);
    List<Metadata> list = wrapper.getMetadata();
    //test default
    assertEquals(totalNoLimit, list.size());

    limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
    assertNull(limitReached);

    wrapper.reset();
    stream.close();

    //test setting value
    metadata = new Metadata();
    stream = RecursiveParserWrapperTest.class
            .getResourceAsStream("/test-documents/test_recursive_embedded.docx");
    wrapper.setMaxEmbeddedResources(maxEmbedded);
    wrapper.parse(stream, new DefaultHandler(), metadata, context);
    list = wrapper.getMetadata();

    //add 1 for outer container file
    assertEquals(maxEmbedded + 1, list.size());

    limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
    assertEquals("true", limitReached);

    wrapper.reset();
    stream.close();

    //test setting value < 0
    metadata = new Metadata();
    stream = RecursiveParserWrapperTest.class
            .getResourceAsStream("/test-documents/test_recursive_embedded.docx");

    wrapper.setMaxEmbeddedResources(-2);
    wrapper.parse(stream, new DefaultHandler(), metadata, context);
    assertEquals(totalNoLimit, list.size());
    limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
    assertNull(limitReached);
}

From source file:org.apache.tika.parser.RecursiveParserWrapperTest.java

@Test
public void testPrimaryExcWEmbedded() throws Exception {
    //if embedded content is handled and then
    //the parser hits an exception in the container document,
    //that the first element of the returned list is the container document
    //and the second is the embedded content
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, "embedded_then_npe.xml");

    ParseContext context = new ParseContext();
    Parser wrapped = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
            new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), true);
    String path = "/test-documents/mock/embedded_then_npe.xml";

    InputStream stream = null;/*from w ww  . java2 s . co  m*/
    boolean npe = false;
    try {
        stream = RecursiveParserWrapperTest.class.getResourceAsStream(path);
        wrapper.parse(stream, new DefaultHandler(), metadata, context);
    } catch (TikaException e) {
        if (e.getCause().getClass().equals(NullPointerException.class)) {
            npe = true;
        }
    } finally {
        IOUtils.closeQuietly(stream);
    }
    assertTrue("npe", npe);

    List<Metadata> metadataList = wrapper.getMetadata();
    assertEquals(2, metadataList.size());
    Metadata outerMetadata = metadataList.get(0);
    Metadata embeddedMetadata = metadataList.get(1);
    assertContains("main_content", outerMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
    assertEquals("embedded_then_npe.xml", outerMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
    assertEquals("Nikolai Lobachevsky", outerMetadata.get("author"));

    assertContains("some_embedded_content", embeddedMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
    assertEquals("embed1.xml", embeddedMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
    assertEquals("embeddedAuthor", embeddedMetadata.get("author"));
}

From source file:org.apache.tika.parser.RecursiveParserWrapperTest.java

private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory,
        boolean catchEmbeddedExceptions, DigestingParser.Digester digester) throws Exception {
    ParseContext context = new ParseContext();
    Parser wrapped = new AutoDetectParser();
    if (digester != null) {
        wrapped = new DigestingParser(wrapped, digester);
    }/*from ww w .j av  a2 s. c  o  m*/
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, contentHandlerFactory,
            catchEmbeddedExceptions);
    String path = metadata.get(Metadata.RESOURCE_NAME_KEY);
    if (path == null) {
        path = "/test-documents/test_recursive_embedded.docx";
    } else {
        path = "/test-documents/" + path;
    }
    InputStream stream = null;
    try {
        stream = TikaInputStream.get(RecursiveParserWrapperTest.class.getResource(path).toURI());
        wrapper.parse(stream, new DefaultHandler(), metadata, context);
    } finally {
        IOUtils.closeQuietly(stream);
    }
    return wrapper.getMetadata();

}

From source file:org.apache.tika.server.MetadataEP.java

/**
 * Get all metadata that can be parsed from the specified input stream. An
 * error is produced if the input stream cannot be parsed.
 * //from   www .j a  v a  2 s .com
 * @param is
 *          an input stream
 * @return the metadata
 * @throws Exception
 */
@POST
public Response getMetadata(InputStream is) throws Exception {
    parser.parse(is, new DefaultHandler(), metadata);
    return Response.ok(metadata).build();
}

From source file:org.apache.tika.server.MetadataEP.java

/**
 * Get a specific TIKA metadata field as a simple text string. If the field is
 * multivalued, then only the first value is returned. If the input stream
 * cannot be parsed, but a value was found for the given metadata field, then
 * the value of the field is returned as part of a 200 OK response; otherwise
 * a {@link Status#BAD_REQUEST} is generated. If the stream was successfully
 * parsed but the specific metadata field was not found, then a
 * {@link Status#NOT_FOUND} is returned.
 * <p>/*  w w w .j  a  va  2s . co  m*/
 * 
 * @param field
 *          the tika metadata field name
 * @param is
 *          the document stream
 * @return one of {@link Status#OK}, {@link Status#NOT_FOUND}, or
 *         {@link Status#BAD_REQUEST}
 * @throws Exception
 */
@POST
@Path("{field}")
@Produces(MediaType.TEXT_PLAIN)
public Response getSimpleMetadataField(@PathParam("field") String field, InputStream is) throws Exception {

    // use BAD request to indicate that we may not have had enough data to
    // process the request
    Status defaultErrorResponse = Status.BAD_REQUEST;
    try {
        parser.parse(is, new DefaultHandler(), metadata);
        // once we've parsed the document successfully, we should use NOT_FOUND
        // if we did not see the field
        defaultErrorResponse = Status.NOT_FOUND;
    } catch (Exception e) {
        logger.info("Failed to process field " + field, e);
    }
    String value = metadata.get(field);
    if (value == null) {
        return Response.status(defaultErrorResponse).entity("Failed to get metadata field " + field).build();
    }
    return Response.ok(value, MediaType.TEXT_PLAIN_TYPE).build();
}