org.apache.tika.parser.RecursiveParserWrapperTest.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.tika.parser.RecursiveParserWrapperTest.java

Source

package org.apache.tika.parser;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import static org.apache.tika.TikaTest.assertContains;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;

import java.io.InputStream;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.io.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.parser.utils.CommonsDigester;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.ContentHandlerFactory;
import org.junit.Test;
import org.xml.sax.helpers.DefaultHandler;

public class RecursiveParserWrapperTest {

    @Test
    public void testBasicXML() throws Exception {
        List<Metadata> list = getMetadata(new Metadata(),
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
        Metadata container = list.get(0);
        String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
        //not much differentiates html from xml in this test file
        assertTrue(content.indexOf("<p class=\"header\" />") > -1);
    }

    @Test
    public void testBasicHTML() throws Exception {
        List<Metadata> list = getMetadata(new Metadata(),
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1));
        Metadata container = list.get(0);
        String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
        //not much differentiates html from xml in this test file
        assertTrue(content.indexOf("<p class=\"header\"></p>") > -1);
    }

    @Test
    public void testBasicText() throws Exception {
        List<Metadata> list = getMetadata(new Metadata(),
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
        Metadata container = list.get(0);
        String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
        assertTrue(content.indexOf("<p ") < 0);
        assertTrue(content.indexOf("embed_0") > -1);
    }

    @Test
    public void testIgnoreContent() throws Exception {
        List<Metadata> list = getMetadata(new Metadata(),
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
        Metadata container = list.get(0);
        String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
        assertNull(content);
    }

    @Test
    public void testCharLimit() throws Exception {
        ParseContext context = new ParseContext();
        Metadata metadata = new Metadata();

        Parser wrapped = new AutoDetectParser();
        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 60));
        InputStream stream = RecursiveParserWrapperTest.class
                .getResourceAsStream("/test-documents/test_recursive_embedded.docx");
        wrapper.parse(stream, new DefaultHandler(), metadata, context);
        List<Metadata> list = wrapper.getMetadata();

        assertEquals(5, list.size());

        int wlr = 0;
        for (Metadata m : list) {
            String limitReached = m.get(RecursiveParserWrapper.WRITE_LIMIT_REACHED);
            if (limitReached != null && limitReached.equals("true")) {
                wlr++;
            }
        }
        assertEquals(1, wlr);

    }

    @Test
    public void testMaxEmbedded() throws Exception {
        int maxEmbedded = 4;
        int totalNoLimit = 12;//including outer container file
        ParseContext context = new ParseContext();
        Metadata metadata = new Metadata();
        String limitReached = null;

        Parser wrapped = new AutoDetectParser();
        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));

        InputStream stream = RecursiveParserWrapperTest.class
                .getResourceAsStream("/test-documents/test_recursive_embedded.docx");
        wrapper.parse(stream, new DefaultHandler(), metadata, context);
        List<Metadata> list = wrapper.getMetadata();
        //test default
        assertEquals(totalNoLimit, list.size());

        limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
        assertNull(limitReached);

        wrapper.reset();
        stream.close();

        //test setting value
        metadata = new Metadata();
        stream = RecursiveParserWrapperTest.class
                .getResourceAsStream("/test-documents/test_recursive_embedded.docx");
        wrapper.setMaxEmbeddedResources(maxEmbedded);
        wrapper.parse(stream, new DefaultHandler(), metadata, context);
        list = wrapper.getMetadata();

        //add 1 for outer container file
        assertEquals(maxEmbedded + 1, list.size());

        limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
        assertEquals("true", limitReached);

        wrapper.reset();
        stream.close();

        //test setting value < 0
        metadata = new Metadata();
        stream = RecursiveParserWrapperTest.class
                .getResourceAsStream("/test-documents/test_recursive_embedded.docx");

        wrapper.setMaxEmbeddedResources(-2);
        wrapper.parse(stream, new DefaultHandler(), metadata, context);
        assertEquals(totalNoLimit, list.size());
        limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
        assertNull(limitReached);
    }

    @Test
    public void testEmbeddedResourcePath() throws Exception {

        Set<String> targets = new HashSet<String>();
        targets.add("/embed1.zip");
        targets.add("/embed1.zip/embed2.zip");
        targets.add("/embed1.zip/embed2.zip/embed3.zip");
        targets.add("/embed1.zip/embed2.zip/embed3.zip/embed4.zip");
        targets.add("/embed1.zip/embed2.zip/embed3.zip/embed4.zip/embed4.txt");
        targets.add("/embed1.zip/embed2.zip/embed3.zip/embed3.txt");
        targets.add("/embed1.zip/embed2.zip/embed2a.txt");
        targets.add("/embed1.zip/embed2.zip/embed2b.txt");
        targets.add("/embed1.zip/embed1b.txt");
        targets.add("/embed1.zip/embed1a.txt");
        targets.add("/image1.emf");

        Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
        List<Metadata> list = getMetadata(metadata,
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
        Metadata container = list.get(0);
        String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
        assertTrue(content.indexOf("<p class=\"header\" />") > -1);

        Set<String> seen = new HashSet<String>();
        for (Metadata m : list) {
            String path = m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
            if (path != null) {
                seen.add(path);
            }
        }
        assertEquals(targets, seen);
    }

    @Test
    public void testEmbeddedNPE() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded_npe.docx");
        List<Metadata> list = getMetadata(metadata,
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
        //default behavior (user doesn't specify whether or not to catch embedded exceptions
        //is to catch the exception
        assertEquals(13, list.size());
        Metadata mockNPEMetadata = list.get(10);
        assertContains("java.lang.NullPointerException",
                mockNPEMetadata.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));

        metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded_npe.docx");
        list = getMetadata(metadata,
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), false, null);

        //Composite parser swallows caught TikaExceptions, IOExceptions and SAXExceptions
        //and just doesn't bother to report that there was an exception.
        assertEquals(12, list.size());
    }

    @Test
    public void testPrimaryExcWEmbedded() throws Exception {
        //if embedded content is handled and then
        //the parser hits an exception in the container document,
        //that the first element of the returned list is the container document
        //and the second is the embedded content
        Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, "embedded_then_npe.xml");

        ParseContext context = new ParseContext();
        Parser wrapped = new AutoDetectParser();
        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), true);
        String path = "/test-documents/mock/embedded_then_npe.xml";

        InputStream stream = null;
        boolean npe = false;
        try {
            stream = RecursiveParserWrapperTest.class.getResourceAsStream(path);
            wrapper.parse(stream, new DefaultHandler(), metadata, context);
        } catch (TikaException e) {
            if (e.getCause().getClass().equals(NullPointerException.class)) {
                npe = true;
            }
        } finally {
            IOUtils.closeQuietly(stream);
        }
        assertTrue("npe", npe);

        List<Metadata> metadataList = wrapper.getMetadata();
        assertEquals(2, metadataList.size());
        Metadata outerMetadata = metadataList.get(0);
        Metadata embeddedMetadata = metadataList.get(1);
        assertContains("main_content", outerMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
        assertEquals("embedded_then_npe.xml", outerMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
        assertEquals("Nikolai Lobachevsky", outerMetadata.get("author"));

        assertContains("some_embedded_content", embeddedMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
        assertEquals("embed1.xml", embeddedMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
        assertEquals("embeddedAuthor", embeddedMetadata.get("author"));
    }

    @Test
    public void testDigesters() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
        List<Metadata> list = getMetadata(metadata,
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), true,
                new CommonsDigester(100000, CommonsDigester.DigestAlgorithm.MD5));
        int i = 0;
        Metadata m0 = list.get(0);
        Metadata m6 = list.get(6);
        String md5Key = "X-TIKA:digest:MD5";
        assertEquals("59f626e09a8c16ab6dbc2800c685f772", list.get(0).get(md5Key));
        assertEquals("ccdf3882e7e4c2454e28884db9b0a54d", list.get(6).get(md5Key));
        assertEquals("a869bf6432ebd14e19fc79416274e0c9", list.get(7).get(md5Key));
    }

    private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory,
            boolean catchEmbeddedExceptions, DigestingParser.Digester digester) throws Exception {
        ParseContext context = new ParseContext();
        Parser wrapped = new AutoDetectParser();
        if (digester != null) {
            wrapped = new DigestingParser(wrapped, digester);
        }
        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, contentHandlerFactory,
                catchEmbeddedExceptions);
        String path = metadata.get(Metadata.RESOURCE_NAME_KEY);
        if (path == null) {
            path = "/test-documents/test_recursive_embedded.docx";
        } else {
            path = "/test-documents/" + path;
        }
        InputStream stream = null;
        try {
            stream = TikaInputStream.get(RecursiveParserWrapperTest.class.getResource(path).toURI());
            wrapper.parse(stream, new DefaultHandler(), metadata, context);
        } finally {
            IOUtils.closeQuietly(stream);
        }
        return wrapper.getMetadata();

    }

    private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory)
            throws Exception {
        return getMetadata(metadata, contentHandlerFactory, true, null);
    }
}