org.apache.tika.parser.microsoft.JackcessParserTest.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.tika.parser.microsoft.JackcessParserTest.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.parser.microsoft;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

import java.io.InputStream;
import java.util.List;

import org.apache.commons.io.IOUtils;
import org.apache.tika.TikaTest;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.junit.Test;
import org.xml.sax.helpers.DefaultHandler;

public class JackcessParserTest extends TikaTest {

    @Test
    public void testBasic() throws Exception {

        Parser p = new AutoDetectParser();

        RecursiveParserWrapper w = new RecursiveParserWrapper(p,
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));

        for (String fName : new String[] { "testAccess2.accdb", "testAccess2_2000.mdb",
                "testAccess2_2002-2003.mdb" }) {
            InputStream is = null;
            try {
                is = this.getResourceAsStream("/test-documents/" + fName);

                Metadata meta = new Metadata();
                ParseContext c = new ParseContext();
                w.parse(is, new DefaultHandler(), meta, c);
            } finally {
                IOUtils.closeQuietly(is);
            }
            List<Metadata> list = w.getMetadata();
            assertEquals(4, list.size());
            String mainContent = list.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);

            //make sure there's a thead and tbody
            assertContains("</thead><tbody>", mainContent);

            //assert table header
            assertContains("<th>ShortTextField</th>", mainContent);

            //test date format
            assertContains("6/24/15", mainContent);

            //test that markup is stripped
            assertContains("over the bold italic dog", mainContent);

            //test unicode
            assertContains("\u666E\u6797\u65AF\u987F\u5927\u5B66", mainContent);

            //test embedded document handling
            assertContains("Test Document with embedded pdf", list.get(3).get(RecursiveParserWrapper.TIKA_CONTENT));

            w.reset();
        }
    }

    @Test
    public void testPassword() throws Exception {
        ParseContext c = new ParseContext();
        c.set(PasswordProvider.class, new PasswordProvider() {
            @Override
            public String getPassword(Metadata metadata) {
                return "tika";
            }
        });
        Parser p = new AutoDetectParser();
        String content = null;
        try (InputStream is = this.getResourceAsStream("/test-documents/testAccess2_encrypted.accdb")) {
            content = getText(is, p, c);
        }
        assertContains("red and brown", content);

        //now try wrong password
        c.set(PasswordProvider.class, new PasswordProvider() {
            @Override
            public String getPassword(Metadata metadata) {
                return "WRONG";
            }
        });

        boolean ex = false;
        try (InputStream is = this.getResourceAsStream("/test-documents/testAccess2_encrypted.accdb")) {
            getText(is, p, c);
        } catch (EncryptedDocumentException e) {
            ex = true;
        }
        assertTrue("failed to throw encrypted document exception for wrong password", ex);

        //now try null
        c.set(PasswordProvider.class, new PasswordProvider() {
            @Override
            public String getPassword(Metadata metadata) {
                return null;
            }
        });

        ex = false;
        try (InputStream is = this.getResourceAsStream("/test-documents/testAccess2_encrypted.accdb")) {
            getText(is, p, c);
        } catch (EncryptedDocumentException e) {
            ex = true;
        }
        assertTrue("failed to throw encrypted document exception for null password", ex);

        //now try missing password provider
        c = new ParseContext();
        ex = false;
        try (InputStream is = this.getResourceAsStream("/test-documents/testAccess2_encrypted.accdb")) {
            getText(is, p, c);
        } catch (EncryptedDocumentException e) {
            ex = true;
        }
        assertTrue("failed to throw encrypted document exception for missing password provider", ex);

        //now try password on file that doesn't need a password
        c = new ParseContext();
        c.set(PasswordProvider.class, new PasswordProvider() {
            @Override
            public String getPassword(Metadata metadata) {
                return "tika";
            }
        });
        ex = false;
        try (InputStream is = this.getResourceAsStream("/test-documents/testAccess2.accdb")) {
            content = getText(is, p, c);
        } catch (EncryptedDocumentException e) {
            ex = true;
        }
        assertFalse("shouldn't have thrown encrypted document exception for "
                + "opening unencrypted file that doesn't need passowrd", ex);
        assertContains("red and brown", content);
    }

    @Test
    public void testReadOnly() throws Exception {
        //TIKA-1681: just make sure an exception is not thrown
        XMLResult r = getXML("testAccess_V1997.mdb");
        assertContains("hijklmnop", r.xml);
    }

    @Test
    public void testMetadata() throws Exception {
        //basic tests for normalized metadata
        XMLResult r = getXML("testAccess_V1997.mdb");
        assertEquals("tmccune", r.metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Health Market Science", r.metadata.get(OfficeOpenXMLExtended.COMPANY));
        assertEquals("test", r.metadata.get(TikaCoreProperties.TITLE));
    }
}