org.apache.tika.parser.pdf18.PDFParserTest.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.tika.parser.pdf18.PDFParserTest.java

Source

package org.apache.tika.parser.pdf18;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import org.apache.commons.io.IOUtils;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.tika.TikaTest;
import org.apache.tika.exception.AccessPermissionException;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.extractor.ParserContainerExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.XMPMM;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.tika.sax.ToXMLContentHandler;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;
import org.xml.sax.ContentHandler;

/**
 * Test case for parsing pdf files.
 */
public class PDFParserTest extends TikaTest {

    public static final MediaType TYPE_TEXT = MediaType.TEXT_PLAIN;
    public static final MediaType TYPE_EMF = MediaType.application("x-emf");
    public static final MediaType TYPE_PDF = MediaType.application("pdf");
    public static final MediaType TYPE_DOCX = MediaType
            .application("vnd.openxmlformats-officedocument.wordprocessingml.document");
    public static final MediaType TYPE_DOC = MediaType.application("msword");
    public static Level PDFBOX_LOG_LEVEL = Level.INFO;

    @BeforeClass
    public static void setup() {
        //remember default logging level, but turn off for PDFParserTest
        PDFBOX_LOG_LEVEL = Logger.getLogger("org.apache.pdfbox").getLevel();
        Logger.getLogger("org.apache.pdfbox").setLevel(Level.OFF);
    }

    @AfterClass
    public static void tearDown() {
        //return to regular logging level
        Logger.getLogger("org.apache.pdfbox").setLevel(PDFBOX_LOG_LEVEL);
    }

    private static int substringCount(String needle, String haystack) {
        int upto = -1;
        int count = 0;
        while (true) {
            final int next = haystack.indexOf(needle, upto);
            if (next == -1) {
                break;
            }
            count++;
            upto = next + 1;
        }

        return count;
    }

    @Test
    public void testPdfParsing() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        Metadata metadata = new Metadata();

        InputStream stream = PDFParserTest.class.getResourceAsStream("/test-documents/testPDF.pdf");

        String content = getText(stream, parser, metadata);

        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Bertrand Delacr\u00e9taz", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Bertrand Delacr\u00e9taz", metadata.get(Metadata.AUTHOR));
        assertEquals("Firefox", metadata.get(TikaCoreProperties.CREATOR_TOOL));
        assertEquals("Apache Tika - Apache Tika", metadata.get(TikaCoreProperties.TITLE));

        // Can't reliably test dates yet - see TIKA-451
        //        assertEquals("Sat Sep 15 10:02:31 BST 2007", metadata.get(Metadata.CREATION_DATE));
        //        assertEquals("Sat Sep 15 10:02:31 BST 2007", metadata.get(Metadata.LAST_MODIFIED));

        assertContains("Apache Tika", content);
        assertContains("Tika - Content Analysis Toolkit", content);
        assertContains("incubator", content);
        assertContains("Apache Software Foundation", content);
        // testing how the end of one paragraph is separated from start of the next one
        assertTrue("should have word boundary after headline", !content.contains("ToolkitApache"));
        assertTrue("should have word boundary between paragraphs", !content.contains("libraries.Apache"));
    }

    @Test
    public void testPdfParsingMetadataOnly() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        Metadata metadata = new Metadata();

        try (InputStream stream = PDFParserTest.class.getResourceAsStream("/test-documents/testPDF.pdf")) {
            parser.parse(stream, null, metadata, new ParseContext());
        }

        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Bertrand Delacr\u00e9taz", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Firefox", metadata.get(TikaCoreProperties.CREATOR_TOOL));
        assertEquals("Apache Tika - Apache Tika", metadata.get(TikaCoreProperties.TITLE));
    }

    @Test
    public void testCustomMetadata() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        Metadata metadata = new Metadata();

        InputStream stream = PDFParserTest.class.getResourceAsStream("/test-documents/testPDF-custommetadata.pdf");

        String content = getText(stream, parser, metadata);

        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Document author", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Document author", metadata.get(Metadata.AUTHOR));
        assertEquals("Document title", metadata.get(TikaCoreProperties.TITLE));

        assertEquals("Custom Value", metadata.get("Custom Property"));

        assertEquals("Array Entry 1", metadata.get("Custom Array"));
        assertEquals(2, metadata.getValues("Custom Array").length);
        assertEquals("Array Entry 1", metadata.getValues("Custom Array")[0]);
        assertEquals("Array Entry 2", metadata.getValues("Custom Array")[1]);

        assertContains("Hello World!", content);
    }

    /**
     * PDFs can be "protected" with the default password. This means
     * they're encrypted (potentially both text and metadata),
     * but we can decrypt them easily.
     */
    @Test
    public void testProtectedPDF() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        ParseContext context = new ParseContext();

        try (InputStream stream = PDFParserTest.class
                .getResourceAsStream("/test-documents/testPDF_protected.pdf")) {
            parser.parse(stream, handler, metadata, context);
        }

        assertEquals("true", metadata.get("pdf:encrypted"));
        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR));
        assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
        assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
        assertEquals(
                "Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009",
                metadata.get(TikaCoreProperties.TITLE));

        String content = handler.toString();
        assertContains("RETHINKING THE FINANCIAL NETWORK", content);
        assertContains("On 16 November 2002", content);
        assertContains("In many important respects", content);

        // Try again with an explicit empty password
        handler = new BodyContentHandler();
        metadata = new Metadata();

        context = new ParseContext();
        context.set(PasswordProvider.class, new PasswordProvider() {
            public String getPassword(Metadata metadata) {
                return "";
            }
        });

        try (InputStream stream = PDFParserTest.class
                .getResourceAsStream("/test-documents/testPDF_protected.pdf")) {
            parser.parse(stream, handler, metadata, context);
        }
        assertEquals("true", metadata.get("pdf:encrypted"));

        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
        assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
        assertEquals(
                "Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009",
                metadata.get(TikaCoreProperties.TITLE));

        assertContains("RETHINKING THE FINANCIAL NETWORK", content);
        assertContains("On 16 November 2002", content);
        assertContains("In many important respects", content);

        //now test wrong password
        handler = new BodyContentHandler();
        metadata = new Metadata();
        context = new ParseContext();
        context.set(PasswordProvider.class, new PasswordProvider() {
            public String getPassword(Metadata metadata) {
                return "WRONG!!!!";
            }
        });

        boolean ex = false;
        try (InputStream stream = PDFParserTest.class
                .getResourceAsStream("/test-documents/testPDF_protected.pdf")) {
            parser.parse(stream, handler, metadata, context);
        } catch (EncryptedDocumentException e) {
            ex = true;
        }
        content = handler.toString();

        assertTrue("encryption exception", ex);
        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("true", metadata.get("pdf:encrypted"));
        //pdf:encrypted, X-Parsed-By and Content-Type
        assertEquals("very little metadata should be parsed", 3, metadata.names().length);
        assertEquals(0, content.length());

        //now test wrong password with non sequential parser
        handler = new BodyContentHandler();
        metadata = new Metadata();
        context = new ParseContext();
        context.set(PasswordProvider.class, new PasswordProvider() {
            public String getPassword(Metadata metadata) {
                return "WRONG!!!!";
            }
        });
        PDFParserConfig config = new PDFParserConfig();
        config.setUseNonSequentialParser(true);
        context.set(PDFParserConfig.class, config);

        ;
        ex = false;
        try (InputStream stream = PDFParserTest.class
                .getResourceAsStream("/test-documents/testPDF_protected.pdf")) {
            parser.parse(stream, handler, metadata, context);
        } catch (EncryptedDocumentException e) {
            ex = true;
        }
        content = handler.toString();
        assertTrue("encryption exception", ex);
        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("true", metadata.get("pdf:encrypted"));

        //pdf:encrypted, X-Parsed-By and Content-Type
        assertEquals("very little metadata should be parsed", 3, metadata.names().length);
        assertEquals(0, content.length());
    }

    @Test
    public void testTwoTextBoxes() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        InputStream stream = PDFParserTest.class.getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
        String content = getText(stream, parser);
        content = content.replaceAll("\\s+", " ");
        assertContains("Left column line 1 Left column line 2 Right column line 1 Right column line 2", content);
    }

    @Test
    public void testVarious() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        Metadata metadata = new Metadata();
        InputStream stream = PDFParserTest.class.getResourceAsStream("/test-documents/testPDFVarious.pdf");

        String content = getText(stream, parser, metadata);
        //content = content.replaceAll("\\s+"," ");
        assertContains("Footnote appears here", content);
        assertContains("This is a footnote.", content);
        assertContains("This is the header text.", content);
        assertContains("This is the footer text.", content);
        assertContains("Here is a text box", content);
        assertContains("Bold", content);
        assertContains("italic", content);
        assertContains("underline", content);
        assertContains("superscript", content);
        assertContains("subscript", content);
        assertContains("Here is a citation:", content);
        assertContains("Figure 1 This is a caption for Figure 1", content);
        assertContains("(Kramer)", content);
        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3",
                content.replaceAll("\\s+", " "));
        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2",
                content.replaceAll("\\s+", " "));
        assertContains("This is a hyperlink", content);
        assertContains("Here is a list:", content);
        for (int row = 1; row <= 3; row++) {
            //assertContains("\tBullet " + row, content);
            //assertContains("\u00b7\tBullet " + row, content);
            assertContains("Bullet " + row, content);
        }
        assertContains("Here is a numbered list:", content);
        for (int row = 1; row <= 3; row++) {
            //assertContains(row + ")\tNumber bullet " + row, content);
            assertContains(row + ") Number bullet " + row, content);
        }

        for (int row = 1; row <= 2; row++) {
            for (int col = 1; col <= 3; col++) {
                assertContains("Row " + row + " Col " + col, content);
            }
        }

        assertContains("Keyword1 Keyword2", content);
        assertEquals("Keyword1 Keyword2", metadata.get(Metadata.KEYWORDS));

        assertContains("Subject is here", content);
        assertEquals("Subject is here", metadata.get(OfficeOpenXMLCore.SUBJECT));
        assertEquals("Subject is here", metadata.get(Metadata.SUBJECT));

        assertContains("Suddenly some Japanese text:", content);
        // Special version of (GHQ)
        assertContains("\uff08\uff27\uff28\uff31\uff09", content);
        // 6 other characters
        assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);

        assertContains("And then some Gothic text:", content);
        // TODO: I saved the word doc as a PDF, but that
        // process somehow, apparently lost the gothic
        // chars, so we cannot test this here:
        //assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
    }

    @Test
    public void testAnnotations() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        InputStream stream = getResourceAsStream("/test-documents/testAnnotations.pdf");
        String content = getText(stream, parser);
        content = content.replaceAll("[\\s\u00a0]+", " ");
        assertContains("Here is some text", content);
        assertContains("Here is a comment", content);

        // Test w/ annotation text disabled:
        PDFParser pdfParser = new PDFParser();
        pdfParser.getPDFParserConfig().setExtractAnnotationText(false);
        stream = getResourceAsStream("/test-documents/testAnnotations.pdf");
        content = getText(stream, pdfParser);
        content = content.replaceAll("[\\s\u00a0]+", " ");
        assertContains("Here is some text", content);
        assertEquals(-1, content.indexOf("Here is a comment"));

        // annotation text disabled through parsecontext
        ParseContext context = new ParseContext();
        PDFParserConfig config = new PDFParserConfig();
        config.setExtractAnnotationText(false);
        context.set(PDFParserConfig.class, config);
        stream = getResourceAsStream("/test-documents/testAnnotations.pdf");
        content = getText(stream, parser, context);
        content = content.replaceAll("[\\s\u00a0]+", " ");
        assertContains("Here is some text", content);
        assertEquals(-1, content.indexOf("Here is a comment"));

        // TIKA-738: make sure no extra </p> tags
        String xml = getXML("testAnnotations.pdf").xml;
        assertEquals(substringCount("<p>", xml), substringCount("</p>", xml));
    }

    // TIKA-981
    @Test
    public void testPopupAnnotation() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        InputStream stream = getResourceAsStream("/test-documents/testPopupAnnotation.pdf");
        String content = getText(stream, parser);
        assertContains("this is the note", content);
        assertContains("igalsh", content);
    }

    @Test
    public void testEmbeddedPDFs() throws Exception {
        String xml = getXML("testPDFPackage.pdf").xml;
        assertContains("PDF1", xml);
        assertContains("PDF2", xml);
    }

    @Test
    public void testPageNumber() throws Exception {
        final XMLResult result = getXML("testPageNumber.pdf");
        final String content = result.xml.replaceAll("\\s+", "");
        assertContains("<p>1</p>", content);
    }

    /**
     * Test to ensure that Links are extracted from the text
     * <p/>
     * Note - the PDF contains the text "This is a hyperlink" which
     * a hyperlink annotation, linking to the tika site, on it. This
     * test will need updating when we're able to apply the annotation
     * to the text itself, rather than following on afterwards as now
     */
    @Test
    public void testLinks() throws Exception {
        final XMLResult result = getXML("testPDFVarious.pdf");
        assertContains("<div class=\"annotation\"><a href=\"http://tika.apache.org/\" /></div>", result.xml);
    }

    @Test
    public void testDisableAutoSpace() throws Exception {
        PDFParser parser = new PDFParser();
        parser.getPDFParserConfig().setEnableAutoSpace(false);
        InputStream stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
        String content = getText(stream, parser);
        content = content.replaceAll("[\\s\u00a0]+", " ");
        // Text is correct when autoSpace is off:
        assertContains("Here is some formatted text", content);

        parser.getPDFParserConfig().setEnableAutoSpace(true);
        stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
        content = getText(stream, parser);
        content = content.replaceAll("[\\s\u00a0]+", " ");
        // Text is correct when autoSpace is off:

        // Text has extra spaces when autoSpace is on
        assertEquals(-1, content.indexOf("Here is some formatted text"));

        //now try with autodetect
        Parser autoParser = new AutoDetectParser();
        ParseContext context = new ParseContext();
        PDFParserConfig config = new PDFParserConfig();
        context.set(PDFParserConfig.class, config);
        //default is true
        stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
        content = getText(stream, autoParser, context);
        content = content.replaceAll("[\\s\u00a0]+", " ");
        // Text has extra spaces when autoSpace is on
        assertEquals(-1, content.indexOf("Here is some formatted text"));

        config.setEnableAutoSpace(false);

        stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
        content = getText(stream, parser, context);
        content = content.replaceAll("[\\s\u00a0]+", " ");
        // Text is correct when autoSpace is off:
        assertContains("Here is some formatted text", content);

    }

    @Test
    public void testDuplicateOverlappingText() throws Exception {
        PDFParser parser = new PDFParser();
        InputStream stream = getResourceAsStream("/test-documents/testOverlappingText.pdf");
        // Default is false (keep overlapping text):
        String content = getText(stream, parser);
        assertContains("Text the first timeText the second time", content);

        parser.getPDFParserConfig().setSuppressDuplicateOverlappingText(true);
        stream = getResourceAsStream("/test-documents/testOverlappingText.pdf");
        content = getText(stream, parser);
        // "Text the first" was dedup'd:
        assertContains("Text the first timesecond time", content);

        //now try with autodetect
        Parser autoParser = new AutoDetectParser();
        ParseContext context = new ParseContext();
        PDFParserConfig config = new PDFParserConfig();
        context.set(PDFParserConfig.class, config);
        stream = getResourceAsStream("/test-documents/testOverlappingText.pdf");
        // Default is false (keep overlapping text):
        content = getText(stream, autoParser, context);
        assertContains("Text the first timeText the second time", content);

        config.setSuppressDuplicateOverlappingText(true);
        stream = getResourceAsStream("/test-documents/testOverlappingText.pdf");
        content = getText(stream, autoParser, context);
        // "Text the first" was dedup'd:
        assertContains("Text the first timesecond time", content);

    }

    @Test
    public void testSortByPosition() throws Exception {
        PDFParser parser = new PDFParser();
        parser.getPDFParserConfig().setEnableAutoSpace(false);
        InputStream stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
        // Default is false (do not sort):
        String content = getText(stream, parser);
        content = content.replaceAll("\\s+", " ");
        assertContains("Left column line 1 Left column line 2 Right column line 1 Right column line 2", content);

        parser.getPDFParserConfig().setSortByPosition(true);
        stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
        content = getText(stream, parser);
        content = content.replaceAll("\\s+", " ");
        // Column text is now interleaved:
        assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", content);

        //now try setting autodetect via parsecontext
        AutoDetectParser autoParser = new AutoDetectParser();
        ParseContext context = new ParseContext();
        PDFParserConfig config = new PDFParserConfig();
        context.set(PDFParserConfig.class, config);
        stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
        // Default is false (do not sort):
        content = getText(stream, autoParser, context);
        content = content.replaceAll("\\s+", " ");
        assertContains("Left column line 1 Left column line 2 Right column line 1 Right column line 2", content);

        config.setSortByPosition(true);
        context.set(PDFParserConfig.class, config);
        stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
        content = getText(stream, parser);
        content = content.replaceAll("\\s+", " ");
        // Column text is now interleaved:
        assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", content);

    }

    // TIKA-1035
    @Test
    public void testBookmarks() throws Exception {
        String xml = getXML("testPDF_bookmarks.pdf").xml;
        int i = xml.indexOf("Denmark bookmark is here");
        int j = xml.indexOf("</body>");
        assertTrue(i != -1);
        assertTrue(j != -1);
        assertTrue(i < j);
    }

    //TIKA-1124
    @Test
    @Ignore("need to have other parsers available")
    public void testEmbeddedPDFEmbeddingAnotherDocument() throws Exception {
        /* format of test doc:
          docx/
        pdf/
           docx
        */
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        ParseContext context = new ParseContext();
        String content = "";
        InputStream stream = null;
        try {
            context.set(org.apache.tika.parser.Parser.class, parser);
            stream = getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx");
            parser.parse(stream, handler, metadata, context);
            content = handler.toString();
        } finally {
            stream.close();
        }
        int outerHaystack = content.indexOf("Outer_haystack");
        int pdfHaystack = content.indexOf("pdf_haystack");
        int needle = content.indexOf("Needle");
        assertTrue(outerHaystack > -1);
        assertTrue(pdfHaystack > -1);
        assertTrue(needle > -1);
        assertTrue(needle > pdfHaystack && pdfHaystack > outerHaystack);

        TrackingHandler tracker = new TrackingHandler();
        TikaInputStream tis;
        ContainerExtractor ex = new ParserContainerExtractor();
        try {
            tis = TikaInputStream.get(getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx"));
            ex.extract(tis, ex, tracker);
        } finally {
            stream.close();
        }
        assertEquals(true, ex.isSupported(tis));
        assertEquals(3, tracker.filenames.size());
        assertEquals(3, tracker.mediaTypes.size());
        assertEquals("image1.emf", tracker.filenames.get(0));
        assertNull(tracker.filenames.get(1));
        assertEquals("Test.docx", tracker.filenames.get(2));
        assertEquals(TYPE_EMF, tracker.mediaTypes.get(0));
        assertEquals(TYPE_PDF, tracker.mediaTypes.get(1));
        assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2));
    }

    /**
     * tests for equality between traditional sequential parser
     * and newer nonsequential parser.
     * <p/>
     * TODO: more testing
     */
    @Test
    public void testSequentialParser() throws Exception {

        Parser sequentialParser = new AutoDetectParser();
        Parser nonSequentialParser = new AutoDetectParser();

        ParseContext seqContext = new ParseContext();
        PDFParserConfig seqConfig = new PDFParserConfig();
        seqConfig.setUseNonSequentialParser(false);
        seqContext.set(PDFParserConfig.class, seqConfig);

        ParseContext nonSeqContext = new ParseContext();
        PDFParserConfig nonSeqConfig = new PDFParserConfig();
        nonSeqConfig.setUseNonSequentialParser(true);
        nonSeqContext.set(PDFParserConfig.class, nonSeqConfig);

        File testDocs = new File(this.getClass().getResource("/test-documents").toURI());
        int pdfs = 0;
        //empty as of PDFBox 1.8.11
        //leave this in for the 1.8.x series in case something new happens
        Set<String> knownMetadataDiffs = new HashSet<String>();

        //empty for now
        Set<String> knownContentDiffs = new HashSet<String>();

        for (File f : testDocs.listFiles()) {
            if (!f.getName().toLowerCase(Locale.ROOT).endsWith(".pdf")) {
                continue;
            }

            String sequentialContent = null;
            Metadata sequentialMetadata = new Metadata();
            try {
                sequentialContent = getText(new FileInputStream(f), sequentialParser, seqContext,
                        sequentialMetadata);
            } catch (EncryptedDocumentException e) {
                //silently skip a file that requires a user password
                continue;
            } catch (Exception e) {
                throw new TikaException("Sequential Parser failed on test file " + f, e);
            }

            pdfs++;

            String nonSequentialContent = null;
            Metadata nonSequentialMetadata = new Metadata();
            try {
                nonSequentialContent = getText(new FileInputStream(f), nonSequentialParser, nonSeqContext,
                        nonSequentialMetadata);
            } catch (Exception e) {
                throw new TikaException("Non-Sequential Parser failed on test file " + f, e);
            }

            if (knownContentDiffs.contains(f.getName())) {
                assertFalse(f.getName(), sequentialContent.equals(nonSequentialContent));
            } else {
                assertEquals(f.getName(), sequentialContent, nonSequentialContent);
            }

            //skip this one file.
            if (knownMetadataDiffs.contains(f.getName())) {
                assertFalse(f.getName(), sequentialMetadata.equals(nonSequentialMetadata));
            } else {
                assertEquals(f.getName(), sequentialMetadata, nonSequentialMetadata);
            }
        }
        //make sure nothing went wrong with getting the resource to test-documents
        //must have tested >= 15 pdfs
        boolean ge15 = (pdfs >= 15);
        assertTrue("Number of pdf files tested >= 15 in non-sequential parser test", ge15);
    }

    // TIKA-973
    //commented out until test documents that are unambiguously
    //consistent with Apache License v2.0 are contributed.
    //TODO: add back test for AcroForm extraction; test document should include
    //recursive forms
    /*    public void testAcroForm() throws Exception{
           Parser p = new AutoDetectParser();
           ParseContext context = new ParseContext();
           InputStream stream = getResourceAsStream("/test-documents/testPDF_acroForm1.pdf");
           String txt = getText(stream, p, context);
           stream.close();
        
           //simple first level form contents
           assertContains("to: John Doe", txt);
           //checkbox
           assertContains("xpackaging: Yes", txt);
        
           //this guarantees that the form processor
           //worked recursively at least once...i.e. it didn't just
           //take the first form
           stream = getResourceAsStream("/test-documents/testPDF_acroForm2.pdf");
           txt = getText(stream, p, context);
           stream.close();
           assertContains("123 Main St.", txt);
        
        
           //now test with nonsequential parser
           PDFParserConfig config = new PDFParserConfig();
           config.setUseNonSequentialParser(true);
           context.set(PDFParserConfig.class, config);
           stream = getResourceAsStream("/test-documents/testPDF_acroForm1.pdf");
           txt = getText(stream, p, context);
           stream.close();
        
           //simple first level form contents
           assertContains("to: John Doe", txt);
           //checkbox
           assertContains("xpackaging: Yes", txt);
        
           //this guarantees that the form processor
           //worked recursively at least once...i.e. it didn't just
           //take the first form
           stream = getResourceAsStream("/test-documents/testPDF_acroForm2.pdf");
           txt = getText(stream, p, context);
           assertContains("123 Main St.", txt);
           stream.close();
        }
    */

    //TIKA-1226
    @Test
    public void testSignatureInAcroForm() throws Exception {
        //The current test doc does not contain any content in the signature area.
        //This just tests that a RuntimeException is not thrown.
        //TODO: find a better test file for this issue.
        String xml = getXML("/testPDF_acroform3.pdf").xml;
        assertTrue("found", (xml.contains("<li>aTextField: TIKA-1226</li>")));
    }

    @Test // TIKA-1228, TIKA-1268
    @Ignore("need to have other parsers available")
    public void testEmbeddedFilesInChildren() throws Exception {
        String xml = getXML("/testPDF_childAttachments.pdf").xml;
        //"regressiveness" exists only in Unit10.doc not in the container pdf document
        assertTrue(xml.contains("regressiveness"));

        RecursiveParserWrapper p = new RecursiveParserWrapper(new AutoDetectParser(),
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
        ParseContext context = new ParseContext();
        PDFParserConfig config = new PDFParserConfig();
        config.setExtractInlineImages(true);
        config.setExtractUniqueInlineImagesOnly(false);
        context.set(org.apache.tika.parser.pdf18.PDFParserConfig.class, config);
        context.set(org.apache.tika.parser.Parser.class, p);

        try (TikaInputStream tis = TikaInputStream
                .get(getResourceAsStream("/test-documents/testPDF_childAttachments.pdf"))) {
            p.parse(tis, new BodyContentHandler(-1), new Metadata(), context);
        }

        List<Metadata> metadatas = p.getMetadata();

        assertEquals(5, metadatas.size());
        assertNull(metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY));
        assertEquals("image0.jpg", metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
        assertEquals("Press Quality(1).joboptions", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY));
        assertEquals("Unit10.doc", metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY));
        assertEquals(MediaType.image("jpeg").toString(), metadatas.get(1).get(Metadata.CONTENT_TYPE));
        assertEquals(MediaType.image("tiff").toString(), metadatas.get(2).get(Metadata.CONTENT_TYPE));
        assertEquals("text/plain; charset=ISO-8859-1", metadatas.get(3).get(Metadata.CONTENT_TYPE));
        assertEquals(TYPE_DOC.toString(), metadatas.get(4).get(Metadata.CONTENT_TYPE));
    }

    @Test
    @Ignore("need to have office parser available")
    public void testEmbeddedFilesInAnnotations() throws Exception {
        String xml = getXML("/testPDFFileEmbInAnnotation.pdf").xml;

        assertTrue(xml.contains("This is a Excel"));
    }

    @Test
    public void testSingleCloseDoc() throws Exception {
        //TIKA-1341
        InputStream is = PDFParserTest.class.getResourceAsStream("/test-documents/testPDFTripleLangTitle.pdf");
        Parser p = new AutoDetectParser();
        Metadata m = new Metadata();
        ParseContext c = new ParseContext();
        ContentHandler h = new EventCountingHandler();
        p.parse(is, h, m, c);
        assertEquals(1, ((EventCountingHandler) h).getEndDocument());
    }

    @Test
    public void testVersions() throws Exception {

        Map<String, String> dcFormat = new HashMap<String, String>();
        dcFormat.put("4.x", "application/pdf; version=1.3");
        dcFormat.put("5.x", "application/pdf; version=1.4");
        dcFormat.put("6.x", "application/pdf; version=1.5");
        dcFormat.put("7.x", "application/pdf; version=1.6");
        dcFormat.put("8.x", "application/pdf; version=1.7");
        dcFormat.put("9.x", "application/pdf; version=1.7");
        dcFormat.put("10.x", "application/pdf; version=1.7");
        dcFormat.put("11.x.PDFA-1b", "application/pdf; version=1.7");

        Map<String, String> pdfVersions = new HashMap<String, String>();
        pdfVersions.put("4.x", "1.3");
        pdfVersions.put("5.x", "1.4");
        pdfVersions.put("6.x", "1.5");
        pdfVersions.put("7.x", "1.6");
        pdfVersions.put("8.x", "1.7");
        pdfVersions.put("9.x", "1.7");
        pdfVersions.put("10.x", "1.7");
        pdfVersions.put("11.x.PDFA-1b", "1.7");

        Map<String, String> pdfExtensionVersions = new HashMap<String, String>();
        pdfExtensionVersions.put("9.x", "1.7 Adobe Extension Level 3");
        pdfExtensionVersions.put("10.x", "1.7 Adobe Extension Level 8");
        pdfExtensionVersions.put("11.x.PDFA-1b", "1.7 Adobe Extension Level 8");

        Parser p = new AutoDetectParser();
        for (Map.Entry<String, String> e : dcFormat.entrySet()) {
            String fName = "testPDF_Version." + e.getKey() + ".pdf";
            InputStream is = PDFParserTest.class.getResourceAsStream("/test-documents/" + fName);
            Metadata m = new Metadata();
            ContentHandler h = new BodyContentHandler();
            ParseContext c = new ParseContext();
            p.parse(is, h, m, c);
            is.close();
            boolean foundDC = false;
            String[] vals = m.getValues("dc:format");
            for (String v : vals) {
                if (v.equals(e.getValue())) {
                    foundDC = true;
                }
            }
            assertTrue("dc:format ::" + e.getValue(), foundDC);
            String extensionVersionTruth = pdfExtensionVersions.get(e.getKey());
            if (extensionVersionTruth != null) {
                assertEquals("pdf:PDFExtensionVersion :: " + extensionVersionTruth, extensionVersionTruth,
                        m.get("pdf:PDFExtensionVersion"));
            }
            assertEquals("pdf:PDFVersion", pdfVersions.get(e.getKey()), m.get("pdf:PDFVersion"));
        }
        //now test full 11.x
        String fName = "testPDF_Version.11.x.PDFA-1b.pdf";
        InputStream is = PDFParserTest.class.getResourceAsStream("/test-documents/" + fName);
        Metadata m = new Metadata();
        ParseContext c = new ParseContext();
        ContentHandler h = new BodyContentHandler();
        p.parse(is, h, m, c);
        is.close();
        Set<String> versions = new HashSet<String>();
        for (String fmt : m.getValues("dc:format")) {
            versions.add(fmt);
        }

        for (String hit : new String[] { "application/pdf; version=1.7", "application/pdf; version=\"A-1b\"",
                "application/pdf; version=\"1.7 Adobe Extension Level 8\"" }) {
            assertTrue(hit, versions.contains(hit));
        }

        assertEquals("pdfaid:conformance", m.get("pdfaid:conformance"), "B");
        assertEquals("pdfaid:part", m.get("pdfaid:part"), "1");
    }

    @Test
    public void testMultipleAuthors() throws Exception {
        String fName = "testPDF_twoAuthors.pdf";
        InputStream is = PDFParserTest.class.getResourceAsStream("/test-documents/" + fName);
        Parser p = new AutoDetectParser();
        Metadata m = new Metadata();
        ParseContext c = new ParseContext();
        ContentHandler h = new BodyContentHandler();
        p.parse(is, h, m, c);
        is.close();

        String[] keys = new String[] { "dc:creator", "meta:author", "creator", "Author" };

        for (String k : keys) {
            String[] vals = m.getValues(k);
            assertEquals("number of authors == 2 for key: " + k, 2, vals.length);
            Set<String> set = new HashSet<String>();
            set.add(vals[0]);
            set.add(vals[1]);
            assertTrue("Sample Author 1", set.contains("Sample Author 1"));
            assertTrue("Sample Author 2", set.contains("Sample Author 2"));
        }
    }

    //STUB test for once TIKA-1295 is fixed
    @Test
    public void testMultipleTitles() throws Exception {
        InputStream is = PDFParserTest.class.getResourceAsStream("/test-documents/testPDFTripleLangTitle.pdf");
        Parser p = new AutoDetectParser();
        Metadata m = new Metadata();
        ParseContext c = new ParseContext();
        ContentHandler h = new BodyContentHandler();
        p.parse(is, h, m, c);
        is.close();
        //TODO: add other tests as part of TIKA-1295
        //dc:title-fr-ca (or whatever we decide) should be "Bonjour World"
        //dc:title-zh-ch is currently hosed...bug in PDFBox while injecting xmp?
        //
        assertEquals("Hello World", m.get("dc:title"));
    }

    @Test
    public void testInlineSelector() throws Exception {

        PDFParserConfig config = new PDFParserConfig();
        config.setExtractInlineImages(true);
        config.setExtractUniqueInlineImagesOnly(false);

        Parser defaultParser = new AutoDetectParser();

        RecursiveParserWrapper p = new RecursiveParserWrapper(defaultParser,
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
        ParseContext context = new ParseContext();
        context.set(org.apache.tika.parser.pdf18.PDFParserConfig.class, config);
        context.set(org.apache.tika.parser.Parser.class, p);
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler(-1);
        String path = "/test-documents/testPDF_childAttachments.pdf";
        InputStream stream = TikaInputStream.get(this.getClass().getResource(path));

        p.parse(stream, handler, metadata, context);

        List<Metadata> metadatas = p.getMetadata();
        int inline = 0;
        int attach = 0;
        for (Metadata m : metadatas) {
            String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
            if (v != null) {
                if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
                    inline++;
                } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
                    attach++;
                }
            }
        }
        assertEquals(2, inline);
        assertEquals(2, attach);

        stream.close();
        p.reset();

        //now try turning off inline
        stream = TikaInputStream.get(this.getClass().getResource(path));

        context.set(org.apache.tika.extractor.DocumentSelector.class, new AvoidInlineSelector());
        inline = 0;
        attach = 0;
        handler = new BodyContentHandler(-1);
        metadata = new Metadata();
        p.parse(stream, handler, metadata, context);

        metadatas = p.getMetadata();
        for (Metadata m : metadatas) {
            String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
            if (v != null) {
                if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
                    inline++;
                } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
                    attach++;
                }
            }
        }
        assertEquals(0, inline);
        assertEquals(2, attach);

    }

    @Test
    public void testInlineConfig() throws Exception {

        Parser defaultParser = new AutoDetectParser();
        RecursiveParserWrapper p = new RecursiveParserWrapper(defaultParser,
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
        ParseContext context = new ParseContext();
        context.set(org.apache.tika.parser.Parser.class, p);
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler(-1);
        String path = "/test-documents/testPDF_childAttachments.pdf";
        InputStream stream = TikaInputStream.get(this.getClass().getResource(path));

        p.parse(stream, handler, metadata, context);

        List<Metadata> metadatas = p.getMetadata();
        int inline = 0;
        int attach = 0;
        for (Metadata m : metadatas) {
            String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
            if (v != null) {
                if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
                    inline++;
                } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
                    attach++;
                }
            }
        }
        assertEquals(0, inline);
        assertEquals(2, attach);

        stream.close();
        p.reset();

        //now try turning off inline
        stream = TikaInputStream.get(this.getClass().getResource(path));
        PDFParserConfig config = new PDFParserConfig();
        config.setExtractInlineImages(true);
        config.setExtractUniqueInlineImagesOnly(false);

        context.set(org.apache.tika.parser.pdf18.PDFParserConfig.class, config);
        inline = 0;
        attach = 0;
        handler = new BodyContentHandler(-1);
        metadata = new Metadata();
        p.parse(stream, handler, metadata, context);

        metadatas = p.getMetadata();
        for (Metadata m : metadatas) {
            String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
            if (v != null) {
                if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
                    inline++;
                } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
                    attach++;
                }
            }
        }
        assertEquals(2, inline);
        assertEquals(2, attach);
    }

    @Test //TIKA-1376
    public void testEmbeddedFileNameExtraction() throws Exception {
        InputStream is = PDFParserTest.class.getResourceAsStream("/test-documents/testPDF_multiFormatEmbFiles.pdf");
        RecursiveParserWrapper p = new RecursiveParserWrapper(new AutoDetectParser(),
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
        Metadata m = new Metadata();
        ParseContext c = new ParseContext();
        c.set(org.apache.tika.parser.Parser.class, p);
        ContentHandler h = new BodyContentHandler();
        p.parse(is, h, m, c);
        is.close();
        List<Metadata> metadatas = p.getMetadata();
        assertEquals("metadata size", 5, metadatas.size());
        Metadata firstAttachment = metadatas.get(1);
        assertEquals("attachment file name", "Test.txt", firstAttachment.get(Metadata.RESOURCE_NAME_KEY));
    }

    @Test //TIKA-1374
    public void testOSSpecificEmbeddedFileExtraction() throws Exception {
        InputStream is = PDFParserTest.class.getResourceAsStream("/test-documents/testPDF_multiFormatEmbFiles.pdf");
        RecursiveParserWrapper p = new RecursiveParserWrapper(new AutoDetectParser(),
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
        Metadata m = new Metadata();
        ParseContext c = new ParseContext();
        c.set(org.apache.tika.parser.Parser.class, p);
        ContentHandler h = new BodyContentHandler();
        p.parse(is, h, m, c);
        is.close();
        List<Metadata> metadatas = p.getMetadata();
        assertEquals("metadata size", 5, metadatas.size());
        /*
                assertEquals("file name", "Test.txt", metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
                assertContains("os specific", metadatas.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
                assertEquals("file name", "TestMac.txt", metadatas.get(2).get(Metadata.RESOURCE_NAME_KEY));
                assertContains("mac embedded", metadatas.get(2).get(RecursiveParserWrapper.TIKA_CONTENT));
                assertEquals("file name", "TestDos.txt", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY));
                assertContains("dos embedded", metadatas.get(3).get(RecursiveParserWrapper.TIKA_CONTENT));
                assertEquals("file name", "TestUnix.txt", metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY));
                assertContains("unix embedded", metadatas.get(4).get(RecursiveParserWrapper.TIKA_CONTENT));
        */
    }

    @Test //TIKA-1427
    public void testEmbeddedFileMarkup() throws Exception {
        Parser parser = new AutoDetectParser();
        ParseContext context = new ParseContext();
        context.set(org.apache.tika.parser.Parser.class, parser);

        PDFParserConfig config = new PDFParserConfig();
        config.setExtractInlineImages(true);
        config.setExtractUniqueInlineImagesOnly(false);
        context.set(org.apache.tika.parser.pdf18.PDFParserConfig.class, config);

        Metadata metadata = new Metadata();
        ContentHandler handler = new ToXMLContentHandler();
        String path = "/test-documents/testPDF_childAttachments.pdf";
        InputStream stream = null;
        try {
            stream = TikaInputStream.get(this.getClass().getResource(path));
            parser.parse(stream, handler, metadata, context);
        } finally {
            IOUtils.closeQuietly(stream);
        }

        String xml = handler.toString();
        //regular attachment
        assertContains("<div class=\"embedded\" id=\"Unit10.doc\" />", xml);
        //inline image
        assertContains("<img src=\"embedded:image1.tif\" alt=\"image1.tif\" />", xml);

        //doc embedded inside an annotation
        xml = getXML("testPDFFileEmbInAnnotation.pdf").xml;
        assertContains("<div class=\"embedded\" id=\"Excel.xlsx\" />", xml);
    }

    //Access checker tests

    @Test
    public void testLegacyAccessChecking() throws Exception {
        //test that default behavior doesn't throw AccessPermissionException
        for (String file : new String[] { "testPDF_no_extract_no_accessibility_owner_empty.pdf",
                "testPDF_no_extract_yes_accessibility_owner_empty.pdf", }) {
            String xml = getXML(file).xml;
            assertContains("Hello World", xml);
        }

        //now try with the user password
        PasswordProvider provider = new PasswordProvider() {
            @Override
            public String getPassword(Metadata metadata) {
                return "user";
            }
        };

        ParseContext context = new ParseContext();
        context.set(PasswordProvider.class, provider);
        Parser parser = new AutoDetectParser();

        for (String path : new String[] { "testPDF_no_extract_no_accessibility_owner_user.pdf",
                "testPDF_no_extract_yes_accessibility_owner_user.pdf", }) {
            InputStream stream = null;
            try {
                stream = TikaInputStream.get(this.getClass().getResource("/test-documents/" + path));
                String text = getText(stream, parser, context);
                assertContains("Hello World", text);
            } finally {
                IOUtils.closeQuietly(stream);
            }
        }
    }

    @Test
    public void testAccessCheckingEmptyPassword() throws Exception {
        PDFParserConfig config = new PDFParserConfig();

        //don't allow extraction, not even for accessibility
        config.setAccessChecker(new AccessChecker(false));
        Parser parser = new AutoDetectParser();
        ParseContext context = new ParseContext();
        context.set(PDFParserConfig.class, config);

        //test exception for empty password
        for (String path : new String[] { "testPDF_no_extract_no_accessibility_owner_empty.pdf",
                "testPDF_no_extract_yes_accessibility_owner_empty.pdf", }) {
            assertException("/test-documents/" + path, parser, context, AccessPermissionException.class);
        }

        config.setAccessChecker(new AccessChecker(true));
        assertException("/test-documents/" + "testPDF_no_extract_no_accessibility_owner_empty.pdf", parser, context,
                AccessPermissionException.class);

        InputStream is = null;
        try {
            is = getResourceAsStream("/test-documents/" + "testPDF_no_extract_yes_accessibility_owner_empty.pdf");
            assertContains("Hello World", getText(is, parser, context));
        } finally {
            IOUtils.closeQuietly(is);
        }
    }

    @Test
    public void testAccessCheckingUserPassword() throws Exception {
        ParseContext context = new ParseContext();

        PDFParserConfig config = new PDFParserConfig();
        //don't allow extraction, not even for accessibility
        config.setAccessChecker(new AccessChecker(false));
        PasswordProvider passwordProvider = new PasswordProvider() {
            @Override
            public String getPassword(Metadata metadata) {
                return "user";
            }
        };

        context.set(PasswordProvider.class, passwordProvider);
        context.set(PDFParserConfig.class, config);

        Parser parser = new AutoDetectParser();

        //test bad passwords
        for (String path : new String[] { "testPDF_no_extract_no_accessibility_owner_empty.pdf",
                "testPDF_no_extract_yes_accessibility_owner_empty.pdf", }) {
            assertException("/test-documents/" + path, parser, context, EncryptedDocumentException.class);
        }

        //bad password is still a bad password
        config.setAccessChecker(new AccessChecker(true));
        for (String path : new String[] { "testPDF_no_extract_no_accessibility_owner_empty.pdf",
                "testPDF_no_extract_yes_accessibility_owner_empty.pdf", }) {
            assertException("/test-documents/" + path, parser, context, EncryptedDocumentException.class);
        }

        //now test documents that require this "user" password
        assertException("/test-documents/" + "testPDF_no_extract_no_accessibility_owner_user.pdf", parser, context,
                AccessPermissionException.class);

        InputStream is = null;
        try {
            is = getResourceAsStream("/test-documents/" + "testPDF_no_extract_yes_accessibility_owner_user.pdf");
            assertContains("Hello World", getText(is, parser, context));
        } finally {
            IOUtils.closeQuietly(is);
        }

        config.setAccessChecker(new AccessChecker(false));
        for (String path : new String[] { "testPDF_no_extract_no_accessibility_owner_user.pdf",
                "testPDF_no_extract_yes_accessibility_owner_user.pdf", }) {
            assertException("/test-documents/" + path, parser, context, AccessPermissionException.class);
        }
    }

    @Test
    public void testAccessCheckingOwnerPassword() throws Exception {
        ParseContext context = new ParseContext();

        PDFParserConfig config = new PDFParserConfig();
        //don't allow extraction, not even for accessibility
        config.setAccessChecker(new AccessChecker(true));
        PasswordProvider passwordProvider = new PasswordProvider() {
            @Override
            public String getPassword(Metadata metadata) {
                return "owner";
            }
        };

        context.set(PasswordProvider.class, passwordProvider);
        context.set(PDFParserConfig.class, config);

        Parser parser = new AutoDetectParser();
        //with owner's password, text can be extracted, no matter the AccessibilityChecker's settings
        for (String path : new String[] { "testPDF_no_extract_no_accessibility_owner_user.pdf",
                "testPDF_no_extract_yes_accessibility_owner_user.pdf",
                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
                "testPDF_no_extract_yes_accessibility_owner_empty.pdf", }) {

            InputStream is = null;
            try {
                is = getResourceAsStream(
                        "/test-documents/" + "testPDF_no_extract_yes_accessibility_owner_user.pdf");
                assertContains("Hello World", getText(is, parser, context));
            } finally {
                IOUtils.closeQuietly(is);
            }
        }

        //really, with owner's password, all extraction is allowed
        config.setAccessChecker(new AccessChecker(false));
        for (String path : new String[] { "testPDF_no_extract_no_accessibility_owner_user.pdf",
                "testPDF_no_extract_yes_accessibility_owner_user.pdf",
                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
                "testPDF_no_extract_yes_accessibility_owner_empty.pdf", }) {

            InputStream is = null;
            try {
                is = getResourceAsStream(
                        "/test-documents/" + "testPDF_no_extract_yes_accessibility_owner_user.pdf");
                assertContains("Hello World", getText(is, parser, context));
            } finally {
                IOUtils.closeQuietly(is);
            }
        }
    }

    @Test
    public void testPDFEncodedStringsInXMP() throws Exception {
        //TIKA-1678
        XMLResult r = getXML("testPDF_PDFEncodedStringInXMP.pdf");
        assertEquals("Microsoft", r.metadata.get(TikaCoreProperties.TITLE));
    }

    @Test
    public void testXFAExtractionBasic() throws Exception {
        XMLResult r = getXML("testPDF_XFA_govdocs1_258578.pdf");
        //contains content existing only in the "regular" pdf
        assertContains("Mount Rushmore National Memorial", r.xml);
        //contains xfa fields and data
        assertContains("<li fieldName=\"School_Name\">School Name: my_school</li>", r.xml);
    }

    @Test
    public void testXFAOnly() throws Exception {
        ParseContext context = new ParseContext();

        PDFParserConfig config = new PDFParserConfig();
        config.setIfXFAExtractOnlyXFA(true);
        context.set(PDFParserConfig.class, config);
        ContentHandler handler = new ToXMLContentHandler(StandardCharsets.UTF_8.name());
        Metadata metadata = new Metadata();
        Parser parser = new AutoDetectParser();
        try (InputStream is = getResourceAsStream("/test-documents/testPDF_XFA_govdocs1_258578.pdf")) {
            parser.parse(is, handler, metadata, context);
        }
        String xml = handler.toString();
        assertContains("<li fieldName=\"Room_1\">Room [1]: my_room1</li>", xml);
        assertContains("</xfa_content></body></html>", xml);

        assertNotContained("Mount Rushmore National Memorial", xml);
    }

    @Test
    public void testXMPMM() throws Exception {
        //        XMLResult r = getXML("testPDF_Version.11.x.PDFA-1b.pdf");
        Metadata m = getXML("testPDF_twoAuthors.pdf").metadata;
        assertEquals("uuid:0e46913c-72b9-40c0-8232-69e362abcd1e", m.get(XMPMM.DOCUMENTID));

        m = getXML("testPDF_Version.11.x.PDFA-1b.pdf").metadata;
        assertEquals("uuid:cccee1fc-51b3-4b52-ac86-672af3974d25", m.get(XMPMM.DOCUMENTID));

        //now test for 7 elements in each parallel array
        //from the history section
        assertArrayEquals(
                new String[] { "uuid:0313504b-a0b0-4dac-a9f0-357221f2eadf",
                        "uuid:edc4279e-0d5f-465e-b13e-1298402fd11c", "uuid:f565b775-43f3-4a9a-8541-e98c4115db6d",
                        "uuid:9fd5e0a8-14a5-4920-ad7f-870c0b8ee65f", "uuid:09b6cfba-efde-4e07-a77f-70de858cc0aa",
                        "uuid:1e4ffbd7-dabc-4aae-801c-15b3404ade36", "uuid:c1669773-a6ca-4bdd-aade-519030d0af00" },
                m.getValues(XMPMM.HISTORY_EVENT_INSTANCEID));

        assertArrayEquals(new String[] { "converted", "converted", "converted", "converted", "converted",
                "converted", "converted" }, m.getValues(XMPMM.HISTORY_ACTION));

        assertArrayEquals(new String[] { "Preflight", "Preflight", "Preflight", "Preflight", "Preflight",
                "Preflight", "Preflight" }, m.getValues(XMPMM.HISTORY_SOFTWARE_AGENT));

        assertArrayEquals(new String[] { "2014-03-04T23:50:41Z", "2014-03-04T23:50:42Z", "2014-03-04T23:51:34Z",
                "2014-03-04T23:51:36Z", "2014-03-04T23:51:37Z", "2014-03-04T23:52:22Z", "2014-03-04T23:54:48Z" },
                m.getValues(XMPMM.HISTORY_WHEN));
    }

    private void assertException(String path, Parser parser, ParseContext context, Class expected) {
        boolean noEx = false;
        InputStream is = getResourceAsStream(path);
        try {
            String text = getText(is, parser, context);
            noEx = true;
        } catch (Exception e) {
            assertEquals("Not the right exception: " + path, expected, e.getClass());
        } finally {
            IOUtils.closeQuietly(is);
        }
        assertFalse(path + " should have thrown exception", noEx);
    }

    /**
     * Simple class to count end of document events.  If functionality is useful,
     * move to org.apache.tika in src/test
     */
    private class EventCountingHandler extends ContentHandlerDecorator {
        private int endDocument = 0;

        @Override
        public void endDocument() {
            endDocument++;
        }

        public int getEndDocument() {
            return endDocument;
        }
    }

    private class AvoidInlineSelector implements DocumentSelector {

        @Override
        public boolean select(Metadata metadata) {
            String v = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
            if (v != null && v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
                return false;
            }
            return true;
        }
    }
}