Example usage for org.xml.sax ContentHandler toString

Introduction

In this page you can find the example usage for org.xml.sax ContentHandler toString.

Prototype

public String toString()

Source Link

Document

Returns a string representation of the object.

Usage

From source file:org.apache.tika.parser.pdf.PDFParserTest.java

/**
 * PDFs can be "protected" with the default password. This means
 * they're encrypted (potentially both text and metadata),
 * but we can decrypt them easily.//from  w  w w.java 2s. c  om
 */
@Test
public void testProtectedPDF() throws Exception {
    XMLResult r = getXML("testPDF_protected.pdf");
    Metadata metadata = r.metadata;
    assertEquals("true", metadata.get("pdf:encrypted"));
    assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR));
    assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
    assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
    assertEquals(
            "Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009",
            metadata.get(TikaCoreProperties.TITLE));

    assertContains("RETHINKING THE FINANCIAL NETWORK", r.xml);
    assertContains("On 16 November 2002", r.xml);
    assertContains("In many important respects", r.xml);

    // Try again with an explicit empty password
    ParseContext context = new ParseContext();
    context.set(PasswordProvider.class, new PasswordProvider() {
        public String getPassword(Metadata metadata) {
            return "";
        }
    });
    r = getXML("testPDF_protected.pdf", context);
    metadata = r.metadata;
    assertEquals("true", metadata.get("pdf:encrypted"));

    assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
    assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
    assertEquals(
            "Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009",
            metadata.get(TikaCoreProperties.TITLE));

    assertContains("RETHINKING THE FINANCIAL NETWORK", r.xml);
    assertContains("On 16 November 2002", r.xml);
    assertContains("In many important respects", r.xml);

    //now test wrong password
    context.set(PasswordProvider.class, new PasswordProvider() {
        public String getPassword(Metadata metadata) {
            return "WRONG!!!!";
        }
    });

    boolean ex = false;
    ContentHandler handler = new BodyContentHandler();
    metadata = new Metadata();
    try (InputStream stream = PDFParserTest.class
            .getResourceAsStream("/test-documents/testPDF_protected.pdf")) {
        Parser parser = new AutoDetectParser();
        parser.parse(stream, handler, metadata, context);
    } catch (EncryptedDocumentException e) {
        ex = true;
    }
    assertTrue("encryption exception", ex);
    assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("true", metadata.get("pdf:encrypted"));
    //pdf:encrypted, X-Parsed-By and Content-Type
    assertEquals("very little metadata should be parsed", 3, metadata.names().length);
    assertEquals(0, handler.toString().length());
}

From source file:org.apache.tika.parser.pdf.PDFParserTest.java

@Test
public void testSkipBadPage() throws Exception {
    //test file comes from govdocs1
    //can't use TikaTest shortcuts because of exception
    Parser p = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler(-1);
    Metadata m = new Metadata();
    ParseContext context = new ParseContext();
    boolean tikaEx = false;
    try (InputStream is = getResourceAsStream("/test-documents/testPDF_bad_page_303226.pdf")) {
        p.parse(is, handler, m, context);
    } catch (TikaException e) {
        tikaEx = true;/*from  w w w  . j  a  va2s.c  o m*/
    }
    String content = handler.toString();
    assertTrue("Should have thrown exception", tikaEx);
    assertEquals(1, m.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length);
    assertContains("Unknown dir", m.get(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING));
    assertContains("1309.61", content);

    //now try throwing exception immediately
    PDFParserConfig config = new PDFParserConfig();
    config.setCatchIntermediateIOExceptions(false);
    context.set(PDFParserConfig.class, config);

    handler = new BodyContentHandler(-1);
    m = new Metadata();
    tikaEx = false;
    try (InputStream is = getResourceAsStream("/test-documents/testPDF_bad_page_303226.pdf")) {
        p.parse(is, handler, m, context);
    } catch (TikaException e) {
        tikaEx = true;
    }
    content = handler.toString();
    assertTrue("Should have thrown exception", tikaEx);
    assertEquals(0, m.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length);
    assertNotContained("1309.61", content);
}

From source file:org.apache.tika.parser.pdf18.PDFParserTest.java

/**
 * PDFs can be "protected" with the default password. This means
 * they're encrypted (potentially both text and metadata),
 * but we can decrypt them easily.//w w w .  ja  v a 2s  .c  o  m
 */
@Test
public void testProtectedPDF() throws Exception {
    Parser parser = new AutoDetectParser(); // Should auto-detect!
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    try (InputStream stream = PDFParserTest.class
            .getResourceAsStream("/test-documents/testPDF_protected.pdf")) {
        parser.parse(stream, handler, metadata, context);
    }

    assertEquals("true", metadata.get("pdf:encrypted"));
    assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR));
    assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
    assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
    assertEquals(
            "Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009",
            metadata.get(TikaCoreProperties.TITLE));

    String content = handler.toString();
    assertContains("RETHINKING THE FINANCIAL NETWORK", content);
    assertContains("On 16 November 2002", content);
    assertContains("In many important respects", content);

    // Try again with an explicit empty password
    handler = new BodyContentHandler();
    metadata = new Metadata();

    context = new ParseContext();
    context.set(PasswordProvider.class, new PasswordProvider() {
        public String getPassword(Metadata metadata) {
            return "";
        }
    });

    try (InputStream stream = PDFParserTest.class
            .getResourceAsStream("/test-documents/testPDF_protected.pdf")) {
        parser.parse(stream, handler, metadata, context);
    }
    assertEquals("true", metadata.get("pdf:encrypted"));

    assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
    assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
    assertEquals(
            "Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009",
            metadata.get(TikaCoreProperties.TITLE));

    assertContains("RETHINKING THE FINANCIAL NETWORK", content);
    assertContains("On 16 November 2002", content);
    assertContains("In many important respects", content);

    //now test wrong password
    handler = new BodyContentHandler();
    metadata = new Metadata();
    context = new ParseContext();
    context.set(PasswordProvider.class, new PasswordProvider() {
        public String getPassword(Metadata metadata) {
            return "WRONG!!!!";
        }
    });

    boolean ex = false;
    try (InputStream stream = PDFParserTest.class
            .getResourceAsStream("/test-documents/testPDF_protected.pdf")) {
        parser.parse(stream, handler, metadata, context);
    } catch (EncryptedDocumentException e) {
        ex = true;
    }
    content = handler.toString();

    assertTrue("encryption exception", ex);
    assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("true", metadata.get("pdf:encrypted"));
    //pdf:encrypted, X-Parsed-By and Content-Type
    assertEquals("very little metadata should be parsed", 3, metadata.names().length);
    assertEquals(0, content.length());

    //now test wrong password with non sequential parser
    handler = new BodyContentHandler();
    metadata = new Metadata();
    context = new ParseContext();
    context.set(PasswordProvider.class, new PasswordProvider() {
        public String getPassword(Metadata metadata) {
            return "WRONG!!!!";
        }
    });
    PDFParserConfig config = new PDFParserConfig();
    config.setUseNonSequentialParser(true);
    context.set(PDFParserConfig.class, config);

    ;
    ex = false;
    try (InputStream stream = PDFParserTest.class
            .getResourceAsStream("/test-documents/testPDF_protected.pdf")) {
        parser.parse(stream, handler, metadata, context);
    } catch (EncryptedDocumentException e) {
        ex = true;
    }
    content = handler.toString();
    assertTrue("encryption exception", ex);
    assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("true", metadata.get("pdf:encrypted"));

    //pdf:encrypted, X-Parsed-By and Content-Type
    assertEquals("very little metadata should be parsed", 3, metadata.names().length);
    assertEquals(0, content.length());
}

From source file:org.apache.tika.parser.pdf18.PDFParserTest.java

@Test
@Ignore("need to have other parsers available")
public void testEmbeddedPDFEmbeddingAnotherDocument() throws Exception {
    /* format of test doc:
      docx//*from   w  w  w.  ja  va  2 s  . co m*/
    pdf/
       docx
    */
    Parser parser = new AutoDetectParser(); // Should auto-detect!
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    String content = "";
    InputStream stream = null;
    try {
        context.set(org.apache.tika.parser.Parser.class, parser);
        stream = getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx");
        parser.parse(stream, handler, metadata, context);
        content = handler.toString();
    } finally {
        stream.close();
    }
    int outerHaystack = content.indexOf("Outer_haystack");
    int pdfHaystack = content.indexOf("pdf_haystack");
    int needle = content.indexOf("Needle");
    assertTrue(outerHaystack > -1);
    assertTrue(pdfHaystack > -1);
    assertTrue(needle > -1);
    assertTrue(needle > pdfHaystack && pdfHaystack > outerHaystack);

    TrackingHandler tracker = new TrackingHandler();
    TikaInputStream tis;
    ContainerExtractor ex = new ParserContainerExtractor();
    try {
        tis = TikaInputStream.get(getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx"));
        ex.extract(tis, ex, tracker);
    } finally {
        stream.close();
    }
    assertEquals(true, ex.isSupported(tis));
    assertEquals(3, tracker.filenames.size());
    assertEquals(3, tracker.mediaTypes.size());
    assertEquals("image1.emf", tracker.filenames.get(0));
    assertNull(tracker.filenames.get(1));
    assertEquals("Test.docx", tracker.filenames.get(2));
    assertEquals(TYPE_EMF, tracker.mediaTypes.get(0));
    assertEquals(TYPE_PDF, tracker.mediaTypes.get(1));
    assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2));
}

From source file:org.apache.tika.parser.pdf18.PDFParserTest.java

@Test //TIKA-1427
public void testEmbeddedFileMarkup() throws Exception {
    Parser parser = new AutoDetectParser();
    ParseContext context = new ParseContext();
    context.set(org.apache.tika.parser.Parser.class, parser);

    PDFParserConfig config = new PDFParserConfig();
    config.setExtractInlineImages(true);
    config.setExtractUniqueInlineImagesOnly(false);
    context.set(org.apache.tika.parser.pdf18.PDFParserConfig.class, config);

    Metadata metadata = new Metadata();
    ContentHandler handler = new ToXMLContentHandler();
    String path = "/test-documents/testPDF_childAttachments.pdf";
    InputStream stream = null;/*from ww w  .  j a  va 2s  .c o  m*/
    try {
        stream = TikaInputStream.get(this.getClass().getResource(path));
        parser.parse(stream, handler, metadata, context);
    } finally {
        IOUtils.closeQuietly(stream);
    }

    String xml = handler.toString();
    //regular attachment
    assertContains("<div class=\"embedded\" id=\"Unit10.doc\" />", xml);
    //inline image
    assertContains("<img src=\"embedded:image1.tif\" alt=\"image1.tif\" />", xml);

    //doc embedded inside an annotation
    xml = getXML("testPDFFileEmbInAnnotation.pdf").xml;
    assertContains("<div class=\"embedded\" id=\"Excel.xlsx\" />", xml);
}

From source file:org.apache.tika.parser.pdf18.PDFParserTest.java

@Test
public void testXFAOnly() throws Exception {
    ParseContext context = new ParseContext();

    PDFParserConfig config = new PDFParserConfig();
    config.setIfXFAExtractOnlyXFA(true);
    context.set(PDFParserConfig.class, config);
    ContentHandler handler = new ToXMLContentHandler(StandardCharsets.UTF_8.name());
    Metadata metadata = new Metadata();
    Parser parser = new AutoDetectParser();
    try (InputStream is = getResourceAsStream("/test-documents/testPDF_XFA_govdocs1_258578.pdf")) {
        parser.parse(is, handler, metadata, context);
    }/* w  ww. ja va2  s.  c  o  m*/
    String xml = handler.toString();
    assertContains("<li fieldName=\"Room_1\">Room [1]: my_room1</li>", xml);
    assertContains("</xfa_content></body></html>", xml);

    assertNotContained("Mount Rushmore National Memorial", xml);
}

From source file:org.apache.tika.parser.pkg.ZipParserTest.java

@Test
public void testZipParsing() throws Exception {
    Parser parser = new AutoDetectParser(); // Should auto-detect!
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();

    try (InputStream stream = ZipParserTest.class.getResourceAsStream("/test-documents/test-documents.zip")) {
        parser.parse(stream, handler, metadata, recursingContext);
    }/*from   w w w .jav a 2s. c  o  m*/

    assertEquals("application/zip", metadata.get(Metadata.CONTENT_TYPE));
    String content = handler.toString();
    assertContains("testEXCEL.xls", content);
    assertContains("Sample Excel Worksheet", content);
    assertContains("testHTML.html", content);
    assertContains("Test Indexation Html", content);
    assertContains("testOpenOffice2.odt", content);
    assertContains("This is a sample Open Office document", content);
    assertContains("testPDF.pdf", content);
    assertContains("Apache Tika", content);
    assertContains("testPPT.ppt", content);
    assertContains("Sample Powerpoint Slide", content);
    assertContains("testRTF.rtf", content);
    assertContains("indexation Word", content);
    assertContains("testTXT.txt", content);
    assertContains("Test d'indexation de Txt", content);
    assertContains("testWORD.doc", content);
    assertContains("This is a sample Microsoft Word Document", content);
    assertContains("testXML.xml", content);
    assertContains("Rida Benjelloun", content);
}

From source file:org.ednovo.gooru.domain.service.resource.ResourceParser.java

public TitleAndText getTextAndTitle(String url) {
    ContentHandler textHandler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try {/*  w  w  w . j av a2s  .co  m*/

        InputStream in = null;
        if (url.startsWith("http://")) {
            URL urlObject = new URL(url);
            URLConnection res = urlObject.openConnection();
            in = res.getInputStream();
        } else {
            in = new FileInputStream(url);
        }

        parser.parse(in, textHandler, metadata, new ParseContext());

    } catch (Exception e) {
        e.toString();
    }

    String title = metadata.get(Metadata.TITLE);
    String text = textHandler.toString().trim().replaceAll("\\s+", " ");
    if (StringUtils.isBlank(title)) {
        title = StringUtils.substring(text, 0, 50);
    }
    return new TitleAndText(title, text);
}

From source file:org.sakaiproject.search.component.adapter.contenthosting.TikaContentDigester.java

public String getContent(ContentResource contentResource) {
    log.debug("Digesting with TikaContentDigester");
    if (contentResource == null) {
        throw new RuntimeException("Null contentResource passed to getContent");
    }//from   w w  w .j  a v  a 2s  .  c  om

    InputStream contentStream = null;

    try {
        contentStream = contentResource.streamContent();

        Metadata metadata = new Metadata();

        metadata.set(Metadata.CONTENT_TYPE, contentResource.getContentType());
        ContentHandler handler = new BodyContentHandler();
        Parser parser = new AutoDetectParser();

        parser.parse(contentStream, handler, metadata, new ParseContext());
        return handler.toString();
    } catch (Exception e) {
        log.debug("Cannot index", e);
        throw new RuntimeException("Failed to read content for indexing ", e);
    } finally {
        if (contentStream != null) {
            try {
                contentStream.close();
            } catch (IOException e) {
                log.debug(e);
            }
        }
    }
}

From source file:searcher.CollStat.java

String getSnippet(Query q, Document doc, int docid) throws Exception {
    StringBuffer buff = new StringBuffer();
    SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
    Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(q));
    System.out.println("enterEd");
    // Get the decompressed html
    String html = IndexHtmlToText.decompress(doc.getBinaryValue(WTDocument.WTDOC_FIELD_HTML).bytes);

    // Generate snippet...
    InputStream input = new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8));
    ContentHandler handler = new BodyContentHandler(-1);
    Metadata metadata = new Metadata();
    new HtmlParser().parse(input, handler, metadata, new ParseContext());
    String text = handler.toString();

    TokenStream tokenStream = analyzer.tokenStream("dummy", new StringReader(text));
    TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 5);
    for (int j = 0; j < frag.length; j++) {
        if ((frag[j] != null) && (frag[j].getScore() > 0)) {
            buff.append((frag[j].toString()));
        }/*from w  w w  . j  ava 2s.c om*/
    }
    String snippet = buff.toString();
    String modifiedText = snippet;

    String pattern = "<(\\s*)[a-zA-Z0-9]+[^>]+$";
    Pattern r = Pattern.compile(pattern);
    Matcher m = r.matcher(snippet);
    if (m.find()) {
        modifiedText = m.replaceAll("");
    }
    snippet = modifiedText;
    return snippet;
    //byte[] encodedBytes = Base64.encodeBase64(snippet.getBytes());
    //return new String(encodedBytes);
}