Example usage for org.xml.sax ContentHandler toString

List of usage examples for org.xml.sax ContentHandler toString

Introduction

In this page you can find the example usage for org.xml.sax ContentHandler toString.

Prototype

public String toString() 

Source Link

Document

Returns a string representation of the object.

Usage

From source file:org.apache.tika.parser.pdf.PDFParserTest.java

/**
 * PDFs can be "protected" with the default password. This means
 * they're encrypted (potentially both text and metadata),
 * but we can decrypt them easily.//from  w  w w.java 2s. c  om
 */
@Test
public void testProtectedPDF() throws Exception {
    XMLResult r = getXML("testPDF_protected.pdf");
    Metadata metadata = r.metadata;
    assertEquals("true", metadata.get("pdf:encrypted"));
    assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR));
    assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
    assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
    assertEquals(
            "Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009",
            metadata.get(TikaCoreProperties.TITLE));

    assertContains("RETHINKING THE FINANCIAL NETWORK", r.xml);
    assertContains("On 16 November 2002", r.xml);
    assertContains("In many important respects", r.xml);

    // Try again with an explicit empty password
    ParseContext context = new ParseContext();
    context.set(PasswordProvider.class, new PasswordProvider() {
        public String getPassword(Metadata metadata) {
            return "";
        }
    });
    r = getXML("testPDF_protected.pdf", context);
    metadata = r.metadata;
    assertEquals("true", metadata.get("pdf:encrypted"));

    assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
    assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
    assertEquals(
            "Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009",
            metadata.get(TikaCoreProperties.TITLE));

    assertContains("RETHINKING THE FINANCIAL NETWORK", r.xml);
    assertContains("On 16 November 2002", r.xml);
    assertContains("In many important respects", r.xml);

    //now test wrong password
    context.set(PasswordProvider.class, new PasswordProvider() {
        public String getPassword(Metadata metadata) {
            return "WRONG!!!!";
        }
    });

    boolean ex = false;
    ContentHandler handler = new BodyContentHandler();
    metadata = new Metadata();
    try (InputStream stream = PDFParserTest.class
            .getResourceAsStream("/test-documents/testPDF_protected.pdf")) {
        Parser parser = new AutoDetectParser();
        parser.parse(stream, handler, metadata, context);
    } catch (EncryptedDocumentException e) {
        ex = true;
    }
    assertTrue("encryption exception", ex);
    assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("true", metadata.get("pdf:encrypted"));
    //pdf:encrypted, X-Parsed-By and Content-Type
    assertEquals("very little metadata should be parsed", 3, metadata.names().length);
    assertEquals(0, handler.toString().length());
}

From source file:org.apache.tika.parser.pdf.PDFParserTest.java

@Test
public void testSkipBadPage() throws Exception {
    //test file comes from govdocs1
    //can't use TikaTest shortcuts because of exception
    Parser p = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler(-1);
    Metadata m = new Metadata();
    ParseContext context = new ParseContext();
    boolean tikaEx = false;
    try (InputStream is = getResourceAsStream("/test-documents/testPDF_bad_page_303226.pdf")) {
        p.parse(is, handler, m, context);
    } catch (TikaException e) {
        tikaEx = true;/*from  w w w  . j  a  va2s.c  o m*/
    }
    String content = handler.toString();
    assertTrue("Should have thrown exception", tikaEx);
    assertEquals(1, m.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length);
    assertContains("Unknown dir", m.get(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING));
    assertContains("1309.61", content);

    //now try throwing exception immediately
    PDFParserConfig config = new PDFParserConfig();
    config.setCatchIntermediateIOExceptions(false);
    context.set(PDFParserConfig.class, config);

    handler = new BodyContentHandler(-1);
    m = new Metadata();
    tikaEx = false;
    try (InputStream is = getResourceAsStream("/test-documents/testPDF_bad_page_303226.pdf")) {
        p.parse(is, handler, m, context);
    } catch (TikaException e) {
        tikaEx = true;
    }
    content = handler.toString();
    assertTrue("Should have thrown exception", tikaEx);
    assertEquals(0, m.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length);
    assertNotContained("1309.61", content);
}

From source file:org.apache.tika.parser.pdf18.PDFParserTest.java

/**
 * PDFs can be "protected" with the default password. This means
 * they're encrypted (potentially both text and metadata),
 * but we can decrypt them easily.//w w w .  ja  v a 2s  .c  o  m
 */
@Test
public void testProtectedPDF() throws Exception {
    Parser parser = new AutoDetectParser(); // Should auto-detect!
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    try (InputStream stream = PDFParserTest.class
            .getResourceAsStream("/test-documents/testPDF_protected.pdf")) {
        parser.parse(stream, handler, metadata, context);
    }

    assertEquals("true", metadata.get("pdf:encrypted"));
    assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR));
    assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
    assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
    assertEquals(
            "Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009",
            metadata.get(TikaCoreProperties.TITLE));

    String content = handler.toString();
    assertContains("RETHINKING THE FINANCIAL NETWORK", content);
    assertContains("On 16 November 2002", content);
    assertContains("In many important respects", content);

    // Try again with an explicit empty password
    handler = new BodyContentHandler();
    metadata = new Metadata();

    context = new ParseContext();
    context.set(PasswordProvider.class, new PasswordProvider() {
        public String getPassword(Metadata metadata) {
            return "";
        }
    });

    try (InputStream stream = PDFParserTest.class
            .getResourceAsStream("/test-documents/testPDF_protected.pdf")) {
        parser.parse(stream, handler, metadata, context);
    }
    assertEquals("true", metadata.get("pdf:encrypted"));

    assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
    assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
    assertEquals(
            "Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009",
            metadata.get(TikaCoreProperties.TITLE));

    assertContains("RETHINKING THE FINANCIAL NETWORK", content);
    assertContains("On 16 November 2002", content);
    assertContains("In many important respects", content);

    //now test wrong password
    handler = new BodyContentHandler();
    metadata = new Metadata();
    context = new ParseContext();
    context.set(PasswordProvider.class, new PasswordProvider() {
        public String getPassword(Metadata metadata) {
            return "WRONG!!!!";
        }
    });

    boolean ex = false;
    try (InputStream stream = PDFParserTest.class
            .getResourceAsStream("/test-documents/testPDF_protected.pdf")) {
        parser.parse(stream, handler, metadata, context);
    } catch (EncryptedDocumentException e) {
        ex = true;
    }
    content = handler.toString();

    assertTrue("encryption exception", ex);
    assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("true", metadata.get("pdf:encrypted"));
    //pdf:encrypted, X-Parsed-By and Content-Type
    assertEquals("very little metadata should be parsed", 3, metadata.names().length);
    assertEquals(0, content.length());

    //now test wrong password with non sequential parser
    handler = new BodyContentHandler();
    metadata = new Metadata();
    context = new ParseContext();
    context.set(PasswordProvider.class, new PasswordProvider() {
        public String getPassword(Metadata metadata) {
            return "WRONG!!!!";
        }
    });
    PDFParserConfig config = new PDFParserConfig();
    config.setUseNonSequentialParser(true);
    context.set(PDFParserConfig.class, config);

    ;
    ex = false;
    try (InputStream stream = PDFParserTest.class
            .getResourceAsStream("/test-documents/testPDF_protected.pdf")) {
        parser.parse(stream, handler, metadata, context);
    } catch (EncryptedDocumentException e) {
        ex = true;
    }
    content = handler.toString();
    assertTrue("encryption exception", ex);
    assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("true", metadata.get("pdf:encrypted"));

    //pdf:encrypted, X-Parsed-By and Content-Type
    assertEquals("very little metadata should be parsed", 3, metadata.names().length);
    assertEquals(0, content.length());
}

From source file:org.apache.tika.parser.pdf18.PDFParserTest.java

@Test
@Ignore("need to have other parsers available")
public void testEmbeddedPDFEmbeddingAnotherDocument() throws Exception {
    /* format of test doc:
      docx//*from   w  w  w.  ja  va  2 s  . co m*/
    pdf/
       docx
    */
    Parser parser = new AutoDetectParser(); // Should auto-detect!
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    String content = "";
    InputStream stream = null;
    try {
        context.set(org.apache.tika.parser.Parser.class, parser);
        stream = getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx");
        parser.parse(stream, handler, metadata, context);
        content = handler.toString();
    } finally {
        stream.close();
    }
    int outerHaystack = content.indexOf("Outer_haystack");
    int pdfHaystack = content.indexOf("pdf_haystack");
    int needle = content.indexOf("Needle");
    assertTrue(outerHaystack > -1);
    assertTrue(pdfHaystack > -1);
    assertTrue(needle > -1);
    assertTrue(needle > pdfHaystack && pdfHaystack > outerHaystack);

    TrackingHandler tracker = new TrackingHandler();
    TikaInputStream tis;
    ContainerExtractor ex = new ParserContainerExtractor();
    try {
        tis = TikaInputStream.get(getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx"));
        ex.extract(tis, ex, tracker);
    } finally {
        stream.close();
    }
    assertEquals(true, ex.isSupported(tis));
    assertEquals(3, tracker.filenames.size());
    assertEquals(3, tracker.mediaTypes.size());
    assertEquals("image1.emf", tracker.filenames.get(0));
    assertNull(tracker.filenames.get(1));
    assertEquals("Test.docx", tracker.filenames.get(2));
    assertEquals(TYPE_EMF, tracker.mediaTypes.get(0));
    assertEquals(TYPE_PDF, tracker.mediaTypes.get(1));
    assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2));
}

From source file:org.apache.tika.parser.pdf18.PDFParserTest.java

@Test //TIKA-1427
public void testEmbeddedFileMarkup() throws Exception {
    Parser parser = new AutoDetectParser();
    ParseContext context = new ParseContext();
    context.set(org.apache.tika.parser.Parser.class, parser);

    PDFParserConfig config = new PDFParserConfig();
    config.setExtractInlineImages(true);
    config.setExtractUniqueInlineImagesOnly(false);
    context.set(org.apache.tika.parser.pdf18.PDFParserConfig.class, config);

    Metadata metadata = new Metadata();
    ContentHandler handler = new ToXMLContentHandler();
    String path = "/test-documents/testPDF_childAttachments.pdf";
    InputStream stream = null;/*from ww w  .  j a  va 2s  .c o  m*/
    try {
        stream = TikaInputStream.get(this.getClass().getResource(path));
        parser.parse(stream, handler, metadata, context);
    } finally {
        IOUtils.closeQuietly(stream);
    }

    String xml = handler.toString();
    //regular attachment
    assertContains("<div class=\"embedded\" id=\"Unit10.doc\" />", xml);
    //inline image
    assertContains("<img src=\"embedded:image1.tif\" alt=\"image1.tif\" />", xml);

    //doc embedded inside an annotation
    xml = getXML("testPDFFileEmbInAnnotation.pdf").xml;
    assertContains("<div class=\"embedded\" id=\"Excel.xlsx\" />", xml);
}

From source file:org.apache.tika.parser.pdf18.PDFParserTest.java

@Test
public void testXFAOnly() throws Exception {
    ParseContext context = new ParseContext();

    PDFParserConfig config = new PDFParserConfig();
    config.setIfXFAExtractOnlyXFA(true);
    context.set(PDFParserConfig.class, config);
    ContentHandler handler = new ToXMLContentHandler(StandardCharsets.UTF_8.name());
    Metadata metadata = new Metadata();
    Parser parser = new AutoDetectParser();
    try (InputStream is = getResourceAsStream("/test-documents/testPDF_XFA_govdocs1_258578.pdf")) {
        parser.parse(is, handler, metadata, context);
    }/* w  ww. ja va2  s.  c  o  m*/
    String xml = handler.toString();
    assertContains("<li fieldName=\"Room_1\">Room [1]: my_room1</li>", xml);
    assertContains("</xfa_content></body></html>", xml);

    assertNotContained("Mount Rushmore National Memorial", xml);
}

From source file:org.apache.tika.parser.pkg.ZipParserTest.java

@Test
public void testZipParsing() throws Exception {
    Parser parser = new AutoDetectParser(); // Should auto-detect!
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();

    try (InputStream stream = ZipParserTest.class.getResourceAsStream("/test-documents/test-documents.zip")) {
        parser.parse(stream, handler, metadata, recursingContext);
    }/*from   w w w .jav a 2s. c  o  m*/

    assertEquals("application/zip", metadata.get(Metadata.CONTENT_TYPE));
    String content = handler.toString();
    assertContains("testEXCEL.xls", content);
    assertContains("Sample Excel Worksheet", content);
    assertContains("testHTML.html", content);
    assertContains("Test Indexation Html", content);
    assertContains("testOpenOffice2.odt", content);
    assertContains("This is a sample Open Office document", content);
    assertContains("testPDF.pdf", content);
    assertContains("Apache Tika", content);
    assertContains("testPPT.ppt", content);
    assertContains("Sample Powerpoint Slide", content);
    assertContains("testRTF.rtf", content);
    assertContains("indexation Word", content);
    assertContains("testTXT.txt", content);
    assertContains("Test d'indexation de Txt", content);
    assertContains("testWORD.doc", content);
    assertContains("This is a sample Microsoft Word Document", content);
    assertContains("testXML.xml", content);
    assertContains("Rida Benjelloun", content);
}

From source file:org.ednovo.gooru.domain.service.resource.ResourceParser.java

public TitleAndText getTextAndTitle(String url) {
    ContentHandler textHandler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try {/*  w  w  w . j av a2s  .co  m*/

        InputStream in = null;
        if (url.startsWith("http://")) {
            URL urlObject = new URL(url);
            URLConnection res = urlObject.openConnection();
            in = res.getInputStream();
        } else {
            in = new FileInputStream(url);
        }

        parser.parse(in, textHandler, metadata, new ParseContext());

    } catch (Exception e) {
        e.toString();
    }

    String title = metadata.get(Metadata.TITLE);
    String text = textHandler.toString().trim().replaceAll("\\s+", " ");
    if (StringUtils.isBlank(title)) {
        title = StringUtils.substring(text, 0, 50);
    }
    return new TitleAndText(title, text);
}

From source file:org.sakaiproject.search.component.adapter.contenthosting.TikaContentDigester.java

public String getContent(ContentResource contentResource) {
    log.debug("Digesting with TikaContentDigester");
    if (contentResource == null) {
        throw new RuntimeException("Null contentResource passed to getContent");
    }//from   w w  w .j  a v  a 2s  .  c  om

    InputStream contentStream = null;

    try {
        contentStream = contentResource.streamContent();

        Metadata metadata = new Metadata();

        metadata.set(Metadata.CONTENT_TYPE, contentResource.getContentType());
        ContentHandler handler = new BodyContentHandler();
        Parser parser = new AutoDetectParser();

        parser.parse(contentStream, handler, metadata, new ParseContext());
        return handler.toString();
    } catch (Exception e) {
        log.debug("Cannot index", e);
        throw new RuntimeException("Failed to read content for indexing ", e);
    } finally {
        if (contentStream != null) {
            try {
                contentStream.close();
            } catch (IOException e) {
                log.debug(e);
            }
        }
    }
}

From source file:searcher.CollStat.java

String getSnippet(Query q, Document doc, int docid) throws Exception {
    StringBuffer buff = new StringBuffer();
    SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
    Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(q));
    System.out.println("enterEd");
    // Get the decompressed html
    String html = IndexHtmlToText.decompress(doc.getBinaryValue(WTDocument.WTDOC_FIELD_HTML).bytes);

    // Generate snippet...
    InputStream input = new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8));
    ContentHandler handler = new BodyContentHandler(-1);
    Metadata metadata = new Metadata();
    new HtmlParser().parse(input, handler, metadata, new ParseContext());
    String text = handler.toString();

    TokenStream tokenStream = analyzer.tokenStream("dummy", new StringReader(text));
    TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 5);
    for (int j = 0; j < frag.length; j++) {
        if ((frag[j] != null) && (frag[j].getScore() > 0)) {
            buff.append((frag[j].toString()));
        }/*from w  w w  . j  ava 2s.c om*/
    }
    String snippet = buff.toString();
    String modifiedText = snippet;

    String pattern = "<(\\s*)[a-zA-Z0-9]+[^>]+$";
    Pattern r = Pattern.compile(pattern);
    Matcher m = r.matcher(snippet);
    if (m.find()) {
        modifiedText = m.replaceAll("");
    }
    snippet = modifiedText;
    return snippet;
    //byte[] encodedBytes = Base64.encodeBase64(snippet.getBytes());
    //return new String(encodedBytes);
}