List of usage examples for org.xml.sax ContentHandler toString
public String toString()
From source file:org.apache.tika.parser.pdf.PDFParserTest.java
/** * PDFs can be "protected" with the default password. This means * they're encrypted (potentially both text and metadata), * but we can decrypt them easily.//from w w w.java 2s. c om */ @Test public void testProtectedPDF() throws Exception { XMLResult r = getXML("testPDF_protected.pdf"); Metadata metadata = r.metadata; assertEquals("true", metadata.get("pdf:encrypted")); assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR)); assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR)); assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT)); assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT)); assertEquals( "Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE)); assertContains("RETHINKING THE FINANCIAL NETWORK", r.xml); assertContains("On 16 November 2002", r.xml); assertContains("In many important respects", r.xml); // Try again with an explicit empty password ParseContext context = new ParseContext(); context.set(PasswordProvider.class, new PasswordProvider() { public String getPassword(Metadata metadata) { return ""; } }); r = getXML("testPDF_protected.pdf", context); metadata = r.metadata; assertEquals("true", metadata.get("pdf:encrypted")); assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR)); assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT)); assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT)); assertEquals( "Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE)); assertContains("RETHINKING THE FINANCIAL NETWORK", r.xml); assertContains("On 16 November 2002", r.xml); assertContains("In many important respects", r.xml); //now test wrong password context.set(PasswordProvider.class, new PasswordProvider() { public String getPassword(Metadata metadata) { return "WRONG!!!!"; } }); boolean ex = false; ContentHandler handler = new BodyContentHandler(); metadata = new Metadata(); try (InputStream stream = PDFParserTest.class .getResourceAsStream("/test-documents/testPDF_protected.pdf")) { Parser parser = new AutoDetectParser(); parser.parse(stream, handler, metadata, context); } catch (EncryptedDocumentException e) { ex = true; } assertTrue("encryption exception", ex); assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("true", metadata.get("pdf:encrypted")); //pdf:encrypted, X-Parsed-By and Content-Type assertEquals("very little metadata should be parsed", 3, metadata.names().length); assertEquals(0, handler.toString().length()); }
From source file:org.apache.tika.parser.pdf.PDFParserTest.java
@Test public void testSkipBadPage() throws Exception { //test file comes from govdocs1 //can't use TikaTest shortcuts because of exception Parser p = new AutoDetectParser(); ContentHandler handler = new BodyContentHandler(-1); Metadata m = new Metadata(); ParseContext context = new ParseContext(); boolean tikaEx = false; try (InputStream is = getResourceAsStream("/test-documents/testPDF_bad_page_303226.pdf")) { p.parse(is, handler, m, context); } catch (TikaException e) { tikaEx = true;/*from w w w . j a va2s.c o m*/ } String content = handler.toString(); assertTrue("Should have thrown exception", tikaEx); assertEquals(1, m.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length); assertContains("Unknown dir", m.get(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING)); assertContains("1309.61", content); //now try throwing exception immediately PDFParserConfig config = new PDFParserConfig(); config.setCatchIntermediateIOExceptions(false); context.set(PDFParserConfig.class, config); handler = new BodyContentHandler(-1); m = new Metadata(); tikaEx = false; try (InputStream is = getResourceAsStream("/test-documents/testPDF_bad_page_303226.pdf")) { p.parse(is, handler, m, context); } catch (TikaException e) { tikaEx = true; } content = handler.toString(); assertTrue("Should have thrown exception", tikaEx); assertEquals(0, m.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length); assertNotContained("1309.61", content); }
From source file:org.apache.tika.parser.pdf18.PDFParserTest.java
/** * PDFs can be "protected" with the default password. This means * they're encrypted (potentially both text and metadata), * but we can decrypt them easily.//w w w . ja v a 2s .c o m */ @Test public void testProtectedPDF() throws Exception { Parser parser = new AutoDetectParser(); // Should auto-detect! ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); try (InputStream stream = PDFParserTest.class .getResourceAsStream("/test-documents/testPDF_protected.pdf")) { parser.parse(stream, handler, metadata, context); } assertEquals("true", metadata.get("pdf:encrypted")); assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR)); assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR)); assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT)); assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT)); assertEquals( "Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE)); String content = handler.toString(); assertContains("RETHINKING THE FINANCIAL NETWORK", content); assertContains("On 16 November 2002", content); assertContains("In many important respects", content); // Try again with an explicit empty password handler = new BodyContentHandler(); metadata = new Metadata(); context = new ParseContext(); context.set(PasswordProvider.class, new PasswordProvider() { public String getPassword(Metadata metadata) { return ""; } }); try (InputStream stream = PDFParserTest.class .getResourceAsStream("/test-documents/testPDF_protected.pdf")) { parser.parse(stream, handler, metadata, context); } assertEquals("true", metadata.get("pdf:encrypted")); assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR)); assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT)); assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT)); assertEquals( "Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE)); assertContains("RETHINKING THE FINANCIAL NETWORK", content); assertContains("On 16 November 2002", content); assertContains("In many important respects", content); //now test wrong password handler = new BodyContentHandler(); metadata = new Metadata(); context = new ParseContext(); context.set(PasswordProvider.class, new PasswordProvider() { public String getPassword(Metadata metadata) { return "WRONG!!!!"; } }); boolean ex = false; try (InputStream stream = PDFParserTest.class .getResourceAsStream("/test-documents/testPDF_protected.pdf")) { parser.parse(stream, handler, metadata, context); } catch (EncryptedDocumentException e) { ex = true; } content = handler.toString(); assertTrue("encryption exception", ex); assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("true", metadata.get("pdf:encrypted")); //pdf:encrypted, X-Parsed-By and Content-Type assertEquals("very little metadata should be parsed", 3, metadata.names().length); assertEquals(0, content.length()); //now test wrong password with non sequential parser handler = new BodyContentHandler(); metadata = new Metadata(); context = new ParseContext(); context.set(PasswordProvider.class, new PasswordProvider() { public String getPassword(Metadata metadata) { return "WRONG!!!!"; } }); PDFParserConfig config = new PDFParserConfig(); config.setUseNonSequentialParser(true); context.set(PDFParserConfig.class, config); ; ex = false; try (InputStream stream = PDFParserTest.class .getResourceAsStream("/test-documents/testPDF_protected.pdf")) { parser.parse(stream, handler, metadata, context); } catch (EncryptedDocumentException e) { ex = true; } content = handler.toString(); assertTrue("encryption exception", ex); assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("true", metadata.get("pdf:encrypted")); //pdf:encrypted, X-Parsed-By and Content-Type assertEquals("very little metadata should be parsed", 3, metadata.names().length); assertEquals(0, content.length()); }
From source file:org.apache.tika.parser.pdf18.PDFParserTest.java
@Test @Ignore("need to have other parsers available") public void testEmbeddedPDFEmbeddingAnotherDocument() throws Exception { /* format of test doc: docx//*from w w w. ja va 2 s . co m*/ pdf/ docx */ Parser parser = new AutoDetectParser(); // Should auto-detect! ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); String content = ""; InputStream stream = null; try { context.set(org.apache.tika.parser.Parser.class, parser); stream = getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx"); parser.parse(stream, handler, metadata, context); content = handler.toString(); } finally { stream.close(); } int outerHaystack = content.indexOf("Outer_haystack"); int pdfHaystack = content.indexOf("pdf_haystack"); int needle = content.indexOf("Needle"); assertTrue(outerHaystack > -1); assertTrue(pdfHaystack > -1); assertTrue(needle > -1); assertTrue(needle > pdfHaystack && pdfHaystack > outerHaystack); TrackingHandler tracker = new TrackingHandler(); TikaInputStream tis; ContainerExtractor ex = new ParserContainerExtractor(); try { tis = TikaInputStream.get(getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx")); ex.extract(tis, ex, tracker); } finally { stream.close(); } assertEquals(true, ex.isSupported(tis)); assertEquals(3, tracker.filenames.size()); assertEquals(3, tracker.mediaTypes.size()); assertEquals("image1.emf", tracker.filenames.get(0)); assertNull(tracker.filenames.get(1)); assertEquals("Test.docx", tracker.filenames.get(2)); assertEquals(TYPE_EMF, tracker.mediaTypes.get(0)); assertEquals(TYPE_PDF, tracker.mediaTypes.get(1)); assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2)); }
From source file:org.apache.tika.parser.pdf18.PDFParserTest.java
@Test //TIKA-1427 public void testEmbeddedFileMarkup() throws Exception { Parser parser = new AutoDetectParser(); ParseContext context = new ParseContext(); context.set(org.apache.tika.parser.Parser.class, parser); PDFParserConfig config = new PDFParserConfig(); config.setExtractInlineImages(true); config.setExtractUniqueInlineImagesOnly(false); context.set(org.apache.tika.parser.pdf18.PDFParserConfig.class, config); Metadata metadata = new Metadata(); ContentHandler handler = new ToXMLContentHandler(); String path = "/test-documents/testPDF_childAttachments.pdf"; InputStream stream = null;/*from ww w . j a va 2s .c o m*/ try { stream = TikaInputStream.get(this.getClass().getResource(path)); parser.parse(stream, handler, metadata, context); } finally { IOUtils.closeQuietly(stream); } String xml = handler.toString(); //regular attachment assertContains("<div class=\"embedded\" id=\"Unit10.doc\" />", xml); //inline image assertContains("<img src=\"embedded:image1.tif\" alt=\"image1.tif\" />", xml); //doc embedded inside an annotation xml = getXML("testPDFFileEmbInAnnotation.pdf").xml; assertContains("<div class=\"embedded\" id=\"Excel.xlsx\" />", xml); }
From source file:org.apache.tika.parser.pdf18.PDFParserTest.java
@Test public void testXFAOnly() throws Exception { ParseContext context = new ParseContext(); PDFParserConfig config = new PDFParserConfig(); config.setIfXFAExtractOnlyXFA(true); context.set(PDFParserConfig.class, config); ContentHandler handler = new ToXMLContentHandler(StandardCharsets.UTF_8.name()); Metadata metadata = new Metadata(); Parser parser = new AutoDetectParser(); try (InputStream is = getResourceAsStream("/test-documents/testPDF_XFA_govdocs1_258578.pdf")) { parser.parse(is, handler, metadata, context); }/* w ww. ja va2 s. c o m*/ String xml = handler.toString(); assertContains("<li fieldName=\"Room_1\">Room [1]: my_room1</li>", xml); assertContains("</xfa_content></body></html>", xml); assertNotContained("Mount Rushmore National Memorial", xml); }
From source file:org.apache.tika.parser.pkg.ZipParserTest.java
@Test public void testZipParsing() throws Exception { Parser parser = new AutoDetectParser(); // Should auto-detect! ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); try (InputStream stream = ZipParserTest.class.getResourceAsStream("/test-documents/test-documents.zip")) { parser.parse(stream, handler, metadata, recursingContext); }/*from w w w .jav a 2s. c o m*/ assertEquals("application/zip", metadata.get(Metadata.CONTENT_TYPE)); String content = handler.toString(); assertContains("testEXCEL.xls", content); assertContains("Sample Excel Worksheet", content); assertContains("testHTML.html", content); assertContains("Test Indexation Html", content); assertContains("testOpenOffice2.odt", content); assertContains("This is a sample Open Office document", content); assertContains("testPDF.pdf", content); assertContains("Apache Tika", content); assertContains("testPPT.ppt", content); assertContains("Sample Powerpoint Slide", content); assertContains("testRTF.rtf", content); assertContains("indexation Word", content); assertContains("testTXT.txt", content); assertContains("Test d'indexation de Txt", content); assertContains("testWORD.doc", content); assertContains("This is a sample Microsoft Word Document", content); assertContains("testXML.xml", content); assertContains("Rida Benjelloun", content); }
From source file:org.ednovo.gooru.domain.service.resource.ResourceParser.java
public TitleAndText getTextAndTitle(String url) { ContentHandler textHandler = new BodyContentHandler(); Metadata metadata = new Metadata(); try {/* w w w . j av a2s .co m*/ InputStream in = null; if (url.startsWith("http://")) { URL urlObject = new URL(url); URLConnection res = urlObject.openConnection(); in = res.getInputStream(); } else { in = new FileInputStream(url); } parser.parse(in, textHandler, metadata, new ParseContext()); } catch (Exception e) { e.toString(); } String title = metadata.get(Metadata.TITLE); String text = textHandler.toString().trim().replaceAll("\\s+", " "); if (StringUtils.isBlank(title)) { title = StringUtils.substring(text, 0, 50); } return new TitleAndText(title, text); }
From source file:org.sakaiproject.search.component.adapter.contenthosting.TikaContentDigester.java
public String getContent(ContentResource contentResource) { log.debug("Digesting with TikaContentDigester"); if (contentResource == null) { throw new RuntimeException("Null contentResource passed to getContent"); }//from w w w .j a v a 2s . c om InputStream contentStream = null; try { contentStream = contentResource.streamContent(); Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, contentResource.getContentType()); ContentHandler handler = new BodyContentHandler(); Parser parser = new AutoDetectParser(); parser.parse(contentStream, handler, metadata, new ParseContext()); return handler.toString(); } catch (Exception e) { log.debug("Cannot index", e); throw new RuntimeException("Failed to read content for indexing ", e); } finally { if (contentStream != null) { try { contentStream.close(); } catch (IOException e) { log.debug(e); } } } }
From source file:searcher.CollStat.java
String getSnippet(Query q, Document doc, int docid) throws Exception { StringBuffer buff = new StringBuffer(); SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(); Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(q)); System.out.println("enterEd"); // Get the decompressed html String html = IndexHtmlToText.decompress(doc.getBinaryValue(WTDocument.WTDOC_FIELD_HTML).bytes); // Generate snippet... InputStream input = new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8)); ContentHandler handler = new BodyContentHandler(-1); Metadata metadata = new Metadata(); new HtmlParser().parse(input, handler, metadata, new ParseContext()); String text = handler.toString(); TokenStream tokenStream = analyzer.tokenStream("dummy", new StringReader(text)); TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 5); for (int j = 0; j < frag.length; j++) { if ((frag[j] != null) && (frag[j].getScore() > 0)) { buff.append((frag[j].toString())); }/*from w w w . j ava 2s.c om*/ } String snippet = buff.toString(); String modifiedText = snippet; String pattern = "<(\\s*)[a-zA-Z0-9]+[^>]+$"; Pattern r = Pattern.compile(pattern); Matcher m = r.matcher(snippet); if (m.find()) { modifiedText = m.replaceAll(""); } snippet = modifiedText; return snippet; //byte[] encodedBytes = Base64.encodeBase64(snippet.getBytes()); //return new String(encodedBytes); }