List of usage examples for org.xml.sax ContentHandler toString
public String toString()
From source file:edu.ucsd.library.dams.api.DAMSAPIServlet.java
public Map extractText(String objid, String cmpid, String fileid, FileStore fs) { Map info = new LinkedHashMap(); InputStream in = null;/*from w ww . j av a2 s . co m*/ String fn = objid; if (cmpid != null) { fn += "/" + cmpid; } fn += "/" + fileid; try { // make sure objid and fileid are specified if (objid == null || fileid == null) { return error(SC_BAD_REQUEST, "Object and file must be specified", null); } // make sure file exists if (!fs.exists(objid, cmpid, fileid)) { return error(SC_NOT_FOUND, "File does not exist", null); } // extract text in = fs.getInputStream(objid, cmpid, fileid); //String text = PDFParser.getContent( in, objid ); ContentHandler contentHandler = new BodyContentHandler((int) maxUploadSize * 4); Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, fileid); Parser parser = new AutoDetectParser(); ParseContext parserContext = new ParseContext(); parser.parse(in, contentHandler, metadata, parserContext); info.put("text", escapeForXml(contentHandler.toString())); } catch (Exception ex) { log.error("Error extracting text from " + fn, ex); return error("Error extracting text from " + fn, ex); } return info; }
From source file:org.apache.tika.example.MyFirstTika.java
public static String parseUsingComponents(String filename, TikaConfig tikaConfig, Metadata metadata) throws Exception { MimeTypes mimeRegistry = tikaConfig.getMimeRepository(); System.out.println("Examining: [" + filename + "]"); metadata.set(Metadata.RESOURCE_NAME_KEY, filename); System.out.println("The MIME type (based on filename) is: [" + mimeRegistry.detect(null, metadata) + "]"); InputStream stream = TikaInputStream.get(new File(filename)); System.out.println("The MIME type (based on MAGIC) is: [" + mimeRegistry.detect(stream, metadata) + "]"); stream = TikaInputStream.get(new File(filename)); Detector detector = tikaConfig.getDetector(); System.out.println(//from w w w. j a v a2 s. c om "The MIME type (based on the Detector interface) is: [" + detector.detect(stream, metadata) + "]"); LanguageDetector langDetector = new OptimaizeLangDetector().loadModels(); LanguageResult lang = langDetector.detect(FileUtils.readFileToString(new File(filename), UTF_8)); System.out.println("The language of this content is: [" + lang.getLanguage() + "]"); // Get a non-detecting parser that handles all the types it can Parser parser = tikaConfig.getParser(); // Tell it what we think the content is MediaType type = detector.detect(stream, metadata); metadata.set(Metadata.CONTENT_TYPE, type.toString()); // Have the file parsed to get the content and metadata ContentHandler handler = new BodyContentHandler(); parser.parse(stream, handler, metadata, new ParseContext()); return handler.toString(); }
From source file:org.apache.tika.parser.font.FontParsersTest.java
@Test public void testAdobeFontMetricParsing() throws Exception { Parser parser = new AutoDetectParser(); // Should auto-detect! ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); try (TikaInputStream stream = TikaInputStream .get(FontParsersTest.class.getResource("/test-documents/testAFM.afm"))) { parser.parse(stream, handler, metadata, context); }/*w w w. ja va2 s . c om*/ assertEquals("application/x-font-adobe-metric", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("TestFullName", metadata.get(TikaCoreProperties.TITLE)); assertEquals("Fri Jul 15 17:50:51 2011", metadata.get(TikaCoreProperties.CREATED)); assertEquals("TestFontName", metadata.get(MET_FONT_NAME)); assertEquals("TestFullName", metadata.get(MET_FONT_FULL_NAME)); assertEquals("TestSymbol", metadata.get(MET_FONT_FAMILY_NAME)); assertEquals("Medium", metadata.get(MET_FONT_WEIGHT)); assertEquals("001.008", metadata.get(MET_FONT_VERSION)); String content = handler.toString(); // Test that the comments got extracted assertContains("Comments", content); assertContains("This is a comment in a sample file", content); assertContains("UniqueID 12345", content); }
From source file:org.apache.tika.parser.font.FontParsersTest.java
@Test public void testTTFParsing() throws Exception { Parser parser = new AutoDetectParser(); // Should auto-detect! ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); //Open Sans font is ASL 2.0 according to //http://www.google.com/fonts/specimen/Open+Sans //...despite the copyright in the file's metadata. try (TikaInputStream stream = TikaInputStream .get(FontParsersTest.class.getResource("/test-documents/testTrueType3.ttf"))) { parser.parse(stream, handler, metadata, context); }//w ww . j a v a 2s . c o m assertEquals("application/x-font-ttf", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("Open Sans Bold", metadata.get(TikaCoreProperties.TITLE)); assertEquals("2010-12-30T11:04:00Z", metadata.get(TikaCoreProperties.CREATED)); assertEquals("2011-05-05T12:37:53Z", metadata.get(TikaCoreProperties.MODIFIED)); assertEquals("Open Sans Bold", metadata.get(MET_FONT_NAME)); assertEquals("Open Sans", metadata.get(MET_FONT_FAMILY_NAME)); assertEquals("Bold", metadata.get(MET_FONT_SUB_FAMILY_NAME)); assertEquals("OpenSans-Bold", metadata.get(MET_PS_NAME)); assertEquals("Digitized", metadata.get("Copyright").substring(0, 9)); assertEquals("Open Sans", metadata.get("Trademark").substring(0, 9)); // Not extracted assertEquals(null, metadata.get(MET_FONT_FULL_NAME)); assertEquals(null, metadata.get(MET_FONT_WEIGHT)); assertEquals(null, metadata.get(MET_FONT_VERSION)); // Currently, the parser doesn't extract any contents String content = handler.toString(); assertEquals("", content); }
From source file:org.apache.tika.parser.fork.ForkParserIntegrationTest.java
/** * Simple text parsing/* ww w. j av a 2 s .c om*/ */ @Test public void testForkedTextParsing() throws Exception { try (ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), tika.getParser())) { ContentHandler output = new BodyContentHandler(); InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/testTXT.txt"); ParseContext context = new ParseContext(); parser.parse(stream, output, new Metadata(), context); String content = output.toString(); assertContains("Test d'indexation", content); assertContains("http://www.apache.org", content); } }
From source file:org.apache.tika.parser.fork.ForkParserIntegrationTest.java
/** * TIKA-832//w w w. ja v a2 s .co m */ @Test public void testAttachingADebuggerOnTheForkedParserShouldWork() throws Exception { ParseContext context = new ParseContext(); context.set(Parser.class, tika.getParser()); ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), tika.getParser()); parser.setJavaCommand(Arrays.asList("java", "-Xmx32m", "-Xdebug", "-Xrunjdwp:transport=dt_socket,address=54321,server=y,suspend=n")); try { ContentHandler body = new BodyContentHandler(); InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/testTXT.txt"); parser.parse(stream, body, new Metadata(), context); String content = body.toString(); assertContains("Test d'indexation", content); assertContains("http://www.apache.org", content); } finally { parser.close(); } }
From source file:org.apache.tika.parser.fork.ForkParserIntegrationTest.java
/** * TIKA-808 - Ensure that parsing of our test PDFs work under * the Fork Parser, to ensure that complex parsing behaves *//* w w w . j a va 2s.co m*/ @Test public void testForkedPDFParsing() throws Exception { try (ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), tika.getParser())) { ContentHandler output = new BodyContentHandler(); InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/testPDF.pdf"); ParseContext context = new ParseContext(); context.set(Parser.class, new EmptyParser()); parser.parse(stream, output, new Metadata(), context); String content = output.toString(); assertContains("Apache Tika", content); assertContains("Tika - Content Analysis Toolkit", content); assertContains("incubator", content); assertContains("Apache Software Foundation", content); } }
From source file:org.apache.tika.parser.fork.ForkParserIntegrationTest.java
@Test public void testForkedPackageParsing() throws Exception { try (ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), tika.getParser())) {// w ww .ja v a 2s.co m ContentHandler output = new BodyContentHandler(); InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/moby.zip"); ParseContext context = new ParseContext(); parser.parse(stream, output, new Metadata(), context); assertContains("Moby Dick", output.toString()); } }
From source file:org.apache.tika.parser.jdbc.SQLite3ParserTest.java
@Test public void testSpacesInBodyContentHandler() throws Exception { Parser p = new AutoDetectParser(); Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME); ContentHandler handler = new BodyContentHandler(-1); ParseContext ctx = new ParseContext(); ctx.set(Parser.class, p); try (InputStream stream = getResourceAsStream(TEST_FILE1)) { p.parse(stream, handler, metadata, ctx); }// w ww . j ava2s . c o m String s = handler.toString(); assertContains("0\t2.3\t2.4\tlorem", s); assertContains("tempor\n", s); }
From source file:org.apache.tika.parser.jdbc.SQLite3ParserTest.java
@Test public void testNotAddingEmbeddedParserToParseContext() throws Exception { Parser p = new AutoDetectParser(); ContentHandler handler = new ToXMLContentHandler(); Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, new EmptyParser()); try (InputStream is = getResourceAsStream(TEST_FILE1)) { metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME); p.parse(is, handler, metadata, parseContext); }//from w ww . j a va 2s .co m String xml = handler.toString(); //just includes headers for embedded documents assertContains("<table name=\"my_table1\"><thead><tr>", xml); assertContains( "<td><span type=\"blob\" column_name=\"BYTES_COL\" row_number=\"0\"><div class=\"package-entry\"><h1>BYTES_COL_0.doc</h1>", xml); //but no other content assertNotContained("dog", xml); assertNotContained("alt=\"image1.png\"", xml); //second embedded doc's image tag assertNotContained("alt=\"A description...\"", xml); }