Example usage for org.xml.sax ContentHandler toString

Introduction

In this page you can find the example usage for org.xml.sax ContentHandler toString.

Prototype

public String toString()

Source Link

Document

Returns a string representation of the object.

Usage

From source file:edu.ucsd.library.dams.api.DAMSAPIServlet.java

public Map extractText(String objid, String cmpid, String fileid, FileStore fs) {
    Map info = new LinkedHashMap();
    InputStream in = null;/*from  w  ww .  j av a2 s  .  co  m*/
    String fn = objid;
    if (cmpid != null) {
        fn += "/" + cmpid;
    }
    fn += "/" + fileid;
    try {
        // make sure objid and fileid are specified
        if (objid == null || fileid == null) {
            return error(SC_BAD_REQUEST, "Object and file must be specified", null);
        }

        // make sure file exists
        if (!fs.exists(objid, cmpid, fileid)) {
            return error(SC_NOT_FOUND, "File does not exist", null);
        }

        // extract text
        in = fs.getInputStream(objid, cmpid, fileid);
        //String text = PDFParser.getContent( in, objid );
        ContentHandler contentHandler = new BodyContentHandler((int) maxUploadSize * 4);
        Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, fileid);
        Parser parser = new AutoDetectParser();
        ParseContext parserContext = new ParseContext();
        parser.parse(in, contentHandler, metadata, parserContext);
        info.put("text", escapeForXml(contentHandler.toString()));
    } catch (Exception ex) {
        log.error("Error extracting text from " + fn, ex);
        return error("Error extracting text from " + fn, ex);
    }
    return info;
}

From source file:org.apache.tika.example.MyFirstTika.java

public static String parseUsingComponents(String filename, TikaConfig tikaConfig, Metadata metadata)
        throws Exception {
    MimeTypes mimeRegistry = tikaConfig.getMimeRepository();

    System.out.println("Examining: [" + filename + "]");

    metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
    System.out.println("The MIME type (based on filename) is: [" + mimeRegistry.detect(null, metadata) + "]");

    InputStream stream = TikaInputStream.get(new File(filename));
    System.out.println("The MIME type (based on MAGIC) is: [" + mimeRegistry.detect(stream, metadata) + "]");

    stream = TikaInputStream.get(new File(filename));
    Detector detector = tikaConfig.getDetector();
    System.out.println(//from   w  w  w. j a  v a2 s.  c om
            "The MIME type (based on the Detector interface) is: [" + detector.detect(stream, metadata) + "]");

    LanguageDetector langDetector = new OptimaizeLangDetector().loadModels();
    LanguageResult lang = langDetector.detect(FileUtils.readFileToString(new File(filename), UTF_8));

    System.out.println("The language of this content is: [" + lang.getLanguage() + "]");

    // Get a non-detecting parser that handles all the types it can
    Parser parser = tikaConfig.getParser();
    // Tell it what we think the content is
    MediaType type = detector.detect(stream, metadata);
    metadata.set(Metadata.CONTENT_TYPE, type.toString());
    // Have the file parsed to get the content and metadata
    ContentHandler handler = new BodyContentHandler();
    parser.parse(stream, handler, metadata, new ParseContext());

    return handler.toString();
}

From source file:org.apache.tika.parser.font.FontParsersTest.java

@Test
public void testAdobeFontMetricParsing() throws Exception {
    Parser parser = new AutoDetectParser(); // Should auto-detect!
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    try (TikaInputStream stream = TikaInputStream
            .get(FontParsersTest.class.getResource("/test-documents/testAFM.afm"))) {
        parser.parse(stream, handler, metadata, context);
    }/*w  w w.  ja  va2  s  . c om*/

    assertEquals("application/x-font-adobe-metric", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("TestFullName", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("Fri Jul 15 17:50:51 2011", metadata.get(TikaCoreProperties.CREATED));

    assertEquals("TestFontName", metadata.get(MET_FONT_NAME));
    assertEquals("TestFullName", metadata.get(MET_FONT_FULL_NAME));
    assertEquals("TestSymbol", metadata.get(MET_FONT_FAMILY_NAME));

    assertEquals("Medium", metadata.get(MET_FONT_WEIGHT));
    assertEquals("001.008", metadata.get(MET_FONT_VERSION));

    String content = handler.toString();

    // Test that the comments got extracted
    assertContains("Comments", content);
    assertContains("This is a comment in a sample file", content);
    assertContains("UniqueID 12345", content);
}

From source file:org.apache.tika.parser.font.FontParsersTest.java

@Test
public void testTTFParsing() throws Exception {
    Parser parser = new AutoDetectParser(); // Should auto-detect!
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    //Open Sans font is ASL 2.0 according to
    //http://www.google.com/fonts/specimen/Open+Sans
    //...despite the copyright in the file's metadata.

    try (TikaInputStream stream = TikaInputStream
            .get(FontParsersTest.class.getResource("/test-documents/testTrueType3.ttf"))) {
        parser.parse(stream, handler, metadata, context);
    }//w ww . j a v a 2s . c o  m

    assertEquals("application/x-font-ttf", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("Open Sans Bold", metadata.get(TikaCoreProperties.TITLE));

    assertEquals("2010-12-30T11:04:00Z", metadata.get(TikaCoreProperties.CREATED));
    assertEquals("2011-05-05T12:37:53Z", metadata.get(TikaCoreProperties.MODIFIED));

    assertEquals("Open Sans Bold", metadata.get(MET_FONT_NAME));
    assertEquals("Open Sans", metadata.get(MET_FONT_FAMILY_NAME));
    assertEquals("Bold", metadata.get(MET_FONT_SUB_FAMILY_NAME));
    assertEquals("OpenSans-Bold", metadata.get(MET_PS_NAME));

    assertEquals("Digitized", metadata.get("Copyright").substring(0, 9));
    assertEquals("Open Sans", metadata.get("Trademark").substring(0, 9));

    // Not extracted
    assertEquals(null, metadata.get(MET_FONT_FULL_NAME));
    assertEquals(null, metadata.get(MET_FONT_WEIGHT));
    assertEquals(null, metadata.get(MET_FONT_VERSION));

    // Currently, the parser doesn't extract any contents
    String content = handler.toString();
    assertEquals("", content);
}

From source file:org.apache.tika.parser.fork.ForkParserIntegrationTest.java

/**
 * Simple text parsing/* ww  w.  j av  a 2 s  .c om*/
 */
@Test
public void testForkedTextParsing() throws Exception {
    try (ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(),
            tika.getParser())) {
        ContentHandler output = new BodyContentHandler();
        InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/testTXT.txt");
        ParseContext context = new ParseContext();
        parser.parse(stream, output, new Metadata(), context);

        String content = output.toString();
        assertContains("Test d'indexation", content);
        assertContains("http://www.apache.org", content);
    }
}

From source file:org.apache.tika.parser.fork.ForkParserIntegrationTest.java

/**
 * TIKA-832//w  w w.  ja v a2  s .co m
 */
@Test
public void testAttachingADebuggerOnTheForkedParserShouldWork() throws Exception {
    ParseContext context = new ParseContext();
    context.set(Parser.class, tika.getParser());

    ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), tika.getParser());
    parser.setJavaCommand(Arrays.asList("java", "-Xmx32m", "-Xdebug",
            "-Xrunjdwp:transport=dt_socket,address=54321,server=y,suspend=n"));
    try {
        ContentHandler body = new BodyContentHandler();
        InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/testTXT.txt");
        parser.parse(stream, body, new Metadata(), context);
        String content = body.toString();
        assertContains("Test d'indexation", content);
        assertContains("http://www.apache.org", content);
    } finally {
        parser.close();
    }
}

From source file:org.apache.tika.parser.fork.ForkParserIntegrationTest.java

/**
 * TIKA-808 - Ensure that parsing of our test PDFs work under
 * the Fork Parser, to ensure that complex parsing behaves
 *//*  w  w  w  .  j  a  va  2s.co m*/
@Test
public void testForkedPDFParsing() throws Exception {
    try (ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(),
            tika.getParser())) {
        ContentHandler output = new BodyContentHandler();
        InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/testPDF.pdf");
        ParseContext context = new ParseContext();
        context.set(Parser.class, new EmptyParser());
        parser.parse(stream, output, new Metadata(), context);

        String content = output.toString();
        assertContains("Apache Tika", content);
        assertContains("Tika - Content Analysis Toolkit", content);
        assertContains("incubator", content);
        assertContains("Apache Software Foundation", content);
    }
}

From source file:org.apache.tika.parser.fork.ForkParserIntegrationTest.java

@Test
public void testForkedPackageParsing() throws Exception {
    try (ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(),
            tika.getParser())) {//  w ww .ja v  a  2s.co  m
        ContentHandler output = new BodyContentHandler();
        InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/moby.zip");
        ParseContext context = new ParseContext();
        parser.parse(stream, output, new Metadata(), context);
        assertContains("Moby Dick", output.toString());
    }
}

From source file:org.apache.tika.parser.jdbc.SQLite3ParserTest.java

@Test
public void testSpacesInBodyContentHandler() throws Exception {
    Parser p = new AutoDetectParser();
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
    ContentHandler handler = new BodyContentHandler(-1);
    ParseContext ctx = new ParseContext();
    ctx.set(Parser.class, p);
    try (InputStream stream = getResourceAsStream(TEST_FILE1)) {
        p.parse(stream, handler, metadata, ctx);
    }//  w ww . j  ava2s  .  c  o m
    String s = handler.toString();
    assertContains("0\t2.3\t2.4\tlorem", s);
    assertContains("tempor\n", s);
}

From source file:org.apache.tika.parser.jdbc.SQLite3ParserTest.java

@Test
public void testNotAddingEmbeddedParserToParseContext() throws Exception {
    Parser p = new AutoDetectParser();
    ContentHandler handler = new ToXMLContentHandler();
    Metadata metadata = new Metadata();
    ParseContext parseContext = new ParseContext();
    parseContext.set(Parser.class, new EmptyParser());
    try (InputStream is = getResourceAsStream(TEST_FILE1)) {
        metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
        p.parse(is, handler, metadata, parseContext);
    }//from  w  ww  . j a  va  2s .co m
    String xml = handler.toString();
    //just includes headers for embedded documents
    assertContains("<table name=\"my_table1\"><thead><tr>", xml);
    assertContains(
            "<td><span type=\"blob\" column_name=\"BYTES_COL\" row_number=\"0\"><div class=\"package-entry\"><h1>BYTES_COL_0.doc</h1>",
            xml);
    //but no other content
    assertNotContained("dog", xml);
    assertNotContained("alt=\"image1.png\"", xml);
    //second embedded doc's image tag
    assertNotContained("alt=\"A description...\"", xml);
}