Example usage for org.xml.sax ContentHandler toString

List of usage examples for org.xml.sax ContentHandler toString

Introduction

In this page you can find the example usage for org.xml.sax ContentHandler toString.

Prototype

public String toString() 

Source Link

Document

Returns a string representation of the object.

Usage

From source file:edu.ucsd.library.dams.api.DAMSAPIServlet.java

public Map extractText(String objid, String cmpid, String fileid, FileStore fs) {
    Map info = new LinkedHashMap();
    InputStream in = null;/*from  w  ww .  j av a2 s  .  co  m*/
    String fn = objid;
    if (cmpid != null) {
        fn += "/" + cmpid;
    }
    fn += "/" + fileid;
    try {
        // make sure objid and fileid are specified
        if (objid == null || fileid == null) {
            return error(SC_BAD_REQUEST, "Object and file must be specified", null);
        }

        // make sure file exists
        if (!fs.exists(objid, cmpid, fileid)) {
            return error(SC_NOT_FOUND, "File does not exist", null);
        }

        // extract text
        in = fs.getInputStream(objid, cmpid, fileid);
        //String text = PDFParser.getContent( in, objid );
        ContentHandler contentHandler = new BodyContentHandler((int) maxUploadSize * 4);
        Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, fileid);
        Parser parser = new AutoDetectParser();
        ParseContext parserContext = new ParseContext();
        parser.parse(in, contentHandler, metadata, parserContext);
        info.put("text", escapeForXml(contentHandler.toString()));
    } catch (Exception ex) {
        log.error("Error extracting text from " + fn, ex);
        return error("Error extracting text from " + fn, ex);
    }
    return info;
}

From source file:org.apache.tika.example.MyFirstTika.java

public static String parseUsingComponents(String filename, TikaConfig tikaConfig, Metadata metadata)
        throws Exception {
    MimeTypes mimeRegistry = tikaConfig.getMimeRepository();

    System.out.println("Examining: [" + filename + "]");

    metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
    System.out.println("The MIME type (based on filename) is: [" + mimeRegistry.detect(null, metadata) + "]");

    InputStream stream = TikaInputStream.get(new File(filename));
    System.out.println("The MIME type (based on MAGIC) is: [" + mimeRegistry.detect(stream, metadata) + "]");

    stream = TikaInputStream.get(new File(filename));
    Detector detector = tikaConfig.getDetector();
    System.out.println(//from   w  w  w. j a  v a2 s.  c om
            "The MIME type (based on the Detector interface) is: [" + detector.detect(stream, metadata) + "]");

    LanguageDetector langDetector = new OptimaizeLangDetector().loadModels();
    LanguageResult lang = langDetector.detect(FileUtils.readFileToString(new File(filename), UTF_8));

    System.out.println("The language of this content is: [" + lang.getLanguage() + "]");

    // Get a non-detecting parser that handles all the types it can
    Parser parser = tikaConfig.getParser();
    // Tell it what we think the content is
    MediaType type = detector.detect(stream, metadata);
    metadata.set(Metadata.CONTENT_TYPE, type.toString());
    // Have the file parsed to get the content and metadata
    ContentHandler handler = new BodyContentHandler();
    parser.parse(stream, handler, metadata, new ParseContext());

    return handler.toString();
}

From source file:org.apache.tika.parser.font.FontParsersTest.java

@Test
public void testAdobeFontMetricParsing() throws Exception {
    Parser parser = new AutoDetectParser(); // Should auto-detect!
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    try (TikaInputStream stream = TikaInputStream
            .get(FontParsersTest.class.getResource("/test-documents/testAFM.afm"))) {
        parser.parse(stream, handler, metadata, context);
    }/*w  w w.  ja  va2  s  . c om*/

    assertEquals("application/x-font-adobe-metric", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("TestFullName", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("Fri Jul 15 17:50:51 2011", metadata.get(TikaCoreProperties.CREATED));

    assertEquals("TestFontName", metadata.get(MET_FONT_NAME));
    assertEquals("TestFullName", metadata.get(MET_FONT_FULL_NAME));
    assertEquals("TestSymbol", metadata.get(MET_FONT_FAMILY_NAME));

    assertEquals("Medium", metadata.get(MET_FONT_WEIGHT));
    assertEquals("001.008", metadata.get(MET_FONT_VERSION));

    String content = handler.toString();

    // Test that the comments got extracted
    assertContains("Comments", content);
    assertContains("This is a comment in a sample file", content);
    assertContains("UniqueID 12345", content);
}

From source file:org.apache.tika.parser.font.FontParsersTest.java

@Test
public void testTTFParsing() throws Exception {
    Parser parser = new AutoDetectParser(); // Should auto-detect!
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    //Open Sans font is ASL 2.0 according to
    //http://www.google.com/fonts/specimen/Open+Sans
    //...despite the copyright in the file's metadata.

    try (TikaInputStream stream = TikaInputStream
            .get(FontParsersTest.class.getResource("/test-documents/testTrueType3.ttf"))) {
        parser.parse(stream, handler, metadata, context);
    }//w ww . j a v a 2s . c o  m

    assertEquals("application/x-font-ttf", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("Open Sans Bold", metadata.get(TikaCoreProperties.TITLE));

    assertEquals("2010-12-30T11:04:00Z", metadata.get(TikaCoreProperties.CREATED));
    assertEquals("2011-05-05T12:37:53Z", metadata.get(TikaCoreProperties.MODIFIED));

    assertEquals("Open Sans Bold", metadata.get(MET_FONT_NAME));
    assertEquals("Open Sans", metadata.get(MET_FONT_FAMILY_NAME));
    assertEquals("Bold", metadata.get(MET_FONT_SUB_FAMILY_NAME));
    assertEquals("OpenSans-Bold", metadata.get(MET_PS_NAME));

    assertEquals("Digitized", metadata.get("Copyright").substring(0, 9));
    assertEquals("Open Sans", metadata.get("Trademark").substring(0, 9));

    // Not extracted
    assertEquals(null, metadata.get(MET_FONT_FULL_NAME));
    assertEquals(null, metadata.get(MET_FONT_WEIGHT));
    assertEquals(null, metadata.get(MET_FONT_VERSION));

    // Currently, the parser doesn't extract any contents
    String content = handler.toString();
    assertEquals("", content);
}

From source file:org.apache.tika.parser.fork.ForkParserIntegrationTest.java

/**
 * Simple text parsing/* ww  w.  j av  a 2 s  .c om*/
 */
@Test
public void testForkedTextParsing() throws Exception {
    try (ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(),
            tika.getParser())) {
        ContentHandler output = new BodyContentHandler();
        InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/testTXT.txt");
        ParseContext context = new ParseContext();
        parser.parse(stream, output, new Metadata(), context);

        String content = output.toString();
        assertContains("Test d'indexation", content);
        assertContains("http://www.apache.org", content);
    }
}

From source file:org.apache.tika.parser.fork.ForkParserIntegrationTest.java

/**
 * TIKA-832//w  w w.  ja v a2  s .co m
 */
@Test
public void testAttachingADebuggerOnTheForkedParserShouldWork() throws Exception {
    ParseContext context = new ParseContext();
    context.set(Parser.class, tika.getParser());

    ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), tika.getParser());
    parser.setJavaCommand(Arrays.asList("java", "-Xmx32m", "-Xdebug",
            "-Xrunjdwp:transport=dt_socket,address=54321,server=y,suspend=n"));
    try {
        ContentHandler body = new BodyContentHandler();
        InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/testTXT.txt");
        parser.parse(stream, body, new Metadata(), context);
        String content = body.toString();
        assertContains("Test d'indexation", content);
        assertContains("http://www.apache.org", content);
    } finally {
        parser.close();
    }
}

From source file:org.apache.tika.parser.fork.ForkParserIntegrationTest.java

/**
 * TIKA-808 - Ensure that parsing of our test PDFs work under
 * the Fork Parser, to ensure that complex parsing behaves
 *//*  w  w  w  .  j  a  va  2s.co m*/
@Test
public void testForkedPDFParsing() throws Exception {
    try (ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(),
            tika.getParser())) {
        ContentHandler output = new BodyContentHandler();
        InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/testPDF.pdf");
        ParseContext context = new ParseContext();
        context.set(Parser.class, new EmptyParser());
        parser.parse(stream, output, new Metadata(), context);

        String content = output.toString();
        assertContains("Apache Tika", content);
        assertContains("Tika - Content Analysis Toolkit", content);
        assertContains("incubator", content);
        assertContains("Apache Software Foundation", content);
    }
}

From source file:org.apache.tika.parser.fork.ForkParserIntegrationTest.java

@Test
public void testForkedPackageParsing() throws Exception {
    try (ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(),
            tika.getParser())) {//  w ww .ja v  a  2s.co  m
        ContentHandler output = new BodyContentHandler();
        InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/moby.zip");
        ParseContext context = new ParseContext();
        parser.parse(stream, output, new Metadata(), context);
        assertContains("Moby Dick", output.toString());
    }
}

From source file:org.apache.tika.parser.jdbc.SQLite3ParserTest.java

@Test
public void testSpacesInBodyContentHandler() throws Exception {
    Parser p = new AutoDetectParser();
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
    ContentHandler handler = new BodyContentHandler(-1);
    ParseContext ctx = new ParseContext();
    ctx.set(Parser.class, p);
    try (InputStream stream = getResourceAsStream(TEST_FILE1)) {
        p.parse(stream, handler, metadata, ctx);
    }//  w ww . j  ava2s  .  c  o m
    String s = handler.toString();
    assertContains("0\t2.3\t2.4\tlorem", s);
    assertContains("tempor\n", s);
}

From source file:org.apache.tika.parser.jdbc.SQLite3ParserTest.java

@Test
public void testNotAddingEmbeddedParserToParseContext() throws Exception {
    Parser p = new AutoDetectParser();
    ContentHandler handler = new ToXMLContentHandler();
    Metadata metadata = new Metadata();
    ParseContext parseContext = new ParseContext();
    parseContext.set(Parser.class, new EmptyParser());
    try (InputStream is = getResourceAsStream(TEST_FILE1)) {
        metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
        p.parse(is, handler, metadata, parseContext);
    }//from  w  ww  . j a  va  2s .co m
    String xml = handler.toString();
    //just includes headers for embedded documents
    assertContains("<table name=\"my_table1\"><thead><tr>", xml);
    assertContains(
            "<td><span type=\"blob\" column_name=\"BYTES_COL\" row_number=\"0\"><div class=\"package-entry\"><h1>BYTES_COL_0.doc</h1>",
            xml);
    //but no other content
    assertNotContained("dog", xml);
    assertNotContained("alt=\"image1.png\"", xml);
    //second embedded doc's image tag
    assertNotContained("alt=\"A description...\"", xml);
}