Example usage for org.xml.sax ContentHandler toString

Introduction

In this page you can find the example usage for org.xml.sax ContentHandler toString.

Prototype

public String toString()

Source Link

Document

Returns a string representation of the object.

Usage

From source file:uib.scratch.WriteIndex.java

/**
 * @param args//from   ww  w  . ja  va 2s.c o  m
 * @throws java.io.IOException
 * @throws org.xml.sax.SAXException
 */
public static void main(String[] args) throws IOException, SAXException {

    File docs = new File("documents");
    File indexDir = new File(INDEX_DIRECTORY);

    //Directory directory = FSDirectory.open(indexDir);

    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
    //IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_30, analyzer);
    //IndexWriter writer = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
    IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir), analyzer, true,
            IndexWriter.MaxFieldLength.LIMITED);
    System.out.println(indexDir);
    writer.deleteAll();

    for (File file : docs.listFiles()) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();
        Parser parser = new AutoDetectParser();
        InputStream stream = new FileInputStream(file);
        try {
            parser.parse(stream, handler, metadata, context);
        } catch (TikaException e) {
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            stream.close();
        }

        String text = handler.toString();
        String fileName = file.getName();

        Document doc = new Document();
        doc.add(new Field("file", fileName, Store.YES, Index.NO));

        for (String key : metadata.names()) {
            String name = key.toLowerCase();
            String value = metadata.get(key);

            if (StringUtils.isEmpty(value)) {
                continue;
            }

            if ("keywords".equalsIgnoreCase(key)) {
                for (String keyword : value.split(",?(\\s+)")) {
                    doc.add(new Field(name, keyword, Store.YES, Index.NOT_ANALYZED));
                }
            } else if ("title".equalsIgnoreCase(key)) {
                doc.add(new Field(name, value, Store.YES, Index.ANALYZED));
            } else {
                doc.add(new Field(name, fileName, Store.YES, Index.NOT_ANALYZED));
            }
        }
        doc.add(new Field("text", text, Store.NO, Index.ANALYZED));
        writer.addDocument(doc);

    }

    writer.commit();
    //.deleteUnusedFiles();

    System.out.println(writer.maxDoc() + " documents written");
}

From source file:us.colloquy.sandbox.TestExtractor.java

@Test
public void getContent() throws TikaException, SAXException, IOException {

    File file = new File("temp/OEBPS/Text/0001_1006_2002.xhtml");
    InputStream input = new ByteArrayInputStream(FileUtils.readFileToByteArray(file));
    ContentHandler text = new BodyContentHandler();//<co id="html.text.co"/>
    LinkContentHandler links = new LinkContentHandler();//<co id="html.link.co"/>

    ContentHandler handler = new TeeContentHandler(links, text);//<co id="html.merge"/>
    Metadata metadata = new Metadata();
    Parser parser = new XMLParser();
    ParseContext context = new ParseContext();
    parser.parse(input, handler, metadata, context);//<co id="html.parse"/>

    listAvailableMetaDataFields(metadata);

    System.out.println("Title: " + metadata.get(Metadata.TITLE));
    // System.out.println("Body: " + text.toString());

    String[] contentArray = text.toString().split("\n");

    for (String line : contentArray) {
        System.out.println(line);

    }//w  ww  .  ja v a 2  s  .co m
    // System.out.println("Links: " + links.getLinks());

}

From source file:us.colloquy.sandbox.TestExtractor.java

@Test
public void parseOnePartToHTML() throws IOException, SAXException, TikaException {
    // Only get things under html -> body -> div (class=header)
    XPathParser xhtmlParser = new XPathParser("", XHTMLContentHandler.XHTML);
    Matcher divContentMatcher = xhtmlParser.parse("*");
    ContentHandler handler = new MatchingContentHandler(new ToXMLContentHandler(), divContentMatcher);

    AutoDetectParser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();

    File file = new File("temp/OEBPS/Text/0001_1006_2002.xhtml");

    try (InputStream stream = new ByteArrayInputStream(FileUtils.readFileToByteArray(file))) {
        parser.parse(stream, handler, metadata);
        System.out.println(handler.toString());
    }//from  w w  w.  j a v  a  2  s.com
}