Example usage for org.xml.sax ContentHandler toString

List of usage examples for org.xml.sax ContentHandler toString

Introduction

In this page you can find the example usage for org.xml.sax ContentHandler toString.

Prototype

public String toString() 

Source Link

Document

Returns a string representation of the object.

Usage

From source file:uib.scratch.WriteIndex.java

/**
 * @param args//from   ww  w  . ja  va 2s.c o  m
 * @throws java.io.IOException
 * @throws org.xml.sax.SAXException
 */
public static void main(String[] args) throws IOException, SAXException {

    File docs = new File("documents");
    File indexDir = new File(INDEX_DIRECTORY);

    //Directory directory = FSDirectory.open(indexDir);

    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
    //IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_30, analyzer);
    //IndexWriter writer = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
    IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir), analyzer, true,
            IndexWriter.MaxFieldLength.LIMITED);
    System.out.println(indexDir);
    writer.deleteAll();

    for (File file : docs.listFiles()) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();
        Parser parser = new AutoDetectParser();
        InputStream stream = new FileInputStream(file);
        try {
            parser.parse(stream, handler, metadata, context);
        } catch (TikaException e) {
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            stream.close();
        }

        String text = handler.toString();
        String fileName = file.getName();

        Document doc = new Document();
        doc.add(new Field("file", fileName, Store.YES, Index.NO));

        for (String key : metadata.names()) {
            String name = key.toLowerCase();
            String value = metadata.get(key);

            if (StringUtils.isEmpty(value)) {
                continue;
            }

            if ("keywords".equalsIgnoreCase(key)) {
                for (String keyword : value.split(",?(\\s+)")) {
                    doc.add(new Field(name, keyword, Store.YES, Index.NOT_ANALYZED));
                }
            } else if ("title".equalsIgnoreCase(key)) {
                doc.add(new Field(name, value, Store.YES, Index.ANALYZED));
            } else {
                doc.add(new Field(name, fileName, Store.YES, Index.NOT_ANALYZED));
            }
        }
        doc.add(new Field("text", text, Store.NO, Index.ANALYZED));
        writer.addDocument(doc);

    }

    writer.commit();
    //.deleteUnusedFiles();

    System.out.println(writer.maxDoc() + " documents written");
}

From source file:us.colloquy.sandbox.TestExtractor.java

@Test
public void getContent() throws TikaException, SAXException, IOException {

    File file = new File("temp/OEBPS/Text/0001_1006_2002.xhtml");
    InputStream input = new ByteArrayInputStream(FileUtils.readFileToByteArray(file));
    ContentHandler text = new BodyContentHandler();//<co id="html.text.co"/>
    LinkContentHandler links = new LinkContentHandler();//<co id="html.link.co"/>

    ContentHandler handler = new TeeContentHandler(links, text);//<co id="html.merge"/>
    Metadata metadata = new Metadata();
    Parser parser = new XMLParser();
    ParseContext context = new ParseContext();
    parser.parse(input, handler, metadata, context);//<co id="html.parse"/>

    listAvailableMetaDataFields(metadata);

    System.out.println("Title: " + metadata.get(Metadata.TITLE));
    // System.out.println("Body: " + text.toString());

    String[] contentArray = text.toString().split("\n");

    for (String line : contentArray) {
        System.out.println(line);

    }//w  ww  .  ja v a 2  s  .co m
    // System.out.println("Links: " + links.getLinks());

}

From source file:us.colloquy.sandbox.TestExtractor.java

@Test
public void parseOnePartToHTML() throws IOException, SAXException, TikaException {
    // Only get things under html -> body -> div (class=header)
    XPathParser xhtmlParser = new XPathParser("", XHTMLContentHandler.XHTML);
    Matcher divContentMatcher = xhtmlParser.parse("*");
    ContentHandler handler = new MatchingContentHandler(new ToXMLContentHandler(), divContentMatcher);

    AutoDetectParser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();

    File file = new File("temp/OEBPS/Text/0001_1006_2002.xhtml");

    try (InputStream stream = new ByteArrayInputStream(FileUtils.readFileToByteArray(file))) {
        parser.parse(stream, handler, metadata);
        System.out.println(handler.toString());
    }//from  w w  w.  j a v  a  2  s.com
}