Example usage for org.apache.poi.hwpf.usermodel HeaderStories getFirstFooterSubrange

List of usage examples for org.apache.poi.hwpf.usermodel HeaderStories getFirstFooterSubrange

Introduction

In this page you can find the example usage for org.apache.poi.hwpf.usermodel HeaderStories getFirstFooterSubrange.

Prototype

public Range getFirstFooterSubrange() 

Source Link

Usage

From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java

License:Apache License

protected void parse(DirectoryNode root, XHTMLContentHandler xhtml)
        throws IOException, SAXException, TikaException {
    HWPFDocument document;//from w w w.  j a  v  a 2 s . com
    try {
        document = new HWPFDocument(root);
    } catch (OldWordFileFormatException e) {
        parseWord6(root, xhtml);
        return;
    }

    org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor(
            document);

    // mj
    extractImageText(xhtml, document);

    HeaderStories headerFooter = new HeaderStories(document);

    // Grab the list of pictures. As far as we can tell,
    // the pictures should be in order, and may be directly
    // placed or referenced from an anchor
    PicturesTable pictureTable = document.getPicturesTable();
    PicturesSource pictures = new PicturesSource(document);

    // Do any headers, if present
    Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(),
            headerFooter.getOddHeaderSubrange() };
    handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml);

    // Do the main paragraph text
    Range r = document.getRange();
    for (int i = 0; i < r.numParagraphs(); i++) {
        Paragraph p = r.getParagraph(i);
        i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, xhtml);
    }

    // Do everything else
    for (String paragraph : wordExtractor.getMainTextboxText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getFootnoteText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getCommentsText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getEndnoteText()) {
        xhtml.element("p", paragraph);
    }

    // Do any footers, if present
    Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(),
            headerFooter.getOddFooterSubrange() };
    handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml);

    // Handle any pictures that we haven't output yet
    for (Picture p = pictures.nextUnclaimed(); p != null;) {
        handlePictureCharacterRun(null, p, pictures, xhtml);
        p = pictures.nextUnclaimed();
    }

    // Handle any embeded office documents
    try {
        DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
        for (Entry entry : op) {
            if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) {
                handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
            }
        }
    } catch (FileNotFoundException e) {
    }
}

From source file:org.apache.tika.parser.microsoft.WordExtractor.java

License:Apache License

protected void parse(DirectoryNode root, XHTMLContentHandler xhtml)
        throws IOException, SAXException, TikaException {
    HWPFDocument document;/*from w w w . ja va2 s . com*/
    try {
        document = new HWPFDocument(root);
    } catch (OldWordFileFormatException e) {
        parseWord6(root, xhtml);
        return;
    }
    org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor(
            document);
    HeaderStories headerFooter = new HeaderStories(document);

    // Grab the list of pictures. As far as we can tell,
    //  the pictures should be in order, and may be directly
    //  placed or referenced from an anchor
    PicturesTable pictureTable = document.getPicturesTable();
    PicturesSource pictures = new PicturesSource(document);

    // Do any headers, if present
    Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(),
            headerFooter.getOddHeaderSubrange() };
    handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml);

    // Do the main paragraph text
    Range r = document.getRange();
    ListManager listManager = new ListManager(document);
    for (int i = 0; i < r.numParagraphs(); i++) {
        Paragraph p = r.getParagraph(i);
        i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager,
                xhtml);
    }

    // Do everything else
    for (String paragraph : wordExtractor.getMainTextboxText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getFootnoteText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getCommentsText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getEndnoteText()) {
        xhtml.element("p", paragraph);
    }

    // Do any footers, if present
    Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(),
            headerFooter.getOddFooterSubrange() };
    handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml);

    // Handle any pictures that we haven't output yet
    for (Picture p = pictures.nextUnclaimed(); p != null;) {
        handlePictureCharacterRun(null, p, pictures, xhtml);
        p = pictures.nextUnclaimed();
    }

    // Handle any embeded office documents
    try {
        DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
        for (Entry entry : op) {
            if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) {
                handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
            }
        }
    } catch (FileNotFoundException e) {
    }
}