Example usage for org.apache.pdfbox.pdmodel.interactive.form PDXFAResource getBytes

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel.interactive.form PDXFAResource getBytes.

Prototype

public byte[] getBytes() throws IOException

Source Link

Document

Get the XFA content as byte array.

Usage

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

License:Apache License

void extractAcroForm(PDDocument pdf) throws IOException, SAXException, TikaException {
    //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields
    //this code derives from Ben's code
    PDDocumentCatalog catalog = pdf.getDocumentCatalog();

    if (catalog == null)
        return;//from   w w w.  j  av  a2s .com

    PDAcroForm form = catalog.getAcroForm();
    if (form == null)
        return;

    //if it has xfa, try that.
    //if it doesn't exist or there's an exception,
    //go with traditional AcroForm
    PDXFAResource pdxfa = form.getXFA();

    if (pdxfa != null) {
        //if successful, return
        XFAExtractor xfaExtractor = new XFAExtractor();
        InputStream is = null;
        try {
            is = new BufferedInputStream(new ByteArrayInputStream(pdxfa.getBytes()));
        } catch (IOException e) {
            EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
        }
        if (is != null) {
            try {
                xfaExtractor.extract(is, xhtml, metadata, context);
                return;
            } catch (XMLStreamException e) {
                //if there was an xml parse exception in xfa, try the AcroForm
                EmbeddedDocumentUtil.recordException(e, metadata);
            } finally {
                IOUtils.closeQuietly(is);
            }
        }
    }

    @SuppressWarnings("rawtypes")
    List fields = form.getFields();

    if (fields == null)
        return;

    @SuppressWarnings("rawtypes")
    ListIterator itr = fields.listIterator();

    if (itr == null)
        return;

    xhtml.startElement("div", "class", "acroform");
    xhtml.startElement("ol");

    while (itr.hasNext()) {
        Object obj = itr.next();
        if (obj != null && obj instanceof PDField) {
            processAcroField((PDField) obj, 0);
        }
    }
    xhtml.endElement("ol");
    xhtml.endElement("div");
}

From source file:org.apache.tika.parser.pdf.EnhancedPDFParser.java

License:Apache License

private String extractXFAText(PDDocument pdfDocument) throws IOException {
    PDDocumentCatalog catalog = pdfDocument.getDocumentCatalog();
    String xfaXml = null;/* w  w  w  . j  a  v a 2 s.  co m*/
    if (catalog != null) {
        PDAcroForm acroForm = catalog.getAcroForm();
        if (acroForm != null) {
            PDXFAResource xfa = acroForm.getXFA();
            if (xfa != null) {
                //TODO consider streaming and writing as we read along
                //to preserve memory.
                //See and replicate how xfa getBytes() does it.
                xfaXml = new String(xfa.getBytes());
            }
        }
    }
    // No XFA, do nothing
    if (xfaXml == null) {
        return null;
    }

    // Extract text from XFA 
    StringBuilder b = new StringBuilder();
    Matcher m = PATTERN_TEXT.matcher(xfaXml);
    while (m.find()) {
        String tag = getMatchGroup(m, 1);
        boolean isText = "text".equals(tag);
        String attribs = getMatchGroup(m, 2);
        String value = getMatchGroup(m, 3);

        // Reject href text
        if (isText && attribs.contains("name=\"embeddedHref\"")) {
            continue;
        }

        // Get text from free-form exData
        if ("exData".equals(tag)) {
            if (attribs.contains("contentType=\"application/xml\"")
                    || attribs.contains("contentType=\"text/html\"")
                    || attribs.contains("contentType=\"text/xml\"")) {
                value = PATTERN_STRIP_MARKUP.matcher(value).replaceAll(" ");
            }
        }
        b.append(value);
        b.append("\n");
    }
    return b.toString();
}