Example usage for org.apache.pdfbox.pdmodel.interactive.form PDXFAResource getBytes

List of usage examples for org.apache.pdfbox.pdmodel.interactive.form PDXFAResource getBytes

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel.interactive.form PDXFAResource getBytes.

Prototype

public byte[] getBytes() throws IOException 

Source Link

Document

Get the XFA content as byte array.

Usage

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

License:Apache License

void extractAcroForm(PDDocument pdf) throws IOException, SAXException, TikaException {
    //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields
    //this code derives from Ben's code
    PDDocumentCatalog catalog = pdf.getDocumentCatalog();

    if (catalog == null)
        return;//from   w w w.  j  av  a2s .com

    PDAcroForm form = catalog.getAcroForm();
    if (form == null)
        return;

    //if it has xfa, try that.
    //if it doesn't exist or there's an exception,
    //go with traditional AcroForm
    PDXFAResource pdxfa = form.getXFA();

    if (pdxfa != null) {
        //if successful, return
        XFAExtractor xfaExtractor = new XFAExtractor();
        InputStream is = null;
        try {
            is = new BufferedInputStream(new ByteArrayInputStream(pdxfa.getBytes()));
        } catch (IOException e) {
            EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
        }
        if (is != null) {
            try {
                xfaExtractor.extract(is, xhtml, metadata, context);
                return;
            } catch (XMLStreamException e) {
                //if there was an xml parse exception in xfa, try the AcroForm
                EmbeddedDocumentUtil.recordException(e, metadata);
            } finally {
                IOUtils.closeQuietly(is);
            }
        }
    }

    @SuppressWarnings("rawtypes")
    List fields = form.getFields();

    if (fields == null)
        return;

    @SuppressWarnings("rawtypes")
    ListIterator itr = fields.listIterator();

    if (itr == null)
        return;

    xhtml.startElement("div", "class", "acroform");
    xhtml.startElement("ol");

    while (itr.hasNext()) {
        Object obj = itr.next();
        if (obj != null && obj instanceof PDField) {
            processAcroField((PDField) obj, 0);
        }
    }
    xhtml.endElement("ol");
    xhtml.endElement("div");
}

From source file:org.apache.tika.parser.pdf.EnhancedPDFParser.java

License:Apache License

private String extractXFAText(PDDocument pdfDocument) throws IOException {
    PDDocumentCatalog catalog = pdfDocument.getDocumentCatalog();
    String xfaXml = null;/* w  w  w  . j  a  v a 2 s.  co m*/
    if (catalog != null) {
        PDAcroForm acroForm = catalog.getAcroForm();
        if (acroForm != null) {
            PDXFAResource xfa = acroForm.getXFA();
            if (xfa != null) {
                //TODO consider streaming and writing as we read along
                //to preserve memory.
                //See and replicate how xfa getBytes() does it.
                xfaXml = new String(xfa.getBytes());
            }
        }
    }
    // No XFA, do nothing
    if (xfaXml == null) {
        return null;
    }

    // Extract text from XFA 
    StringBuilder b = new StringBuilder();
    Matcher m = PATTERN_TEXT.matcher(xfaXml);
    while (m.find()) {
        String tag = getMatchGroup(m, 1);
        boolean isText = "text".equals(tag);
        String attribs = getMatchGroup(m, 2);
        String value = getMatchGroup(m, 3);

        // Reject href text
        if (isText && attribs.contains("name=\"embeddedHref\"")) {
            continue;
        }

        // Get text from free-form exData
        if ("exData".equals(tag)) {
            if (attribs.contains("contentType=\"application/xml\"")
                    || attribs.contains("contentType=\"text/html\"")
                    || attribs.contains("contentType=\"text/xml\"")) {
                value = PATTERN_STRIP_MARKUP.matcher(value).replaceAll(" ");
            }
        }
        b.append(value);
        b.append("\n");
    }
    return b.toString();
}