Example usage for org.apache.pdfbox.pdmodel.interactive.form PDAcroForm getXFA

List of usage examples for org.apache.pdfbox.pdmodel.interactive.form PDAcroForm getXFA

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel.interactive.form PDAcroForm getXFA.

Prototype

public PDXFAResource getXFA() 

Source Link

Document

Get the XFA resource, the XFA resource is only used for PDF 1.5+ forms.

Usage

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

License:Apache License

void extractAcroForm(PDDocument pdf) throws IOException, SAXException, TikaException {
    //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields
    //this code derives from Ben's code
    PDDocumentCatalog catalog = pdf.getDocumentCatalog();

    if (catalog == null)
        return;//w w  w .  j av  a2  s.c  o  m

    PDAcroForm form = catalog.getAcroForm();
    if (form == null)
        return;

    //if it has xfa, try that.
    //if it doesn't exist or there's an exception,
    //go with traditional AcroForm
    PDXFAResource pdxfa = form.getXFA();

    if (pdxfa != null) {
        //if successful, return
        XFAExtractor xfaExtractor = new XFAExtractor();
        InputStream is = null;
        try {
            is = new BufferedInputStream(new ByteArrayInputStream(pdxfa.getBytes()));
        } catch (IOException e) {
            EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
        }
        if (is != null) {
            try {
                xfaExtractor.extract(is, xhtml, metadata, context);
                return;
            } catch (XMLStreamException e) {
                //if there was an xml parse exception in xfa, try the AcroForm
                EmbeddedDocumentUtil.recordException(e, metadata);
            } finally {
                IOUtils.closeQuietly(is);
            }
        }
    }

    @SuppressWarnings("rawtypes")
    List fields = form.getFields();

    if (fields == null)
        return;

    @SuppressWarnings("rawtypes")
    ListIterator itr = fields.listIterator();

    if (itr == null)
        return;

    xhtml.startElement("div", "class", "acroform");
    xhtml.startElement("ol");

    while (itr.hasNext()) {
        Object obj = itr.next();
        if (obj != null && obj instanceof PDField) {
            processAcroField((PDField) obj, 0);
        }
    }
    xhtml.endElement("ol");
    xhtml.endElement("div");
}

From source file:org.apache.tika.parser.pdf.EnhancedPDFParser.java

License:Apache License

private String extractXFAText(PDDocument pdfDocument) throws IOException {
    PDDocumentCatalog catalog = pdfDocument.getDocumentCatalog();
    String xfaXml = null;/*from   w ww.  j  av a2 s. c om*/
    if (catalog != null) {
        PDAcroForm acroForm = catalog.getAcroForm();
        if (acroForm != null) {
            PDXFAResource xfa = acroForm.getXFA();
            if (xfa != null) {
                //TODO consider streaming and writing as we read along
                //to preserve memory.
                //See and replicate how xfa getBytes() does it.
                xfaXml = new String(xfa.getBytes());
            }
        }
    }
    // No XFA, do nothing
    if (xfaXml == null) {
        return null;
    }

    // Extract text from XFA 
    StringBuilder b = new StringBuilder();
    Matcher m = PATTERN_TEXT.matcher(xfaXml);
    while (m.find()) {
        String tag = getMatchGroup(m, 1);
        boolean isText = "text".equals(tag);
        String attribs = getMatchGroup(m, 2);
        String value = getMatchGroup(m, 3);

        // Reject href text
        if (isText && attribs.contains("name=\"embeddedHref\"")) {
            continue;
        }

        // Get text from free-form exData
        if ("exData".equals(tag)) {
            if (attribs.contains("contentType=\"application/xml\"")
                    || attribs.contains("contentType=\"text/html\"")
                    || attribs.contains("contentType=\"text/xml\"")) {
                value = PATTERN_STRIP_MARKUP.matcher(value).replaceAll(" ");
            }
        }
        b.append(value);
        b.append("\n");
    }
    return b.toString();
}

From source file:org.apache.tika.parser.pdf.PDF2XHTML.java

License:Apache License

private void extractAcroForm(PDDocument pdf, XHTMLContentHandler handler) throws IOException, SAXException {
    //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields
    //this code derives from Ben's code
    PDDocumentCatalog catalog = pdf.getDocumentCatalog();

    if (catalog == null)
        return;//  w ww .j a va 2 s.co m

    PDAcroForm form = catalog.getAcroForm();
    if (form == null)
        return;

    //if it has xfa, try that.
    //if it doesn't exist or there's an exception,
    //go with traditional AcroForm
    PDXFA pdxfa = form.getXFA();
    if (pdxfa != null) {
        XFAExtractor xfaExtractor = new XFAExtractor();
        try {
            xfaExtractor.extract(new BufferedInputStream(new ByteArrayInputStream(pdxfa.getBytes())), handler,
                    metadata);
            return;
        } catch (XMLStreamException | IOException e) {
            //if there was an xml parse exception in xfa, try the AcroForm
        }
    }

    @SuppressWarnings("rawtypes")
    List fields = form.getFields();

    if (fields == null)
        return;

    @SuppressWarnings("rawtypes")
    ListIterator itr = fields.listIterator();

    if (itr == null)
        return;

    handler.startElement("div", "class", "acroform");
    handler.startElement("ol");

    while (itr.hasNext()) {
        Object obj = itr.next();
        if (obj != null && obj instanceof PDField) {
            processAcroField((PDField) obj, handler, 0);
        }
    }
    handler.endElement("ol");
    handler.endElement("div");
}