List of usage examples for org.apache.pdfbox.pdmodel.interactive.form PDXFAResource getBytes
public byte[] getBytes() throws IOException
From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java
License:Apache License
void extractAcroForm(PDDocument pdf) throws IOException, SAXException, TikaException { //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields //this code derives from Ben's code PDDocumentCatalog catalog = pdf.getDocumentCatalog(); if (catalog == null) return;//from w w w. j av a2s .com PDAcroForm form = catalog.getAcroForm(); if (form == null) return; //if it has xfa, try that. //if it doesn't exist or there's an exception, //go with traditional AcroForm PDXFAResource pdxfa = form.getXFA(); if (pdxfa != null) { //if successful, return XFAExtractor xfaExtractor = new XFAExtractor(); InputStream is = null; try { is = new BufferedInputStream(new ByteArrayInputStream(pdxfa.getBytes())); } catch (IOException e) { EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); } if (is != null) { try { xfaExtractor.extract(is, xhtml, metadata, context); return; } catch (XMLStreamException e) { //if there was an xml parse exception in xfa, try the AcroForm EmbeddedDocumentUtil.recordException(e, metadata); } finally { IOUtils.closeQuietly(is); } } } @SuppressWarnings("rawtypes") List fields = form.getFields(); if (fields == null) return; @SuppressWarnings("rawtypes") ListIterator itr = fields.listIterator(); if (itr == null) return; xhtml.startElement("div", "class", "acroform"); xhtml.startElement("ol"); while (itr.hasNext()) { Object obj = itr.next(); if (obj != null && obj instanceof PDField) { processAcroField((PDField) obj, 0); } } xhtml.endElement("ol"); xhtml.endElement("div"); }
From source file:org.apache.tika.parser.pdf.EnhancedPDFParser.java
License:Apache License
private String extractXFAText(PDDocument pdfDocument) throws IOException { PDDocumentCatalog catalog = pdfDocument.getDocumentCatalog(); String xfaXml = null;/* w w w . j a v a 2 s. co m*/ if (catalog != null) { PDAcroForm acroForm = catalog.getAcroForm(); if (acroForm != null) { PDXFAResource xfa = acroForm.getXFA(); if (xfa != null) { //TODO consider streaming and writing as we read along //to preserve memory. //See and replicate how xfa getBytes() does it. xfaXml = new String(xfa.getBytes()); } } } // No XFA, do nothing if (xfaXml == null) { return null; } // Extract text from XFA StringBuilder b = new StringBuilder(); Matcher m = PATTERN_TEXT.matcher(xfaXml); while (m.find()) { String tag = getMatchGroup(m, 1); boolean isText = "text".equals(tag); String attribs = getMatchGroup(m, 2); String value = getMatchGroup(m, 3); // Reject href text if (isText && attribs.contains("name=\"embeddedHref\"")) { continue; } // Get text from free-form exData if ("exData".equals(tag)) { if (attribs.contains("contentType=\"application/xml\"") || attribs.contains("contentType=\"text/html\"") || attribs.contains("contentType=\"text/xml\"")) { value = PATTERN_STRIP_MARKUP.matcher(value).replaceAll(" "); } } b.append(value); b.append("\n"); } return b.toString(); }