List of usage examples for org.xml.sax.helpers AttributesImpl AttributesImpl
public AttributesImpl()
From source file:org.apache.tika.parser.microsoft.ooxml.SXSLFPowerPointExtractorDecorator.java
/** * This should handle the comments, master, notes, etc * * @param contentType/*from w w w . jav a 2 s . co m*/ * @param xhtmlClassLabel * @param parentPart * @param contentHandler */ private void handleBasicRelatedParts(String contentType, String xhtmlClassLabel, PackagePart parentPart, ContentHandler contentHandler) throws SAXException { PackageRelationshipCollection relatedPartPRC = null; try { relatedPartPRC = parentPart.getRelationshipsByType(contentType); } catch (InvalidFormatException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); } if (relatedPartPRC != null && relatedPartPRC.size() > 0) { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", xhtmlClassLabel); contentHandler.startElement("", "div", "div", attributes); for (int i = 0; i < relatedPartPRC.size(); i++) { PackageRelationship relatedPartPackageRelationship = relatedPartPRC.getRelationship(i); try { PackagePart relatedPartPart = parentPart.getRelatedPart(relatedPartPackageRelationship); try (InputStream stream = relatedPartPart.getInputStream()) { context.getSAXParser().parse(stream, new OfflineContentHandler(new EmbeddedContentHandler(contentHandler))); } catch (IOException | TikaException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); } } catch (InvalidFormatException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); } } contentHandler.endElement("", "div", "div"); } }
From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java
private void processEmbeddedDocNames(Map<String, PDComplexFileSpecification> embeddedFileNames) throws IOException, SAXException, TikaException { if (embeddedFileNames == null || embeddedFileNames.isEmpty()) { return;//from w w w . j a v a 2 s . c o m } for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) { processDoc(ent.getKey(), ent.getValue(), new AttributesImpl()); } }
From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java
@Override protected void endPage(PDPage page) throws IOException { try {//from w ww . jav a2 s. c o m for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); try { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "source", "source", "CDATA", "annotation"); extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes); } catch (SAXException e) { throw new IOExceptionWithCause("file embedded in annotation sax exception", e); } catch (TikaException e) { throw new IOExceptionWithCause("file embedded in annotation tika exception", e); } catch (IOException e) { handleCatchableIOE(e); } } else if (annotation instanceof PDAnnotationWidget) { handleWidget((PDAnnotationWidget) annotation); } // TODO: remove once PDFBOX-1143 is fixed: if (config.getExtractAnnotationText()) { PDActionURI uri = getActionURI(annotation); if (uri != null) { String link = uri.getURI(); if (link != null && link.trim().length() > 0) { xhtml.startElement("div", "class", "annotation"); xhtml.startElement("a", "href", link); xhtml.characters(link); xhtml.endElement("a"); xhtml.endElement("div"); } } if (annotation instanceof PDAnnotationMarkup) { PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; String title = annotationMarkup.getTitlePopup(); String subject = annotationMarkup.getSubject(); String contents = annotationMarkup.getContents(); // TODO: maybe also annotationMarkup.getRichContents()? if (title != null || subject != null || contents != null) { xhtml.startElement("div", "class", "annotation"); if (title != null) { xhtml.startElement("div", "class", "annotationTitle"); xhtml.characters(title); xhtml.endElement("div"); } if (subject != null) { xhtml.startElement("div", "class", "annotationSubject"); xhtml.characters(subject); xhtml.endElement("div"); } if (contents != null) { xhtml.startElement("div", "class", "annotationContents"); xhtml.characters(contents); xhtml.endElement("div"); } xhtml.endElement("div"); } } } } if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { doOCROnCurrentPage(); } PDPageAdditionalActions pageActions = page.getActions(); if (pageActions != null) { handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE); handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN); } xhtml.endElement("div"); } catch (SAXException | TikaException e) { throw new IOExceptionWithCause("Unable to end a page", e); } catch (IOException e) { exceptions.add(e); } finally { pageIndex++; } }
From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java
private void handleDestinationOrAction(PDDestinationOrAction action, ActionTrigger actionTrigger) throws IOException, SAXException, TikaException { if (action == null || !config.getExtractActions()) { return;// w ww . j a v a2 s. c o m } AttributesImpl attributes = new AttributesImpl(); String actionOrDestString = (action instanceof PDAction) ? "action" : "destination"; addNonNullAttribute("class", actionOrDestString, attributes); addNonNullAttribute("type", action.getClass().getSimpleName(), attributes); addNonNullAttribute("trigger", actionTrigger.name(), attributes); if (action instanceof PDActionImportData) { processDoc("", ((PDActionImportData) action).getFile(), attributes); } else if (action instanceof PDActionLaunch) { PDActionLaunch pdActionLaunch = (PDActionLaunch) action; addNonNullAttribute("id", pdActionLaunch.getF(), attributes); addNonNullAttribute("defaultDirectory", pdActionLaunch.getD(), attributes); addNonNullAttribute("operation", pdActionLaunch.getO(), attributes); addNonNullAttribute("parameters", pdActionLaunch.getP(), attributes); processDoc(pdActionLaunch.getF(), pdActionLaunch.getFile(), attributes); } else if (action instanceof PDActionRemoteGoTo) { PDActionRemoteGoTo remoteGoTo = (PDActionRemoteGoTo) action; processDoc("", remoteGoTo.getFile(), attributes); } else if (action instanceof PDActionJavaScript) { PDActionJavaScript jsAction = (PDActionJavaScript) action; Metadata m = new Metadata(); m.set(Metadata.CONTENT_TYPE, "application/javascript"); m.set(Metadata.CONTENT_ENCODING, StandardCharsets.UTF_8.toString()); m.set(PDF.ACTION_TRIGGER, actionTrigger.toString()); m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.name()); String js = jsAction.getAction(); js = (js == null) ? "" : js; if (embeddedDocumentExtractor.shouldParseEmbedded(m)) { try (InputStream is = TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) { embeddedDocumentExtractor.parseEmbedded(is, xhtml, m, false); } } addNonNullAttribute("class", "javascript", attributes); addNonNullAttribute("type", jsAction.getType(), attributes); addNonNullAttribute("subtype", jsAction.getSubType(), attributes); xhtml.startElement("div", attributes); xhtml.endElement("div"); } else { xhtml.startElement("div", attributes); xhtml.endElement("div"); } }
From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java
private void addFieldString(PDField field) throws SAXException { //Pick partial name to present in content and altName for attribute //Ignoring FullyQualifiedName for now String partName = field.getPartialName(); String altName = field.getAlternateFieldName(); StringBuilder sb = new StringBuilder(); AttributesImpl attrs = new AttributesImpl(); if (partName != null) { sb.append(partName).append(": "); }//from w ww. java 2s.c o m if (altName != null) { attrs.addAttribute("", "altName", "altName", "CDATA", altName); } //return early if PDSignature field if (field instanceof PDSignatureField) { handleSignature(attrs, (PDSignatureField) field); return; } String value = field.getValueAsString(); if (value != null && !value.equals("null")) { sb.append(value); } if (attrs.getLength() > 0 || sb.length() > 0) { xhtml.startElement("li", attrs); xhtml.characters(sb.toString()); xhtml.endElement("li"); } }
From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java
private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField) throws SAXException { PDSignature sig = sigField.getSignature(); if (sig == null) { return;/* www . j a va2 s.c o m*/ } Map<String, String> vals = new TreeMap<>(); vals.put("name", sig.getName()); vals.put("contactInfo", sig.getContactInfo()); vals.put("location", sig.getLocation()); vals.put("reason", sig.getReason()); Calendar cal = sig.getSignDate(); if (cal != null) { dateFormat.setTimeZone(cal.getTimeZone()); vals.put("date", dateFormat.format(cal.getTime())); } //see if there is any data int nonNull = 0; for (String val : vals.keySet()) { if (val != null && !val.equals("")) { nonNull++; } } //if there is, process it if (nonNull > 0) { xhtml.startElement("li", parentAttributes); AttributesImpl attrs = new AttributesImpl(); attrs.addAttribute("", "type", "type", "CDATA", "signaturedata"); xhtml.startElement("ol", attrs); for (Map.Entry<String, String> e : vals.entrySet()) { if (e.getValue() == null || e.getValue().equals("")) { continue; } attrs = new AttributesImpl(); attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey()); xhtml.startElement("li", attrs); xhtml.characters(e.getValue()); xhtml.endElement("li"); } xhtml.endElement("ol"); xhtml.endElement("li"); } }
From source file:org.apache.tika.parser.pdf.PDF2XHTML.java
private void extractImages(PDResources resources, Set<COSBase> seenThisPage) throws SAXException, IOException { if (resources == null || config.getExtractInlineImages() == false) { return;//w w w . j av a2s. com } for (COSName name : resources.getXObjectNames()) { PDXObject object = resources.getXObject(name); if (object == null) { continue; } COSBase cosObject = object.getCOSObject(); if (seenThisPage.contains(cosObject)) { //avoid infinite recursion TIKA-1742 continue; } seenThisPage.add(cosObject); if (object instanceof PDFormXObject) { extractImages(((PDFormXObject) object).getResources(), seenThisPage); } else if (object instanceof PDImageXObject) { PDImageXObject image = (PDImageXObject) object; Metadata metadata = new Metadata(); String extension = image.getSuffix(); if (extension == null) { metadata.set(Metadata.CONTENT_TYPE, "image/png"); extension = "png"; } else if (extension.equals("jpg")) { metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); } else if (extension.equals("tiff")) { metadata.set(Metadata.CONTENT_TYPE, "image/tiff"); extension = "tif"; } else { //TODO: determine if we need to add more image types //throw new RuntimeException("EXTEN:" + extension); } Integer imageNumber = processedInlineImages.get(name.getName()); if (imageNumber == null) { imageNumber = inlineImageCounter++; } String fileName = "image" + imageNumber + "." + extension; metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); // Output the img tag AttributesImpl attr = new AttributesImpl(); attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName); attr.addAttribute("", "alt", "alt", "CDATA", fileName); handler.startElement("img", attr); handler.endElement("img"); //Do we only want to process unique COSObject ids? //If so, have we already processed this one? if (config.getExtractUniqueInlineImagesOnly() == true) { String cosObjectId = name.getName(); if (processedInlineImages.containsKey(cosObjectId)) { continue; } processedInlineImages.put(cosObjectId, imageNumber); } metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); if (extractor.shouldParseEmbedded(metadata)) { ByteArrayOutputStream buffer = new ByteArrayOutputStream(); try { //TODO: handle image.getMetadata()? writeToBuffer(image, extension, buffer); extractor.parseEmbedded(new ByteArrayInputStream(buffer.toByteArray()), new EmbeddedContentHandler(handler), metadata, false); } catch (IOException e) { // could not extract this image, so just skip it... } } } } }
From source file:org.apache.tika.parser.pdf.PDF2XHTML.java
private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file, EmbeddedDocumentExtractor extractor) throws SAXException, IOException, TikaException { if (file == null) { //skip silently return;// www. j a va 2 s . com } fileName = (fileName == null) ? defaultName : fileName; // TODO: other metadata? Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); metadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString()); if (extractor.shouldParseEmbedded(metadata)) { TikaInputStream stream = null; try { stream = TikaInputStream.get(file.createInputStream()); extractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); attributes.addAttribute("", "id", "id", "CDATA", fileName); handler.startElement("div", attributes); handler.endElement("div"); } finally { IOUtils.closeQuietly(stream); } } }
From source file:org.apache.tika.parser.pdf.PDF2XHTML.java
private void addFieldString(PDField field, XHTMLContentHandler handler) throws SAXException { //Pick partial name to present in content and altName for attribute //Ignoring FullyQualifiedName for now String partName = field.getPartialName(); String altName = field.getAlternateFieldName(); StringBuilder sb = new StringBuilder(); AttributesImpl attrs = new AttributesImpl(); if (partName != null) { sb.append(partName).append(": "); }//from w w w .j av a 2 s. c o m if (altName != null) { attrs.addAttribute("", "altName", "altName", "CDATA", altName); } //return early if PDSignature field if (field instanceof PDSignatureField) { handleSignature(attrs, (PDSignatureField) field, handler); return; } String value = field.getValueAsString(); if (value != null && !value.equals("null")) { sb.append(value); } if (attrs.getLength() > 0 || sb.length() > 0) { handler.startElement("li", attrs); handler.characters(sb.toString()); handler.endElement("li"); } }
From source file:org.apache.tika.parser.pdf.PDF2XHTML.java
private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField, XHTMLContentHandler handler) throws SAXException { PDSignature sig = sigField.getSignature(); if (sig == null) { return;// w ww . ja v a 2 s. com } Map<String, String> vals = new TreeMap<>(); vals.put("name", sig.getName()); vals.put("contactInfo", sig.getContactInfo()); vals.put("location", sig.getLocation()); vals.put("reason", sig.getReason()); Calendar cal = sig.getSignDate(); if (cal != null) { dateFormat.setTimeZone(cal.getTimeZone()); vals.put("date", dateFormat.format(cal.getTime())); } //see if there is any data int nonNull = 0; for (String val : vals.keySet()) { if (val != null && !val.equals("")) { nonNull++; } } //if there is, process it if (nonNull > 0) { handler.startElement("li", parentAttributes); AttributesImpl attrs = new AttributesImpl(); attrs.addAttribute("", "type", "type", "CDATA", "signaturedata"); handler.startElement("ol", attrs); for (Map.Entry<String, String> e : vals.entrySet()) { if (e.getValue() == null || e.getValue().equals("")) { continue; } attrs = new AttributesImpl(); attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey()); handler.startElement("li", attrs); handler.characters(e.getValue()); handler.endElement("li"); } handler.endElement("ol"); handler.endElement("li"); } }