Example usage for org.xml.sax.helpers AttributesImpl AttributesImpl

List of usage examples for org.xml.sax.helpers AttributesImpl AttributesImpl

Introduction

In this page you can find the example usage for org.xml.sax.helpers AttributesImpl AttributesImpl.

Prototype

public AttributesImpl() 

Source Link

Document

Construct a new, empty AttributesImpl object.

Usage

From source file:org.apache.tika.parser.microsoft.ooxml.SXSLFPowerPointExtractorDecorator.java

/**
 * This should handle the comments, master, notes, etc
 *
 * @param contentType/*from  w w w . jav  a 2  s  .  co m*/
 * @param xhtmlClassLabel
 * @param parentPart
 * @param contentHandler
 */
private void handleBasicRelatedParts(String contentType, String xhtmlClassLabel, PackagePart parentPart,
        ContentHandler contentHandler) throws SAXException {

    PackageRelationshipCollection relatedPartPRC = null;

    try {
        relatedPartPRC = parentPart.getRelationshipsByType(contentType);
    } catch (InvalidFormatException e) {
        metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
    }
    if (relatedPartPRC != null && relatedPartPRC.size() > 0) {
        AttributesImpl attributes = new AttributesImpl();

        attributes.addAttribute("", "class", "class", "CDATA", xhtmlClassLabel);
        contentHandler.startElement("", "div", "div", attributes);
        for (int i = 0; i < relatedPartPRC.size(); i++) {
            PackageRelationship relatedPartPackageRelationship = relatedPartPRC.getRelationship(i);
            try {
                PackagePart relatedPartPart = parentPart.getRelatedPart(relatedPartPackageRelationship);
                try (InputStream stream = relatedPartPart.getInputStream()) {
                    context.getSAXParser().parse(stream,
                            new OfflineContentHandler(new EmbeddedContentHandler(contentHandler)));

                } catch (IOException | TikaException e) {
                    metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
                            ExceptionUtils.getStackTrace(e));
                }

            } catch (InvalidFormatException e) {
                metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
            }
        }
        contentHandler.endElement("", "div", "div");
    }

}

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

private void processEmbeddedDocNames(Map<String, PDComplexFileSpecification> embeddedFileNames)
        throws IOException, SAXException, TikaException {
    if (embeddedFileNames == null || embeddedFileNames.isEmpty()) {
        return;//from w  w w  .  j a v  a 2  s  . c o  m
    }

    for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) {
        processDoc(ent.getKey(), ent.getValue(), new AttributesImpl());
    }
}

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

@Override
protected void endPage(PDPage page) throws IOException {

    try {//from w ww  .  jav  a2  s.  c  o m
        for (PDAnnotation annotation : page.getAnnotations()) {

            if (annotation instanceof PDAnnotationFileAttachment) {
                PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
                PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
                try {
                    AttributesImpl attributes = new AttributesImpl();
                    attributes.addAttribute("", "source", "source", "CDATA", "annotation");
                    extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes);
                } catch (SAXException e) {
                    throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
                } catch (TikaException e) {
                    throw new IOExceptionWithCause("file embedded in annotation tika exception", e);
                } catch (IOException e) {
                    handleCatchableIOE(e);
                }
            } else if (annotation instanceof PDAnnotationWidget) {
                handleWidget((PDAnnotationWidget) annotation);
            }
            // TODO: remove once PDFBOX-1143 is fixed:
            if (config.getExtractAnnotationText()) {
                PDActionURI uri = getActionURI(annotation);
                if (uri != null) {
                    String link = uri.getURI();
                    if (link != null && link.trim().length() > 0) {
                        xhtml.startElement("div", "class", "annotation");
                        xhtml.startElement("a", "href", link);
                        xhtml.characters(link);
                        xhtml.endElement("a");
                        xhtml.endElement("div");
                    }
                }

                if (annotation instanceof PDAnnotationMarkup) {
                    PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
                    String title = annotationMarkup.getTitlePopup();
                    String subject = annotationMarkup.getSubject();
                    String contents = annotationMarkup.getContents();
                    // TODO: maybe also annotationMarkup.getRichContents()?
                    if (title != null || subject != null || contents != null) {
                        xhtml.startElement("div", "class", "annotation");

                        if (title != null) {
                            xhtml.startElement("div", "class", "annotationTitle");
                            xhtml.characters(title);
                            xhtml.endElement("div");
                        }

                        if (subject != null) {
                            xhtml.startElement("div", "class", "annotationSubject");
                            xhtml.characters(subject);
                            xhtml.endElement("div");
                        }

                        if (contents != null) {
                            xhtml.startElement("div", "class", "annotationContents");
                            xhtml.characters(contents);
                            xhtml.endElement("div");
                        }

                        xhtml.endElement("div");
                    }
                }
            }
        }
        if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
            doOCROnCurrentPage();
        }

        PDPageAdditionalActions pageActions = page.getActions();
        if (pageActions != null) {
            handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE);
            handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN);
        }
        xhtml.endElement("div");
    } catch (SAXException | TikaException e) {
        throw new IOExceptionWithCause("Unable to end a page", e);
    } catch (IOException e) {
        exceptions.add(e);
    } finally {
        pageIndex++;
    }
}

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

private void handleDestinationOrAction(PDDestinationOrAction action, ActionTrigger actionTrigger)
        throws IOException, SAXException, TikaException {
    if (action == null || !config.getExtractActions()) {
        return;// w ww . j  a v  a2  s.  c  o  m
    }
    AttributesImpl attributes = new AttributesImpl();
    String actionOrDestString = (action instanceof PDAction) ? "action" : "destination";

    addNonNullAttribute("class", actionOrDestString, attributes);
    addNonNullAttribute("type", action.getClass().getSimpleName(), attributes);
    addNonNullAttribute("trigger", actionTrigger.name(), attributes);

    if (action instanceof PDActionImportData) {
        processDoc("", ((PDActionImportData) action).getFile(), attributes);
    } else if (action instanceof PDActionLaunch) {
        PDActionLaunch pdActionLaunch = (PDActionLaunch) action;
        addNonNullAttribute("id", pdActionLaunch.getF(), attributes);
        addNonNullAttribute("defaultDirectory", pdActionLaunch.getD(), attributes);
        addNonNullAttribute("operation", pdActionLaunch.getO(), attributes);
        addNonNullAttribute("parameters", pdActionLaunch.getP(), attributes);
        processDoc(pdActionLaunch.getF(), pdActionLaunch.getFile(), attributes);
    } else if (action instanceof PDActionRemoteGoTo) {
        PDActionRemoteGoTo remoteGoTo = (PDActionRemoteGoTo) action;
        processDoc("", remoteGoTo.getFile(), attributes);
    } else if (action instanceof PDActionJavaScript) {
        PDActionJavaScript jsAction = (PDActionJavaScript) action;
        Metadata m = new Metadata();
        m.set(Metadata.CONTENT_TYPE, "application/javascript");
        m.set(Metadata.CONTENT_ENCODING, StandardCharsets.UTF_8.toString());
        m.set(PDF.ACTION_TRIGGER, actionTrigger.toString());
        m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.name());
        String js = jsAction.getAction();
        js = (js == null) ? "" : js;
        if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
            try (InputStream is = TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) {
                embeddedDocumentExtractor.parseEmbedded(is, xhtml, m, false);
            }
        }
        addNonNullAttribute("class", "javascript", attributes);
        addNonNullAttribute("type", jsAction.getType(), attributes);
        addNonNullAttribute("subtype", jsAction.getSubType(), attributes);
        xhtml.startElement("div", attributes);
        xhtml.endElement("div");
    } else {
        xhtml.startElement("div", attributes);
        xhtml.endElement("div");
    }
}

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

private void addFieldString(PDField field) throws SAXException {
    //Pick partial name to present in content and altName for attribute
    //Ignoring FullyQualifiedName for now
    String partName = field.getPartialName();
    String altName = field.getAlternateFieldName();

    StringBuilder sb = new StringBuilder();
    AttributesImpl attrs = new AttributesImpl();

    if (partName != null) {
        sb.append(partName).append(": ");
    }//from  w ww. java  2s.c  o  m
    if (altName != null) {
        attrs.addAttribute("", "altName", "altName", "CDATA", altName);
    }
    //return early if PDSignature field
    if (field instanceof PDSignatureField) {
        handleSignature(attrs, (PDSignatureField) field);
        return;
    }
    String value = field.getValueAsString();
    if (value != null && !value.equals("null")) {
        sb.append(value);
    }

    if (attrs.getLength() > 0 || sb.length() > 0) {
        xhtml.startElement("li", attrs);
        xhtml.characters(sb.toString());
        xhtml.endElement("li");
    }
}

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField) throws SAXException {

    PDSignature sig = sigField.getSignature();
    if (sig == null) {
        return;/* www  . j  a va2 s.c  o m*/
    }
    Map<String, String> vals = new TreeMap<>();
    vals.put("name", sig.getName());
    vals.put("contactInfo", sig.getContactInfo());
    vals.put("location", sig.getLocation());
    vals.put("reason", sig.getReason());

    Calendar cal = sig.getSignDate();
    if (cal != null) {
        dateFormat.setTimeZone(cal.getTimeZone());
        vals.put("date", dateFormat.format(cal.getTime()));
    }
    //see if there is any data
    int nonNull = 0;
    for (String val : vals.keySet()) {
        if (val != null && !val.equals("")) {
            nonNull++;
        }
    }
    //if there is, process it
    if (nonNull > 0) {
        xhtml.startElement("li", parentAttributes);

        AttributesImpl attrs = new AttributesImpl();
        attrs.addAttribute("", "type", "type", "CDATA", "signaturedata");

        xhtml.startElement("ol", attrs);
        for (Map.Entry<String, String> e : vals.entrySet()) {
            if (e.getValue() == null || e.getValue().equals("")) {
                continue;
            }
            attrs = new AttributesImpl();
            attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey());
            xhtml.startElement("li", attrs);
            xhtml.characters(e.getValue());
            xhtml.endElement("li");
        }
        xhtml.endElement("ol");
        xhtml.endElement("li");
    }
}

From source file:org.apache.tika.parser.pdf.PDF2XHTML.java

private void extractImages(PDResources resources, Set<COSBase> seenThisPage) throws SAXException, IOException {
    if (resources == null || config.getExtractInlineImages() == false) {
        return;//w  w w . j  av a2s.  com
    }

    for (COSName name : resources.getXObjectNames()) {

        PDXObject object = resources.getXObject(name);
        if (object == null) {
            continue;
        }
        COSBase cosObject = object.getCOSObject();
        if (seenThisPage.contains(cosObject)) {
            //avoid infinite recursion TIKA-1742
            continue;
        }
        seenThisPage.add(cosObject);

        if (object instanceof PDFormXObject) {
            extractImages(((PDFormXObject) object).getResources(), seenThisPage);
        } else if (object instanceof PDImageXObject) {

            PDImageXObject image = (PDImageXObject) object;

            Metadata metadata = new Metadata();
            String extension = image.getSuffix();
            if (extension == null) {
                metadata.set(Metadata.CONTENT_TYPE, "image/png");
                extension = "png";
            } else if (extension.equals("jpg")) {
                metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
            } else if (extension.equals("tiff")) {
                metadata.set(Metadata.CONTENT_TYPE, "image/tiff");
                extension = "tif";
            } else {
                //TODO: determine if we need to add more image types
                //throw new RuntimeException("EXTEN:" + extension);
            }

            Integer imageNumber = processedInlineImages.get(name.getName());
            if (imageNumber == null) {
                imageNumber = inlineImageCounter++;
            }
            String fileName = "image" + imageNumber + "." + extension;
            metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);

            // Output the img tag
            AttributesImpl attr = new AttributesImpl();
            attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName);
            attr.addAttribute("", "alt", "alt", "CDATA", fileName);
            handler.startElement("img", attr);
            handler.endElement("img");

            //Do we only want to process unique COSObject ids?
            //If so, have we already processed this one?
            if (config.getExtractUniqueInlineImagesOnly() == true) {
                String cosObjectId = name.getName();
                if (processedInlineImages.containsKey(cosObjectId)) {
                    continue;
                }
                processedInlineImages.put(cosObjectId, imageNumber);
            }

            metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                    TikaCoreProperties.EmbeddedResourceType.INLINE.toString());

            EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
            if (extractor.shouldParseEmbedded(metadata)) {
                ByteArrayOutputStream buffer = new ByteArrayOutputStream();
                try {
                    //TODO: handle image.getMetadata()?
                    writeToBuffer(image, extension, buffer);
                    extractor.parseEmbedded(new ByteArrayInputStream(buffer.toByteArray()),
                            new EmbeddedContentHandler(handler), metadata, false);
                } catch (IOException e) {
                    // could not extract this image, so just skip it...
                }
            }
        }
    }
}

From source file:org.apache.tika.parser.pdf.PDF2XHTML.java

private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file,
        EmbeddedDocumentExtractor extractor) throws SAXException, IOException, TikaException {

    if (file == null) {
        //skip silently
        return;// www.  j  a  va 2 s .  com
    }

    fileName = (fileName == null) ? defaultName : fileName;

    // TODO: other metadata?
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
    metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
    metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
    metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
            TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());

    if (extractor.shouldParseEmbedded(metadata)) {
        TikaInputStream stream = null;
        try {
            stream = TikaInputStream.get(file.createInputStream());
            extractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);

            AttributesImpl attributes = new AttributesImpl();
            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
            attributes.addAttribute("", "id", "id", "CDATA", fileName);
            handler.startElement("div", attributes);
            handler.endElement("div");
        } finally {
            IOUtils.closeQuietly(stream);
        }
    }
}

From source file:org.apache.tika.parser.pdf.PDF2XHTML.java

private void addFieldString(PDField field, XHTMLContentHandler handler) throws SAXException {
    //Pick partial name to present in content and altName for attribute
    //Ignoring FullyQualifiedName for now
    String partName = field.getPartialName();
    String altName = field.getAlternateFieldName();

    StringBuilder sb = new StringBuilder();
    AttributesImpl attrs = new AttributesImpl();

    if (partName != null) {
        sb.append(partName).append(": ");
    }//from   w w  w  .j  av a 2 s. c o m
    if (altName != null) {
        attrs.addAttribute("", "altName", "altName", "CDATA", altName);
    }
    //return early if PDSignature field
    if (field instanceof PDSignatureField) {
        handleSignature(attrs, (PDSignatureField) field, handler);
        return;
    }
    String value = field.getValueAsString();
    if (value != null && !value.equals("null")) {
        sb.append(value);
    }

    if (attrs.getLength() > 0 || sb.length() > 0) {
        handler.startElement("li", attrs);
        handler.characters(sb.toString());
        handler.endElement("li");
    }
}

From source file:org.apache.tika.parser.pdf.PDF2XHTML.java

private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField,
        XHTMLContentHandler handler) throws SAXException {

    PDSignature sig = sigField.getSignature();
    if (sig == null) {
        return;// w ww .  ja v a  2  s.  com
    }
    Map<String, String> vals = new TreeMap<>();
    vals.put("name", sig.getName());
    vals.put("contactInfo", sig.getContactInfo());
    vals.put("location", sig.getLocation());
    vals.put("reason", sig.getReason());

    Calendar cal = sig.getSignDate();
    if (cal != null) {
        dateFormat.setTimeZone(cal.getTimeZone());
        vals.put("date", dateFormat.format(cal.getTime()));
    }
    //see if there is any data
    int nonNull = 0;
    for (String val : vals.keySet()) {
        if (val != null && !val.equals("")) {
            nonNull++;
        }
    }
    //if there is, process it
    if (nonNull > 0) {
        handler.startElement("li", parentAttributes);

        AttributesImpl attrs = new AttributesImpl();
        attrs.addAttribute("", "type", "type", "CDATA", "signaturedata");

        handler.startElement("ol", attrs);
        for (Map.Entry<String, String> e : vals.entrySet()) {
            if (e.getValue() == null || e.getValue().equals("")) {
                continue;
            }
            attrs = new AttributesImpl();
            attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey());
            handler.startElement("li", attrs);
            handler.characters(e.getValue());
            handler.endElement("li");
        }
        handler.endElement("ol");
        handler.endElement("li");
    }
}