List of usage examples for org.apache.poi.hmef Attachment getFilename
public String getFilename()
From source file:mj.ocraptor.extraction.tika.parser.microsoft.TNEFParser.java
License:Apache License
/** * Extracts properties and text from an MS Document input stream *//*from w w w .j a v a 2 s. com*/ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // We work by recursing, so get the appropriate bits EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); EmbeddedDocumentExtractor embeddedExtractor; if (ex == null) { embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context); } else { embeddedExtractor = ex; } // Ask POI to process the file for us HMEFMessage msg = new HMEFMessage(stream); // Set the message subject if known String subject = msg.getSubject(); if (subject != null && subject.length() > 0) { // TODO: Move to title in Tika 2.0 metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE, subject); } // Recurse into the message body RTF MAPIAttribute attr = msg.getMessageMAPIAttribute(MAPIProperty.RTF_COMPRESSED); if (attr != null && attr instanceof MAPIRtfAttribute) { MAPIRtfAttribute rtf = (MAPIRtfAttribute) attr; handleEmbedded("message.rtf", "application/rtf", rtf.getData(), embeddedExtractor, handler); } // Recurse into each attachment in turn for (Attachment attachment : msg.getAttachments()) { String name = attachment.getLongFilename(); if (name == null || name.length() == 0) { name = attachment.getFilename(); } if (name == null || name.length() == 0) { String ext = attachment.getExtension(); if (ext != null) { name = "unknown" + ext; } } handleEmbedded(name, null, attachment.getContents(), embeddedExtractor, handler); } }
From source file:org.apache.nifi.processors.email.ExtractTNEFAttachments.java
License:Apache License
@Override public void onTrigger(final ProcessContext context, final ProcessSession session) { final ComponentLog logger = getLogger(); final FlowFile originalFlowFile = session.get(); if (originalFlowFile == null) { return;/*from w ww . jav a2 s .c o m*/ } final List<FlowFile> attachmentsList = new ArrayList<>(); final List<FlowFile> invalidFlowFilesList = new ArrayList<>(); final List<FlowFile> originalFlowFilesList = new ArrayList<>(); session.read(originalFlowFile, new InputStreamCallback() { @Override public void process(final InputStream rawIn) throws IOException { try (final InputStream in = new BufferedInputStream(rawIn)) { Properties props = new Properties(); HMEFMessage hmefMessage = null; // This will trigger an exception in case content is not a TNEF. hmefMessage = new HMEFMessage(in); // Add otiginal flowfile (may revert later on in case of errors) // originalFlowFilesList.add(originalFlowFile); if (hmefMessage != null) { // Attachments isn empty, proceeding. if (!hmefMessage.getAttachments().isEmpty()) { final String originalFlowFileName = originalFlowFile .getAttribute(CoreAttributes.FILENAME.key()); try { for (final Attachment attachment : hmefMessage.getAttachments()) { FlowFile split = session.create(originalFlowFile); final Map<String, String> attributes = new HashMap<>(); if (StringUtils.isNotBlank(attachment.getLongFilename())) { attributes.put(CoreAttributes.FILENAME.key(), attachment.getFilename()); } String parentUuid = originalFlowFile.getAttribute(CoreAttributes.UUID.key()); attributes.put(ATTACHMENT_ORIGINAL_UUID, parentUuid); attributes.put(ATTACHMENT_ORIGINAL_FILENAME, originalFlowFileName); // TODO: Extract Mime Type (HMEF doesn't seem to be able to get this info. split = session.append(split, new OutputStreamCallback() { @Override public void process(OutputStream out) throws IOException { out.write(attachment.getContents()); } }); split = session.putAllAttributes(split, attributes); attachmentsList.add(split); } } catch (FlowFileHandlingException e) { // Something went wrong // Removing splits that may have been created session.remove(attachmentsList); // Removing the original flow from its list originalFlowFilesList.remove(originalFlowFile); logger.error( "Flowfile {} triggered error {} while processing message removing generated FlowFiles from sessions", new Object[] { originalFlowFile, e }); invalidFlowFilesList.add(originalFlowFile); } } } } catch (Exception e) { // Another error hit... // Removing the original flow from its list originalFlowFilesList.remove(originalFlowFile); logger.error("Could not parse the flowfile {} as an email, treating as failure", new Object[] { originalFlowFile, e }); // Message is invalid or triggered an error during parsing invalidFlowFilesList.add(originalFlowFile); } } }); session.transfer(attachmentsList, REL_ATTACHMENTS); // As per above code, originalFlowfile may be routed to invalid or // original depending on RFC2822 compliance. session.transfer(invalidFlowFilesList, REL_FAILURE); session.transfer(originalFlowFilesList, REL_ORIGINAL); // check if attachments have been extracted if (attachmentsList.size() != 0) { if (attachmentsList.size() > 10) { // If more than 10, summarise log logger.info("Split {} into {} files", new Object[] { originalFlowFile, attachmentsList.size() }); } else { // Otherwise be more verbose and list each individual split logger.info("Split {} into {} files: {}", new Object[] { originalFlowFile, attachmentsList.size(), attachmentsList }); } } }
From source file:org.apache.tika.parser.microsoft.TNEFParser.java
License:Apache License
/** * Extracts properties and text from an MS Document input stream *///from w w w . j a va 2 s . c o m public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // We work by recursing, so get the appropriate bits EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); EmbeddedDocumentExtractor embeddedExtractor; if (ex == null) { embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context); } else { embeddedExtractor = ex; } // Ask POI to process the file for us HMEFMessage msg = new HMEFMessage(stream); // Set the message subject if known String subject = msg.getSubject(); if (subject != null && subject.length() > 0) { // TODO: Move to title in Tika 2.0 metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE, subject); } // Recurse into the message body RTF MAPIAttribute attr = msg.getMessageMAPIAttribute(MAPIProperty.RTF_COMPRESSED); if (attr != null && attr instanceof MAPIRtfAttribute) { MAPIRtfAttribute rtf = (MAPIRtfAttribute) attr; handleEmbedded("message.rtf", "application/rtf", rtf.getData(), embeddedExtractor, handler); } // Recurse into each attachment in turn for (Attachment attachment : msg.getAttachments()) { String name = attachment.getLongFilename(); if (name == null || name.length() == 0) { name = attachment.getFilename(); } if (name == null || name.length() == 0) { String ext = attachment.getExtension(); if (ext != null) { name = "unknown" + ext; } } handleEmbedded(name, null, attachment.getContents(), embeddedExtractor, handler); } }