org.opencrx.application.uses.com.auxilii.msgparser.MsgParser.java Source code

Java tutorial

Introduction

Here is the source code for org.opencrx.application.uses.com.auxilii.msgparser.MsgParser.java

Source

/*
 * msgparser - http://auxilii.com/msgparser
 * Copyright (C) 2007  Roman Kurmanowytsch
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see http://www.gnu.org/licenses/.
 */
package org.opencrx.application.uses.com.auxilii.msgparser;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.opencrx.application.uses.com.auxilii.msgparser.attachment.Attachment;
import org.opencrx.application.uses.com.auxilii.msgparser.attachment.FileAttachment;
import org.opencrx.application.uses.com.auxilii.msgparser.attachment.MsgAttachment;

/**
 * Main parser class that does the actual
 * parsing of the Outlook .msg file. It uses the 
 * <a href="http://poi.apache.org/poifs/">POI</a>
 * library for parsing the .msg container file
 * and is based on a description posted by
 * Peter Fiskerstrand at
 * <a href="http://www.fileformat.info/format/outlookmsg/">fileformat.info</a>.
 * <br /><br />
 * It parses the .msg file and stores the information
 * in a {@link Message} object. Attachments are
 * put into an {@link FileAttachment} object. Hence, please
 * keep in mind that the complete mail is held in the memory!
 * If an attachment is another .msg file, this
 * attachment is not processed as a normal attachment
 * but rather included as a {@link MsgAttachment}. This
 * attached mail is, again, a {@link Message} object 
 * and may have further attachments and so on.
 * <br /><br />
 * Note: this code has not been tested on a wide
 * range of .msg files. Use in production level
 * (as in any other level) at your own risk.
 * <br /><br />
 * Usage:
 * <br /><br />
 * <code>
 * MsgParser msgp = new MsgParser();<br />
 * Message msg = msgp.parseMsg("test.msg"); 
 * </code>
 * @author roman.kurmanowytsch
 */
public class MsgParser {
    protected static final Logger logger = Logger.getLogger(MsgParser.class.getName());

    /**
     * Empty constructor.
     */
    public MsgParser() {
    }

    /**
     * Parses a .msg file provided in the specified file.
     * 
     * @param msgFile The .msg file.
     * @return A {@link Message} object representing the .msg file.
     * @throws IOException Thrown if the file could not be loaded or parsed.
     * @throws UnsupportedOperationException Thrown if the .msg file cannot
     *   be parsed correctly.
     */
    public Message parseMsg(File msgFile) throws IOException, UnsupportedOperationException {
        return this.parseMsg(new FileInputStream(msgFile), true);
    }

    /**
     * Parses a .msg file provided in the specified file.
     *
     * @param msgFile The .msg file as a String path.
     * @return A {@link Message} object representing the .msg file.
     * @throws IOException Thrown if the file could not be loaded or parsed.
     * @throws UnsupportedOperationException Thrown if the .msg file cannot
     *   be parsed correctly.
     */
    public Message parseMsg(String msgFile) throws IOException, UnsupportedOperationException {
        return this.parseMsg(new FileInputStream(msgFile), true);
    }

    /**
     * Parses a .msg file provided by an input stream.
     * 
     * @param msgFileStream The .msg file as a InputStream.
     * @return A {@link Message} object representing the .msg file.
     * @throws IOException Thrown if the file could not be loaded or parsed.
     * @throws UnsupportedOperationException Thrown if the .msg file cannot
     *   be parsed correctly.
     */
    public Message parseMsg(InputStream msgFileStream) throws IOException, UnsupportedOperationException {
        return this.parseMsg(msgFileStream, true);
    }

    /**
     * Parses a .msg file provided by an input stream.
     * 
     * @param msgFileStream The .msg file as a InputStream.
     * @param closeStream Indicates whether the provided stream should
     *   be closed after the message has been read.
     * @return A {@link Message} object representing the .msg file.
     * @throws IOException Thrown if the file could not be loaded or parsed.
     * @throws UnsupportedOperationException Thrown if the .msg file cannot
     *   be parsed correctly.
     */
    public Message parseMsg(InputStream msgFileStream, boolean closeStream)
            throws IOException, UnsupportedOperationException {
        // the .msg file, like a file system, contains directories
        // and documents within this directories
        // we now gain access to the root node
        // and recursively go through the complete 'filesystem'.
        Message msg = null;
        try {
            POIFSFileSystem fs = new POIFSFileSystem(msgFileStream);
            DirectoryEntry dir = fs.getRoot();
            msg = new Message();
            this.checkDirectoryEntry(dir, msg);
        } finally {
            if (closeStream) {
                try {
                    msgFileStream.close();
                } catch (Exception e) {
                    // ignore
                }
            }
        }
        return msg;
    }

    /**
     * Recursively parses the complete .msg file with the
     * help of the POI library. The parsed information is
     * put into the {@link Message} object.
     * 
     * @param dir The current node in the .msg file.
     * @param msg The resulting {@link Message} object.
     * @throws IOException Thrown if the .msg file could not
     *  be parsed.
     * @throws UnsupportedOperationException Thrown if 
     *  the .msg file contains unknown data.
     */
    protected void checkDirectoryEntry(DirectoryEntry dir, Message msg)
            throws IOException, UnsupportedOperationException {

        // we iterate through all entries in the current directory
        for (Iterator<?> iter = dir.getEntries(); iter.hasNext();) {
            Entry entry = (Entry) iter.next();

            // check whether the entry is either a directory entry
            // or a document entry

            if (entry.isDirectoryEntry()) {

                DirectoryEntry de = (DirectoryEntry) entry;

                // attachments have a special name and
                // have to be handled separately at this point
                if (de.getName().startsWith("__attach_version1.0")) {
                    this.parseAttachment(de, msg);
                } else if (de.getName().startsWith("__recip_version1.0")) {
                    // a recipient entry has been found (which is also a directory entry itself)
                    this.checkRecipientDirectoryEntry(de, msg);
                } else {
                    // a directory entry has been found. this
                    // node will be recursively checked
                    this.checkDirectoryEntry(de, msg);
                }

            } else if (entry.isDocumentEntry()) {

                // a document entry contains information about
                // the mail (e.g, from, to, subject, ...)
                DocumentEntry de = (DocumentEntry) entry;

                // the data is accessed by getting an input stream
                // for the given document entry
                DocumentInputStream dstream = new DocumentInputStream(de);
                // analyze the document entry
                // (i.e., get class and data type)
                FieldInformation info = this.analyzeDocumentEntry(de);
                // create a Java object from the data provided
                // by the input stream. depending on the field
                // information, either a String or a byte[] will
                // be returned. other datatypes are not yet supported
                Object data = this.getData(dstream, info);

                logger.finest("  Document data: " + ((data == null) ? "null" : data.toString()));
                // the data is written into the Message object
                msg.setProperty(info.getClazz(), data);

            } else {
                // any other type is not supported
            }
        }
    }

    /**
     * Parses a recipient directory entry which holds informations about one of possibly multiple recipients. 
     * The parsed information is put into the {@link Message} object.
     * 
     * @param dir The current node in the .msg file.
     * @param msg The resulting {@link Message} object.
     * @throws IOException Thrown if the .msg file could not
     *  be parsed.
     * @throws UnsupportedOperationException Thrown if 
     *  the .msg file contains unknown data.
     */
    protected void checkRecipientDirectoryEntry(DirectoryEntry dir, Message msg)
            throws IOException, UnsupportedOperationException {

        RecipientEntry recipient = new RecipientEntry();

        // we iterate through all entries in the current directory
        for (Iterator<?> iter = dir.getEntries(); iter.hasNext();) {
            Entry entry = (Entry) iter.next();

            // check whether the entry is either a directory entry
            // or a document entry, while we are just interested in document entries on this level         
            if (entry.isDirectoryEntry()) {
                // not expected within a recipient entry

            } else if (entry.isDocumentEntry()) {

                // a document entry contains information about
                // the mail (e.g, from, to, subject, ...)
                DocumentEntry de = (DocumentEntry) entry;

                // the data is accessed by getting an input stream
                // for the given document entry
                DocumentInputStream dstream = new DocumentInputStream(de);
                // analyze the document entry
                // (i.e., get class and data type)
                FieldInformation info = this.analyzeDocumentEntry(de);
                // create a Java object from the data provided
                // by the input stream. depending on the field
                // information, either a String or a byte[] will
                // be returned. other datatypes are not yet supported
                Object data = this.getData(dstream, info);

                logger.finest("  Document data: " + ((data == null) ? "null" : data.toString()));
                // the data is written into the Message object
                recipient.setProperty(info.getClazz(), data);

            } else {
                // any other type is not supported
            }
        }

        //after all properties are set -> add recipient to msg object
        msg.addRecipient(recipient);
    }

    /**
     * Reads the information from the InputStream and
     * creates, based on the information in the
     * {@link FieldInformation} object, either a String
     * or a byte[] (e.g., for attachments) Object 
     * containing this data.
     * 
     * @param dstream The InputStream of the Document Entry.
     * @param info The field information that is needed to
     *  determine the data type of the input stream.
     * @return The String/byte[] object representing
     *  the data.
     * @throws IOException Thrown if the .msg file could not
     *  be parsed.
     * @throws UnsupportedOperationException Thrown if 
     *  the .msg file contains unknown data.
     */
    protected Object getData(DocumentInputStream dstream, FieldInformation info) throws IOException {
        // if there is no field information available, we simply
        // return null. in that case, we're not interested in the
        // data anyway
        if ((info == null) || (info.getType() == FieldInformation.UNKNOWN)) {
            return null;
        }
        // if the type is 001e (we know it is lower case
        // because analyzeDocumentEntry stores the type in
        // lower case), we create a String object from the data.
        // the encoding of the binary data is most probably
        // ISO-8859-1 (not pure ASCII).
        if (info.getType().equals("001e")) {
            // we put the complete data into a byte[] object...
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            byte[] buffer = new byte[1024];
            int read = -1;
            while ((read = dstream.read(buffer)) > 0) {
                baos.write(buffer, 0, read);
            }
            // ...and create a String object from it
            String text = new String(baos.toByteArray(), "ISO-8859-1");
            return text;
        } else if (info.getType().equals("001f")) {
            // Unicode encoding with lowbyte followed by hibyte
            // Note: this is arcane guesswork, but it works
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            byte[] buffer = new byte[1024];
            int read = -1;
            while ((read = dstream.read(buffer)) > 0) {
                baos.write(buffer, 0, read);
            }
            byte[] bytes = baos.toByteArray();
            // now that we have all bytes from the stream,
            // we can now convert the byte array into
            // a character array by switching hi- and lowbytes
            char[] characters = new char[bytes.length / 2];
            int c = 0;
            for (int i = 0; i < bytes.length - 1; i = i + 2) {
                int ch = (int) bytes[i + 1];
                int cl = (int) bytes[i] & 0xff; //Using unsigned value (thanks to Reto Schuettel)
                characters[c++] = (char) ((ch << 8) + cl);
            }
            String text = new String(characters);
            return text;
        } else if (info.getType().equals("0102")) {
            // the data is read into a byte[] object
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            byte[] buffer = new byte[1024];
            int read = -1;
            while ((read = dstream.read(buffer)) > 0) {
                baos.write(buffer, 0, read);
            }
            return baos.toByteArray();
        }

        // this should not happen      
        logger.fine("Unknown field type " + info.getType());
        return null;
    }

    /**
     * Analyzes the {@link DocumentEntry} and returns
     * a {@link FieldInformation} object containing the
     * class (the field name, so to say) and type of
     * the entry. 
     * 
     * @param de The {@link DocumentEntry} that should be examined.
     * @return A {@link FieldInformation} object containing class
     *  and type of the document entry or, if the entry is
     *  not an interesting field, an empty {@link FieldInformation}
     *  object containing {@link FieldInformation#UNKNOWN} class
     *  and type.
     */
    protected FieldInformation analyzeDocumentEntry(DocumentEntry de) {
        String name = de.getName();
        // we are only interested in document entries
        // with names starting with __substg1.
        logger.finest("Document entry: " + name);
        String key = "__substg1.";
        if (name.startsWith(key)) {
            String clazz = FieldInformation.UNKNOWN;
            String type = FieldInformation.UNKNOWN;
            try {
                String val = name.substring(key.length() + 2).toLowerCase();
                // the first 4 digits of the remainder
                // defines the field class (or field name)
                // and the last 4 digits indicate the
                // data type.
                clazz = val.substring(0, 4);
                type = val.substring(4);
                logger.finest("  Found document entry: class=" + clazz + ", type=" + type);
            } catch (RuntimeException re) {
                logger.log(Level.FINE, "Could not parse directory entry " + name, re);
            }
            return new FieldInformation(clazz, type);
        } else {
            logger.finest("Ignoring entry with name " + name);
        }
        // we are not interested in the field
        // and return an empty FieldInformation object
        return new FieldInformation();
    }

    /**
     * Creates an {@link Attachment} object based on
     * the given directory entry. The entry may either
     * point to an attached file or to an
     * attached .msg file, which will be added 
     * as a {@link MsgAttachment} object instead.
     * 
     * @param dir The directory entry containing the attachment
     *  document entry and some other document entries
     *  describing the attachment (name, extension, mime type, ...)
     * @param msg The {@link Message} object that this
     *  attachment should be added to.
     * @throws IOException Thrown if the attachment could
     *  not be parsed/read.
     */
    protected void parseAttachment(DirectoryEntry dir, Message msg) throws IOException {

        FileAttachment attachment = new FileAttachment();

        // iterate through all document entries
        for (Iterator<?> iter = dir.getEntries(); iter.hasNext();) {
            Entry entry = (Entry) iter.next();
            if (entry.isDocumentEntry()) {

                // the document entry may contain information
                // about the attachment
                DocumentEntry de = (DocumentEntry) entry;

                FieldInformation info = analyzeDocumentEntry(de);
                DocumentInputStream dstream = new DocumentInputStream(de);

                Object data = this.getData(dstream, info);
                String clazz = info.getClazz();

                // we provide the class and data of the document
                // entry to the attachment. the attachment implementation
                // has to know the semantics of the field names
                attachment.setProperty(clazz, data, de);

            } else {

                // a directory within the attachment directory
                // entry  means that a .msg file is attached
                // at this point. we recursively parse
                // this .msg file and add it as a MsgAttachment
                // object to the current Message object.
                Message attachmentMsg = new Message();
                MsgAttachment msgAttachment = new MsgAttachment();
                msgAttachment.setMessage(attachmentMsg);
                msg.addAttachment(msgAttachment);

                this.checkDirectoryEntry((DirectoryEntry) entry, attachmentMsg);
            }
        }

        // only if there was really an attachment, we
        // add this object to the Message object
        if (attachment.getSize() > -1) {
            msg.addAttachment(attachment);
        }

    }
}