net.sf.mmm.content.parser.impl.poi.AbstractContentParserPoi.java Source code

Introduction

Here is the source code for net.sf.mmm.content.parser.impl.poi.AbstractContentParserPoi.java
Source

/* Copyright (c) The m-m-m Team, Licensed under the Apache License, Version 2.0
 * http://www.apache.org/licenses/LICENSE-2.0 */
package net.sf.mmm.content.parser.impl.poi;

import java.io.InputStream;

import net.sf.mmm.content.parser.api.ContentParserOptions;
import net.sf.mmm.content.parser.base.AbstractContentParser;
import net.sf.mmm.util.context.api.MutableGenericContext;

import org.apache.poi.hpsf.PropertySetFactory;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;

/**
 * This is the abstract base implementation of the
 * {@link net.sf.mmm.content.parser.api.ContentParser} interface for parsing
 * binary Microsoft office documents using apache POI.
 * 
 * @author Joerg Hohwiller (hohwille at users.sourceforge.net)
 */
public abstract class AbstractContentParserPoi extends AbstractContentParser {

    /** name of the entry for a word document in the POI filesystem */
    public static final String POIFS_WORD_DOC = "WordDocument";

    /** name of the entry for a powerpoint document in the POI filesystem */
    public static final String POIFS_POWERPOINT_DOC = "PowerPoint Document";

    /** name of the entry for a excel document in the POI filesystem */
    public static final String POIFS_EXCEL_DOC = "Workbook";

    /**
     * The constructor.
     */
    public AbstractContentParserPoi() {

        super();
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void parse(InputStream inputStream, long filesize, ContentParserOptions options,
            MutableGenericContext context) throws Exception {

        POIFSFileSystem poiFs = new POIFSFileSystem(inputStream);
        SummaryInformation summaryInfo = (SummaryInformation) PropertySetFactory
                .create(poiFs.createDocumentInputStream(SummaryInformation.DEFAULT_STREAM_NAME));
        String title = summaryInfo.getTitle();
        if (title != null) {
            context.setVariable(VARIABLE_NAME_TITLE, title);
        }
        String author = summaryInfo.getAuthor();
        if (author != null) {
            context.setVariable(VARIABLE_NAME_CREATOR, author);
        }
        String keywords = summaryInfo.getKeywords();
        if (keywords != null) {
            context.setVariable(VARIABLE_NAME_KEYWORDS, keywords);
        }
        context.setVariable(VARIABLE_NAME_TEXT, extractText(poiFs, filesize, options));
    }

    /**
     * This method extracts the text from the office document given by
     * <code>poiFs</code>.
     * 
     * @param poiFs is the POI filesystem of the office document.
     * @param filesize is the size (content-length) of the content to parse in
     *        bytes or <code>0</code> if NOT available (unknown). If available,
     *        the parser may use this value for optimized allocations.
     * @param options are the {@link ContentParserOptions}.
     * @return the plain text extracted from the content.
     * @throws Exception if something goes wrong.
     */
    protected abstract String extractText(POIFSFileSystem poiFs, long filesize, ContentParserOptions options)
            throws Exception;

}