net.sf.mmm.content.parser.impl.poi.AbstractContentParserOoxml.java Source code

Java tutorial

Introduction

Here is the source code for net.sf.mmm.content.parser.impl.poi.AbstractContentParserOoxml.java

Source

/* Copyright (c) The m-m-m Team, Licensed under the Apache License, Version 2.0
 * http://www.apache.org/licenses/LICENSE-2.0 */
package net.sf.mmm.content.parser.impl.poi;

import java.io.InputStream;

import net.sf.mmm.content.parser.api.ContentParserOptions;
import net.sf.mmm.content.parser.base.AbstractContentParser;
import net.sf.mmm.util.context.api.MutableGenericContext;

import org.apache.poi.POIXMLProperties.CoreProperties;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.openxml4j.opc.OPCPackage;

/**
 * This is the abstract base implementation of the
 * {@link net.sf.mmm.content.parser.api.ContentParser} interface for parsing
 * binary Microsoft office documents using apache POI.
 * 
 * @author Joerg Hohwiller (hohwille at users.sourceforge.net)
 */
public abstract class AbstractContentParserOoxml extends AbstractContentParser {

    /** The generic mimetype for OOXML. */
    public static final String KEY_MIMETYPE_GENERIC = "application/vnd.openxmlformats";

    /**
     * The constructor.
     */
    public AbstractContentParserOoxml() {

        super();
    }

    /**
     * This method creates the {@link POIXMLTextExtractor} for the given
     * <code>opcPackage</code>.
     * 
     * @param opcPackage is the {@link OPCPackage}.
     * @return the {@link POIXMLTextExtractor}.
     * @throws Exception if something goes wrong.
     */
    protected POIXMLTextExtractor createExtractor(OPCPackage opcPackage) throws Exception {

        return ExtractorFactory.createExtractor(opcPackage);
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void parse(InputStream inputStream, long filesize, ContentParserOptions options,
            MutableGenericContext context) throws Exception {

        OPCPackage opcPackage = OPCPackage.open(inputStream);
        POIXMLTextExtractor extractor = createExtractor(opcPackage);
        CoreProperties coreProperties = extractor.getCoreProperties();
        String title = coreProperties.getTitle();
        if (title != null) {
            context.setVariable(VARIABLE_NAME_TITLE, title);
        }
        String author = coreProperties.getCreator();
        if (author != null) {
            context.setVariable(VARIABLE_NAME_CREATOR, author);
        }
        String keywords = coreProperties.getKeywords();
        if (keywords != null) {
            context.setVariable(VARIABLE_NAME_KEYWORDS, keywords);
        }
        context.setVariable(VARIABLE_NAME_TEXT, extractText(extractor, filesize));
    }

    /**
     * This method extracts the text from the office document given by
     * <code>poiFs</code>.
     * 
     * @param extractor is the {@link POIXMLTextExtractor}.
     * @param filesize is the size (content-length) of the content to parse in
     *        bytes or <code>0</code> if NOT available (unknown). If available,
     *        the parser may use this value for optimized allocations.
     * @return the plain text extracted from the content.
     * @throws Exception if something goes wrong.
     */
    protected String extractText(POIXMLTextExtractor extractor, long filesize) throws Exception {

        return extractor.getText();
    }

}