com.opensearchserver.textextractor.parser.Pptx.java Source code

Introduction

Here is the source code for com.opensearchserver.textextractor.parser.Pptx.java
Source

/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2010-2013 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see <http://www.gnu.org/licenses/>.
 **/

package com.opensearchserver.textextractor.parser;

import java.io.File;
import java.io.InputStream;

import org.apache.poi.POIXMLProperties.CoreProperties;
import org.apache.poi.xslf.XSLFSlideShow;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.DrawingParagraph;
import org.apache.poi.xslf.usermodel.DrawingTextBody;
import org.apache.poi.xslf.usermodel.DrawingTextPlaceholder;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFCommentAuthors;
import org.apache.poi.xslf.usermodel.XSLFComments;
import org.apache.poi.xslf.usermodel.XSLFCommonSlideData;
import org.apache.poi.xslf.usermodel.XSLFNotes;
import org.apache.poi.xslf.usermodel.XSLFSlide;
import org.apache.poi.xslf.usermodel.XSLFSlideLayout;
import org.apache.poi.xslf.usermodel.XSLFSlideMaster;
import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentAuthor;

import com.opensearchserver.textextractor.ParserAbstract;
import com.opensearchserver.textextractor.ParserDocument;
import com.opensearchserver.textextractor.ParserField;

public class Pptx extends ParserAbstract {

    final protected static ParserField TITLE = ParserField.newString("title", "The title of the document");

    final protected static ParserField CREATOR = ParserField.newString("creator", "The name of the creator");

    final protected static ParserField DESCRIPTION = ParserField.newString("description", null);

    final protected static ParserField KEYWORDS = ParserField.newString("keywords", null);

    final protected static ParserField SUBJECT = ParserField.newString("subject", "The subject of the document");

    final protected static ParserField CREATION_DATE = ParserField.newDate("creation_date", null);

    final protected static ParserField MODIFICATION_DATE = ParserField.newDate("modification_date", null);

    final protected static ParserField SLIDES = ParserField.newString("slides", null);

    final protected static ParserField MASTER = ParserField.newString("master", null);

    final protected static ParserField NOTES = ParserField.newString("notes", null);

    final protected static ParserField COMMENTS = ParserField.newString("comments", null);

    final protected static ParserField LANG_DETECTION = ParserField.newString("lang_detection",
            "Detection of the language");

    final protected static ParserField[] FIELDS = { TITLE, CREATOR, DESCRIPTION, KEYWORDS, SUBJECT, CREATION_DATE,
            MODIFICATION_DATE, SLIDES, MASTER, NOTES, COMMENTS, LANG_DETECTION };

    public Pptx() {
    }

    @Override
    protected ParserField[] getParameters() {
        return null;
    }

    @Override
    protected ParserField[] getFields() {
        return FIELDS;
    }

    @Override
    protected void parseContent(InputStream inputStream) throws Exception {
        File tempFile = ParserAbstract.createTempFile(inputStream, "pptx");
        try {
            parseContent(tempFile);
        } finally {
            tempFile.delete();
        }
    }

    @Override
    protected void parseContent(File file) throws Exception {

        XSLFSlideShow pptSlideShow = new XSLFSlideShow(file.getAbsolutePath());
        XMLSlideShow slideshow = new XMLSlideShow(pptSlideShow.getPackage());

        // Extract metadata
        XSLFPowerPointExtractor poiExtractor = null;
        try {
            poiExtractor = new XSLFPowerPointExtractor(slideshow);
            CoreProperties info = poiExtractor.getCoreProperties();
            if (info != null) {
                metas.add(TITLE, info.getTitle());
                metas.add(CREATOR, info.getCreator());
                metas.add(SUBJECT, info.getSubject());
                metas.add(DESCRIPTION, info.getDescription());
                metas.add(KEYWORDS, info.getKeywords());
                metas.add(CREATION_DATE, info.getCreated());
                metas.add(MODIFICATION_DATE, info.getModified());
            }
        } finally {
            poiExtractor.close();
        }
        extractSides(slideshow);
    }

    /**
     * Declined from XSLFPowerPointExtractor.java
     */
    private String extractText(XSLFCommonSlideData data, boolean skipPlaceholders) {
        StringBuilder sb = new StringBuilder();
        for (DrawingTextBody textBody : data.getDrawingText()) {
            if (skipPlaceholders && textBody instanceof DrawingTextPlaceholder) {
                DrawingTextPlaceholder ph = (DrawingTextPlaceholder) textBody;
                if (!ph.isPlaceholderCustom()) {
                    // Skip non-customised placeholder text
                    continue;
                }
            }

            for (DrawingParagraph p : textBody.getParagraphs()) {
                sb.append(p.getText());
                sb.append("\n");
            }
        }
        return sb.toString();
    }

    /**
     * Declined from XSLFPowerPointExtractor.java
     * 
     * @param pptSlideShow
     */
    private void extractSides(XMLSlideShow slideshow) {

        XSLFSlide[] slides = (XSLFSlide[]) slideshow.getSlides();
        XSLFCommentAuthors commentAuthors = slideshow.getCommentAuthors();

        for (XSLFSlide slide : slides) {

            // One document per slide
            ParserDocument result = getNewParserDocument();

            XSLFNotes notes = slide.getNotes();
            XSLFComments comments = slide.getComments();
            XSLFSlideLayout layout = slide.getSlideLayout();
            XSLFSlideMaster master = layout.getSlideMaster();

            // TODO Do the slide's name
            // (Stored in docProps/app.xml)

            // Do the slide's text
            result.add(SLIDES, extractText(slide.getCommonSlideData(), false));
            result.add(LANG_DETECTION, languageDetection(SLIDES, 10000));

            // If requested, get text from the master and it's layout
            if (layout != null) {
                result.add(MASTER, extractText(layout.getCommonSlideData(), true));
            }
            if (master != null) {
                result.add(MASTER, extractText(master.getCommonSlideData(), true));
            }

            // If the slide has comments, do those too
            if (comments != null) {
                for (CTComment comment : comments.getCTCommentsList().getCmList()) {
                    StringBuilder sbComment = new StringBuilder();
                    // Do the author if we can
                    if (commentAuthors != null) {
                        CTCommentAuthor author = commentAuthors.getAuthorById(comment.getAuthorId());
                        if (author != null) {
                            sbComment.append(author.getName());
                            sbComment.append(": ");
                        }
                    }

                    // Then the comment text, with a new line afterwards
                    sbComment.append(comment.getText());
                    sbComment.append("\n");
                    if (sbComment.length() > 0)
                        result.add(COMMENTS, sbComment.toString());
                }
            }

            // Do the notes if requested
            if (notes != null) {
                result.add(NOTES, extractText(notes.getCommonSlideData(), false));
            }
        }
    }
}