org.wandora.application.tools.extractors.files.SimpleDocumentExtractor.java Source code

Java tutorial

Introduction

Here is the source code for org.wandora.application.tools.extractors.files.SimpleDocumentExtractor.java

Source

/*
 * WANDORA
 * Knowledge Extraction, Management, and Publishing Application
 * http://wandora.org
 * 
 * Copyright (C) 2004-2016 Wandora Team
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * 
 * SimpleDocumentExtractor.java
 *
 * Created on 31. lokakuuta 2007, 16:07
 *
 */

package org.wandora.application.tools.extractors.files;

import eu.medsea.mimeutil.MimeType;
import eu.medsea.mimeutil.MimeUtil;
import eu.medsea.mimeutil.detector.MagicMimeMimeDetector;

import org.wandora.application.tools.browserextractors.*;
import org.wandora.topicmap.TMBox;
import org.wandora.utils.IObox;
import org.wandora.utils.Textbox;
import org.wandora.utils.MSOfficeBox;
import org.wandora.application.tools.*;
import org.wandora.topicmap.*;
import org.wandora.application.*;
import org.wandora.*;
import org.wandora.utils.*;

import java.util.*;
import java.text.*;
import java.lang.*;
import java.io.*;
import java.net.*;
import javax.swing.Icon;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.util.PDFTextStripper;
import org.wandora.application.contexts.Context;
import org.wandora.application.gui.UIBox;
import org.wandora.application.tools.extractors.AbstractExtractor;
import org.wandora.application.tools.extractors.ExtractHelper;

/**
 *
 * @author akivela
 */
public class SimpleDocumentExtractor extends AbstractExtractor implements WandoraTool, BrowserPluginExtractor {
    protected static String TOPIC_SI = "http://wandora.org/si/topic";
    protected static String SOURCE_SI = "http://wandora.org/si/source";
    protected static String DOCUMENT_SI = "http://wandora.org/si/document";

    protected static String DEFAULT_DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";

    private String defaultLang = "en";

    private Wandora admin = null;

    private ArrayList<String> visitedDirectories = new ArrayList<String>();

    /** Creates a new instance of SimpleDocumentExtractor */
    public SimpleDocumentExtractor() {
    }

    @Override
    public String getName() {
        return "Simple document extractor";
    }

    @Override
    public String getDescription() {
        return "Creates a topic for given document and stores document content to an occurrence attached to the topic.";
    }

    @Override
    public Icon getIcon() {
        return UIBox.getIcon(0xf016); // or 0xf15b
    }

    @Override
    public boolean useTempTopicMap() {
        return false;
    }

    @Override
    public boolean useURLCrawler() {
        return false;
    }

    @Override
    public String getGUIText(int textType) {
        switch (textType) {
        case SELECT_DIALOG_TITLE:
            return "Select document(s) or directories containing documents!";
        case POINT_START_URL_TEXT:
            return "Where would you like to start the crawl?";
        case INFO_WAIT_WHILE_WORKING:
            return "Wait while seeking documents!";

        case FILE_PATTERN:
            return ".*";

        case DONE_FAILED:
            return "Ready. No extractions! %1 file(s) crawled!";
        case DONE_ONE:
            return "Ready. Successful extraction! %1 file(s) crawled!";
        case DONE_MANY:
            return "Ready. Total %0 successful extractions! %1 files crawled!";

        case LOG_TITLE:
            return "Simple Text Document Extraction Log";
        }
        return "";
    }

    // ---------------------------------------------------- PLUGIN EXTRACTOR ---

    @Override
    public String doBrowserExtract(BrowserExtractRequest request, Wandora wandora) throws TopicMapException {
        String url = request.getSource();
        try {
            TopicMap tm = wandora.getTopicMap();
            Topic theTopic = null;
            String content = request.getSelection();

            // SOURCE IS A FRACTION OF URL
            if (content != null) {
                String tidyContent = XMLbox.cleanUp(content);
                if (tidyContent != null && tidyContent.length() > 0) {
                    content = XMLbox.getAsText(tidyContent, "ISO-8859-1");
                }

                Topic sourceTopic = tm.getTopicBySubjectLocator(url);
                if (sourceTopic == null) {
                    sourceTopic = tm.createTopic();
                    Locator l = tm.createLocator(url);
                    sourceTopic.addSubjectIdentifier(l);
                    sourceTopic.setSubjectLocator(l);
                }
                theTopic = tm.createTopic();
                theTopic.addSubjectIdentifier(tm.makeSubjectIndicatorAsLocator());

                fillDocumentTopic(theTopic, tm, content);

                Association a = tm.createAssociation(getSourceType(tm));
                a.addPlayer(theTopic, getDocumentType(tm));
                a.addPlayer(sourceTopic, getSourceType(tm));
            }
            // SOURCE IS A COMPLETE URL
            else {
                content = ExtractHelper.getContent(request);
                String tidyContent = XMLbox.cleanUp(content);
                if (tidyContent != null && tidyContent.length() > 0) {
                    content = XMLbox.getAsText(tidyContent, "ISO-8859-1");
                }

                theTopic = tm.getTopicBySubjectLocator(url);
                if (theTopic == null) {
                    theTopic = tm.createTopic();
                    Locator l = tm.createLocator(url);
                    theTopic.addSubjectIdentifier(l);
                    theTopic.setSubjectLocator(l);
                }
                fillDocumentTopic(theTopic, tm, content);
            }
        } catch (Exception e) {
            e.printStackTrace();
            return BrowserPluginExtractor.RETURN_ERROR + e.getMessage();
        }
        wandora.doRefresh();
        return null;
    }

    @Override
    public boolean browserExtractorConsumesPlainText() {
        return true;
    }

    // -------------------------------------------------------------------------

    @Override
    public void execute(Wandora wandora, Context context) {
        visitedDirectories = new ArrayList<String>();
        super.execute(wandora, context);
    }

    @Override
    public boolean _extractTopicsFrom(String str, TopicMap topicMap) throws Exception {
        if (str == null || str.length() == 0)
            return false;

        try {
            int hash = str.hashCode();
            Topic documentType = this.getDocumentType(topicMap);
            String locator = "http://wandora.org/si/simple-document-extractor/" + hash;

            String name = null;
            if (str.length() > 80) {
                name = str.substring(0, 80) + " (" + hash + ")";
            } else {
                name = str;
            }

            Topic documentTopic = topicMap.getTopic(locator);
            if (documentTopic == null)
                documentTopic = topicMap.createTopic();
            documentTopic.addSubjectIdentifier(new Locator(locator));
            documentTopic.setBaseName(name);
            documentTopic.addType(documentType);

            // --- ADD EXTRACTION TIME AS OCCURRENCE ---
            DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT);
            Topic extractionTimeType = createTopic(topicMap, "extraction-time");
            String dateString = dateFormatter.format(new Date(System.currentTimeMillis()));
            setData(documentTopic, extractionTimeType, defaultLang, dateString);

            _extractTopicsFromStream(locator, new ByteArrayInputStream(str.getBytes()), topicMap, documentTopic);
            return true;
        } catch (Exception e) {
            log("Exception occurred while extracting from input string.", e);
            takeNap(1000);
        }
        return false;
    }

    @Override
    public boolean _extractTopicsFrom(URL url, TopicMap topicMap) throws Exception {
        if (url == null || url.toExternalForm().length() == 0)
            return false;

        try {
            Topic documentType = this.getDocumentType(topicMap);
            String locator = url.toExternalForm();
            int hash = locator.hashCode();
            String name = url.getFile();
            if (name != null && name.length() > 0) {
                if (name.lastIndexOf("/") > -1) {
                    name = name.substring(name.lastIndexOf("/") + 1);
                }
            } else {
                name = locator;
            }

            Topic documentTopic = topicMap.getTopic(locator);
            if (documentTopic == null)
                documentTopic = topicMap.createTopic();
            documentTopic.addSubjectIdentifier(new Locator(locator));
            documentTopic.setBaseName(name + " (" + hash + ")");
            documentTopic.setSubjectLocator(new Locator(locator));
            documentTopic.addType(documentType);

            // --- ADD EXTRACTION TIME AS OCCURRENCE ---
            DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT);
            Topic extractionTimeType = createTopic(topicMap, "extraction-time");
            String dateString = dateFormatter.format(new Date(System.currentTimeMillis()));
            setData(documentTopic, extractionTimeType, defaultLang, dateString);

            URLConnection uc = null;
            if (admin != null) {
                uc = admin.wandoraHttpAuthorizer.getAuthorizedAccess(url);
            } else {
                uc = url.openConnection();
                Wandora.initUrlConnection(uc);
            }
            _extractTopicsFromStream(url.toExternalForm(), uc.getInputStream(), topicMap, documentTopic);
            return true;
        } catch (Exception e) {
            log("Exception occurred while extracting from url '" + url.toExternalForm() + "'.", e);
            takeNap(1000);
        }
        return false;
    }

    @Override
    public boolean _extractTopicsFrom(File file, TopicMap topicMap) throws Exception {
        if (file == null)
            return false;

        try {
            if (file.isDirectory()) {
                /*
                if(!visitedDirectories.contains(file.getAbsolutePath())) {
                visitedDirectories.add(file.getAbsolutePath());
                File[] fs = file.listFiles();
                for(int i=0; i<fs.length && !forceStop(); i++) {
                    _extractTopicsFrom(fs[i], topicMap);
                }
                }
                */
                return true;
            }
        } catch (Exception e) {
            log(e);
        }

        try {
            Topic documentType = this.getDocumentType(topicMap);
            String locator = file.toURI().toURL().toExternalForm();
            int hash = locator.hashCode();

            Topic documentTopic = topicMap.getTopic(locator);
            if (documentTopic == null)
                documentTopic = topicMap.createTopic();
            documentTopic.addSubjectIdentifier(new Locator(locator));
            documentTopic.setBaseName(file.getName() + " (" + hash + ")");
            documentTopic.setSubjectLocator(new Locator(locator));
            documentTopic.addType(documentType);

            // --- ADD EXTRACTION TIME AS OCCURRENCE ---
            DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT);
            Topic extractionTimeType = createTopic(topicMap, "extraction-time");
            String dateString = dateFormatter.format(new Date(System.currentTimeMillis()));
            setData(documentTopic, extractionTimeType, defaultLang, dateString);

            // --- ADD MODIFICATION TIME AS OCCURRENCE ---
            long lastModified = file.lastModified();
            Topic modificationTimeType = createTopic(topicMap, "modification-time");
            String modificationDateString = dateFormatter.format(new Date(lastModified));
            setData(documentTopic, modificationTimeType, defaultLang, modificationDateString);

            // --- ADD ABSOLUTE FILE NAME AS OCCURRENCE ---
            Topic fileType = createTopic(topicMap, "file-name");
            String fileString = file.getAbsolutePath();
            setData(documentTopic, fileType, defaultLang, fileString);

            _extractTopicsFromStream(file.getPath(), new FileInputStream(file), topicMap, documentTopic);

            return true;
        } catch (Exception e) {
            log("Exception occurred while extracting from file '" + file.getName() + "'.", e);
            takeNap(1000);
        }
        return false;
    }

    public void _extractTopicsFromStream(String locator, InputStream inputStream, TopicMap topicMap, Topic topic) {
        try {
            String name = locator;
            if (name.indexOf("/") != -1) {
                name = name.substring(name.lastIndexOf("/") + 1);
            } else if (name.indexOf("\\") != -1) {
                name = name.substring(name.lastIndexOf("\\") + 1);
            }
            String lowerCaseLocator = locator.toLowerCase();

            // --- HANDLE PDF ENRICHMENT TEXT ---
            if (lowerCaseLocator.endsWith("pdf")) {
                PDDocument doc = PDDocument.load(locator);
                PDDocumentInformation info = doc.getDocumentInformation();
                DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT);

                // --- PDF PRODUCER ---
                String producer = info.getProducer();
                if (producer != null && producer.length() > 0) {
                    Topic producerType = createTopic(topicMap, "pdf-producer");
                    setData(topic, producerType, defaultLang, producer.trim());
                }

                // --- PDF MODIFICATION DATE ---
                Calendar mCal = info.getModificationDate();
                if (mCal != null) {
                    String mdate = dateFormatter.format(mCal.getTime());
                    if (mdate != null && mdate.length() > 0) {
                        Topic modificationDateType = createTopic(topicMap, "pdf-modification-date");
                        setData(topic, modificationDateType, defaultLang, mdate.trim());
                    }
                }

                // --- PDF CREATOR ---
                String creator = info.getCreator();
                if (creator != null && creator.length() > 0) {
                    Topic creatorType = createTopic(topicMap, "pdf-creator");
                    setData(topic, creatorType, defaultLang, creator.trim());
                }

                // --- PDF CREATION DATE ---
                Calendar cCal = info.getCreationDate();
                if (cCal != null) {
                    String cdate = dateFormatter.format(cCal.getTime());
                    if (cdate != null && cdate.length() > 0) {
                        Topic creationDateType = createTopic(topicMap, "pdf-creation-date");
                        setData(topic, creationDateType, defaultLang, cdate.trim());
                    }
                }

                // --- PDF AUTHOR ---
                String author = info.getAuthor();
                if (author != null && author.length() > 0) {
                    Topic authorType = createTopic(topicMap, "pdf-author");
                    setData(topic, authorType, defaultLang, author.trim());
                }

                // --- PDF SUBJECT ---
                String subject = info.getSubject();
                if (subject != null && subject.length() > 0) {
                    Topic subjectType = createTopic(topicMap, "pdf-subject");
                    setData(topic, subjectType, defaultLang, subject.trim());
                }

                // --- PDF TITLE ---
                String title = info.getSubject();
                if (title != null && title.length() > 0) {
                    Topic titleType = createTopic(topicMap, "pdf-title");
                    setData(topic, titleType, defaultLang, title.trim());
                }

                // --- PDF KEYWORDS (SEPARATED WITH SEMICOLON) ---
                String keywords = info.getKeywords();
                if (keywords != null && keywords.length() > 0) {
                    Topic keywordType = createTopic(topicMap, "pdf-keyword");
                    String[] keywordArray = keywords.split(";");
                    String keyword = null;
                    for (int i = 0; i < keywordArray.length; i++) {
                        keyword = Textbox.trimExtraSpaces(keywordArray[i]);
                        if (keyword != null && keyword.length() > 0) {
                            Topic keywordTopic = createTopic(topicMap, keyword, keywordType);
                            createAssociation(topicMap, keywordType, new Topic[] { topic, keywordTopic });
                        }
                    }
                }

                // --- PDF TEXT CONTENT ---
                PDFTextStripper stripper = new PDFTextStripper();
                String content = stripper.getText(doc);
                doc.close();
                setTextEnrichment(topic, topicMap, content, name);
            }

            // --- HANDLE RTF DOCUMENTS ---
            else if (lowerCaseLocator.endsWith("rtf")) {
                String content = Textbox.RTF2PlainText(inputStream);
                setTextEnrichment(topic, topicMap, content, name);
            }

            // --- HANDLE OFFICE DOCUMENTS ---
            else if (lowerCaseLocator.endsWith("doc") || lowerCaseLocator.endsWith("docx")
                    || lowerCaseLocator.endsWith("ppt") || lowerCaseLocator.endsWith("xls")
                    || lowerCaseLocator.endsWith("vsd")) {
                String content = MSOfficeBox.getText(inputStream);
                if (content != null) {
                    setTextEnrichment(topic, topicMap, content, name);
                }
            }

            else if (lowerCaseLocator.endsWith("odt") || lowerCaseLocator.endsWith("odp")
                    || lowerCaseLocator.endsWith("odg") || lowerCaseLocator.endsWith("ods")) {

                org.odftoolkit.simple.Document oodocument = org.odftoolkit.simple.Document
                        .loadDocument(inputStream);
                String content = OpenOfficeBox.getText(oodocument);
                setTextEnrichment(topic, topicMap, content, name);

                org.odftoolkit.simple.meta.Meta meta = oodocument.getOfficeMetadata();

                // --- OO KEYWORDS ---
                List<String> keywords = meta.getKeywords();
                if (keywords != null && !keywords.isEmpty()) {
                    Topic keywordType = createTopic(topicMap, "oo-keyword");
                    for (String keyword : keywords) {
                        keyword = keyword.trim();
                        if (keyword != null && keyword.length() > 0) {
                            Topic keywordTopic = createTopic(topicMap, keyword, keywordType);
                            createAssociation(topicMap, keywordType, new Topic[] { topic, keywordTopic });
                        }
                    }
                }

                // --- OO TITLE ---
                String title = meta.getTitle();
                if (title != null && title.length() > 0) {
                    Topic titleType = createTopic(topicMap, "oo-title");
                    setData(topic, titleType, defaultLang, title.trim());
                }

                // --- OO SUBJECT ---
                String subject = meta.getSubject();
                if (subject != null && subject.length() > 0) {
                    Topic subjectType = createTopic(topicMap, "oo-subject");
                    setData(topic, subjectType, defaultLang, subject.trim());
                }

                // --- OO CREATOR ---
                String author = meta.getCreator();
                if (author != null && author.length() > 0) {
                    Topic authorType = createTopic(topicMap, "oo-author");
                    setData(topic, authorType, defaultLang, author.trim());
                }

                // --- OO CREATION DATE ---
                Calendar cCal = meta.getCreationDate();
                if (cCal != null) {
                    DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT);
                    String cdate = dateFormatter.format(cCal.getTime());
                    if (cdate != null && cdate.length() > 0) {
                        Topic creationDateType = createTopic(topicMap, "oo-creation-date");
                        setData(topic, creationDateType, defaultLang, cdate.trim());
                    }
                }

                // --- OO DESCRIPTION ---
                String description = meta.getDescription();
                if (description != null && description.length() > 0) {
                    Topic descriptionType = createTopic(topicMap, "oo-description");
                    setData(topic, descriptionType, defaultLang, description.trim());
                }

                // --- OO GENERATOR ---
                String generator = meta.getGenerator();
                if (generator != null && generator.length() > 0) {
                    Topic generatorType = createTopic(topicMap, "oo-generator");
                    setData(topic, generatorType, defaultLang, generator.trim());
                }
            }

            else if (lowerCaseLocator.endsWith("html") || lowerCaseLocator.endsWith("htm")) {
                String content = IObox.loadFile(new InputStreamReader(inputStream));
                setTextEnrichment(topic, topicMap, content, name);
            }

            else if (lowerCaseLocator.endsWith("txt") || lowerCaseLocator.endsWith("text")) {
                String content = IObox.loadFile(new InputStreamReader(inputStream));
                setTextEnrichment(topic, topicMap, content, name);
            }

            // --- HANDLE ANY OTHER DOCUMENTS ---
            else {
                byte[] content = IObox.loadBFile(inputStream);
                String mimeType = "";
                MimeUtil.registerMimeDetector("eu.medsea.mimeutil.detector.MagicMimeMimeDetector");
                Collection<MimeType> mimeTypes = new ArrayList();
                if (locator != null) {
                    if (MimeTypes.getMimeType(locator) != null) {
                        mimeTypes.add(new MimeType(MimeTypes.getMimeType(locator)));
                    }
                    mimeTypes.addAll(MimeUtil.getMimeTypes(locator));
                }
                mimeTypes.addAll(MimeUtil.getMimeTypes(content));
                boolean isText = false;
                for (MimeType mime : mimeTypes) {
                    if (MimeUtil.isTextMimeType(mime)) {
                        isText = true;
                        break;
                    }
                }
                if (isText) {
                    setTextEnrichment(topic, topicMap, new String(content), name);
                } else {
                    if (!mimeTypes.isEmpty()) {
                        MimeType mime = mimeTypes.iterator().next();
                        mimeType = mime.toString();
                    }
                    setBinaryEnrichment(topic, topicMap, content, mimeType);
                }
            }
        } catch (Exception e) {
            log(e);
        }
    }

    public void setBinaryEnrichment(Topic textTopic, TopicMap topicMap, byte[] content, String mimeType) {
        try {
            if (content != null && content.length > 0) {
                Topic contentType = createTopic(topicMap, "document-content");
                DataURL dataURL = new DataURL(mimeType, content);
                setData(textTopic, contentType, defaultLang, dataURL.toExternalForm());
            }
        } catch (Exception e) {
            log(e);
        }
    }

    public void setTextEnrichment(Topic topic, TopicMap topicMap, String content, String title) {
        try {
            String trimmedText = Textbox.trimExtraSpaces(content);
            if (trimmedText != null && trimmedText.length() > 0) {
                Topic contentType = createTopic(topicMap, "document-content");
                setData(topic, contentType, defaultLang, trimmedText);
            }
            if (title == null || title.length() == 0) {
                title = solveTitle(trimmedText);
            }
            if (title != null) {
                // textTopic.setBaseName(title + " ("+trimmedText.hashCode()+")");
                // textTopic.setDisplayName(defaultLang, title);
            }
        } catch (Exception e) {
            log(e);
        }
    }

    // -------------------------------------------------------------------------
    // -------------------------------------------------------------------------
    // -------------------------------------------------------------------------

    public String solveTitle(String content) {
        if (content == null || content.length() == 0)
            return "empty-document";

        boolean forceTrim = false;
        String title = null;
        int i = content.indexOf("\n");
        if (i > 0)
            title = content.substring(0, i);
        else {
            title = content.substring(0, Math.min(80, content.length()));
            forceTrim = true;
        }

        if (title != null && (forceTrim || title.length() > 80)) {
            title = title.substring(0, Math.min(80, title.length()));
            while (!title.endsWith(" ") && title.length() > 10) {
                title = title.substring(0, title.length() - 1);
            }
            title = Textbox.trimExtraSpaces(title) + "...";
        }
        return title;
    }

    public void fillDocumentTopic(Topic textTopic, TopicMap topicMap, String content) {
        try {
            String trimmedText = Textbox.trimExtraSpaces(content);
            if (trimmedText != null && trimmedText.length() > 0) {
                Topic contentType = createTopic(topicMap, "document-content");
                setData(textTopic, contentType, defaultLang, trimmedText);
            }
            String title = solveTitle(trimmedText);
            if (title != null) {
                textTopic.setBaseName(title + " (" + content.hashCode() + ")");
                textTopic.setDisplayName(defaultLang, title);
            }
            Topic documentType = getDocumentType(topicMap);
            textTopic.addType(documentType);
        } catch (Exception e) {
            log(e);
        }
    }

    public Topic getWandoraClass(TopicMap tm) throws TopicMapException {
        return getOrCreateTopic(tm, TMBox.WANDORACLASS_SI, "Wandora class");
    }

    public Topic getTopicType(TopicMap tm) throws TopicMapException {
        return getOrCreateTopic(tm, TOPIC_SI, "Topic");
    }

    public Topic getSourceType(TopicMap tm) throws TopicMapException {
        return getOrCreateTopic(tm, SOURCE_SI, "Source");
    }

    public Topic getDocumentType(TopicMap tm) throws TopicMapException {
        Topic type = getOrCreateTopic(tm, DOCUMENT_SI, "Document");
        Topic wandoraClass = getWandoraClass(tm);
        makeSubclassOf(tm, type, wandoraClass);
        return type;
    }

    // --------

    protected Topic getOrCreateTopic(TopicMap tm, String si) throws TopicMapException {
        return getOrCreateTopic(tm, si, null);
    }

    protected Topic getOrCreateTopic(TopicMap tm, String si, String bn) throws TopicMapException {
        return ExtractHelper.getOrCreateTopic(si, bn, tm);
    }

    protected void makeSubclassOf(TopicMap tm, Topic t, Topic superclass) throws TopicMapException {
        ExtractHelper.makeSubclassOf(t, superclass, tm);
    }

    // -------------------------------------------------------------------------

    public static final String[] contentTypes = new String[] { "application/pdf", "text/plain", "text/html",
            "application/rtf", "application/xml", "application/msword" };

    public String[] getContentTypes() {
        return contentTypes;
    }

}