org.wandora.application.tools.extractors.fng.ExtractFNGTextEnrichment.java Source code

Introduction

Here is the source code for org.wandora.application.tools.extractors.fng.ExtractFNGTextEnrichment.java
Source

/*
 * WANDORA
 * Knowledge Extraction, Management, and Publishing Application
 * http://wandora.org
 * 
 * Copyright (C) 2004-2016 Wandora Team
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * 
 * ExtractFNGTextEnrichment.java
 *
 * Created on 10. keskuuta 2006, 11:18
 *
 */

package org.wandora.application.tools.extractors.fng;

import org.wandora.utils.Textbox;
import org.wandora.utils.IObox;
import org.wandora.utils.MSOfficeBox;
import org.wandora.application.tools.extractors.AbstractExtractor;
import org.wandora.topicmap.Locator;
import org.wandora.topicmap.TopicTools;
import org.wandora.topicmap.TopicMap;
import org.wandora.application.Wandora;
import org.wandora.application.WandoraTool;
import org.wandora.topicmap.Topic;
import org.wandora.application.tools.extractors.*;
import org.wandora.application.tools.*;
import org.wandora.topicmap.*;
import org.wandora.application.*;
import org.wandora.*;
import org.wandora.utils.*;

import java.util.*;
import java.text.*;
import java.lang.*;
import java.io.*;
import java.net.*;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.util.PDFTextStripper;

/**
 *
 * @author akivela
 */
public class ExtractFNGTextEnrichment extends AbstractExtractor implements WandoraTool {
    protected Wandora admin = null;
    protected File keywordFile = null;
    private String defaultLang = "en";

    /** Creates a new instance of ApplyKiasmaKeywords */
    public ExtractFNGTextEnrichment() {
    }

    @Override
    public String getName() {
        return "FNG Enrichment Text Extractor";
    }

    @Override
    public String getDescription() {
        return "Extract enrichment texts for FNG collection browser.";
    }

    @Override
    public String getGUIText(int textType) {
        switch (textType) {
        case SELECT_DIALOG_TITLE:
            return "Select enrichment text file(s) or directories containing enrichment text files!";
        case POINT_START_URL_TEXT:
            return "Where would you like to start the crawl?";
        case INFO_WAIT_WHILE_WORKING:
            return "Wait while seeking enrichment text files!";

        case FILE_PATTERN:
            return ".*\\.(pdf|txt|rtf|doc)";

        case DONE_FAILED:
            return "Ready. No extractions! %1 enrichment text(s) and %2 other file(s) crawled!";
        case DONE_ONE:
            return "Ready. Successful extraction! %1 enrichment text(s) and %2 other file(s) crawled!";
        case DONE_MANY:
            return "Ready. Total %0 successful extractions! %1 enrichment text(s) and %2 other files crawled!";

        case LOG_TITLE:
            return "Enrichment Text Extraction Log";
        }
        return "";
    }

    public boolean _extractTopicsFrom(URL url, TopicMap topicMap) throws Exception {
        if (url == null)
            return false;

        try {
            Topic textType = createTopic(topicMap, "tekstidokumentti");
            Topic textTopic = createTopic(topicMap, url.getFile(), " (tekstidokumentti)", url.getFile(), textType);
            textTopic.addSubjectIdentifier(new Locator(TopicTools.cleanDirtyLocator(url.toExternalForm())));

            URLConnection uc = null;
            if (admin != null) {
                uc = admin.wandoraHttpAuthorizer.getAuthorizedAccess(url);
            } else {
                uc = url.openConnection();
                Wandora.initUrlConnection(uc);
            }
            _extractTopicsFromStream(url.toExternalForm(), uc.getInputStream(), topicMap, textTopic);
            return true;
        } catch (Exception e) {
            log("Exception occurred while extracting from url\n" + url.toExternalForm(), e);
            takeNap(1000);
        }
        return false;
    }

    public boolean _extractTopicsFrom(String str, TopicMap topicMap) throws Exception {
        throw (new Exception(STRING_EXTRACTOR_NOT_SUPPORTED_MESSAGE));
    }

    public boolean _extractTopicsFrom(File file, TopicMap topicMap) throws Exception {
        if (file == null || file.isDirectory())
            return false;

        try {
            Topic textType = createTopic(topicMap, "tekstidokumentti");
            Topic textTopic = createTopic(topicMap, file.getName(), " (tekstidokumentti)", file.getName(),
                    textType);
            textTopic
                    .addSubjectIdentifier(new Locator(TopicTools.cleanDirtyLocator(file.toURL().toExternalForm())));

            // --- ADD LAST MODIFICATION TIME AS OCCURRENCE ---
            try {
                Topic modType = createTopic(topicMap, "file-modified");
                String dateString = DateFormat.getDateInstance().format(new Date(file.lastModified()));
                setData(textTopic, modType, "en", dateString);
                setData(textTopic, modType, "fi", dateString);
            } catch (Exception e) {
                log("Exception occurred while setting enrichment topic's modification time!", e);
            }

            _extractTopicsFromStream(file.getPath(), new FileInputStream(file), topicMap, textTopic);
            return true;
        } catch (Exception e) {
            log("Exception occurred while extracting from file " + file.getName(), e);
            takeNap(1000);
        }
        return false;
    }

    public void _extractTopicsFromStream(String locator, InputStream inputStream, TopicMap topicMap,
            Topic textTopic) {
        try {
            String lowerCaseLocator = locator.toLowerCase();

            // --- HANDLE PDF ENRICHMENT TEXT ---
            if (lowerCaseLocator.endsWith("pdf")) {

                PDDocument doc = PDDocument.load(new URL(locator));
                PDDocumentInformation info = doc.getDocumentInformation();

                // --- PDF SUBJECT ---
                String subject = info.getSubject();
                if (subject != null && subject.length() > 0) {
                    Topic subjectType = createTopic(topicMap, "subject");
                    setData(textTopic, subjectType, defaultLang, subject.trim());
                }

                // --- PDF TITLE ---
                String title = info.getTitle();
                if (title != null && title.length() > 0) {
                    Topic titleType = createTopic(topicMap, "title");
                    setData(textTopic, titleType, defaultLang, title.trim());
                }

                // --- PDF KEYWORDS ---
                String keywords = info.getKeywords();
                if (keywords != null && keywords.length() > 0) {
                    Topic keywordType = createTopic(topicMap, "keywords");
                    setData(textTopic, keywordType, defaultLang, keywords.trim());
                }

                // --- PDF TEXT CONTENT ---
                PDFTextStripper stripper = new PDFTextStripper();
                String content = stripper.getText(doc);
                setTextEnrichment(textTopic, topicMap, content);
                doc.close();
            }

            // --- HANDLE RTF DOCUMENTS ---
            else if (lowerCaseLocator.endsWith("rtf")) {
                String content = Textbox.RTF2PlainText(inputStream);
                setTextEnrichment(textTopic, topicMap, content);
            }

            // --- HANDLE OFFICE DOCUMENTS ---
            else if (lowerCaseLocator.endsWith("doc") || lowerCaseLocator.endsWith("docx")
                    || lowerCaseLocator.endsWith("ppt") || lowerCaseLocator.endsWith("xsl")
                    || lowerCaseLocator.endsWith("vsd")) {
                String content = MSOfficeBox.getText(inputStream);
                if (content != null) {
                    setTextEnrichment(textTopic, topicMap, content);
                }
            }

            // --- HANDLE TXT DOCUMENTS ---
            else {
                String content = IObox.loadFile(new InputStreamReader(inputStream));
                setTextEnrichment(textTopic, topicMap, content);
            }
        } catch (Exception e) {
            log(e);
        }
    }

    public String solveTitle(String content) {
        String title = null;
        title = content.substring(0, Math.max(80, content.indexOf("\n")));
        if (title != null && title.length() > 80) {
            while (!title.endsWith(" ")) {
                title = title.substring(0, title.length() - 1);
            }
            title = Textbox.trimExtraSpaces(title);
            if (title == null || title.length() == 0)
                return null;
        }
        return title;
    }

    public void setTextEnrichment(Topic textTopic, TopicMap topicMap, String content) {
        try {
            String trimmedText = Textbox.trimExtraSpaces(content);
            if (trimmedText != null && trimmedText.length() > 0) {
                Topic contentType = createTopic(topicMap, "teksti");
                setData(textTopic, contentType, "en", trimmedText);
                setData(textTopic, contentType, "fi", trimmedText);
                setData(textTopic, contentType, "se", trimmedText);
            }
            String title = solveTitle(trimmedText);
            if (title != null) {
                textTopic.setDisplayName("en", title);
                textTopic.setDisplayName("fi", title);
                textTopic.setDisplayName("se", title);
            }
        } catch (Exception e) {
            log(e);
        }
    }

    // -------------------------------------------------------------------------
    // -------------------------------------------------------------------------
    // -------------------------------------------------------------------------

    public static final String[] contentTypes = new String[] { "application/pdf", "text/plain", "application/rtf",
            "application/msword" };

    public String[] getContentTypes() {
        return contentTypes;
    }
}