org.wandora.application.tools.extractors.files.SimplePDFExtractor.java Source code

Introduction

Here is the source code for org.wandora.application.tools.extractors.files.SimplePDFExtractor.java
Source

/*
 * WANDORA
 * Knowledge Extraction, Management, and Publishing Application
 * http://wandora.org
 * 
 * Copyright (C) 2004-2016 Wandora Team
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * 
 * SimplePDFExtractor.java
 *
 * Created on 9. keskuuta 2006, 15:08
 *
 */

package org.wandora.application.tools.extractors.files;

import org.wandora.utils.Textbox;
import java.net.*;
import java.io.*;
import java.util.*;
import java.text.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.Icon;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;

import org.apache.pdfbox.util.*;

import org.wandora.topicmap.*;
import org.wandora.application.*;
import org.wandora.application.gui.UIBox;
import org.wandora.application.tools.browserextractors.BrowserExtractRequest;
import org.wandora.application.tools.browserextractors.BrowserPluginExtractor;
import org.wandora.application.tools.extractors.AbstractExtractor;
import org.wandora.application.tools.extractors.ExtractHelper;
import org.wandora.utils.*;

/**
 *
 * @author akivela
 */
public class SimplePDFExtractor extends AbstractExtractor {
    protected static String DEFAULT_DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";

    public boolean makePageTopics = false;
    public boolean makeVariantFromTitle = true;
    private String defaultLang = "en";

    public SimplePDFExtractor() {
    }

    @Override
    public String getName() {
        return "Simple PDF extractor...";
    }

    @Override
    public String getDescription() {
        return "Extracts text and metadata from PDF files.";
    }

    @Override
    public Icon getIcon() {
        return UIBox.getIcon(0xf1c1);
    }

    @Override
    public int getExtractorType() {
        return FILE_EXTRACTOR | URL_EXTRACTOR;
    }

    @Override
    public String getGUIText(int textType) {
        switch (textType) {
        case SELECT_DIALOG_TITLE:
            return "Select PDF file(s) or directories containing PDF files!";
        case POINT_START_URL_TEXT:
            return "Where would you like to start the crawl?";
        case INFO_WAIT_WHILE_WORKING:
            return "Wait while seeking PDF files!";

        case FILE_PATTERN:
            return ".*\\.(pdf|PDF)";

        case DONE_FAILED:
            return "Ready. No extractions! %1 pdf file(s) crawled!";
        case DONE_ONE:
            return "Ready. Successful extraction. %1 pdf file(s) crawled!";
        case DONE_MANY:
            return "Ready. Total %0 successful extractions. %1 pdf files crawled!";

        case LOG_TITLE:
            return "Simple PDF Extraction Log";
        }
        return "";
    }

    @Override
    public String doBrowserExtract(BrowserExtractRequest request, Wandora wandora) throws TopicMapException {
        try {
            setWandora(wandora);
            String url = request.getSource();
            TopicMap tm = wandora.getTopicMap();
            if (url != null && url.endsWith(".pdf")) {
                _extractTopicsFrom(new URL(url), tm);
                wandora.doRefresh();
                return null;
            } else {
                String content = request.getSelection();
                if (content == null) {
                    content = request.getContent();
                }
                if (content == null && url != null) {
                    try {
                        System.out.println("Found no content. Reading the url content.");
                        content = IObox.doUrl(new URL(url));
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }

                if (content != null) {
                    System.out.println("--- browser plugin processing content ---");
                    System.out.println(content);

                    Pattern p = Pattern.compile("\"[^\"]+?\\.pdf\"");
                    Matcher m = p.matcher(content);
                    ArrayList<String> pdfUrls = new ArrayList<String>();
                    int l = 0;
                    while (l < content.length() && m.find(l)) {
                        String g = m.group();
                        if (g.startsWith("\""))
                            g = g.substring(1);
                        if (g.endsWith("\""))
                            g = g.substring(0, g.length() - 1);
                        pdfUrls.add(g);
                        l = m.end();
                    }

                    for (String u : pdfUrls) {
                        System.out.println("Extracting pdf url: " + u);
                        _extractTopicsFrom(new URL(u), tm);
                    }
                    wandora.doRefresh();
                    return null;
                } else {
                    return BrowserPluginExtractor.RETURN_ERROR
                            + "Couldn't solve browser extractor content. Nothing extracted.";
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
            return BrowserPluginExtractor.RETURN_ERROR + e.getMessage();
        }

    }

    // -------------------------------------------------------------------------

    @Override
    public boolean _extractTopicsFrom(URL url, TopicMap topicMap) throws Exception {
        if (url == null)
            return false;

        try {
            Topic pdfType = createPDFTypeTopic(topicMap);
            String location = url.toExternalForm();
            long hash = location.hashCode();
            String urlfile = url.getFile();
            String name = urlfile;
            if (urlfile.lastIndexOf("/") > -1) {
                name = urlfile.substring(urlfile.lastIndexOf("/") + 1);
            }
            Topic pdfTopic = createTopic(topicMap, location, " (" + hash + ")", name, pdfType);
            pdfTopic.setSubjectLocator(new Locator(location));

            URLConnection uc = null;
            if (getWandora() != null) {
                uc = getWandora().wandoraHttpAuthorizer.getAuthorizedAccess(url);
            } else {
                uc = url.openConnection();
                Wandora.initUrlConnection(uc);
            }
            _extractTopicsFromStream(url.toExternalForm(), uc.getInputStream(), topicMap, pdfTopic);

            // --- ADD EXTRACTION TIME AS OCCURRENCE ---
            DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT);
            Topic extractionTimeType = createTopic(topicMap, "extraction-time");
            String dateString = dateFormatter.format(new Date(System.currentTimeMillis()));
            setData(pdfTopic, extractionTimeType, defaultLang, dateString);

            return true;
        } catch (Exception e) {
            log("Exception occurred while extracting from url\n" + url.toExternalForm(), e);
            takeNap(1000);
        }
        return false;
    }

    @Override
    public boolean _extractTopicsFrom(String str, TopicMap topicMap) throws Exception {
        throw (new Exception(STRING_EXTRACTOR_NOT_SUPPORTED_MESSAGE));
    }

    @Override
    public boolean _extractTopicsFrom(File file, TopicMap topicMap) throws Exception {
        if (file == null || file.isDirectory())
            return false;

        try {

            Topic pdfType = createPDFTypeTopic(topicMap);
            String location = file.toURI().toURL().toExternalForm();
            long hash = location.hashCode();
            Topic pdfTopic = createTopic(topicMap, location, " (" + hash + ")", file.getName(), pdfType);
            pdfTopic.setSubjectLocator(new Locator(location));

            // --- ADD LAST MODIFICATION TIME AS OCCURRENCE ---
            try {
                DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT);
                Topic modType = createTopic(topicMap, "file-modified");
                String dateString = dateFormatter.format(new Date(file.lastModified()));
                setData(pdfTopic, modType, defaultLang, dateString);
            } catch (Exception e) {
                log("Exception occurred while setting file topic's modification time!", e);
            }

            // --- ADD FILE SIZE AS OCCURRENCE ---
            try {
                Topic sizeType = createTopic(topicMap, "file-size");
                setData(pdfTopic, sizeType, defaultLang, "" + file.length());
            } catch (Exception e) {
                log("Exception occurred while setting file topic's file size!", e);
            }

            FileInputStream fis = new FileInputStream(file);
            try {
                _extractTopicsFromStream(file.getPath(), fis, topicMap, pdfTopic);
            } finally {
                if (fis != null)
                    fis.close();
            }

            // --- ADD EXTRACTION TIME AS OCCURRENCE ---
            DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT);
            Topic extractionTimeType = createTopic(topicMap, "extraction-time");
            String dateString = dateFormatter.format(new Date(System.currentTimeMillis()));
            setData(pdfTopic, extractionTimeType, defaultLang, dateString);

            return true;
        } catch (Exception e) {
            log("Exception occurred while extracting from file " + file.getName(), e);
            takeNap(1000);
        }
        return false;
    }

    public void _extractTopicsFromStream(String locator, InputStream inputStream, TopicMap topicMap,
            Topic pdfTopic) {
        PDDocument doc = null;
        try {
            if (locator.startsWith("http://")) {
                doc = PDDocument.load(new URL(locator));
            } else {
                doc = PDDocument.load(new File(locator));
            }
            PDDocumentInformation info = doc.getDocumentInformation();
            DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT);

            // --- PDF PRODUCER ---
            String producer = info.getProducer();
            if (producer != null && producer.length() > 0) {
                Topic producerType = createTopic(topicMap, "pdf-producer");
                setData(pdfTopic, producerType, defaultLang, producer.trim());
            }

            // --- PDF MODIFICATION DATE ---
            Calendar mCal = info.getModificationDate();
            if (mCal != null) {
                String mdate = dateFormatter.format(mCal.getTime());
                if (mdate != null && mdate.length() > 0) {
                    Topic modificationDateType = createTopic(topicMap, "pdf-modification-date");
                    setData(pdfTopic, modificationDateType, defaultLang, mdate.trim());
                }
            }

            // --- PDF CREATOR ---
            String creator = info.getCreator();
            if (creator != null && creator.length() > 0) {
                Topic creatorType = createTopic(topicMap, "pdf-creator");
                setData(pdfTopic, creatorType, defaultLang, creator.trim());
            }

            // --- PDF CREATION DATE ---
            Calendar cCal = info.getCreationDate();
            if (cCal != null) {
                String cdate = dateFormatter.format(cCal.getTime());
                if (cdate != null && cdate.length() > 0) {
                    Topic creationDateType = createTopic(topicMap, "pdf-creation-date");
                    setData(pdfTopic, creationDateType, defaultLang, cdate.trim());
                }
            }

            // --- PDF AUTHOR ---
            String author = info.getAuthor();
            if (author != null && author.length() > 0) {
                Topic authorType = createTopic(topicMap, "pdf-author");
                setData(pdfTopic, authorType, defaultLang, author.trim());
            }

            // --- PDF SUBJECT ---
            String subject = info.getSubject();
            if (subject != null && subject.length() > 0) {
                Topic subjectType = createTopic(topicMap, "pdf-subject");
                setData(pdfTopic, subjectType, defaultLang, subject.trim());
            }

            // --- PDF TITLE ---
            String title = info.getSubject();
            if (title != null && title.length() > 0) {
                if (makeVariantFromTitle) {
                    pdfTopic.setDisplayName(defaultLang, title);
                } else {
                    Topic titleType = createTopic(topicMap, "pdf-title");
                    setData(pdfTopic, titleType, defaultLang, title.trim());
                }
            }

            // --- PDF KEYWORDS (SEPARATED WITH SEMICOLON) ---
            String keywords = info.getKeywords();
            if (keywords != null && keywords.length() > 0) {
                Topic keywordType = createTopic(topicMap, "pdf-keyword");
                String[] keywordArray = keywords.split(";");
                String keyword = null;
                for (int i = 0; i < keywordArray.length; i++) {
                    keyword = Textbox.trimExtraSpaces(keywordArray[i]);
                    if (keyword != null && keyword.length() > 0) {
                        Topic keywordTopic = createTopic(topicMap, keyword, keywordType);
                        createAssociation(topicMap, keywordType, new Topic[] { pdfTopic, keywordTopic });
                    }
                }
            }

            // --- PDF TEXT CONTENT ---
            PDFTextStripper stripper = new PDFTextStripper();
            String content = new String();

            if (makePageTopics) {
                int pages = doc.getNumberOfPages();
                String pageContent = null;
                for (int i = 0; i < pages; i++) {
                    stripper.setStartPage(i);
                    stripper.setEndPage(i);
                    pageContent = stripper.getText(doc);
                    Topic pageType = createTopic(topicMap, "pdf-page");
                    Topic pageTopic = createTopic(topicMap, pdfTopic.getBaseName() + " (page " + i + ")", pageType);
                    Topic orderType = createTopic(topicMap, "order");
                    Topic orderTopic = createTopic(topicMap, i + ".", orderType);
                    Topic contentType = createTopic(topicMap, "pdf-text");
                    setData(pageTopic, contentType, defaultLang, pageContent.trim());
                    createAssociation(topicMap, pageType, new Topic[] { pdfTopic, pageTopic, orderTopic });
                }
            } else {
                content = stripper.getText(doc);
            }

            if (!makePageTopics && content != null && content.length() > 0) {
                Topic contentType = createTopic(topicMap, "pdf-text");
                setData(pdfTopic, contentType, defaultLang, content.trim());
            }
            doc.close();
        } catch (Exception e) {
            e.printStackTrace();
            try {
                if (doc != null)
                    doc.close();
            } catch (Exception ix) {
                e.printStackTrace();
            }
        }
    }

    // -------------------------------------------------------------------------

    public static final String[] contentTypes = new String[] { "application/pdf" };

    @Override
    public String[] getContentTypes() {
        return contentTypes;
    }

    @Override
    public Locator buildSI(String siend) {
        if (siend == null)
            siend = "" + System.currentTimeMillis() + Math.random() * 999999;
        if (siend.startsWith("http://"))
            return new Locator(siend);
        if (siend.startsWith("file:/"))
            return new Locator(siend);
        if (siend.startsWith("/"))
            siend = siend.substring(1);
        return new Locator("http://wandora.org/si/pdf/" + urlEncode(siend));
    }

    // -------------------------------------------------------------------------

    public Topic createPDFTypeTopic(TopicMap tm) throws TopicMapException {
        Topic t = createTopic(tm, "PDF resource");
        Topic w = getWandoraClass(tm);
        makeSubclassOf(tm, t, w);
        return t;
    }

    public Topic getWandoraClass(TopicMap tm) throws TopicMapException {
        return createTopic(tm, TMBox.WANDORACLASS_SI, "Wandora class");
    }

    protected void makeSubclassOf(TopicMap tm, Topic t, Topic superclass) throws TopicMapException {
        ExtractHelper.makeSubclassOf(t, superclass, tm);
    }
}