org.olat.search.service.document.file.PdfDocument.java Source code

Java tutorial

Introduction

Here is the source code for org.olat.search.service.document.file.PdfDocument.java

Source

/**
 * OLAT - Online Learning and Training<br>
 * http://www.olat.org
 * <p>
 * Licensed under the Apache License, Version 2.0 (the "License"); <br>
 * you may not use this file except in compliance with the License.<br>
 * You may obtain a copy of the License at
 * <p>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p>
 * Unless required by applicable law or agreed to in writing,<br>
 * software distributed under the License is distributed on an "AS IS" BASIS, <br>
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br>
 * See the License for the specific language governing permissions and <br>
 * limitations under the License.
 * <p>
 * Copyright (c) since 2004 at Multimedia- & E-Learning Services (MELS),<br>
 * University of Zurich, Switzerland.
 * <p>
 */

package org.olat.search.service.document.file;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;

import org.apache.lucene.document.Document;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.olat.core.logging.OLog;
import org.olat.core.logging.Tracing;
import org.olat.core.util.FileUtils;
import org.olat.core.util.vfs.VFSLeaf;
import org.olat.search.service.SearchResourceContext;
import org.olat.search.service.SearchServiceFactory;

/**
 * Lucene document mapper.
 * 
 * @author Christian Guretzki
 */
public class PdfDocument extends FileDocument {
    private static OLog log = Tracing.createLoggerFor(PdfDocument.class);

    public final static String FILE_TYPE = "type.file.pdf";

    private final boolean pdfTextBuffering;

    private final String pdfTextBufferPath;

    private String filePath;

    public PdfDocument() {
        super();
        pdfTextBuffering = SearchServiceFactory.getService().getSearchModuleConfig().getPdfTextBuffering();
        pdfTextBufferPath = SearchServiceFactory.getService().getSearchModuleConfig().getPdfTextBufferPath();
    }

    public static Document createDocument(final SearchResourceContext leafResourceContext, final VFSLeaf leaf)
            throws IOException, DocumentException, DocumentAccessException {
        final PdfDocument textDocument = new PdfDocument();
        textDocument.setFilePath(getPdfTextTmpFilePath(leafResourceContext));
        textDocument.init(leafResourceContext, leaf);
        textDocument.setFileType(FILE_TYPE);
        textDocument.setCssIcon("b_filetype_pdf");
        if (log.isDebug()) {
            log.debug(textDocument.toString());
        }
        return textDocument.getLuceneDocument();
    }

    private void setFilePath(final String filePath2) {
        this.filePath = filePath2;
    }

    /**
     * Create a file-path for certain SearchResourceContext. E.g. '04\1601914104anuale_print.pdf'
     */
    private static String getPdfTextTmpFilePath(final SearchResourceContext leafResourceContext) {
        final int hashCode = Math.abs(leafResourceContext.getResourceUrl().hashCode());
        final String hashCodeAsString = Integer.toString(hashCode);
        final String splitDirName = hashCodeAsString.substring(hashCodeAsString.length() - 2);
        final String pdfTextTmpFilePath = splitDirName + File.separator + hashCodeAsString
                + leafResourceContext.getFilePath();
        if (log.isDebug()) {
            log.debug("PdfTextTmpFilePath=" + pdfTextTmpFilePath);
        }
        return pdfTextTmpFilePath;
    }

    @Override
    protected String readContent(final VFSLeaf leaf) throws DocumentException, DocumentAccessException {
        try {
            long startTime = 0;
            if (log.isDebug()) {
                startTime = System.currentTimeMillis();
            }
            String pdfText = null;
            final String fullPdfTextTmpFilePath = pdfTextBufferPath + File.separator + getFilePath() + ".tmp";
            final File pdfTextFile = new File(fullPdfTextTmpFilePath);
            if (pdfTextBuffering && !isNewPdfFile(leaf, pdfTextFile)) {
                // text file with extracted text exist => read pdf text from there
                pdfText = getPdfTextFromBuffer(pdfTextFile);
            } else {
                // no text file with extracted text exist => extract text from pdf
                pdfText = extractTextFromPdf(leaf);
                if (pdfTextBuffering) {
                    // store extracted pdf-text in
                    storePdfTextInBuffer(pdfText, fullPdfTextTmpFilePath, pdfTextFile);
                }
                if (log.isDebug()) {
                    log.debug("readContent from pdf done.");
                }
            }
            if (log.isDebug()) {
                final long time = System.currentTimeMillis() - startTime;
                log.debug("readContent time=" + time);
            }
            return pdfText;
        } catch (final DocumentAccessException ex) {
            // pass exception
            throw new DocumentAccessException(ex.getMessage());
        } catch (final Exception ex) {
            throw new DocumentException("Can not read PDF content. File=" + leaf.getName() + ";" + ex.getMessage());
        }
    }

    private void storePdfTextInBuffer(final String pdfText, final String fullPdfTextTmpFilePath,
            final File pdfTextFile) throws IOException {
        final int lastSlash = fullPdfTextTmpFilePath.lastIndexOf('/');
        final String dirPath = fullPdfTextTmpFilePath.substring(0, lastSlash);
        final File dirFile = new File(dirPath);
        dirFile.mkdirs();
        FileUtils.save(new FileOutputStream(pdfTextFile), pdfText, "utf-8");
    }

    private String extractTextFromPdf(final VFSLeaf leaf) throws IOException, DocumentAccessException {
        if (log.isDebug()) {
            log.debug("readContent from pdf starts...");
        }
        PDDocument document = null;
        BufferedInputStream bis = null;
        try {
            bis = new BufferedInputStream(leaf.getInputStream());
            document = PDDocument.load(bis);
            if (document.isEncrypted()) {
                try {
                    document.decrypt("");
                } catch (final Exception e) {
                    throw new DocumentAccessException(
                            "PDF is encrypted. Can not read content file=" + leaf.getName());
                }
            }
            if (log.isDebug()) {
                log.debug("readContent PDDocument loaded");
            }
            final PDFTextStripper stripper = new PDFTextStripper();
            return stripper.getText(document);
        } finally {
            if (document != null) {
                document.close();
            }
            if (bis != null) {
                bis.close();
            }
        }

    }

    private String getPdfTextFromBuffer(final File pdfTextFile) throws IOException {
        if (log.isDebug()) {
            log.debug("readContent from text file start...");
        }
        BufferedInputStream bis = null;
        try {
            bis = new BufferedInputStream(new FileInputStream(pdfTextFile));
            final String pdfText = FileUtils.load(bis, "utf-8");
            if (log.isDebug()) {
                log.debug("readContent from text file done.");
            }
            return pdfText;
        } finally {
            if (bis != null) {
                bis.close();
            }
        }
    }

    private String getFilePath() {
        return filePath;
    }

    private boolean isNewPdfFile(final VFSLeaf leaf, final File pdfTextFile) {
        if (pdfTextFile == null) {
            return true;
        }
        if (!pdfTextFile.exists()) {
            return true;
        }
        if (leaf.getLastModified() > pdfTextFile.lastModified()) {
            // pdf file is newer => delete it
            pdfTextFile.delete();
            return true;
        }
        return false;
    }

}