org.apache.james.mailbox.store.search.PDFTextExtractor.java Source code

Introduction

Here is the source code for org.apache.james.mailbox.store.search.PDFTextExtractor.java
Source

/****************************************************************
 * Licensed to the Apache Software Foundation (ASF) under one   *
 * or more contributor license agreements.  See the NOTICE file *
 * distributed with this work for additional information        *
 * regarding copyright ownership.  The ASF licenses this file   *
 * to you under the Apache License, Version 2.0 (the            *
 * "License"); you may not use this file except in compliance   *
 * with the License.  You may obtain a copy of the License at   *
 *                                                              *
 *   http://www.apache.org/licenses/LICENSE-2.0                 *
 *                                                              *
 * Unless required by applicable law or agreed to in writing,   *
 * software distributed under the License is distributed on an  *
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
 * KIND, either express or implied.  See the License for the    *
 * specific language governing permissions and limitations      *
 * under the License.                                           *
 ****************************************************************/
package org.apache.james.mailbox.store.search;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.Optional;

import org.apache.commons.io.IOUtils;
import org.apache.james.mailbox.extractor.ParsedContent;
import org.apache.james.mailbox.extractor.TextExtractor;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;

public class PDFTextExtractor implements TextExtractor {

    static final String PDF_TYPE = "application/pdf";

    @Override
    public ParsedContent extractContent(InputStream inputStream, String contentType) throws Exception {
        Preconditions.checkNotNull(inputStream);
        Preconditions.checkNotNull(contentType);

        if (isPDF(contentType)) {
            return extractTextFromPDF(inputStream);
        }
        return new ParsedContent(Optional.ofNullable(IOUtils.toString(inputStream, StandardCharsets.UTF_8)),
                ImmutableMap.of());
    }

    private boolean isPDF(String contentType) {
        return contentType.equals(PDF_TYPE);
    }

    private ParsedContent extractTextFromPDF(InputStream inputStream) throws IOException {
        return new ParsedContent(Optional.ofNullable(new PDFTextStripper().getText(PDDocument.load(inputStream))),
                ImmutableMap.of());
    }
}