org.nuxeo.ecm.platform.convert.tests.DocumentUTUtils.java Source code

Java tutorial

Introduction

Here is the source code for org.nuxeo.ecm.platform.convert.tests.DocumentUTUtils.java

Source

//
// Copyright (C) 2004-2006 - Mirko Nasato <mirko@artofsolving.com>
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
// http://www.gnu.org/copyleft/lesser.html
//
// JOOConverter - The Open Source Java/OpenOffice Document Converter
//
package org.nuxeo.ecm.platform.convert.tests;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringWriter;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

public final class DocumentUTUtils {

    private static final int BYTE_ORDER_MARK_CHAR = 0xFEFF;

    // This is an utility class, but making ctor private somehow confuses
    // Maven Surefire.

    private DocumentUTUtils() {
    }

    /**
     * Extracts the text from a PDF file.
     *
     * @return the document content as plain text
     */
    public static String readPdfText(File pdfFile) throws IOException {
        PDFTextStripper textStripper = new PDFTextStripper();
        PDDocument document = PDDocument.load(pdfFile);
        String text = textStripper.getText(document);
        document.close();
        return text.trim();
    }

    public static String readContent(File file) throws IOException {
        char[] buffer = new char[2048];
        InputStreamReader reader = new InputStreamReader(new FileInputStream(file), "UTF-8");
        StringWriter writer = new StringWriter();
        int length;
        while ((length = reader.read(buffer, 0, 2048)) != -1) {
            writer.write(buffer, 0, length);
        }
        String content = stripByteOrderMarkChar(writer.toString());
        return content.trim();
    }

    private static String stripByteOrderMarkChar(String content) {
        if (content.length() > 0 && content.charAt(0) == BYTE_ORDER_MARK_CHAR) {
            return content.substring(1).trim();
        }
        return content;
    }

}