net.sourceforge.docfetcher.model.parse.OpenOfficeParser.java Source code

Introduction

Here is the source code for net.sourceforge.docfetcher.model.parse.OpenOfficeParser.java
Source

/*******************************************************************************
 * Copyright (c) 2011 Tran Nam Quang.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *    Tran Nam Quang - initial API and implementation
 *******************************************************************************/

package net.sourceforge.docfetcher.model.parse;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Enumeration;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;

import org.aspectj.lang.annotation.SuppressAjWarnings;

import net.htmlparser.jericho.CharacterReference;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.StartTag;
import net.sourceforge.docfetcher.enums.Msg;
import net.sourceforge.docfetcher.util.annotations.NotNull;
import net.sourceforge.docfetcher.util.annotations.Nullable;

import com.catcode.odf.OpenDocumentTextInputStream;
import com.google.common.io.CharStreams;
import com.google.common.io.Closeables;

/**
 * @author Tran Nam Quang
 */
abstract class OpenOfficeParser extends FileParser {

    public static final class OpenOfficeWriterParser extends OpenOfficeParser {
        public OpenOfficeWriterParser() {
            super(Msg.filetype_odt.get(), "odt", "ott");
        }
    }

    public static final class OpenOfficeCalcParser extends OpenOfficeParser {
        public OpenOfficeCalcParser() {
            super(Msg.filetype_ods.get(), "ods", "ots");
        }
    }

    public static final class OpenOfficeDrawParser extends OpenOfficeParser {
        public OpenOfficeDrawParser() {
            super(Msg.filetype_odg.get(), "odg", "otg");
        }
    }

    public static final class OpenOfficeImpressParser extends OpenOfficeParser {
        public OpenOfficeImpressParser() {
            super(Msg.filetype_odp.get(), "odp", "otp");
        }
    }

    private static final Collection<String> types = MediaType.Col.application("zip");

    private final String typeLabel;
    private final Collection<String> extensions;

    private OpenOfficeParser(@NotNull String typeLabel, @NotNull String... extensions) {
        this.typeLabel = typeLabel;
        this.extensions = Arrays.asList(extensions);
    }

    protected final Collection<String> getExtensions() {
        return extensions;
    }

    public final String getTypeLabel() {
        return typeLabel;
    }

    protected final Collection<String> getTypes() {
        return types;
    }

    @Override
    protected final ParseResult parse(@NotNull File file, @NotNull ParseContext context) throws ParseException {
        ZipFile zipFile = null;
        try {
            // Get zip entries
            zipFile = new ZipFile(file);
            ZipEntry manifZipEntry = zipFile.getEntry("META-INF/manifest.xml"); //$NON-NLS-1$
            ZipEntry metaZipEntry = zipFile.getEntry("meta.xml"); //$NON-NLS-1$
            ZipEntry contentZipEntry = zipFile.getEntry("content.xml"); //$NON-NLS-1$
            if (manifZipEntry == null || metaZipEntry == null || contentZipEntry == null)
                throw new ParseException(Msg.file_corrupted.get());

            // Find out if file is password protected
            InputStream manifInputStream = zipFile.getInputStream(manifZipEntry);
            Source manifSource = new Source(manifInputStream);
            Closeables.closeQuietly(manifInputStream);
            manifSource.setLogger(null);
            StartTag encryptTag = manifSource.getNextStartTag(0, "manifest:encryption-data"); //$NON-NLS-1$
            if (encryptTag != null)
                throw new ParseException(Msg.doc_pw_protected.get());

            // Get tags from meta.xml file
            InputStream metaInputStream = zipFile.getInputStream(metaZipEntry);
            Source metaSource = new Source(metaInputStream);
            Closeables.closeQuietly(metaInputStream);
            metaSource.setLogger(null);
            String title = getElementContent(metaSource, "dc:title"); //$NON-NLS-1$
            String author = getElementContent(metaSource, "dc:creator"); //$NON-NLS-1$
            String description = getElementContent(metaSource, "dc:description"); //$NON-NLS-1$
            String subject = getElementContent(metaSource, "dc:subject"); //$NON-NLS-1$
            String keyword = getElementContent(metaSource, "meta:keyword"); //$NON-NLS-1$

            // Collect content.xml entries
            List<ZipEntry> contentEntries = new ArrayList<ZipEntry>();
            contentEntries.add(contentZipEntry);
            Enumeration<? extends ZipEntry> zipEntries = zipFile.entries();
            while (zipEntries.hasMoreElements()) {
                ZipEntry entry = zipEntries.nextElement();
                if (entry.getName().endsWith("/content.xml")) //$NON-NLS-1$
                    contentEntries.add(entry);
            }

            // Get contents from the content.xml entries
            StringBuilder sb = new StringBuilder();
            for (ZipEntry entry : contentEntries) {
                InputStream contentInputStream = zipFile.getInputStream(entry);
                Source contentSource = new Source(contentInputStream);
                Closeables.closeQuietly(contentInputStream);
                contentSource.setLogger(null);
                Element contentElement = contentSource.getNextElement(0, "office:body"); //$NON-NLS-1$
                if (contentElement == null) // this content.xml file doesn't seem to contain text
                    continue;
                String content = contentElement.getContent().getTextExtractor().toString();
                sb.append(content).append(" "); //$NON-NLS-1$
            }

            // Create and return parse result
            ParseResult parseResult = new ParseResult(sb);
            parseResult.setTitle(title);
            parseResult.addAuthor(author);
            parseResult.addMiscMetadata(description);
            parseResult.addMiscMetadata(subject);
            parseResult.addMiscMetadata(keyword);
            return parseResult;
        } catch (IOException e) {
            throw new ParseException(e);
        } finally {
            closeZipFile(zipFile);
        }
    }

    @SuppressAjWarnings
    private static void closeZipFile(@Nullable ZipFile zipFile) {
        // We can't use Closeables.closeQuietly for ZipFiles because it doesn't
        // implement the Closeable interface on Mac OS X.
        if (zipFile == null)
            return;
        try {
            zipFile.close();
        } catch (IOException e) {
        }
    }

    /**
     * Returns the textual content inside the given HTML element from the given
     * HTML source. Returns null if the HTML element is not found.
     */
    @Nullable
    private String getElementContent(@NotNull Source source, @NotNull String elementName) {
        Element el = source.getNextElement(0, elementName);
        return el == null ? null : CharacterReference.decode(el.getContent());
    }

    protected final String renderText(File file, ParseContext context) throws ParseException {
        ZipFile zipFile = null;
        Reader reader = null;
        try {
            zipFile = new ZipFile(file);
            ZipEntry contentZipEntry = zipFile.getEntry("content.xml"); //$NON-NLS-1$
            if (contentZipEntry == null)
                throw new ParseException(Msg.file_corrupted.get());

            InputStream in = zipFile.getInputStream(contentZipEntry);
            in = new OpenDocumentTextInputStream(in);
            reader = new BufferedReader(new InputStreamReader(in, "utf8"));

            return CharStreams.toString(reader);
        } catch (IOException e) {
            throw new ParseException(e);
        } finally {
            Closeables.closeQuietly(reader);
            closeZipFile(zipFile);
        }
    }

}