net.sourceforge.vaticanfetcher.model.parse.ChmParser.java Source code

Introduction

Here is the source code for net.sourceforge.vaticanfetcher.model.parse.ChmParser.java
Source

/*******************************************************************************
 * Copyright (c) 2011 Tran Nam Quang.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *    Tran Nam Quang - initial API and implementation
 *******************************************************************************/

package net.sourceforge.vaticanfetcher.model.parse;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Collection;
import java.util.Collections;

import net.htmlparser.jericho.Source;
import net.sourceforge.vaticanfetcher.enums.Msg;
import net.sourceforge.vaticanfetcher.util.annotations.NotNull;

import org.chm4j.ChmEntry;
import org.chm4j.ChmFile;

import com.google.common.io.Closeables;

final class ChmParser extends FileParser {

    /*
     * TODO post-release-1.1: replace this with Tika's CHM parser. Then:
     * - remove chm4j dependency
     * - enable CHM parser on Mac
     * - throw runtime exception when ParseService.findParser can't find a suitable parser
     */

    private static final Collection<String> extensions = Collections.singleton("chm");

    @Override
    protected ParseResult parse(File file, ParseContext context) throws ParseException {
        return new ParseResult(renderText(file, false));
    }

    @Override
    protected String renderText(File file, String filename) throws ParseException {
        return renderText(file, true);
    }

    @NotNull
    private String renderText(@NotNull File file, boolean renderText) throws ParseException {
        StringBuilder contents = new StringBuilder();
        try {
            ChmFile chmFile = new ChmFile(file);
            ChmEntry[] entries = chmFile.entries(ChmEntry.Attribute.ALL);
            for (ChmEntry entry : entries)
                append(contents, entry, renderText);
        } catch (IOException e) {
            throw new ParseException(e);
        }
        return contents.toString();
    }

    /**
     * Converts all <tt>ChmEntry</tt>s under <tt>entry</tt> to strings and puts them into the given <tt>StringBuilder</tt>.
     * 
     * @param renderText
     *            Whether the textual contents of the <tt>ChmEntry</tt>s
     *            should be extracted in a readable format (true) or as raw strings (false).
     */
    private void append(@NotNull StringBuilder sb, @NotNull ChmEntry entry, boolean renderText) throws IOException {
        if (entry.hasAttribute(ChmEntry.Attribute.DIRECTORY)) {
            for (ChmEntry child : entry.entries(ChmEntry.Attribute.ALL))
                append(sb, child, renderText);
        } else {
            BufferedReader reader = null;
            try {
                reader = new BufferedReader(new InputStreamReader(entry.getInputStream(), "utf8" // Just guessing... //$NON-NLS-1$
                ));
                StringBuilder entryBuffer = new StringBuilder();
                String line = null;
                while ((line = reader.readLine()) != null)
                    entryBuffer.append(line).append("\n\n"); //$NON-NLS-1$

                /*
                 * The current version of chm4j doesn't allow differentiating between binary files (such as images) 
                 * and HTML files. Therefore we scan the text for HTML tags to select the HTML files.
                 */
                if (isHTML(entryBuffer)) {
                    Source source = new Source(entryBuffer);
                    source.setLogger(null);
                    if (renderText)
                        sb.append(source.getRenderer().setIncludeHyperlinkURLs(false).toString());
                    else
                        sb.append(source.getTextExtractor().toString());
                }
            } catch (RuntimeException e) {
                // The HTML lib can do this to us; do nothing
            } finally {
                Closeables.closeQuietly(reader);
            }
        }
    }

    /**
     * Returns true if the given StringBuilder appears to contain HTML. This is determined by parsing the input with a simple 
     * finite state machine that checks whether the input contains an html start tag, followed by an html end tag.
     * <p>
     * Note: This is better than using regular expressions because the latter can crash the program 
     * with a StackOverflowError, as seen in bug report #2948903.
     */
    private static boolean isHTML(@NotNull StringBuilder input) {
        final int OUTSIDE = 0;
        final int INSIDE = 1;
        int state = OUTSIDE;
        for (int i = 0; i < input.length(); i++) {
            char c = input.charAt(i);
            if (state == OUTSIDE) {
                /*
                 * Note that we're checking for the occurrence of <html, not <html>, since in some HTML 
                 * documents the html start tag contains additional attributes, e.g. <html attr="value">.
                 */
                if (c == 'l' || c == 'L') { // last char in 'html'
                    if (i >= 4) {
                        String substring = input.substring(i - 4, i + 1);
                        if (substring.toLowerCase().equals("<html"))
                            state = INSIDE;
                    }
                }
            } else if (state == INSIDE) {
                if (c == '>') {
                    if (i >= 6) {
                        String substring = input.substring(i - 6, i + 1);
                        if (substring.toLowerCase().equals("</html>"))
                            return true;
                    }
                }
            }
        }
        return false;
    }

    protected Collection<String> getExtensions() {
        return extensions;
    }

    protected Collection<String> getTypes() {
        /* The mime-util library doesn't seem to be able to detect CHM files, so we'll return an empty list here. */
        return Collections.emptyList();
    }

    public String getTypeLabel() {
        return Msg.filetype_chm.get();
    }

}