com.mothsoft.alexis.engine.textual.WebContentParserImpl.java Source code

Introduction

Here is the source code for com.mothsoft.alexis.engine.textual.WebContentParserImpl.java
Source

/*   Copyright 2012 Tim Garrett, Mothsoft LLC
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package com.mothsoft.alexis.engine.textual;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

import org.apache.commons.io.input.ReaderInputStream;
import org.apache.commons.lang.StringUtils;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.parser.html.HtmlParser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import de.l3s.boilerpipe.extractors.ArticleExtractor;
import de.l3s.boilerpipe.extractors.KeepEverythingExtractor;

public class WebContentParserImpl implements WebContentParser {

    private org.apache.tika.parser.AutoDetectParser autoDetectParser;
    private Detector detector;

    private static final Set<MediaType> HTML_TYPES = Collections.unmodifiableSet(
            new HashSet<MediaType>(Arrays.asList(MediaType.text("html"), MediaType.application("xhtml+xml"),
                    MediaType.application("vnd.wap.xhtml+xml"), MediaType.application("x-asp"))));

    public WebContentParserImpl() {
        this.autoDetectParser = new AutoDetectParser();
        this.detector = new DefaultDetector();
    }

    public String parse(final InputStream is) throws IOException {
        final InputStream bufferedStream = buffered(is);

        final StringBuffer buffer = new StringBuffer();
        final org.apache.tika.mime.MediaType mediaType = this.detector.detect(bufferedStream, new Metadata());

        final ContentHandler handler;
        if (HTML_TYPES.contains(mediaType)) {
            // if coming in as a stream and HTML, likely part of a larger
            // document (web page), we would like to do article extraction
            // FIXME - smarter handler?
            handler = new BoilerpipeContentHandler(new FullTextContentHandler(buffer), ArticleExtractor.INSTANCE);
        } else {
            // assuming full documents like Word or PDF are more about a single
            // topic
            handler = new FullTextContentHandler(buffer);
        }

        return parse(this.autoDetectParser, bufferedStream, handler, buffer);
    }

    private BufferedInputStream buffered(InputStream is) {
        return new BufferedInputStream(is, 1024 * 16);
    }

    public String parseHTML(final String string) throws IOException {
        final StringBuffer buffer = new StringBuffer();
        final HtmlParser htmlParser = new HtmlParser();
        final BoilerpipeContentHandler handler = new BoilerpipeContentHandler(new FullTextContentHandler(buffer),
                KeepEverythingExtractor.INSTANCE);
        return parse(htmlParser, new ReaderInputStream(new StringReader(string)), handler, buffer);
    }

    private String parse(org.apache.tika.parser.Parser parser, InputStream is, ContentHandler handler,
            StringBuffer buffer) throws IOException {
        final Metadata metadata = new Metadata();
        final ParseContext context = new ParseContext();

        try {
            parser.parse(is, handler, metadata, context);
            return StringUtils.trimToEmpty(buffer.toString());
        } catch (SAXException e) {
            throw new IOException(e.getLocalizedMessage());
        } catch (TikaException e) {
            throw new IOException(e.getLocalizedMessage());
        }
    }

    private class FullTextContentHandler extends DefaultHandler {
        private StringBuffer buffer;
        private boolean lastWasWhitespace = false;

        FullTextContentHandler(final StringBuffer buffer) {
            this.buffer = buffer;
        }

        @Override
        public void characters(char[] chars, int start, int length) throws SAXException {
            buffer.append(chars, start, length);
            lastWasWhitespace = false;
        }

        @Override
        public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException {
            if (!lastWasWhitespace) {
                buffer.append(" ");
                lastWasWhitespace = true;
            }
        }

    }
}