Java tutorial
/* Copyright 2012 Tim Garrett, Mothsoft LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.mothsoft.alexis.engine.textual; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.io.StringReader; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Set; import org.apache.commons.io.input.ReaderInputStream; import org.apache.commons.lang.StringUtils; import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.html.BoilerpipeContentHandler; import org.apache.tika.parser.html.HtmlParser; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import de.l3s.boilerpipe.extractors.ArticleExtractor; import de.l3s.boilerpipe.extractors.KeepEverythingExtractor; public class WebContentParserImpl implements WebContentParser { private org.apache.tika.parser.AutoDetectParser autoDetectParser; private Detector detector; private static final Set<MediaType> HTML_TYPES = Collections.unmodifiableSet( new HashSet<MediaType>(Arrays.asList(MediaType.text("html"), MediaType.application("xhtml+xml"), MediaType.application("vnd.wap.xhtml+xml"), MediaType.application("x-asp")))); public WebContentParserImpl() { this.autoDetectParser = new AutoDetectParser(); this.detector = new DefaultDetector(); } public String parse(final InputStream is) throws IOException { final InputStream bufferedStream = buffered(is); final StringBuffer buffer = new StringBuffer(); final org.apache.tika.mime.MediaType mediaType = this.detector.detect(bufferedStream, new Metadata()); final ContentHandler handler; if (HTML_TYPES.contains(mediaType)) { // if coming in as a stream and HTML, likely part of a larger // document (web page), we would like to do article extraction // FIXME - smarter handler? handler = new BoilerpipeContentHandler(new FullTextContentHandler(buffer), ArticleExtractor.INSTANCE); } else { // assuming full documents like Word or PDF are more about a single // topic handler = new FullTextContentHandler(buffer); } return parse(this.autoDetectParser, bufferedStream, handler, buffer); } private BufferedInputStream buffered(InputStream is) { return new BufferedInputStream(is, 1024 * 16); } public String parseHTML(final String string) throws IOException { final StringBuffer buffer = new StringBuffer(); final HtmlParser htmlParser = new HtmlParser(); final BoilerpipeContentHandler handler = new BoilerpipeContentHandler(new FullTextContentHandler(buffer), KeepEverythingExtractor.INSTANCE); return parse(htmlParser, new ReaderInputStream(new StringReader(string)), handler, buffer); } private String parse(org.apache.tika.parser.Parser parser, InputStream is, ContentHandler handler, StringBuffer buffer) throws IOException { final Metadata metadata = new Metadata(); final ParseContext context = new ParseContext(); try { parser.parse(is, handler, metadata, context); return StringUtils.trimToEmpty(buffer.toString()); } catch (SAXException e) { throw new IOException(e.getLocalizedMessage()); } catch (TikaException e) { throw new IOException(e.getLocalizedMessage()); } } private class FullTextContentHandler extends DefaultHandler { private StringBuffer buffer; private boolean lastWasWhitespace = false; FullTextContentHandler(final StringBuffer buffer) { this.buffer = buffer; } @Override public void characters(char[] chars, int start, int length) throws SAXException { buffer.append(chars, start, length); lastWasWhitespace = false; } @Override public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException { if (!lastWasWhitespace) { buffer.append(" "); lastWasWhitespace = true; } } } }