com.jejking.hh.nord.corpus.AllrisHtmlToRawDrucksache.java Source code

Introduction

Here is the source code for com.jejking.hh.nord.corpus.AllrisHtmlToRawDrucksache.java
Source

/* 
 *  Hamburg-Nord Geocoder, by John King.
 *  Copyright (C) 2014,  John King
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>
 *
 *    
 */
package com.jejking.hh.nord.corpus;

import java.io.BufferedInputStream;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;

import org.apache.commons.codec.Charsets;
import org.apache.commons.codec.DecoderException;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.joda.time.LocalDate;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import rx.functions.Func1;

import com.google.common.base.Optional;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;

import static com.google.common.base.Preconditions.checkNotNull;

/**
 * Class to map a file (previously downloaded from a URL encoded to a hex-string that is the 
 * file's name, and gzipped) to an initial representation we can work with.
 * 
 * @author jejking
 *
 */
public class AllrisHtmlToRawDrucksache implements Func1<File, Optional<RawDrucksache>> {

    private final ImmutableMap<URL, Optional<LocalDate>> urlToDateMap;

    public AllrisHtmlToRawDrucksache(ImmutableMap<URL, Optional<LocalDate>> urlToDateMap) {
        this.urlToDateMap = checkNotNull(urlToDateMap);
    }

    @Override
    public Optional<RawDrucksache> call(File file) {
        try {
            try (InputStream inputStream = new GzipCompressorInputStream(
                    new BufferedInputStream(new FileInputStream(file)))) {
                URL originalUrl = originalUrlFromFileName(file);

                Document htmlDoc = Jsoup.parse(inputStream, null,
                        "http://ratsinformation.web.hamburg.de:85/bi/vo040.asp?showall=true");

                String druckSacheId = druckSacheId(htmlDoc);
                ImmutableMap<String, String> props = druckSachenProperties(htmlDoc);
                ImmutableList<String> contents = druckSachenContents(htmlDoc);

                Optional<LocalDate> optionalDate = this.urlToDateMap.get(originalUrl);

                RawDrucksache drucksache = new RawDrucksache(druckSacheId, originalUrl, optionalDate, props,
                        contents);

                return Optional.of(drucksache);
            }

        } catch (Exception e) {
            e.printStackTrace();
            return Optional.absent();
        }
    }

    private URL originalUrlFromFileName(File file) throws MalformedURLException, DecoderException {
        String hexName = file.getName().substring(0, file.getName().length() - 3); // trim off ".gz"
        URL originalUrl = new URL(new String(Hex.decodeHex(hexName.toCharArray()), Charsets.UTF_8));
        return originalUrl;
    }

    private ImmutableList<String> druckSachenContents(Document htmlDoc) {
        /*
         * In this way we can identify the bits of "RTF" like text inserted into the overall HTML.
         * JSoup cleans up the broken HTML removing the xml declaration and inserted html roots
         * that ALLRIS manages to put in.
         */
        Elements contentMetaElements = htmlDoc.getElementsByAttributeValue("name", "generator");
        ImmutableList.Builder<String> listBuilder = ImmutableList.builder();

        /*
         * Iterate over our candidates. Sometimes there are several.
         */
        for (Element contentMetaElement : contentMetaElements) {
            StringBuilder contentAsTextBuilder = new StringBuilder();
            Element nextSibling = contentMetaElement.nextElementSibling();

            /*
             * In the cleaned up HTML DOM returned by JSoup the "RTF" content is
             * rendered as siblings of the meta node (JSoup having removed the html, head, body
             * elements which should never have been there in the first place). 
             */
            while (nextSibling != null && !nextSibling.tag().equals("meta")) {
                contentAsTextBuilder.append(nextSibling.text());
                nextSibling = nextSibling.nextElementSibling();
            }
            /*
             * Only carry over non-empty content.
             */
            String contentAsText = contentAsTextBuilder.toString();
            if (!removeNonBreakingSpacesAndTrim(contentAsText).isEmpty()) {
                listBuilder.add(contentAsText);
            }
        }

        return listBuilder.build();
    }

    private ImmutableMap<String, String> druckSachenProperties(Document htmlDoc) {

        ImmutableMap.Builder<String, String> mapBuilder = ImmutableMap.builder();
        Elements keyElements = htmlDoc.getElementsByClass("kb1"); // td elements
        for (Element element : keyElements) {
            String key = removeNonBreakingSpacesAndTrim(element.text());
            if (key.endsWith(":")) {
                key = key.substring(0, key.length() - 1);
            }
            if (element.nextElementSibling() != null && !element.nextElementSibling().hasAttr("kb1")) {
                String value = removeNonBreakingSpacesAndTrim(element.nextElementSibling().text());

                if ((!key.isEmpty()) && (!value.isEmpty())) {
                    mapBuilder.put(key, value);
                }
            }
        }
        return mapBuilder.build();
    }

    private String removeNonBreakingSpacesAndTrim(String text) {
        // the unicode character for non-breaking space...
        return text.replace('\u00A0', ' ').trim();
    }

    private String druckSacheId(Document htmlDoc) {
        Elements druckSacheIdElememnts = htmlDoc.select("#risname > h1");
        Element druckSacheIdElement = druckSacheIdElememnts.first();
        String elementText = druckSacheIdElement.text();
        String druckSacheId = removeNonBreakingSpacesAndTrim(elementText.substring("Drucksache - ".length()));
        return druckSacheId;
    }

}