app.data.parse.WebPageUtil.java Source code

Java tutorial

Introduction

Here is the source code for app.data.parse.WebPageUtil.java

Source

/*
 * Copyright 2016 TomeOkin
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package app.data.parse;

import app.data.model.WebPageInfo;
import com.google.common.cache.Cache;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.web.util.UriComponentsBuilder;

import javax.validation.constraints.NotNull;
import java.io.IOException;

public class WebPageUtil {
    private static final Logger logger = LoggerFactory.getLogger(WebPageUtil.class);

    // http://www.atool.org/useragent.php
    public static final String GOOGLE_USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36";
    public static final String FIREFOX_USER_AGENT = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.2357.125 Safari/537.36 OPR/30.0.1835.88";

    public static WebPageInfo parse(String url, Cache<String, WebPageInfo> urlInfoCache) throws IOException {
        String original = url;

        // hit toutiao.io
        // fixme http://toutiao.io/shares/640539/url
        if (original.startsWith("https://toutiao.io/posts/")) {
            original = original.replace("/posts/", "/k/");
        }

        // check cache
        WebPageInfo info = urlInfoCache != null ? urlInfoCache.getIfPresent(original) : null;
        if (info != null) {
            return info;
        } else {
            info = new WebPageInfo();
            info.url = original;
        }

        // attach url
        Document doc = requestUrl(info.url);
        info.url = doc.baseUri(); // or doc.location()

        // hit gold.xitu.io
        if (info.url.startsWith("http://gold.xitu.io/entry/")) {
            Elements origin = doc.select("div[class=ellipsis]");
            Elements originLink = origin.select("a[class=share-link]");
            info.url = originLink.attr("href");

            // reconnect
            doc = requestUrl(info.url);
            info.url = doc.baseUri(); // or doc.location()
        }

        info.url = smartUri(info.url);

        // get title
        Elements metaTitle = doc.select("meta[property=og:title]");
        if (metaTitle != null) {
            info.title = metaTitle.attr("content");
        }
        if (StringUtils.isEmpty(info.title)) {
            metaTitle = doc.select("meta[property=twitter:title]");
            if (metaTitle != null) {
                info.title = metaTitle.attr("content");
            }
            info.title = StringUtils.isEmpty(info.title) ? doc.title() : info.title;
        }

        // get desc
        Elements metaDesc = doc.select("meta[property=og:description]");
        if (metaDesc != null) {
            info.description = metaDesc.attr("content");
        }
        if (StringUtils.isEmpty(info.description)) {
            metaDesc = doc.select("meta[property=twitter:description]");
            if (metaDesc != null) {
                info.description = metaDesc.attr("content");
            }
            if (StringUtils.isEmpty(info.description)) {
                metaDesc = doc.select("meta[name=description]");
                if (metaDesc != null) {
                    info.description = metaDesc.attr("content");
                }
                if (StringUtils.isEmpty(info.description)) {
                    metaDesc = doc.body().select("p");
                    if (metaDesc != null) {
                        for (Element element : metaDesc) {
                            info.description = element.text();
                            if (info.description != null && info.description.length() >= 20) {
                                break;
                            }
                        }
                    }
                }
            }
        }
        info.description = ellipsis(info.description, 140, "...");

        // cache info
        if (urlInfoCache != null) {
            urlInfoCache.put(original, info);
        }
        return info;
    }

    private static Document requestUrl(String url) throws IOException {
        return Jsoup.connect(url).userAgent(GOOGLE_USER_AGENT).timeout(20000).validateTLSCertificates(false).get();
    }

    //    public static String smartLink(String old) {
    //        if (old.contains("http://mp.weixin.qq.com/")) {
    //            return old;
    //        }
    //
    //        String url = old;
    //        int query = url.lastIndexOf('?');
    //        if (query != -1) {
    //            url = url.substring(0, query);
    //        }
    //        query = url.lastIndexOf('#');
    //        if (query != -1) {
    //            url = url.substring(0, query);
    //        }
    //
    //        logger.info("smartUri: {}", smartUri(old));
    //        return url;
    //    }

    public static String smartUri(String old) {
        return UriComponentsBuilder.fromUriString(old).replaceQueryParam("utm_source")
                .replaceQueryParam("utm_medium").replaceQueryParam("utm_campaign").replaceQueryParam("utm_term")
                .replaceQueryParam("utm_content").replaceQueryParam("hmsr").build().toUriString();
    }

    /**
     * ???
     */
    public static String ellipsis(String text, int limit, @NotNull String append) {
        if (text.length() <= limit) {
            return text;
        }

        final int space = limit - append.length();
        int i = 0, next = 0;
        while (i + next <= space) {
            i += next;
            int unicode = Character.codePointAt(text, i);
            next = Character.charCount(unicode);
        }
        return text.substring(0, i) + append;
    }
}