scrape.Scraper.java Source code

Java tutorial

Introduction

Here is the source code for scrape.Scraper.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package scrape;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashSet;
import org.apache.commons.validator.routines.UrlValidator;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 *
 * @author abdulhakim
 */
public class Scraper {

    private HashSet<String> newUrls;
    private HashSet<String> scrapedUrls;
    private JSONArray json;
    private UrlValidator urlVal;

    public Scraper() {
        newUrls = new HashSet<>();
        scrapedUrls = new HashSet<>();
        json = new JSONArray();
        urlVal = new UrlValidator();
    }

    public JSONArray request(String url, String baseUrl) throws IOException, JSONException {

        HashSet<String> assets = new HashSet<>();

        //get html
        Document body = Jsoup.connect(url).get();
        scrapedUrls.add(url);

        //removes the new url from the set because it has been scraped
        if (newUrls.contains(url)) {
            newUrls.remove(url);
        }

        //get links, media and css && js imports from html
        Elements links = body.select("a[href]");
        Elements media = body.select("[src]");
        Elements imports = body.select("link[href]");

        for (Element el : links) {
            String link = el.attr("abs:href");
            if (urlVal.isValid(link) && checkSubOrCrossDomain(baseUrl, link) && !scrapedUrls.contains(link)) {
                newUrls.add(link);
            }
        }

        for (Element asset : media) {
            if (urlVal.isValid(asset.attr("abs:src"))) {
                assets.add(asset.attr("abs:src"));
            }
        }

        for (Element asset : imports) {
            if (urlVal.isValid(asset.attr("abs:href"))) {
                assets.add(asset.attr("abs:href"));
            }
        }

        //make the json object
        JSONObject obj = new JSONObject();
        obj.put("url", url);
        obj.put("assets", assets);

        json.put(obj);

        //recursively call the function with the next url in the new url set
        if (!newUrls.isEmpty()) {
            request(newUrls.iterator().next(), baseUrl);
        }

        return this.json;

    }

    public boolean checkSubOrCrossDomain(String bUrl, String url) throws MalformedURLException {

        URL baseHost = new URL(bUrl);
        URL urlHost = new URL(url);

        return baseHost.getHost().equals(urlHost.getHost());
    }

    public boolean isValid(String url) {
        return urlVal.isValid(url);
    }
}