com.gumtreescraper.scraper.GumtreeScraper.java Source code

Java tutorial

Introduction

Here is the source code for com.gumtreescraper.scraper.GumtreeScraper.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package com.gumtreescraper.scraper;

import com.gumtreescraper.model.Gumtree;
import com.gumtreescraper.util.GumtreeUtils;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.lang.time.DateUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openqa.selenium.By;
import org.openqa.selenium.StaleElementReferenceException;
import org.openqa.selenium.TimeoutException;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;

/**
 *
 * @author duonghung1269
 */
public class GumtreeScraper extends AbstractScraper {

    private String username;
    private String password;
    private String url;
    private String fileName;
    private Date lastEditedDate;
    private int timeout;

    private static final long SPECIAL_TIMEOUT = 30;

    private static final String BASE_URL = "https://www.gumtree.com.au";
    private static final String LOGIN_URL = "https://www.gumtree.com.au/t-login-form.html";

    private static final int NUMBER_OF_LINE_TO_SERIALIZE = 50;

    public GumtreeScraper(String userName, String password, String url, String fileName, Date lastEditedDate,
            int timeout) {
        super();
        this.username = userName;
        this.password = password;
        this.url = url;
        this.fileName = fileName;
        this.lastEditedDate = lastEditedDate;
        this.timeout = timeout;
        this.webDriver = new ChromeDriver();

        // add 100 result per page cookie
        //        Cookie ck = new Cookie("up", "%7B%22ln%22%3A%22532176319%22%2C%22sps%22%3A%22100%22%2C%22ls%22%3A%22l%3D0%26c%3D20031%26r%3D0%26sv%3DLIST%26sf%3Ddate%22%2C%22lbh%22%3A%22l%3D0%26c%3D20031%26r%3D0%26sv%3DLIST%26sf%3Ddate%22%7D");
        //        webDriver.manage().addCookie(ck);
    }

    public boolean login() {
        openSite(LOGIN_URL);
        webDriver.findElement(By.id("login-email")).sendKeys(username);
        webDriver.findElement(By.id("login-password")).sendKeys(password);
        webDriver.findElement(By.className("login-form-submit")).findElement(By.tagName("button")).click();

        //        waitForSeconds(10);

        try {
            // if found then return true, otherwise return false
            (new WebDriverWait(this.webDriver, SPECIAL_TIMEOUT))
                    .until(ExpectedConditions.presenceOfElementLocated(By.className("item-sign-out")));
            //            webDriver.findElement(By.className("item-sign-out")); 
        } catch (Exception ex) {
            System.out.println(ex);
            return false;
        }

        return true;
    }

    public void updateGumtreeModel(List<Gumtree> gumtrees) {
        List<Gumtree> gumtreesNeedToWrite = new ArrayList<>();
        int count = 0;
        int totalGumtreeLength = gumtrees.size();
        for (int i = 0; i < totalGumtreeLength; i++) {
            //            ad-attributes
            Gumtree gumtree = gumtrees.get(i);
            try {
                openSite(gumtree.getUrl());
                waitForPageToLoad();

                String content = (new WebDriverWait(this.webDriver, SPECIAL_TIMEOUT))
                        .until(ExpectedConditions.presenceOfElementLocated(By.xpath("//meta[@name='WT.cg_s']")))
                        .getAttribute("content").toLowerCase();

                String type = content.contains("sale") ? "sale" : "rent";
                gumtree.setType(type);

                String saleRentId = "forsaleby_s-wrapper";
                if (type.equals("rent")) {
                    saleRentId = "forrentby_s-wrapper";
                }

                String saleRentBy = (new WebDriverWait(this.webDriver, SPECIAL_TIMEOUT))
                        .until(ExpectedConditions.presenceOfElementLocated(
                                By.xpath("//div[@id='ad-attributes']/dl[contains(@id, '" + saleRentId + "')]/dd")))
                        .getText().trim();

                System.out.append("===Sale/Rent by: " + saleRentBy);

                if (!"owner".equalsIgnoreCase(saleRentBy)) {
                    continue;
                }

                String name = (new WebDriverWait(this.webDriver, SPECIAL_TIMEOUT))
                        .until(ExpectedConditions.presenceOfElementLocated(
                                By.xpath("//div[@id='reply-form']//div[@class='reply-form-name']/a")))
                        .getText().trim();

                gumtree.setName(name);
                gumtreesNeedToWrite.add(gumtree);

                if ((i > 0 && (count % NUMBER_OF_LINE_TO_SERIALIZE) == 0) || (i == totalGumtreeLength - 1)) {
                    writeToCsvFile(gumtreesNeedToWrite, fileName);
                    gumtreesNeedToWrite.clear();
                }

            } catch (TimeoutException ex) {
                gumtree.setNotes("TIME_OUT");
                System.out.print(ex.getMessage());
            }
        }
    }

    public static void writeToCsvFile(List<Gumtree> gumtreesNeedToWrite, String outputCsvFileName) {
        try {
            boolean isAppend = true;
            FileWriter writer = new FileWriter(outputCsvFileName, isAppend);
            for (Gumtree gumtree : gumtreesNeedToWrite) {
                writer.append(gumtree.toString()).append("\n");
            }

            writer.flush();
            writer.close();
        } catch (IOException ex) {
            Logger.getLogger(GumtreeScraper.class.getName()).log(Level.SEVERE, null, ex);
        }

    }

    //    private String getPhoneNumber() {
    //        byte[] arrScreen = webDriver.ggetScreenshotAs(OutputType.BYTES);
    //        BufferedImage imageScreen = ImageIO.read(new ByteArrayInputStream(arrScreen));
    //        WebElement cap = driver.findElementById("captcha");
    //        Dimension capDimension = cap.getSize();
    //        Point capLocation = cap.getLocation();
    //        BufferedImage imgCap = imageScreen.getSubimage(capLocation.x, capLocation.y, capDimension.width, capDimension.height);
    //        ByteArrayOutputStream os = new ByteArrayOutputStream();
    //        ImageIO.write(imgCap, "png", os);
    //    }

    public void scrapeWithClick(List<Gumtree> gumtrees, String url) {

        openSite(url);
        waitForPageToLoad();

        do {
            List<WebElement> gumtreeAds = (new WebDriverWait(this.webDriver, getTimeout()))
                    .until(ExpectedConditions.presenceOfAllElementsLocatedBy(
                            By.xpath("//ul[@id='srchrslt-adtable']/li//h6[@class='rs-ad-title']/a")));

            for (WebElement ad : gumtreeAds) {
                //                String adUrl = (new WebDriverWait(this.webDriver, 15))
                //            .until(ExpectedConditions.presenceOfElementLocated(By.xpath("//h6[@class='rs-ad-title']/a"))).getAttribute("href");
                //                String adUrl = ad.findElement(By.xpath("//h6[@class='rs-ad-title']/a")).getAttribute("href");
                String adUrl = ad.getAttribute("href");
                Gumtree gumtree = new Gumtree();
                gumtree.setUrl(adUrl);
                gumtrees.add(gumtree);
            }

            List<WebElement> nextElements = webDriver.findElements(By.xpath("//a[@class='rs-paginator-btn next']"));
            if (nextElements.isEmpty()) { //  no more next page
                break;
            }

            nextElements.get(0).click();
            try {
                Thread.sleep(5000);

            } catch (InterruptedException ex) {
                Logger.getLogger(GumtreeScraper.class.getName()).log(Level.SEVERE, null, ex);
            }

        } while (true);
    }

    public void scrapeWithJSoup(List<Gumtree> gumtrees, String url) throws IOException {

        //        openSite(url);
        //            waitForPageToLoad();

        String nextPageUrl = url;
        boolean needContinue = true;
        do {

            try {
                Document doc = Jsoup.connect(nextPageUrl).timeout(getTimeout() * 1000).userAgent("Mozilla")
                        //                     .userAgent("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36")
                        .get();
                Elements adElements = doc.select("#srchrslt-adtable > li");
                int size = adElements.size();
                for (int i = 0; i < size; i++) {
                    Element ad = adElements.get(i);

                    if (!isOwner(ad)) {
                        continue;
                    }

                    Element linkElement = ad.select("h6.rs-ad-title > a").first();

                    if (linkElement == null) {
                        System.out.print(ad);
                        continue;
                    }

                    String adUrl = linkElement.attr("href");
                    Gumtree gumtree = new Gumtree();
                    gumtree.setUrl(BASE_URL + adUrl);
                    gumtrees.add(gumtree);

                    if (i == size - 1) { // last element
                        Elements adDateElements = ad.select("div.rs-ad-date");
                        if (adDateElements.isEmpty()) {
                            continue;
                        }

                        if (!needToScrapeNextPage(adDateElements.first().text().trim())) {
                            needContinue = false;
                        }
                    }
                }

                Elements nextElements = doc.select("a.rs-paginator-btn.next");
                if (nextElements.isEmpty()) {
                    break;
                }

                nextPageUrl = BASE_URL + nextElements.first().attr("href");
                System.out.println("next page: " + nextPageUrl);
            } catch (Exception oex) {
                System.out.println(oex);
            }
        } while (true && needContinue);
    }

    private boolean isOwner(Element adElement) {
        Elements forSaleByElements = adElement.select("span.rs-ad-attributes-forsaleby_s");
        Elements forRentByElements = adElement.select("span.rs-ad-attributes-forrentby_s");

        // sometime if ads is owner then it does not display
        if (forSaleByElements.isEmpty() && forRentByElements.isEmpty()) {
            return true;
        }

        if (!forSaleByElements.isEmpty() && ("agency".equalsIgnoreCase(forSaleByElements.first().text().trim())
                || "agent".equalsIgnoreCase(forSaleByElements.first().text().trim()))) {
            return false;
        }

        if (!forRentByElements.isEmpty() && ("agency".equalsIgnoreCase(forRentByElements.first().text().trim())
                || "agent".equalsIgnoreCase(forRentByElements.first().text().trim()))) {
            return false;
        }

        return true;
    }

    private boolean needToScrapeNextPage(String dateStr) {
        Date today = new Date();
        if (DateUtils.isSameDay(today, lastEditedDate)) { // check why lastEditedDate is null
            if (dateStr.toLowerCase().contains("minutes") || dateStr.toLowerCase().contains("hours")) {
                return true;
            }

            if (dateStr.toLowerCase().contains("yesterday")) {
                return false;
            }
        }

        Calendar cal = Calendar.getInstance();
        cal.setTime(today);
        cal.add(Calendar.DAY_OF_YEAR, -1);
        Date yesterday = cal.getTime();

        if (DateUtils.isSameDay(yesterday, lastEditedDate)) {
            if (dateStr.toLowerCase().contains("yesterday")) {
                return true;
            }
        }

        Date d = GumtreeUtils.convertStringToDate(dateStr);
        if (d == null) {
            return false;
        }

        return DateUtils.isSameDay(d, lastEditedDate);

        //        return true;
    }

    public void scrape(List<Gumtree> gumtrees, String url) {

        // get total page
        int totalPage = getTotalPage(url);

        for (int i = 1; i <= totalPage; i++) {

            String newUrl = buildPageUrl(url, i);
            //            openSiteWithoutTimeout(newUrl);
            openSite(newUrl);
            waitForPageToLoad();

            if (i == 1) {
                try {
                    Thread.sleep(20 * 1000);
                } catch (InterruptedException ex) {
                    Logger.getLogger(GumtreeScraper.class.getName()).log(Level.SEVERE, null, ex);
                }
            }

            //            List<WebElement> gumtreeAds = (new WebDriverWait(this.webDriver, 15))
            //            .until(ExpectedConditions.presenceOfAllElementsLocatedBy(By.xpath("//ul[@id='srchrslt-adtable']/li")));

            List<WebElement> gumtreeAds = (new WebDriverWait(this.webDriver, SPECIAL_TIMEOUT))
                    .until(ExpectedConditions.presenceOfAllElementsLocatedBy(
                            By.xpath("//ul[@id='srchrslt-adtable']/li//h6[@class='rs-ad-title']/a")));

            for (WebElement ad : gumtreeAds) {
                //               (new WebDriverWait(this.webDriver, 15))
                //            .until(ExpectedConditions.presenceOfElementLocated(By.xpath("//h6[@class='rs-ad-title']/a")));

                try {
                    //                    String adUrl = ad.findElement(By.xpath("//h6[@class='rs-ad-title']/a")).getAttribute("href");
                    String adUrl = ad.getAttribute("href");
                    Gumtree gumtree = new Gumtree();
                    gumtree.setUrl(adUrl);
                    gumtrees.add(gumtree);
                } catch (StaleElementReferenceException ex) {
                    ex.printStackTrace();
                    String adUrl = ad.findElement(By.xpath("//h6[@class='rs-ad-title']/a")).getAttribute("href");
                    Gumtree gumtree = new Gumtree();
                    gumtree.setUrl(adUrl);
                    gumtrees.add(gumtree);
                }
            }

            //            List<WebElement> nextElements = webDriver.findElements(By.xpath("//a[@class='rs-paginator-btn next']"));
            //            if (nextElements.isEmpty()) { //  no more next page
            //                break;
            //            }
            //
            //            nextElements.get(0).click();
        }

        //        List<WebElement> lastPageElements = webDriver.findElements(By.xpath("//a[@class='rs-paginator-btn last']"));
        //        if (lastPageElements.isEmpty()) { // 1 page only
        //            return;
        //        }
        //                
        //        do {
        //            List<WebElement> gumtreeAds = (new WebDriverWait(this.webDriver, 15))
        //            .until(ExpectedConditions.presenceOfAllElementsLocatedBy(By.xpath("//ul[@id='srchrslt-adtable']/li")));
        //                      
        //            for (WebElement ad : gumtreeAds) {
        //                String adUrl = (new WebDriverWait(this.webDriver, 15))
        //            .until(ExpectedConditions.presenceOfElementLocated(By.xpath("//h6[@class='rs-ad-title']/a"))).getAttribute("href");
        ////                String adUrl = ad.findElement(By.xpath("//h6[@class='rs-ad-title']/a")).getAttribute("href");
        //                Gumtree gumtree = new Gumtree();
        //                gumtree.setUrl(adUrl);
        //                gumtrees.add(gumtree);
        //            }
        //
        //            List<WebElement> nextElements = webDriver.findElements(By.xpath("//a[@class='rs-paginator-btn next']"));
        //            if (nextElements.isEmpty()) { //  no more next page
        //                break;
        //            }
        //
        //            nextElements.get(0).click();
        //
        ////        scrape(gumtrees, nextPageUrl);
        //        
        //        } while(true);
    }

    private String buildPageUrl(String url, int pageNo) {
        //        http://www.gumtree.com.au/s-land-for-sale/c20031
        // http://www.gumtree.com.au/s-land-for-sale/page-109/c20031
        int lastIndexOfSlash = url.lastIndexOf("/");
        String newUrl = url.substring(0, lastIndexOfSlash) + "/page-" + pageNo + url.substring(lastIndexOfSlash);
        return newUrl;
        //        String findString = "/page-";        
        //        int fromIndex = url.indexOf(findString);
        //        int toIndex = StringUtils.indexOf(url, "/", fromIndex + 1);
        //        String newPage = findString + pageNo;
        //        String newPageUrl = url.substring(0, fromIndex) + newPage + url.substring(toIndex);
        //        return newPageUrl;
    }

    private int getTotalPage(String url) {
        openSiteWithoutTimeout(url);
        String lastPageUrl = (new WebDriverWait(this.webDriver, 15))
                .until(ExpectedConditions.presenceOfElementLocated(By.xpath("//a[@class='rs-paginator-btn last']")))
                .getAttribute("href");

        // http://www.gumtree.com.au/s-land-for-sale/page-109/c20031
        String findString = "/page-";
        int indexPage = lastPageUrl.indexOf(findString);
        int lastIndexOfSlash = lastPageUrl.lastIndexOf("/");
        String totalPageStr = lastPageUrl.substring(indexPage + findString.length(), lastIndexOfSlash);
        return Integer.parseInt(totalPageStr);
    }

    private void waitForSeconds(int seconds) {
        // sleep for 10 seconds
        try {
            Thread.sleep(seconds * 1000);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }

    @Override
    protected String getStartURL() {
        return url;
    }

    @Override
    protected String getDriverPath() {
        return "src/main/java/driver/chromedriver.exe";
    }

    @Override
    protected int getTimeout() {
        return timeout;
    }

}