Java HTML Parse Jsoup parseWithAdultCheck(URL url, int timeout)

Here you can find the source of parseWithAdultCheck(URL url, int timeout)

Description

parse With Adult Check

License

Open Source License

Declaration

public static Document parseWithAdultCheck(URL url, int timeout)
            throws IOException 

Method Source Code

//package com.java2s;
/*/*from w w  w  . ja  va2 s.  c o  m*/
 * Copyright 2009-2012 TauNova (http://taunova.com). All rights reserved.
 *
 * This file is subject to the terms and conditions defined in
 * file 'LICENSE.txt', which is part of this source code package.
 */

import java.io.IOException;

import java.net.URL;
import java.util.Iterator;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

public class Main {
    private static final String ADULT_NOTICE = "Adult Content Notice";

    public static Document parseWithAdultCheck(URL url, int timeout)
            throws IOException {
        return verifyAdultNotice(parse(url, timeout));
    }

    public static Document verifyAdultNotice(Document doc)
            throws IOException {
        Document document = doc;

        if (ADULT_NOTICE.equals(doc.title())) {
            Element form = document.select("form[action~=adult_\\w+.bml$]")
                    .first();
            Element hidden = form.select("input[name=ret]").first();
            Element submit = form.select("input[name=adult_check]").first();

            Connection conn = Jsoup.connect(form.attr("action"));

            Iterator<Element> iterator = form.select("input").iterator();
            while (iterator.hasNext()) {
                Element element = iterator.next();
                //System.out.println("   -- " + element.attributes());
                conn = conn.data(element.attr("name"),
                        element.attr("value"));
            }

            document = conn.post();
        }

        return document;
    }

    /**
     * 
     * @param url
     * @param timeout
     * @return
     * @throws IOException 
     */
    public static Document parse(URL url, int timeout) throws IOException {
        Document doc = null;

        final int LIMIT = 10;
        final int LIMIT_SLEEP = 2;
        int iteration = 0;

        while (null == doc) {
            try {
                //doc = Jsoup.parse(url, timeout);
                //Jsoup.connect(null).re
                doc = Jsoup
                        .connect(url.toString())
                        .timeout(timeout)
                        .referrer("http://www.google.com/search")
                        .userAgent(
                                "Mozilla/6.0 (Windows; U; WindowsNT 5.1; en-US; rv1.8.1.6) Gecko/20070725 Firefox/3.0.0.0")
                        .get();
            } catch (IOException e1) {
                //                e1.printStackTrace();
                //                if(true)
                //                break;
                System.out.println("TIMEOUT: refetching: " + iteration);
                if (iteration > LIMIT) {
                    throw e1;
                }

                if (iteration > LIMIT_SLEEP) {
                    sleep(timeout * iteration);
                }

                iteration++;
            }
        }
        return doc;
    }

    /**
     * 
     * @param delay
     */
    public static void sleep(int delay) {
        try {
            Thread.sleep(delay);
        } catch (Exception e) {
            //...
        }
    }
}

Related

  1. parsePropertyTable(Element table)
  2. parseTable2ArrayList(Document doc, String selectorRow, String selectorCol)
  3. parseTemplate1_1(Element element)
  4. parseTemplate1_2(Element element)
  5. parseUTF8HTMLDocument(String html)
  6. prettyPrint(String html)
  7. processHtml(String html)
  8. removeAllHtmlTags(String unsafe)
  9. removeHTMLTags(final String text)