Example usage for org.jsoup.select Elements select

List of usage examples for org.jsoup.select Elements select

Introduction

In this page you can find the example usage for org.jsoup.select Elements select.

Prototype

public Elements select(String query) 

Source Link

Document

Find matching elements within this element list.

Usage

From source file:com.normalexception.app.rx8club.fragment.pm.PrivateMessageViewFragment.java

/**
 * Format the user post by removing the vb style quotes and the 
 * duplicate youtube links/*from  w  ww . j  a v a 2 s. co m*/
 * @param innerPost   The element that contains the inner post
 * @return         The formatted string
 */
private String formatUserPost(Elements innerPost) {

    // Remove the duplicate youtube links (this is caused by a plugin on 
    // the forum that embeds youtube videos automatically)
    for (Element embedded : innerPost.select("div[id^=ame_doshow_post_]"))
        embedded.remove();

    // Remove the vbulletin quotes
    String upost = Utils.reformatQuotes(innerPost.html());

    return upost;
}

From source file:com.normalexception.app.rx8club.fragment.ProfileFragment.java

/**
 * Get the user information from the users profile
 * @param doc   The page document/*from ww  w .java  2s . c o  m*/
 */
private void getUserInformation(Document doc) {
    final UserProfile upInstance = UserProfile.getInstance();
    stubs = new ArrayList<ProfileModel>();

    // Title
    Elements userInfo = doc.select("div[id=main_userinfo]");
    Elements title = userInfo.select("h2");
    upInstance.setUserTitle(title.text());

    // Posts
    Elements statisticInfo = doc.select("fieldset[class=statistics_group]");
    Elements post = statisticInfo.select("li");

    // Profile Pic
    Elements profilePicInfo = doc.select("td[id=profilepic_cell] > img");

    // Grab image, trap
    try {
        upInstance.setUserImageLink(profilePicInfo.attr("src"));
    } catch (Exception e) {
    }

    // Grab Post count, trap exception
    try {
        upInstance.setUserPostCount(
                post.get(0).text() + " / " + post.get(1).text().split(" ", 4)[3] + " per day");
    } catch (Exception e) {
        upInstance.setUserPostCount("Error Getting Post Count");
    }

    // Grab Join Date, trap exception
    try {
        upInstance.setUserJoinDate(post.get(13).text());
    } catch (Exception e) {
        upInstance.setUserJoinDate("Error Getting Join Date");
    }

    // Threads
    String link = WebUrls.userUrl + upInstance.getUserId();
    doc = VBForumFactory.getInstance().get(getActivity(), link);
    if (doc != null) {
        Elements threadlist = doc.select("table[id^=post]");
        for (Element threadl : threadlist) {
            ProfileModel stub = new ProfileModel();
            Elements divs = threadl.getElementsByTag("div");
            Elements div = divs.get(1).getElementsByTag("a");
            stub.setLink(div.attr("href"));
            stub.setName(div.text());

            div = divs.get(5).getElementsByTag("a");
            stub.setText(div.text());
            stubs.add(stub);
        }
    }
}

From source file:com.normalexception.app.rx8club.fragment.thread.EditPostFragment.java

/**
 * Report the value inside of an input element
 * @param pan   The panel where all of the input elements reside
 * @param name   The name of the input to get the value for
 * @return      The string value of the input
 *//* w  ww . j  ava 2 s  .c o m*/
private String getInputElementValue(Elements pan, String name) {
    return pan.select("input[name=" + name + "]").attr("value");
}

From source file:com.normalexception.app.rx8club.fragment.thread.ThreadFragment.java

/**
 * Grab contents from the forum that the user clicked on
 * @param doc   The document parsed from the link
 * @param id   The id number of the link
 * @return      An arraylist of forum contents
 */// w  ww.j av a  2  s.  co m
public void getThreadContents(Document doc) {
    // Update pagination
    try {
        Elements pageNumbers = doc.select("div[class=pagenav]");
        if (pageNumbers.first() != null) {
            Elements pageLinks = pageNumbers.first().select("td[class^=vbmenu_control]");
            thisPage = pageLinks.text().split(" ")[1];
            finalPage = pageLinks.text().split(" ")[3];
            Log.d(TAG, String.format("This Page: %s, Final Page: %s", thisPage, finalPage));
        } else {
            Log.d(TAG, "Thread only contains one page");
        }
    } catch (Exception e) {
        Log.e(TAG, "We had an error with pagination", e);
    }

    // Is user thread admin??
    Elements threadTools = doc.select("div[id=threadtools_menu] > form > table");
    if (threadTools.text().contains(MODERATION_TOOLS)) {
        Log.d(TAG, "<><> User has administrative rights here! <><>");
    } else {
        //adminContent.setVisibility(View.GONE);
        lv.removeHeaderView(adminContent);
    }

    // Get the user's actual ID, there is a chance they never got it
    // before
    UserProfile.getInstance().setUserId(HtmlFormUtils.getInputElementValueByName(doc, "loggedinuser"));

    // Get Post Number and security token
    securityToken = HtmlFormUtils.getInputElementValueByName(doc, "securitytoken");

    Elements pNumber = doc.select("a[href^=http://www.rx8club.com/newreply.php?do=newreply&noquote=1&p=]");
    String pNumberHref = pNumber.attr("href");
    postNumber = pNumberHref.substring(pNumberHref.lastIndexOf("=") + 1);
    threadNumber = doc.select("input[name=searchthreadid]").attr("value");

    Elements posts = doc.select("div[id=posts]").select("div[id^=edit]");
    Log.v(TAG, String.format("Parsing through %d posts", posts.size()));
    for (Element post : posts) {
        try {
            Elements innerPost = post.select("table[id^=post]");

            // User Control Panel
            Elements userCp = innerPost.select("td[class=alt2]");
            Elements userDetail = userCp.select("div[class=smallfont]");
            Elements userSubDetail = userDetail.last().select("div");
            Elements userAvatar = userDetail.select("img[alt$=Avatar]");

            // User Information
            PostModel pv = new PostModel();
            pv.setUserName(userCp.select("div[id^=postmenu]").text());
            pv.setIsLoggedInUser(LoginFactory.getInstance().isLoggedIn()
                    ? UserProfile.getInstance().getUsername().equals(pv.getUserName())
                    : false);
            pv.setUserTitle(userDetail.first().text());
            pv.setUserImageUrl(userAvatar.attr("src"));
            pv.setPostDate(innerPost.select("td[class=thead]").first().text());
            pv.setPostId(Utils.parseInts(post.attr("id")));
            pv.setRootThreadUrl(currentPageLink);

            // get Likes if any exist
            Elements eLikes = innerPost.select("div[class*=vbseo_liked] > a");
            List<String> likes = new ArrayList<String>();
            for (Element eLike : eLikes)
                likes.add(eLike.text());
            pv.setLikes(likes);

            Iterator<Element> itr = userSubDetail.listIterator();
            while (itr.hasNext()) {
                String txt = itr.next().text();
                if (txt.contains("Location:"))
                    pv.setUserLocation(txt);
                else if (txt.contains("Posts:"))
                    pv.setUserPostCount(txt);
                else if (txt.contains("Join Date:"))
                    pv.setJoinDate(txt);
            }

            // User Post Content
            pv.setUserPost(formatUserPost(innerPost));

            // User signature
            try {
                Element userSig = innerPost.select("div[class=konafilter]").first();
                pv.setUserSignature(userSig.html());
            } catch (NullPointerException npe) {
            }

            Elements postAttachments = innerPost.select("a[id^=attachment]");
            if (postAttachments != null && !postAttachments.isEmpty()) {
                ArrayList<String> attachments = new ArrayList<String>();
                for (Element postAttachment : postAttachments) {
                    attachments.add(postAttachment.attr("href"));
                }
                pv.setAttachments(attachments);
            }

            pv.setSecurityToken(securityToken);

            // Make sure we aren't adding a blank user post
            if (pv.getUserPost() != null)
                postlist.add(pv);
        } catch (Exception e) {
            Log.w(TAG, "Error Parsing Post...Probably Deleted");
        }
    }
}

From source file:com.normalexception.app.rx8club.fragment.thread.ThreadFragment.java

/**
 * Format the user post by removing the vb style quotes and the 
 * duplicate youtube links//  ww  w .  j  a  va 2s  .  com
 * @param innerPost   The element that contains the inner post
 * @return         The formatted string
 */
private String formatUserPost(Elements innerPost) {
    try {
        Element ipost = innerPost.select("td[class=alt1]").select("div[id^=post_message]").first();

        // Only if there is a post to key off of
        if (ipost != null) {
            // Remove the duplicate youtube links (this is caused by a plugin on 
            // the forum that embeds youtube videos automatically)
            for (Element embedded : ipost.select("div[id^=ame_doshow_post_]"))
                embedded.remove();

            // Remove the vbulletin quotes
            return Utils.reformatQuotes(ipost.html());
        } else {
            return null;
        }
    } catch (Exception e) {
        Log.e(TAG, "Error Parsing Post", e);
        return null;
    }
}

From source file:noThreads.Menu.java

public void createMenu() throws IOException, InterruptedException {
    Document doc = null;//from   www .  j  a  v  a2  s  .co  m
    BufferedReader br = null;

    System.out.print("******************** Menu Options ******************** "
            + "\n1. Get a playlist for all the stations at <e-radio.gr>"
            + "\n2. View the available station Categories and get a playlist."
            + "\n3. View the available station Locations and get a playlist."
            + "\n4. View the station Ratings (Top) and get a playlist." + "\n5. Exit." + "\n\n"
            + "Please make a choice (1-5): ");
    br = new BufferedReader(new InputStreamReader(System.in));

    try {
        choice = Integer.parseInt(br.readLine());
    } catch (IOException e) {
        System.out.println("Error!");
        System.exit(1);
    }

    switch (choice) {
    case (1): //GET all the e-radio location links (in order to get all the links)
        doc = parseUrl(URL, 0);

        if (doc == null) {
            print("No connection to the server! Exiting...");
            System.exit(1);
        }

        Elements links = doc.select("div[id=paneContainer]").select("a[href*=/locations/]");

        for (Element link : links)
            theUrls.add(link.attr("abs:href"));
        System.out.println("...Processing <All e-radio> station links");
        break;

    case (2): //Get CATEGORIES
        doc = parseUrl(URL, 0);

        if (doc == null) {
            print("No connection to the server! Exiting...");
            System.exit(1);
        }

        Elements categoryLinks = doc.select("div[id=paneContainer]").select("a[href*=/categories/]");

        System.out.println("E-radio stations available categories: " + "\n");
        for (int i = 0; i < categoryLinks.size(); i++) {
            System.out.println(i + 1 + ".  " + StringEscapeUtils.unescapeHtml4(categoryLinks.get(i).html()));
        }
        System.out.print("\n" + "Please make a choise (1-" + categoryLinks.size() + "): ");

        br = new BufferedReader(new InputStreamReader(System.in));
        try {
            choice = Integer.parseInt(br.readLine());
        } catch (IOException e) {
            System.out.println("Error!");
            System.exit(1);
        }
        if (choice <= categoryLinks.size() && choice >= 1) {
            theUrls.add(categoryLinks.get(choice - 1).attr("abs:href"));
            System.out.println("...Processing the <"
                    + StringEscapeUtils.unescapeHtml4(categoryLinks.get(choice - 1).html()) + "> category");
        } else {
            System.out.println("Wrong selection...");
            System.out.println("Exiting program...");
            System.exit(1);
        }

        break;

    case (3)://Get LOCATIONS
        doc = parseUrl(URL, 0);

        if (doc == null) {
            print("No connection to the server! Exiting...");
            System.exit(1);
        }

        Elements locationLinks = doc.select("div[id=paneContainer]").select("a[href*=/locations/]");

        System.out.println("E-radio stations available locations: " + "\n");
        for (int i = 0; i < locationLinks.size(); i++) {
            System.out.println(i + 1 + ".  " + StringEscapeUtils.unescapeHtml4(locationLinks.get(i).html()));
        }
        System.out.print("\n" + "Please make a choise (1-" + locationLinks.size() + "): ");

        br = new BufferedReader(new InputStreamReader(System.in));
        try {
            choice = Integer.parseInt(br.readLine());
        } catch (IOException e) {
            System.out.println("Error!");
            System.exit(1);
        }
        if (choice <= locationLinks.size() && choice >= 1) {
            theUrls.add(locationLinks.get(choice - 1).attr("abs:href"));
            System.out.println("...Processing <"
                    + StringEscapeUtils.unescapeHtml4(locationLinks.get(choice - 1).html()) + "> locatino");
        } else {
            System.out.println("Wrong selection!");
            System.out.println("Exiting program...");
            System.exit(1);
        }

        break;

    case (4):
        final int YEARLY_RATING = 10;
        doc = parseUrl(URL, 0);

        if (doc == null) {
            print("No connection to the server! Exiting...");
            System.exit(1);
        }

        Elements ratingsMenu = doc.select("div[class=menuFly]").select("li").select("a[class=hide]");

        print("\nStations ratings: \n");

        for (int i = 0; i < ratingsMenu.size(); i++) {
            System.out.println(i + 1 + ".  " + StringEscapeUtils.unescapeHtml4(ratingsMenu.get(i).html()));
        }
        System.out.print("\n" + "Please make a choise (1-" + ratingsMenu.size() + "): ");

        br = new BufferedReader(new InputStreamReader(System.in));
        try {
            choice = Integer.parseInt(br.readLine());
        } catch (IOException e) {
            System.out.println("Error!");
            System.exit(1);
        }

        /*
         * The html of the Ratings menu processed 
         * has this structure:
         * <div>
         *    <ul>
         *       <li>
         *          <ul>
         *             ...
         *          </ul>
         *       </li>
         *       ...
         *    </ul>
         * </div>
         */
        if (choice <= ratingsMenu.size() && choice >= 1) {
            //Get the DIV element with class "menuFly"
            Elements div = doc.select("div[class=menuFly]");
            //div Elements list has only one element. So get the children of div
            Elements ul = div.get(0).children();
            //ul Elements list has only one element. So get the children of ul
            Elements li = ul.get(0).children();

            //remove blank elements
            for (int j = 0; j < li.size(); j++) {
                if (li.get(j).hasText() == false)
                    li.remove(li.get(j));
            }

            //get the title of user choice and print it out
            print("\n%s", StringEscapeUtils.unescapeHtml4(ratingsMenu.get(choice - 1).html()) + "\n");
            //check if there is a sub-menu
            Elements ulTag = li.get(choice - 1).select("ul");
            if (ulTag.hasText() == true) {
                Elements subMenu = ulTag.select("li").select("a[href]");

                //print the sub-menu
                for (int j = 0; j < subMenu.size(); j++)
                    print("%s.  %s ", j + 1, StringEscapeUtils.unescapeHtml4(subMenu.get(j).html()));

                System.out.print("\n" + "Please make a choise (1-" + subMenu.size() + "): ");

                //read user input
                br = new BufferedReader(new InputStreamReader(System.in));
                try {
                    choice = Integer.parseInt(br.readLine());
                } catch (IOException e) {
                    System.out.println("Error!");
                    System.exit(1);
                }

                if (choice <= subMenu.size() && choice >= 1) {
                    theUrls.add(subMenu.get(choice - 1).attr("abs:href"));
                    System.out.println("...Processing the <"
                            + StringEscapeUtils.unescapeHtml4(subMenu.get(choice - 1).html()) + "> category");
                } else {
                    System.out.println("Wrong selection!");
                    System.out.println("Exiting program...");
                    System.exit(1);
                }
            } else {
                if (choice == YEARLY_RATING) {
                    String url = li.get(choice - 1).select("a[href").attr("abs:href");
                    doc = parseUrl(url, 0);

                    if (doc != null) {
                        Elements yearTopSubMenu = doc.select("div[id=maintabsid]").select("a[href]");

                        //print the sub-menu
                        for (int i = 0; i < yearTopSubMenu.size(); i++)
                            print("%s.  %s", i + 1,
                                    StringEscapeUtils.unescapeHtml4(yearTopSubMenu.get(i).html()));

                        System.out.print("\n" + "Please make a choise (1-" + yearTopSubMenu.size() + "): ");

                        //read user input
                        br = new BufferedReader(new InputStreamReader(System.in));
                        try {
                            choice = Integer.parseInt(br.readLine());
                        } catch (IOException e) {
                            System.out.println("Error!");
                            System.exit(1);
                        }

                        if (choice <= yearTopSubMenu.size() && choice >= 1) {
                            if (choice == 1) {
                                theUrls.add(yearTopSubMenu.get(choice - 1).attr("abs:href"));
                                print("...Processing the <"
                                        + StringEscapeUtils.unescapeHtml4(yearTopSubMenu.get(choice - 1).html())
                                        + "> category");
                            } else if (choice == 2) {
                                String link = yearTopSubMenu.get(choice - 1).attr("abs:href");
                                doc = parseUrl(link, 0);

                                //print menu title
                                print("\n%s",
                                        StringEscapeUtils.unescapeHtml4(yearTopSubMenu.get(choice - 1).html())
                                                + "\n");

                                if (doc != null) {
                                    Elements elem = doc.select("select[id=selectoption]")
                                            .select("option[value]");
                                    ArrayList<Integer> nums = new ArrayList<Integer>();

                                    for (int i = 0; i < elem.size(); i++) {
                                        //get the select category values and print the sub-menu
                                        int num = Integer.parseInt(elem.get(i).attr("value"));
                                        //add them to list
                                        nums.add(num);
                                        print("%s.  %s", i + 1, StringEscapeUtils.unescapeHtml4(
                                                elem.get(i).html().replace("Select category: ", "")));
                                    }

                                    System.out.print("\n" + "Please make a choise (1-" + elem.size() + "): ");

                                    //read user input
                                    br = new BufferedReader(new InputStreamReader(System.in));
                                    try {
                                        choice = Integer.parseInt(br.readLine());
                                    } catch (IOException e) {
                                        System.out.println("Error!");
                                        System.exit(1);
                                    }
                                    if (choice <= elem.size() && choice >= 1) {
                                        int num = nums.get(choice - 1);
                                        String added = "max=100&id=" + num + "&";
                                        String newlink = link.replace("max=100&", added);

                                        //print("\nlink: %s", newlink); DEBUG print

                                        theUrls.add(newlink);
                                        System.out
                                                .println("...Processing the <"
                                                        + StringEscapeUtils.unescapeHtml4(elem.get(choice - 1)
                                                                .html().replace("Select category: ", ""))
                                                        + "> category");
                                        print(elem.get(choice - 1).select("a[href]").attr("abs:href"));
                                    } else {
                                        System.out.println("Wrong selection!");
                                        System.out.println("Exiting program...");
                                        System.exit(1);
                                    }
                                } else {
                                    System.out.println("ERROR: Cannot get links from server!");
                                    System.out.println("Exiting program...");
                                    System.exit(1);
                                }
                            } else {
                                String link = yearTopSubMenu.get(choice - 1).attr("abs:href");
                                doc = parseUrl(link, 0);

                                //print menu title
                                print("\n%s",
                                        StringEscapeUtils.unescapeHtml4(yearTopSubMenu.get(choice - 1).html())
                                                + "\n");

                                if (doc != null) {
                                    Elements elem = doc.select("select[id=selectoption]")
                                            .select("option[value]");
                                    ArrayList<Integer> nums = new ArrayList<Integer>();

                                    for (int i = 0; i < elem.size(); i++) {
                                        //get the select category values and print the sub-menu
                                        int num = Integer.parseInt(elem.get(i).attr("value"));
                                        //add them to list
                                        nums.add(num);
                                        print("%s.  %s", i + 1, StringEscapeUtils.unescapeHtml4(
                                                elem.get(i).html().replace("Select location: ", "")));
                                    }

                                    System.out.print("\n" + "Please make a choise (1-" + elem.size() + "): ");

                                    //read user input
                                    br = new BufferedReader(new InputStreamReader(System.in));
                                    try {
                                        choice = Integer.parseInt(br.readLine());
                                    } catch (IOException e) {
                                        System.out.println("Error!");
                                        System.exit(1);
                                    }
                                    if (choice <= elem.size() && choice >= 1) {
                                        int num = nums.get(choice - 1);
                                        String[] linkParts = link.split("&", 4);
                                        String finalLink = linkParts[0] + "&" + linkParts[1] + "&" + "id=" + num
                                                + "&" + linkParts[3];

                                        //print("\nlink: %s \n link2: %s \n link3: %s \n link: %s \nsize: %s", linkParts[0], linkParts[1], linkParts[2], linkParts[3], linkParts.length); // DEBUG print
                                        //print(finalLink);

                                        theUrls.add(finalLink);
                                        System.out
                                                .println("...Processing the <"
                                                        + StringEscapeUtils.unescapeHtml4(elem.get(choice - 1)
                                                                .html().replace("Select category: ", ""))
                                                        + "> category");
                                        print(elem.get(choice - 1).select("a[href]").attr("abs:href"));
                                    } else {
                                        System.out.println("Wrong selection!");
                                        System.out.println("Exiting program...");
                                        System.exit(1);
                                    }
                                } else {
                                    System.out.println("ERROR: Cannot get links from server!");
                                    System.out.println("Exiting program...");
                                    System.exit(1);
                                }
                            }
                        } else {
                            System.out.println("Wrong selection!");
                            System.out.println("Exiting program...");
                            System.exit(1);
                        }
                    } else {
                        System.out.println("ERROR: Cannot get links from server!");
                        System.out.println("Exiting program...");
                        System.exit(1);
                    }
                } else {
                    theUrls.add(li.get(choice - 1).select("a[href").attr("abs:href"));
                    System.out.println("...Processing the <"
                            + StringEscapeUtils.unescapeHtml4(ratingsMenu.get(choice - 1).html())
                            + "> category");
                    print(li.get(choice - 1).select("a[href]").attr("abs:href"));
                }
            }
        } else {
            System.out.println("Wrong selection!");
            System.out.println("Exiting program...");
            System.exit(1);
        }
        break;

    case (5):
        System.out.println("Exiting program...");
        System.exit(0);
        break;

    default:
        System.out.println("Invalid choice! Exiting...");
        System.exit(1);
        break;

    }
}

From source file:org.aliuge.crawler.jobconf.ExtractConfig.java

/**
 * ????/*from   w w  w  .j  a v  a2s.c om*/
 * @param doc
 * @return
 * @throws ConfigurationException
 */
public ExtractConfig loadConfig(Document doc) {
    Elements extractElement = doc.select("extract");
    super.setJobName(doc.select("job").attr("name"));
    super.setIndexName(doc.select("job").attr("indexName"));
    String temp = extractElement.select("threadNum").text();
    if (StringUtils.isNotBlank(temp)) {
        this.threadNum = Integer.parseInt(temp);
    }

    Elements templateElement = extractElement.select("extract").select("template");
    Iterator<Element> it = templateElement.iterator();

    while (it.hasNext()) {
        Element template = it.next();
        ExtractTemplate extractTemplate = new ExtractTemplate();
        // ?Url????
        Elements urlPatternElement = template.select("url");
        List<Pattern> patterns = Lists.newArrayList();
        for (Element urlElement : urlPatternElement) {
            patterns.add(Pattern.compile(urlElement.text()));
        }
        extractTemplate.setUrlPattern(patterns);
        extractTemplate.setName(template.attr("name"));
        // ???
        Elements selectElement = template.select("elements").first().children();
        for (Element element : selectElement) {
            if ("element".equals(element.tagName())) {
                AbstractElementCssSelector<?> selector = ElementCssSelectorFactory.create(element);
                extractTemplate.addCssSelector(selector);
            } else if ("if".equals(element.tagName())) {
                IFConditions ifConditions = IFConditions.create(element);
                extractTemplate.addConditions(ifConditions);
            }
        }
        super.setExtractConfig(this);
        this.templates.add(extractTemplate);
    }
    //super.setExtractConfig(this);
    return this;
}

From source file:org.aliuge.crawler.jobconf.FetchConfig.java

/**
 * ???/*  w  w w .  j  av a 2  s .c om*/
 * 
 * @param confFile
 * @return
 */
@SuppressWarnings("unchecked")
public FetchConfig loadConfig(Document confDoc) throws ConfigurationException {
    try {
        Document doc = confDoc;
        super.setJobName(doc.select("job").attr("name"));
        super.setIndexName(doc.select("job").attr("indexName"));
        Elements e = doc.select("fetch");
        this.type = e.select("type").text();
        this.agent = e.select("agent").text();
        String temp = e.select("threadNum").text();
        if (StringUtils.isNotBlank(temp)) {
            this.threadNum = Integer.parseInt(temp);
        }

        temp = e.select("delayBetweenRequests").text();
        if (StringUtils.isNotBlank(temp)) {
            this.delayBetweenRequests = Integer.parseInt(temp);
        }

        temp = e.select("maxDepthOfCrawling").text();
        if (StringUtils.isNotBlank(temp)) {
            this.maxDepthOfCrawling = Integer.parseInt(temp);
        }

        temp = e.select("fetchBinaryContent").text();
        if (StringUtils.isNotBlank(temp)) {
            this.fetchBinaryContent = Boolean.parseBoolean(temp);
        }

        if (StringUtils.isNotBlank(e.select("maxOutgoingLinksToFollow").text())) {
            this.maxOutgoingLinksToFollow = Integer.parseInt(e.select("maxOutgoingLinksToFollow").text());
        }

        temp = e.select("fileSuffix").text();
        if (StringUtils.isNotBlank(temp)) {
            this.fileSuffix = temp;
        }

        temp = e.select("maxDownloadSizePerPage").text();
        if (StringUtils.isNotBlank(temp)) {
            this.maxDownloadSizePerPage = Integer.parseInt(temp);
        }

        temp = e.select("https").text();
        if (StringUtils.isNotBlank(temp)) {
            this.https = Boolean.parseBoolean(temp);
        }

        temp = e.select("onlyDomain").text();
        if (StringUtils.isNotBlank(temp)) {
            this.onlyDomain = Boolean.parseBoolean(temp);
        }

        temp = e.select("socketTimeoutMilliseconds").text();
        if (StringUtils.isNotBlank(temp)) {
            this.socketTimeoutMilliseconds = Integer.parseInt(temp);
        }

        temp = e.select("connectionTimeout").text();
        if (StringUtils.isNotBlank(temp)) {
            this.connectionTimeout = Integer.parseInt(temp);
        }

        temp = e.select("maxTotalConnections").text();
        if (StringUtils.isNotBlank(temp)) {
            this.maxTotalConnections = Integer.parseInt(temp);
        }

        temp = e.select("maxConnectionsPerHost").text();
        if (StringUtils.isNotBlank(temp)) {
            this.maxConnectionsPerHost = Integer.parseInt(e.select("maxConnectionsPerHost").text());
        }

        temp = e.select("maxConnectionsPerHost").text();
        if (StringUtils.isNotBlank(temp)) {
            this.maxConnectionsPerHost = Integer.parseInt(temp);
        }

        temp = e.select("proxy").text();
        if (StringUtils.isNotBlank(temp)) {
            Properties p = PropertyConfigurationHelper.getProperties(temp);
            this.proxyIps = Lists.newLinkedList();
            for (Object o : p.keySet()) {
                proxyIps.add((String) p.get(o));
            }

        }

        // seed
        Elements seeds = doc.select("fetch seeds seed");
        for (Element element : seeds) {
            // WebURL url = new WebURL();
            String url = element.text();
            if (StringUtils.isBlank(url)) {
                continue;
            }
            url = url.trim();
            String area = element.attr("area");
            this.seeds.add(url);

            WebURL areaUrl = new WebURL(area, url);

            try {
                PendingManager.getPendingArea(super.getJobName()).addElement(areaUrl);
            } catch (QueueException e1) {
                log.error("", e1);
                e1.printStackTrace();
            }
            // BloomfilterHelper.getInstance().add(url.getURL());

        }

        /*
         * ??Url
         */
        Elements fetchUrlFilters = doc.select("fetchUrlFilters filter");
        for (Element element : fetchUrlFilters) {
            String tmp = element.text();
            if (StringUtils.isNoneBlank(tmp))
                this.fetchUrlFilters.add(element.text());
        }
        /*
         * ?????Url
         */
        Elements extractUrlfilters = doc.select("extractUrlfilters filter");
        for (Element element : extractUrlfilters) {
            String tmp = element.text();
            String tmp_rep = element.attr("replace");
            if (StringUtils.isNoneBlank(tmp))
                this.extractUrlfilters.add(new KeyValue(tmp, tmp_rep));
        }
    } catch (NumberFormatException e) {
        throw new ConfigurationException("?" + e.getMessage());
    }
    // super.setFetchConfig(this);
    return this;
}

From source file:org.aliuge.crawler.jobconf.StoreConfig.java

public StoreConfig loadConfig(Document confDoc) throws StorageException {
    Document doc = confDoc;// w w  w. ja  v a  2  s  .  c o  m
    super.setJobName(doc.select("job").attr("name"));
    super.setIndexName(doc.select("job").attr("indexName"));
    Elements e = doc.select("store");
    this.type = e.select("type").text();
    if (StringUtils.isNotBlank(e.select("threadNum").text())) {
        this.threadNum = Integer.parseInt(e.select("threadNum").text());
    }
    String className = e.select("plugin").text();
    if (StringUtils.isNotBlank(className)) {
        this.pluginClass = className;
    }
    if (!StorageType.containsValue(this.type)) {
        log.info("?" + this.type);
        throw new StorageException(
                "???store? mysql,hbase,elasticSearch,localFile,mongodb");
    }
    if (this.type.equalsIgnoreCase(StorageType.hbase.getValue())) {
        String tName = e.select("table").first().attr("name");
        String fName = e.select("family").first().text();
        this.hconfig = new HBaseConfig(tName, fName);
    } else if (this.type.equalsIgnoreCase(StorageType.mongodb.getValue())) {
        String dbName = e.select("db").first().attr("name");
        String collection = e.select("collection").first().text();
        String port = e.select("port").text();
        String host = e.select("host").text();
        this.mongodbConfig = new MongodbConfig(dbName, collection, host, port);
    } else if (this.type.equalsIgnoreCase(StorageType.mysql.getValue())) {

    } else if (this.type.equalsIgnoreCase(StorageType.elasticsearch.getValue())) {

    }

    // id?
    String idPolicy = e.select("idPolicy").text();
    if (StringUtils.isNotBlank(idPolicy)) {
        id = EnumUtils.getEnum(IDPolicy.class, idPolicy);
        if (!IDPolicy.auto.equals(id)) {
            String pref = e.select("ref").text();
            if (StringUtils.isNotBlank(pref)) {
                this.policyRef = pref;
            }
            if (StringUtils.isBlank(this.policyRef)) {
                try {
                    throw new ConfigurationException("ID??");
                } catch (Exception e2) {
                    e2.printStackTrace();
                }
            }
        }
    }
    return this;
}

From source file:org.b3log.wordman.word.Main.java

/**
 * ?.//w  w  w.jav  a  2s .  c  om
 *
 * @param args ?
 * @throws java.lang.Exception 
 */
public static void main(final String[] args) throws Exception {
    final Clazz clazz = new Clazz();
    clazz.setId(CLASS_ID);
    clazz.setName(CLASS_NAME);
    final List<Word> classWords = new ArrayList<Word>();
    clazz.setWords(classWords);

    for (int clazzNum = 1; clazzNum <= CLASS_NUM; clazzNum++) {
        final Connection.Response response = Jsoup
                .connect("http://word.iciba.com/?action=words&class=" + clazz.getId() + "&course=" + clazzNum)
                .userAgent("Mozilla").timeout(TIMEOUT).execute();

        final Document document = response.parse();

        int classWordCnt = 0;
        for (int i = 1; i <= PAGE; i++) {
            final Elements wordList = document.select("ul#word_list_" + i);
            final Elements wordLi = wordList.select("li");

            for (final Element li : wordLi) {
                final Word word = new Word();
                word.setId(UUID.randomUUID().toString().replaceAll("-", ""));

                final Element w = li.select("div.word_main_list_w").get(0);
                String spell = w.select("span").get(0).attr("title");

                // ??
                spell = spell.replace("*", "").replaceAll("\\(.*\\)", "").replace("\\", "");

                spell = spell.trim();

                word.setWord(spell);
                if (!checkWord(spell)) { // 
                    throw new IllegalStateException(" [" + spell + ']');
                }

                final Element y = li.select("div.word_main_list_y").get(0);
                word.setPhon(y.select("strong").get(0).text());
                word.setPron(y.select("a").get(0).id());

                final Element s = li.select("div.word_main_list_s").get(0);
                word.setPara(s.select("span").get(0).text());

                // ???
                word.setBuild("");
                word.setExample("");

                // System.out.println(word.toString());
                classWords.add(word);
                classWordCnt++;
            }
        }

        System.out.println("? [" + clazzNum + "] ??? [" + classWordCnt + "]");
    }

    final StringBuilder sqlBuilder = new StringBuilder();

    final List<String> sqls = clazz.toSQLs();
    for (final String sql : sqls) {
        System.out.println(sql);
        sqlBuilder.append(sql).append(IOUtils.LINE_SEPARATOR);
    }

    final OutputStream outputStream = new FileOutputStream(new File("C:\\" + CLASS_NAME + ".sql"));
    IOUtils.write(sqlBuilder.toString(), outputStream, "UTF-8");
    IOUtils.closeQuietly(outputStream);
}