Example usage for org.jsoup.select Elements not

List of usage examples for org.jsoup.select Elements not

Introduction

In this page you can find the example usage for org.jsoup.select Elements not.

Prototype

public Elements not(String query) 

Source Link

Document

Remove elements from this list that match the Selector query.

Usage

From source file:com.ewcms.plugin.crawler.generate.EwcmsContentCrawler.java

/**
 * ?page??/*  w w w .ja va2  s . c o  m*/
 */
@Override
public void visit(Page page) {
    try {
        String url = page.getWebURL().getURL();

        page.setContentType("text/html; charset=" + gather.getEncoding());
        Document doc = Jsoup.connect(url).timeout(gather.getTimeOutWait().intValue() * 1000).get();

        String title = doc.title();
        if (gather.getTitleExternal() && gather.getTitleRegex() != null
                && gather.getTitleRegex().length() > 0) {
            Elements titleEles = doc.select(gather.getTitleRegex());
            if (!titleEles.isEmpty()) {
                String tempTitle = titleEles.text();
                if (tempTitle != null && tempTitle.length() > 0) {
                    title = tempTitle;
                }
            }
        }

        if (title != null && title.trim().length() > 0) {
            Elements elements = doc.select(matchRegex);
            if (filterRegex != null && filterRegex.trim().length() > 0) {
                elements = elements.not(filterRegex);
            }
            if (!elements.isEmpty()) {
                String subHtml = elements.html();
                Document blockDoc = Jsoup.parse(subHtml);
                String contentText = blockDoc.html();

                if (gather.getRemoveHref()) {
                    Document moveDoc = Jsoup.parse(contentText);
                    Elements moveEles = moveDoc.select("*").not("a");
                    contentText = moveEles.html();
                }
                if (gather.getRemoveHtmlTag())
                    contentText = doc.text();

                if (isLocal) {
                    contentText = doc.text();

                    Boolean isMatcher = true;
                    for (int i = 0; i < keys.length; i++) {
                        Boolean result = Pattern.compile(keys[i].trim()).matcher(contentText).find();
                        if (!result) {
                            isMatcher = false;
                            break;
                        }
                    }

                    if (isMatcher) {
                        Storage storage = new Storage();
                        storage.setGatherId(gather.getId());
                        storage.setGatherName(gather.getName());
                        storage.setTitle(title);
                        storage.setUrl(url);
                        try {
                            gatherService.addStorage(storage);
                        } catch (Exception e) {
                            logger.error("save storage error : {}", e.getLocalizedMessage());
                        } finally {
                            storage = null;
                        }
                    }
                } else {
                    Content content = new Content();
                    content.setDetail(contentText);
                    content.setPage(1);
                    List<Content> contents = new ArrayList<Content>();
                    contents.add(content);

                    Article article = new Article();
                    article.setTitle(title);
                    article.setContents(contents);

                    articleMainService.addArticleMainByCrawler(article, gather.getChannelId(),
                            CrawlerUtil.USER_NAME);
                }
            }
        }
    } catch (IOException e) {
        logger.warn(e.getLocalizedMessage());
    }
}