Example usage for org.jsoup.nodes Document select

List of usage examples for org.jsoup.nodes Document select

Introduction

In this page you can find the example usage for org.jsoup.nodes Document select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:org.cellcore.code.engine.page.extractor.mtgf.MTGFPageDataExtractor.java

@Override
protected float getPrice(Document doc) {
    String val = cleanPriceString(doc.select(".card-buy").get(0).select(".price").get(0).text());
    return Float.parseFloat(val);
}

From source file:org.cellcore.code.engine.page.extractor.pkg.PKGPageDataExtractor.java

@Override
protected int getStock(Document doc) {
    String stock = doc.select("option").last().text();
    return Integer.parseInt(stock);
}

From source file:io.seldon.importer.articles.ItemAttributesImporter.java

public static Map<String, String> getAttributes(String url, String existingCategory) {
    ItemProcessResult itemProcessResult = new ItemProcessResult();
    itemProcessResult.client_item_id = url;
    itemProcessResult.extraction_status = "EXTRACTION_FAILED";

    logger.info("Trying to get attributes for " + url);
    Map<String, String> attributes = null;
    String title = "";
    String category = "";
    String subCategory = "";
    String img_url = "";
    String description = "";
    String tags = "";
    String leadtext = "";
    String link = "";
    String publishDate = "";
    String domain = "";
    try {/*from w w w.j  a va  2  s  .  c o m*/
        long now = System.currentTimeMillis();
        long timeSinceLastRequest = now - lastUrlFetchTime;
        if (timeSinceLastRequest < minFetchGapMsecs) {
            long timeToSleep = minFetchGapMsecs - timeSinceLastRequest;
            logger.info(
                    "Sleeping " + timeToSleep + "msecs as time since last fetch is " + timeSinceLastRequest);
            Thread.sleep(timeToSleep);
        }
        Document articleDoc = Jsoup.connect(url).userAgent("SeldonBot/1.0").timeout(httpGetTimeout).get();
        lastUrlFetchTime = System.currentTimeMillis();
        //get IMAGE URL
        if (StringUtils.isNotBlank(imageCssSelector)) {
            Element imageElement = articleDoc.select(imageCssSelector).first();
            if (imageElement != null && imageElement.attr("content") != null) {
                img_url = imageElement.attr("content");
            }
            if (imageElement != null && StringUtils.isBlank(img_url)) {
                img_url = imageElement.attr("src");
            }
            if (imageElement != null && StringUtils.isBlank(img_url)) {
                img_url = imageElement.attr("href");
            }

        }

        if (StringUtils.isBlank(img_url) && StringUtils.isNotBlank(defImageUrl)) {
            logger.info("Setting image to default: " + defImageUrl);
            img_url = defImageUrl;
        }
        img_url = StringUtils.strip(img_url);

        //get TITLE
        if (StringUtils.isNotBlank(titleCssSelector)) {
            Element titleElement = articleDoc.select(titleCssSelector).first();
            if ((titleElement != null) && (titleElement.attr("content") != null)) {
                title = titleElement.attr("content");
            }

            // if still blank get from text instead
            if (StringUtils.isBlank(title) && (titleElement != null)) {
                title = titleElement.text();
            }
        }

        //get LEAD TEXT
        if (StringUtils.isNotBlank(leadTextCssSelector)) {
            Element leadElement = articleDoc.select(leadTextCssSelector).first();
            if (leadElement != null && leadElement.attr("content") != null) {
                leadtext = leadElement.attr("content");
            }
        }

        //get publish date
        if (StringUtils.isNotBlank(publishDateCssSelector)) {
            //2013-01-21T10:40:55Z
            Element pubElement = articleDoc.select(publishDateCssSelector).first();
            if (pubElement != null && pubElement.attr("content") != null) {
                String pubtext = pubElement.attr("content");
                SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
                DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
                Date result = null;
                try {
                    result = df.parse(pubtext);
                } catch (ParseException e) {
                    logger.info("Failed to parse date withUTC format " + pubtext);
                }
                //try a simpler format
                df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ENGLISH);
                try {
                    result = df.parse(pubtext);
                } catch (ParseException e) {
                    logger.info("Failed to parse date " + pubtext);
                }

                if (result != null)
                    publishDate = dateFormatter.format(result);
                else
                    logger.error("Failed to parse date " + pubtext);
            }
        }

        //get Link
        if (StringUtils.isNotBlank(linkCssSelector)) {
            Element linkElement = articleDoc.select(linkCssSelector).first();
            if (linkElement != null && linkElement.attr("content") != null) {
                link = linkElement.attr("content");
            }
        }

        //get CONTENT
        if (StringUtils.isNotBlank(textCssSelector)) {
            Element descriptionElement = articleDoc.select(textCssSelector).first();
            if (descriptionElement != null)
                description = Jsoup.parse(descriptionElement.html()).text();
        }

        //get TAGS
        Set<String> tagSet = AttributesImporterUtils.getTags(articleDoc, tagsCssSelector, title);

        if (tagSet.size() > 0)
            tags = CollectionTools.join(tagSet, ",");

        //get CATEGORY - client specific
        if (StringUtils.isNotBlank(categoryCssSelector)) {
            Element categoryElement = articleDoc.select(categoryCssSelector).first();
            if (categoryElement != null && categoryElement.attr("content") != null) {
                category = categoryElement.attr("content");
                if (StringUtils.isNotBlank(category))
                    category = category.toUpperCase();
            }
        } else if (StringUtils.isNotBlank(categoryClassPrefix)) {
            String className = "io.seldon.importer.articles.category." + categoryClassPrefix
                    + "CategoryExtractor";
            Class<?> clazz = Class.forName(className);
            Constructor<?> ctor = clazz.getConstructor();
            CategoryExtractor extractor = (CategoryExtractor) ctor.newInstance();
            category = extractor.getCategory(url, articleDoc);
        }

        //get Sub CATEGORY - client specific
        if (StringUtils.isNotBlank(subCategoryCssSelector)) {
            Element subCategoryElement = articleDoc.select(subCategoryCssSelector).first();
            if (subCategoryElement != null && subCategoryElement.attr("content") != null) {
                subCategory = subCategoryElement.attr("content");
                if (StringUtils.isNotBlank(subCategory))
                    subCategory = category.toUpperCase();
            }
        } else if (StringUtils.isNotBlank(subCategoryClassPrefix)) {
            String className = "io.seldon.importer.articles.category." + subCategoryClassPrefix
                    + "SubCategoryExtractor";
            Class<?> clazz = Class.forName(className);
            Constructor<?> ctor = clazz.getConstructor();
            CategoryExtractor extractor = (CategoryExtractor) ctor.newInstance();
            subCategory = extractor.getCategory(url, articleDoc);
        }

        // Get domain
        if (domainIsNeeded) {
            domain = getDomain(url);
        }

        if ((StringUtils.isNotBlank(title) && (imageNotNeeded || StringUtils.isNotBlank(img_url))
                && (categoryNotNeeded || StringUtils.isNotBlank(category))
                && (!domainIsNeeded || StringUtils.isNotBlank(domain)))) {
            attributes = new HashMap<String, String>();
            attributes.put(TITLE, title);
            if (StringUtils.isNotBlank(category))
                attributes.put(CATEGORY, category);
            if (StringUtils.isNotBlank(subCategory))
                attributes.put(SUBCATEGORY, subCategory);
            if (StringUtils.isNotBlank(link))
                attributes.put(LINK, link);
            if (StringUtils.isNotBlank(leadtext))
                attributes.put(LEAD_TEXT, leadtext);
            if (StringUtils.isNotBlank(img_url))
                attributes.put(IMG_URL, img_url);
            if (StringUtils.isNotBlank(tags))
                attributes.put(TAGS, tags);
            attributes.put(CONTENT_TYPE, VERIFIED_CONTENT_TYPE);
            if (StringUtils.isNotBlank(description))
                attributes.put(DESCRIPTION, description);
            if (StringUtils.isNotBlank(publishDate))
                attributes.put(PUBLISH_DATE, publishDate);
            if (StringUtils.isNotBlank(domain))
                attributes.put(DOMAIN, domain);
            System.out.println("Item: " + url + "; Category: " + category + " SubCategory: " + subCategory);
            itemProcessResult.extraction_status = "EXTRACTION_SUCCEEDED";
        } else {
            logger.warn("Failed to get needed attributes for article " + url);
            logger.warn("[title=" + title + ", img_url=" + img_url + ", category=" + category + ", domain="
                    + domain + "]");
        }

        { // check for failures for the log result
            if (StringUtils.isBlank(title)) {
                itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list
                        + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",") + "title";
            }
            if (!imageNotNeeded && StringUtils.isBlank(img_url)) {
                itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list
                        + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",") + "img_url";
            }
            if (!categoryNotNeeded && StringUtils.isBlank(category)) {
                itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list
                        + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",")
                        + "category";
            }
        }
    } catch (Exception e) {
        logger.error("Article: " + url + ". Attributes import FAILED", e);
        itemProcessResult.error = e.toString();
    }

    AttributesImporterUtils.logResult(logger, itemProcessResult);

    return attributes;
}

From source file:org.cellcore.code.engine.page.extractor.mcc.MCCPageDataExtractor.java

@Override
protected String getName(Document doc) {
    return doc.select("#blockContent").get(3).select("b").get(0).childNodes().get(0).attr("text");
}

From source file:de.geeksfactory.opacclient.apis.IOpac.java

static void parseMediaList(List<LentItem> media, Document doc, JSONObject data) {
    if (doc.select("a[name=AUS]").size() == 0)
        return;//from   w w  w  .j  a  va2s  .  co  m

    Elements copytrs = doc.select("a[name=AUS] ~ table, a[name=AUS] ~ form table").first().select("tr");
    doc.setBaseUri(data.optString("baseurl"));

    DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN);

    int trs = copytrs.size();
    if (trs < 2) {
        return;
    }
    assert (trs > 0);

    JSONObject copymap = new JSONObject();
    try {
        if (data.has("accounttable")) {
            copymap = data.getJSONObject("accounttable");
        }
    } catch (JSONException e) {
    }

    Pattern datePattern = Pattern.compile("\\d{2}\\.\\d{2}\\.\\d{4}");
    for (int i = 1; i < trs; i++) {
        Element tr = copytrs.get(i);
        LentItem item = new LentItem();

        if (copymap.optInt("title", 0) >= 0) {
            item.setTitle(tr.child(copymap.optInt("title", 0)).text().trim().replace("\u00a0", ""));
        }
        if (copymap.optInt("author", 1) >= 0) {
            item.setAuthor(tr.child(copymap.optInt("author", 1)).text().trim().replace("\u00a0", ""));
        }
        if (copymap.optInt("format", 2) >= 0) {
            item.setFormat(tr.child(copymap.optInt("format", 2)).text().trim().replace("\u00a0", ""));
        }
        int prolongCount = 0;
        if (copymap.optInt("prolongcount", 3) >= 0) {
            prolongCount = Integer
                    .parseInt(tr.child(copymap.optInt("prolongcount", 3)).text().trim().replace("\u00a0", ""));
            item.setStatus(String.valueOf(prolongCount) + "x verl.");
        }
        if (data.optInt("maxprolongcount", -1) != -1) {
            item.setRenewable(prolongCount < data.optInt("maxprolongcount", -1));
        }
        if (copymap.optInt("returndate", 4) >= 0) {
            String value = tr.child(copymap.optInt("returndate", 4)).text().trim().replace("\u00a0", "");
            Matcher matcher = datePattern.matcher(value);
            if (matcher.find()) {
                try {
                    item.setDeadline(fmt.parseLocalDate(matcher.group()));
                } catch (IllegalArgumentException e1) {
                    e1.printStackTrace();
                }
            }
        }
        if (copymap.optInt("prolongurl", 5) >= 0) {
            if (tr.children().size() > copymap.optInt("prolongurl", 5)) {
                Element cell = tr.child(copymap.optInt("prolongurl", 5));
                if (cell.select("input[name=MedNrVerlAll]").size() > 0) {
                    // new iOPAC Version 1.45 - checkboxes to prolong multiple items
                    // internal convention: We add "NEW" to the media ID to show that we have
                    // the new iOPAC version
                    Element input = cell.select("input[name=MedNrVerlAll]").first();
                    String value = input.val();
                    item.setProlongData("NEW" + value);
                    item.setId(value.split(";")[0]);
                    if (input.hasAttr("disabled"))
                        item.setRenewable(false);
                } else {
                    // previous versions - link for prolonging on every medium
                    String link = cell.select("a").attr("href");
                    item.setProlongData(link);
                    // find media number with regex
                    Pattern pattern = Pattern.compile("mednr=([^&]*)&");
                    Matcher matcher = pattern.matcher(link);
                    if (matcher.find() && matcher.group() != null)
                        item.setId(matcher.group(1));
                }
            }
        }

        media.add(item);
    }
    assert (media.size() == trs - 1);

}

From source file:org.cellcore.code.engine.page.extractor.mb.MBPageDataExtractor.java

protected String[] getOtherNames(Document doc) {
    String frName = doc.select(".text").get(1).childNodes().get(0).attr("text").trim();
    return new String[] { frName };
}

From source file:org.cellcore.code.engine.page.extractor.mfrag.MFRAGPageDataExtractor.java

@Override
protected int getStock(Document doc) {
    Elements trs = doc.select("#Tableau").get(0).children().get(0).children();
    float iPrice = Float.MAX_VALUE;
    int iStock = 0;
    for (int i = 1; i < trs.size(); i++) {
        Element tr = trs.get(i);//from w ww. jav  a 2 s. c om
        String val = tr.select("td").get(3).select("strong").get(0).childNodes().get(0).attr("text");
        String stockV = tr.select("td").get(4).select("option").last().childNodes().get(0).attr("text");
        val = cleanPriceString(val);
        float price = Float.parseFloat(val);

        if (price < iPrice) {
            iPrice = price;
            iStock = Integer.parseInt(stockV.replaceAll("\\(", "").replaceAll("\\)", ""));
        }
    }
    return iStock;
}

From source file:ddf.catalog.transformer.html.HtmlMetacardTransformerTest.java

@Test
public void testMetacardTransform() throws CatalogTransformerException, IOException {
    Metacard metacard = new MetacardImpl();
    HtmlMetacardTransformer htmlTransformer = new HtmlMetacardTransformer(EMPTY_CATEGORY_LIST);
    BinaryContent binaryContent = htmlTransformer.transform(metacard, Collections.emptyMap());

    Document doc = getHtmlDocument(binaryContent);

    assertThat(doc.select(METACARD_CLASS), hasSize(1));
}

From source file:org.cellcore.code.engine.page.extractor.mtgf.MTGFPageDataExtractor.java

@Override
protected String getName(Document doc) throws UnsupportedCardException {
    String name = doc.select(".name").get(0).select("h1").text();
    return name;//from w  ww .j  a  v  a 2s.  com
}

From source file:org.cellcore.code.engine.page.extractor.mcc.MCCPageDataExtractor.java

@Override
protected String[] getOtherNames(Document doc) {
    String frName = doc.select("#blockContent").get(4).select("b").get(0).childNodes().get(0).attr("text");
    return new String[] { frName };
}