Example usage for org.jsoup.nodes Document getElementsByClass

List of usage examples for org.jsoup.nodes Document getElementsByClass

Introduction

In this page you can find the example usage for org.jsoup.nodes Document getElementsByClass.

Prototype

public Elements getElementsByClass(String className) 

Source Link

Document

Find elements that have this class, including or under this element.

Usage

From source file:Search.DataManipulation.DataParser.java

public String getName(Document dom) {
    Elements appNameClass = dom.getElementsByClass("document-title");
    return appNameClass.first().child(0).ownText();
}

From source file:Search.DataManipulation.DataParser.java

public String getBundleId(Document dom) {
    Elements bundleClass = dom.getElementsByClass("buy-button-container");
    return bundleClass.first().attr("data-docid");
}

From source file:Search.DataManipulation.DataParser.java

public String getDescription(Document dom) {
    Elements descClass = dom.getElementsByClass("id-app-orig-desc");
    return descClass.first().ownText();
}

From source file:Search.DataManipulation.DataParser.java

public String getThumbnails(Document dom) throws IOException {
    Elements thumbnailsClass = dom.getElementsByClass("thumbnails");
    Elements thumbnails = thumbnailsClass.first().children();

    List<String> imageArray = new ArrayList<String>();

    for (Element images : thumbnails) {
        String imageTagUrl = images.getElementsByTag("img").first().attr("src");
        byte[] imageByte = dataHandler.imageDownloader(imageTagUrl);

        if (imageByte.length == 0) {
            continue;
        }/*from ww w .  j av  a 2  s.co  m*/

        String imageTag = Base64.getEncoder().encodeToString(imageByte);
        imageArray.add(imageTag);
    }

    return JSONValue.toJSONString(imageArray);
}

From source file:Search.DataManipulation.DataParser.java

public Map<String, String> getMetaData(Document dom) {
    Elements details = null;/*from  w w w.j  a v a  2 s. co m*/
    Map<String, String> metaData = new HashMap<>();

    Elements detailClass = dom.getElementsByClass("details-section-contents");
    for (Element testClass : detailClass) {
        if (testClass.children().first().hasClass("meta-info")) {
            details = testClass.children();
        }
    }

    assert details != null;
    for (Element detailElement : details) {
        String area = detailElement.children().first().ownText();
        String value = detailElement.children().last().ownText();

        if (!(area.equals("Permissions") || area.equals("Report") || area.equals("Developer"))) {
            metaData.put(area, value);
        }
    }
    return metaData;
}

From source file:Search.DataManipulation.DataParser.java

public Map<String, String> getRatingData(Document dom) {
    Map<String, String> ratingData = new HashMap<>();

    Elements ratingClass = dom.getElementsByClass("score-container");
    Elements ratingDom = ratingClass.first().children();

    for (Element rating : ratingDom) {
        String item = rating.attr("itemprop");
        String content = rating.attr("content");

        if (item.equals("ratingValue")) {
            ratingData.put("rating", content);
        } else if (item.equals("ratingCount")) {
            ratingData.put(item, content);
        }//from   ww  w  .j a v  a2 s  .c o  m
    }
    return ratingData;
}

From source file:uk.co.certait.htmlexporter.demo.DemoTwo.java

public DemoTwo() throws Exception {
    Document document = Jsoup
            .parse(new URL("http://news.bbc.co.uk/sport1/hi/football/eng_prem/table/8102708.stm"), 10000);
    Elements elements = document.getElementsByClass("fulltable");

    String table = null;//from   w  w w.j a va  2 s. c  o m

    for (Element element : elements) {
        table = element.toString();
    }

    String html = generateHTML(table);
    saveFile("league.html", html.getBytes());

    new ExcelExporter().exportHtml(html, new File("./league.xlsx"));
    new OdsExporter().exportHtml(html, new File("./league.ods"));
}

From source file:us.colloquy.sandbox.TestExtractor.java

@Test
public void useJsoup() {

    String homeDir = System.getProperty("user.home");

    System.out.println(homeDir);/*w ww.j  ava  2 s .com*/

    //JSOUP API allows to extract all  elements of letters in files

    // File input = new File("samples/OEBPS/Text/0001_1006_2001.xhtml");

    File input = new File("samples/pisma-1904/OEBPS/Text/single_doc.html");

    try {
        Document doc = Jsoup.parse(input, "UTF-8");

        List<Letter> letters = new ArrayList<>(); //our model contains only a subset of fields

        String previousYear = "";

        for (Element element : doc.getElementsByClass("section")) {
            Letter letter = new Letter();

            StringBuilder content = new StringBuilder();

            for (Element child : element.children()) {

                for (Attribute att : child.attributes()) {
                    System.out.println(att.getKey() + " " + att.getValue());
                }

                if ("center".equalsIgnoreCase(child.className())) {
                    String toWhom = child.getElementsByTag("strong").text();

                    if (StringUtils.isEmpty(toWhom)) {
                        toWhom = child.text();
                        // System.out.println(toWhom);
                    }

                    String[] toWhomArray = toWhom.split("(\\s\\s)|(,)");

                    for (String to : toWhomArray) {
                        RussianDate.parseToWhom(letter, to); //here we need to recognize a russian name and store that but for now we store the content
                    }

                    //check if there is anything else here and find date and place - it will be replaced if exists below

                    String entireText = child.text();

                    String tail = entireText.replace(toWhom, "");

                    if (StringUtils.isNotEmpty(tail)) {
                        RussianDate.parseDateAndPlace(letter, tail, previousYear); //a parser that figures out date and place if they are present
                    }

                    // System.out.println("two whom\t " +  child.getElementsByTag("strong").text() );

                } else if ("Data".equalsIgnoreCase(child.className())) {

                    if (child.getElementsByTag("em") != null
                            && StringUtils.isNotEmpty(child.getElementsByTag("em").text())) {
                        RussianDate.parseDateAndPlace(letter, child.getElementsByTag("em").text(),
                                previousYear); //most often date and place are enclosed in em tag

                        if (letter.getDate() != null) {
                            LocalDate localDate = letter.getDate().toInstant().atZone(ZoneId.systemDefault())
                                    .toLocalDate();
                            int year = localDate.getYear();
                            previousYear = year + "";
                        }
                    }

                    // System.out.println("when and where\t " + child.getElementsByTag("em").text());

                } else if ("petit".equalsIgnoreCase(child.className())
                        || "Textpetit_otstup".equalsIgnoreCase(child.className())) {
                    letter.getNotes().add(child.text());

                } else {
                    //System.out.println(child.text() );

                    Elements elements = child.getElementsByTag("sup");

                    for (Element e : elements) {
                        String value = e.text();

                        e.replaceWith(new TextNode("[" + value + "]", null));
                    }

                    for (Element el : child.getAllElements()) {
                        // System.out.println(el.tagName());
                        if ("sup".equalsIgnoreCase(el.tagName())) {
                            content.append(" [" + el.text() + "] ");
                        } else {
                            content.append(el.text());
                        }

                    }

                    content.append("\n");

                }

                //                  System.out.println(child.tag() + "\n" );
                //                  System.out.println(child.outerHtml() + "\n" + child.text());
            }

            letter.setContent(content.toString());
            letters.add(letter);
        }

        ObjectWriter ow = new com.fasterxml.jackson.databind.ObjectMapper().writer().withDefaultPrettyPrinter();

        for (Letter letter : letters) {
            //                if (letter.getDate() == null)
            //                {

            //                        if (StringUtils.isNotEmpty(person.getLastName()))
            //                        {
            String json = ow.writeValueAsString(letter);

            System.out.println(json);
            //                        }

            //}

        }

    } catch (IOException e) {
        e.printStackTrace();
    }

}

From source file:us.colloquy.util.DiaryParser.java

@Test
    public void useJsoup() {
        //File input = new File(System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries/dnevnik_1893(2)/OEBPS/Text/0001_1006_2001.xhtml");
        //   File input = new File(System.getProperty("user.home") + "/IdeaProjects/ElasticTest/temp/dnevnik_1862(1)/OEBPS/Text/0001_1006_2001.xhtml");

        File input = new File(System.getProperty("user.home")
                + "/Documents/Tolstoy/90-volume-set/diaries/uzip/dnevnik_1881-1887_vol_49/OEBPS/Text/0001_1011_2005.xhtml");

        String previousYear = "";

        String sourse = "pointer";

        List<DiaryEntry> diaryEntrys = new ArrayList<>();

        try {/*from w w  w  .jav  a 2 s . co  m*/
            Document doc = Jsoup.parse(input, "UTF-8");

            for (Element element : doc.getElementsByClass("section")) {
                DiaryEntry diaryEntry = null;

                StringBuilder contentBuilder = new StringBuilder();

                for (Element child : element.children()) {
                    //                    for (Attribute att : child.attributes())
                    //                    {
                    //                        //   System.out.println(att.getKey() + " " + att.getValue());
                    //                    }
                    //we need to assume that each element is a continuation unless the entry is a date that starts a new entry
                    //the problem is to distinguish between an entry that contains date and place vs date within an entry

                    //lets try to see if element is a date

                    DiaryEntry diaryEntryToCollectDate = new DiaryEntry();

                    //we send it in two cases when text matches year or when text has em element
                    Element em = child.select("em").first();

                    if (em == null && StringUtils.isNotEmpty(child.text())) {
                        Matcher m = yearPattern.matcher(child.text());

                        if (m.find()) {
                            child.text(m.group(1));
                            previousYear = parseDateAndPlace(previousYear, diaryEntryToCollectDate, child);
                        }
                    }

                    if (em != null) {
                        previousYear = parseDateAndPlace(previousYear, diaryEntryToCollectDate, child);
                    }

                    if (diaryEntryToCollectDate.getDate() != null) //this is the begginng of a new entry
                    {
                        System.out.println("Found date: " + diaryEntryToCollectDate.getDate());
                        //create new DiaryEntry
                        if (diaryEntry != null) {
                            diaryEntry.setEntry(contentBuilder.toString()); //add consecutive entries here
                            diaryEntrys.add(diaryEntry);
                        }

                        diaryEntry = new DiaryEntry();
                        diaryEntry.setSource(sourse);
                        diaryEntry.setDate(diaryEntryToCollectDate.getDate());
                        diaryEntry.setPlace(diaryEntryToCollectDate.getPlace());

                        contentBuilder = new StringBuilder();

                    }

                    if (StringUtils.isNotEmpty(child.text()) && child.text().length() > 8) {
                        contentBuilder.append(child.text() + "\n");

                    }
                    //
                    //                    System.out.println(child.tag() + "\n");
                    //                    System.out.println(child.outerHtml() + "\n" + child.text());
                }

                //whatever we still have, add here:
                if (StringUtils.isNotEmpty(contentBuilder.toString()) && diaryEntry != null) {
                    diaryEntry.setEntry(contentBuilder.toString());
                    diaryEntrys.add(diaryEntry);
                }
            }

        } catch (IOException e) {
            e.printStackTrace();
        }

        for (DiaryEntry diaryEntry : diaryEntrys) {
            System.out.println(diaryEntry.toString());
        }
    }

From source file:wo.trade.SearchPageScraper.java

public List<TradeItem> parse() {
    List<TradeItem> tradeItems = new LinkedList<>();
    Document doc = Jsoup.parse(page, "UTF-8");

    Element content = doc.getElementById("content");

    Elements items = null;//from   w  w  w  .  j av  a  2  s  .  c  om
    if (content == null) {
        items = doc.getElementsByClass("item");
    } else {
        items = content.getElementsByClass("item");
    }

    for (Element element : items) {

        TradeItem item = new TradeItem();

        item.id = element.attr("id");
        item.id = StringUtils.remove(item.id, "item-container-");
        item.seller = element.attr("data-seller");
        item.thread = element.attr("data-thread");
        item.sellerid = element.attr("data-sellerid");
        item.buyout = element.attr("data-buyout");
        item.ign = element.attr("data-ign");
        item.league = element.attr("data-league");
        item.name = element.attr("data-name");
        item.corrupted = element.getElementsByClass("corrupted").size() > 0;
        item.identified = element.getElementsByClass("item-unid").size() == 0;

        //         System.out.println(String.format("Now parsing item id %s name %s", item.id, item.name));

        Element sockElem = element.getElementsByClass("sockets-raw").get(0);
        item.socketsRaw = sockElem.text();

        Elements accntAgeElement = element.getElementsByAttributeValue("title",
                "account age and highest level");
        if (accntAgeElement != null && !accntAgeElement.isEmpty()) {
            item.ageAndHighLvl = accntAgeElement.get(0).text();
        }

        // ----- Requirements ----- //
        Element reqElem = element.getElementsByClass("requirements").get(0);
        List<TextNode> reqNodes = reqElem.textNodes();
        for (TextNode reqNode : reqNodes) {
            // sample [ Level:&nbsp;37 ,  Strength:&nbsp;42 ,  Intelligence:&nbsp;42 ] 
            String req = StringUtils.trimToEmpty(reqNode.getWholeText());
            req = req.replaceAll(regex_horizontal_whitespace, "");
            req = Util.removeThoseDamnWhiteSpace(req);
            String separator = ":";
            String reqType = trim(substringBefore(req, separator));
            switch (reqType) {
            case "Level":
                item.reqLvl = trim(substringAfter(req, separator));
                break;
            case "Strength":
                item.reqStr = trim(substringAfter(req, separator));
                break;
            case "Intelligence":
                item.reqInt = trim(substringAfter(req, separator));
                break;
            case "Dexterity":
                item.reqDex = trim(substringAfter(req, separator));
                break;
            }
        }
        item.mapQuantity = element.getElementsByAttributeValue("data-name", "mapq").stream().findFirst()
                .map(n -> n.text()).map(s -> substringAfter(s, "Item quantity:"))
                .map(s -> StringUtils.removePattern(s, "[^\\d]")).orElse("")
                .replaceAll(regex_horizontal_whitespace, "").trim();

        // ----- Rarity by checking the item name link class ----- //
        // itemframe0 - normal
        // itemframe1 - magic
        // itemframe2 - rare
        // itemframe3 - unique
        // itemframe4 - gems
        // itemframe5 - currency
        // itemframe6 - divination card
        String itemframeStr = element.getElementsByClass("title").stream().findFirst().map(n -> n.attr("class"))
                .orElse(null);
        itemframeStr = Util.regexMatch("itemframe(\\d)", itemframeStr, 1);
        if (itemframeStr != null) {
            int frame = Integer.parseInt(itemframeStr);
            item.rarity = Rarity.valueOf(frame);
        } else {
            item.rarity = Rarity.unknown;
        }

        // ----- Verify ----- //
        item.dataHash = element.getElementsByAttributeValue("onclick", "verify_modern(this)").stream()
                .findFirst().map(n -> n.attr("data-hash")).orElse("").trim();

        // ----- Mods ----- //
        Elements itemModsElements = element.getElementsByClass("item-mods");
        if (itemModsElements != null && itemModsElements.size() > 0) {
            Element itemMods = itemModsElements.get(0);
            if (itemMods.getElementsByClass("bullet-item").size() != 0) {
                Element bulletItem = itemMods.getElementsByClass("bullet-item").get(0);
                Elements ulMods = bulletItem.getElementsByTag("ul");
                if (ulMods.size() == 2) {
                    // implicit mod
                    Elements implicitLIs = ulMods.get(0).getElementsByTag("li");
                    Element implicitLi = implicitLIs.last();
                    Mod impMod = new Mod(implicitLi.attr("data-name"), implicitLi.attr("data-value"));
                    item.implicitMod = impMod;
                }
                int indexOfExplicitMods = ulMods.size() - 1;
                Elements modsLi = ulMods.get(indexOfExplicitMods).getElementsByTag("li");
                for (Element modLi : modsLi) {
                    // explicit mods
                    Mod mod = new Mod(modLi.attr("data-name"), modLi.attr("data-value"));
                    item.explicitMods.add(mod);
                }
            }
        }

        // ----- Properties ----- //
        // this is the third column data (the first col is the image, second is the mods, reqs)
        item.quality = element.getElementsByAttributeValue("data-name", "q").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.physDmgRangeAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_pd").get(0)
                .text().replaceAll(regex_horizontal_whitespace, "").trim();
        item.eleDmgRange = element.getElementsByAttributeValue("data-name", "ed").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.attackSpeed = element.getElementsByAttributeValue("data-name", "aps").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.dmgAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_dps").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.physDmgAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_pdps").get(0)
                .text().replaceAll(regex_horizontal_whitespace, "").trim();
        item.eleDmg = element.getElementsByAttributeValue("data-name", "edps").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.armourAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_armour").get(0)
                .text().replaceAll(regex_horizontal_whitespace, "").trim();
        item.evasionAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_evasion").get(0)
                .text().replaceAll(regex_horizontal_whitespace, "").trim();
        item.energyShieldAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_shield")
                .get(0).text().replaceAll(regex_horizontal_whitespace, "").trim();
        item.block = element.getElementsByAttributeValue("data-name", "block").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.crit = element.getElementsByAttributeValue("data-name", "crit").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.level = element.getElementsByAttributeValue("data-name", "level").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.imageUrl = element.getElementsByAttributeValue("alt", "Item icon").get(0).attr("src");
        item.stackSize = asList(split(trimToEmpty(item.imageUrl), '&')).stream()
                .filter(t -> t.startsWith("stackSize=")).findFirst().map(s -> substringAfter(s, "="))
                .orElse(null);

        Elements onlineSpans = element.getElementsMatchingText("online");
        if (!onlineSpans.isEmpty()) {
            item.online = "Online";
        } else {
            item.online = "";
        }

        tradeItems.add(item);
    }
    //      System.out.println("DONE --- Items");

    return tradeItems;
}