List of usage examples for org.jsoup.nodes Document getElementsByClass
public Elements getElementsByClass(String className)
From source file:Search.DataManipulation.DataParser.java
public String getName(Document dom) { Elements appNameClass = dom.getElementsByClass("document-title"); return appNameClass.first().child(0).ownText(); }
From source file:Search.DataManipulation.DataParser.java
public String getBundleId(Document dom) { Elements bundleClass = dom.getElementsByClass("buy-button-container"); return bundleClass.first().attr("data-docid"); }
From source file:Search.DataManipulation.DataParser.java
public String getDescription(Document dom) { Elements descClass = dom.getElementsByClass("id-app-orig-desc"); return descClass.first().ownText(); }
From source file:Search.DataManipulation.DataParser.java
public String getThumbnails(Document dom) throws IOException { Elements thumbnailsClass = dom.getElementsByClass("thumbnails"); Elements thumbnails = thumbnailsClass.first().children(); List<String> imageArray = new ArrayList<String>(); for (Element images : thumbnails) { String imageTagUrl = images.getElementsByTag("img").first().attr("src"); byte[] imageByte = dataHandler.imageDownloader(imageTagUrl); if (imageByte.length == 0) { continue; }/*from ww w . j av a 2 s.co m*/ String imageTag = Base64.getEncoder().encodeToString(imageByte); imageArray.add(imageTag); } return JSONValue.toJSONString(imageArray); }
From source file:Search.DataManipulation.DataParser.java
public Map<String, String> getMetaData(Document dom) { Elements details = null;/*from w w w.j a v a 2 s. co m*/ Map<String, String> metaData = new HashMap<>(); Elements detailClass = dom.getElementsByClass("details-section-contents"); for (Element testClass : detailClass) { if (testClass.children().first().hasClass("meta-info")) { details = testClass.children(); } } assert details != null; for (Element detailElement : details) { String area = detailElement.children().first().ownText(); String value = detailElement.children().last().ownText(); if (!(area.equals("Permissions") || area.equals("Report") || area.equals("Developer"))) { metaData.put(area, value); } } return metaData; }
From source file:Search.DataManipulation.DataParser.java
public Map<String, String> getRatingData(Document dom) { Map<String, String> ratingData = new HashMap<>(); Elements ratingClass = dom.getElementsByClass("score-container"); Elements ratingDom = ratingClass.first().children(); for (Element rating : ratingDom) { String item = rating.attr("itemprop"); String content = rating.attr("content"); if (item.equals("ratingValue")) { ratingData.put("rating", content); } else if (item.equals("ratingCount")) { ratingData.put(item, content); }//from ww w .j a v a2 s .c o m } return ratingData; }
From source file:uk.co.certait.htmlexporter.demo.DemoTwo.java
public DemoTwo() throws Exception { Document document = Jsoup .parse(new URL("http://news.bbc.co.uk/sport1/hi/football/eng_prem/table/8102708.stm"), 10000); Elements elements = document.getElementsByClass("fulltable"); String table = null;//from w w w.j a va 2 s. c o m for (Element element : elements) { table = element.toString(); } String html = generateHTML(table); saveFile("league.html", html.getBytes()); new ExcelExporter().exportHtml(html, new File("./league.xlsx")); new OdsExporter().exportHtml(html, new File("./league.ods")); }
From source file:us.colloquy.sandbox.TestExtractor.java
@Test public void useJsoup() { String homeDir = System.getProperty("user.home"); System.out.println(homeDir);/*w ww.j ava 2 s .com*/ //JSOUP API allows to extract all elements of letters in files // File input = new File("samples/OEBPS/Text/0001_1006_2001.xhtml"); File input = new File("samples/pisma-1904/OEBPS/Text/single_doc.html"); try { Document doc = Jsoup.parse(input, "UTF-8"); List<Letter> letters = new ArrayList<>(); //our model contains only a subset of fields String previousYear = ""; for (Element element : doc.getElementsByClass("section")) { Letter letter = new Letter(); StringBuilder content = new StringBuilder(); for (Element child : element.children()) { for (Attribute att : child.attributes()) { System.out.println(att.getKey() + " " + att.getValue()); } if ("center".equalsIgnoreCase(child.className())) { String toWhom = child.getElementsByTag("strong").text(); if (StringUtils.isEmpty(toWhom)) { toWhom = child.text(); // System.out.println(toWhom); } String[] toWhomArray = toWhom.split("(\\s\\s)|(,)"); for (String to : toWhomArray) { RussianDate.parseToWhom(letter, to); //here we need to recognize a russian name and store that but for now we store the content } //check if there is anything else here and find date and place - it will be replaced if exists below String entireText = child.text(); String tail = entireText.replace(toWhom, ""); if (StringUtils.isNotEmpty(tail)) { RussianDate.parseDateAndPlace(letter, tail, previousYear); //a parser that figures out date and place if they are present } // System.out.println("two whom\t " + child.getElementsByTag("strong").text() ); } else if ("Data".equalsIgnoreCase(child.className())) { if (child.getElementsByTag("em") != null && StringUtils.isNotEmpty(child.getElementsByTag("em").text())) { RussianDate.parseDateAndPlace(letter, child.getElementsByTag("em").text(), previousYear); //most often date and place are enclosed in em tag if (letter.getDate() != null) { LocalDate localDate = letter.getDate().toInstant().atZone(ZoneId.systemDefault()) .toLocalDate(); int year = localDate.getYear(); previousYear = year + ""; } } // System.out.println("when and where\t " + child.getElementsByTag("em").text()); } else if ("petit".equalsIgnoreCase(child.className()) || "Textpetit_otstup".equalsIgnoreCase(child.className())) { letter.getNotes().add(child.text()); } else { //System.out.println(child.text() ); Elements elements = child.getElementsByTag("sup"); for (Element e : elements) { String value = e.text(); e.replaceWith(new TextNode("[" + value + "]", null)); } for (Element el : child.getAllElements()) { // System.out.println(el.tagName()); if ("sup".equalsIgnoreCase(el.tagName())) { content.append(" [" + el.text() + "] "); } else { content.append(el.text()); } } content.append("\n"); } // System.out.println(child.tag() + "\n" ); // System.out.println(child.outerHtml() + "\n" + child.text()); } letter.setContent(content.toString()); letters.add(letter); } ObjectWriter ow = new com.fasterxml.jackson.databind.ObjectMapper().writer().withDefaultPrettyPrinter(); for (Letter letter : letters) { // if (letter.getDate() == null) // { // if (StringUtils.isNotEmpty(person.getLastName())) // { String json = ow.writeValueAsString(letter); System.out.println(json); // } //} } } catch (IOException e) { e.printStackTrace(); } }
From source file:us.colloquy.util.DiaryParser.java
@Test public void useJsoup() { //File input = new File(System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries/dnevnik_1893(2)/OEBPS/Text/0001_1006_2001.xhtml"); // File input = new File(System.getProperty("user.home") + "/IdeaProjects/ElasticTest/temp/dnevnik_1862(1)/OEBPS/Text/0001_1006_2001.xhtml"); File input = new File(System.getProperty("user.home") + "/Documents/Tolstoy/90-volume-set/diaries/uzip/dnevnik_1881-1887_vol_49/OEBPS/Text/0001_1011_2005.xhtml"); String previousYear = ""; String sourse = "pointer"; List<DiaryEntry> diaryEntrys = new ArrayList<>(); try {/*from w w w .jav a 2 s . co m*/ Document doc = Jsoup.parse(input, "UTF-8"); for (Element element : doc.getElementsByClass("section")) { DiaryEntry diaryEntry = null; StringBuilder contentBuilder = new StringBuilder(); for (Element child : element.children()) { // for (Attribute att : child.attributes()) // { // // System.out.println(att.getKey() + " " + att.getValue()); // } //we need to assume that each element is a continuation unless the entry is a date that starts a new entry //the problem is to distinguish between an entry that contains date and place vs date within an entry //lets try to see if element is a date DiaryEntry diaryEntryToCollectDate = new DiaryEntry(); //we send it in two cases when text matches year or when text has em element Element em = child.select("em").first(); if (em == null && StringUtils.isNotEmpty(child.text())) { Matcher m = yearPattern.matcher(child.text()); if (m.find()) { child.text(m.group(1)); previousYear = parseDateAndPlace(previousYear, diaryEntryToCollectDate, child); } } if (em != null) { previousYear = parseDateAndPlace(previousYear, diaryEntryToCollectDate, child); } if (diaryEntryToCollectDate.getDate() != null) //this is the begginng of a new entry { System.out.println("Found date: " + diaryEntryToCollectDate.getDate()); //create new DiaryEntry if (diaryEntry != null) { diaryEntry.setEntry(contentBuilder.toString()); //add consecutive entries here diaryEntrys.add(diaryEntry); } diaryEntry = new DiaryEntry(); diaryEntry.setSource(sourse); diaryEntry.setDate(diaryEntryToCollectDate.getDate()); diaryEntry.setPlace(diaryEntryToCollectDate.getPlace()); contentBuilder = new StringBuilder(); } if (StringUtils.isNotEmpty(child.text()) && child.text().length() > 8) { contentBuilder.append(child.text() + "\n"); } // // System.out.println(child.tag() + "\n"); // System.out.println(child.outerHtml() + "\n" + child.text()); } //whatever we still have, add here: if (StringUtils.isNotEmpty(contentBuilder.toString()) && diaryEntry != null) { diaryEntry.setEntry(contentBuilder.toString()); diaryEntrys.add(diaryEntry); } } } catch (IOException e) { e.printStackTrace(); } for (DiaryEntry diaryEntry : diaryEntrys) { System.out.println(diaryEntry.toString()); } }
From source file:wo.trade.SearchPageScraper.java
public List<TradeItem> parse() { List<TradeItem> tradeItems = new LinkedList<>(); Document doc = Jsoup.parse(page, "UTF-8"); Element content = doc.getElementById("content"); Elements items = null;//from w w w . j av a 2 s . c om if (content == null) { items = doc.getElementsByClass("item"); } else { items = content.getElementsByClass("item"); } for (Element element : items) { TradeItem item = new TradeItem(); item.id = element.attr("id"); item.id = StringUtils.remove(item.id, "item-container-"); item.seller = element.attr("data-seller"); item.thread = element.attr("data-thread"); item.sellerid = element.attr("data-sellerid"); item.buyout = element.attr("data-buyout"); item.ign = element.attr("data-ign"); item.league = element.attr("data-league"); item.name = element.attr("data-name"); item.corrupted = element.getElementsByClass("corrupted").size() > 0; item.identified = element.getElementsByClass("item-unid").size() == 0; // System.out.println(String.format("Now parsing item id %s name %s", item.id, item.name)); Element sockElem = element.getElementsByClass("sockets-raw").get(0); item.socketsRaw = sockElem.text(); Elements accntAgeElement = element.getElementsByAttributeValue("title", "account age and highest level"); if (accntAgeElement != null && !accntAgeElement.isEmpty()) { item.ageAndHighLvl = accntAgeElement.get(0).text(); } // ----- Requirements ----- // Element reqElem = element.getElementsByClass("requirements").get(0); List<TextNode> reqNodes = reqElem.textNodes(); for (TextNode reqNode : reqNodes) { // sample [ Level: 37 , Strength: 42 , Intelligence: 42 ] String req = StringUtils.trimToEmpty(reqNode.getWholeText()); req = req.replaceAll(regex_horizontal_whitespace, ""); req = Util.removeThoseDamnWhiteSpace(req); String separator = ":"; String reqType = trim(substringBefore(req, separator)); switch (reqType) { case "Level": item.reqLvl = trim(substringAfter(req, separator)); break; case "Strength": item.reqStr = trim(substringAfter(req, separator)); break; case "Intelligence": item.reqInt = trim(substringAfter(req, separator)); break; case "Dexterity": item.reqDex = trim(substringAfter(req, separator)); break; } } item.mapQuantity = element.getElementsByAttributeValue("data-name", "mapq").stream().findFirst() .map(n -> n.text()).map(s -> substringAfter(s, "Item quantity:")) .map(s -> StringUtils.removePattern(s, "[^\\d]")).orElse("") .replaceAll(regex_horizontal_whitespace, "").trim(); // ----- Rarity by checking the item name link class ----- // // itemframe0 - normal // itemframe1 - magic // itemframe2 - rare // itemframe3 - unique // itemframe4 - gems // itemframe5 - currency // itemframe6 - divination card String itemframeStr = element.getElementsByClass("title").stream().findFirst().map(n -> n.attr("class")) .orElse(null); itemframeStr = Util.regexMatch("itemframe(\\d)", itemframeStr, 1); if (itemframeStr != null) { int frame = Integer.parseInt(itemframeStr); item.rarity = Rarity.valueOf(frame); } else { item.rarity = Rarity.unknown; } // ----- Verify ----- // item.dataHash = element.getElementsByAttributeValue("onclick", "verify_modern(this)").stream() .findFirst().map(n -> n.attr("data-hash")).orElse("").trim(); // ----- Mods ----- // Elements itemModsElements = element.getElementsByClass("item-mods"); if (itemModsElements != null && itemModsElements.size() > 0) { Element itemMods = itemModsElements.get(0); if (itemMods.getElementsByClass("bullet-item").size() != 0) { Element bulletItem = itemMods.getElementsByClass("bullet-item").get(0); Elements ulMods = bulletItem.getElementsByTag("ul"); if (ulMods.size() == 2) { // implicit mod Elements implicitLIs = ulMods.get(0).getElementsByTag("li"); Element implicitLi = implicitLIs.last(); Mod impMod = new Mod(implicitLi.attr("data-name"), implicitLi.attr("data-value")); item.implicitMod = impMod; } int indexOfExplicitMods = ulMods.size() - 1; Elements modsLi = ulMods.get(indexOfExplicitMods).getElementsByTag("li"); for (Element modLi : modsLi) { // explicit mods Mod mod = new Mod(modLi.attr("data-name"), modLi.attr("data-value")); item.explicitMods.add(mod); } } } // ----- Properties ----- // // this is the third column data (the first col is the image, second is the mods, reqs) item.quality = element.getElementsByAttributeValue("data-name", "q").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.physDmgRangeAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_pd").get(0) .text().replaceAll(regex_horizontal_whitespace, "").trim(); item.eleDmgRange = element.getElementsByAttributeValue("data-name", "ed").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.attackSpeed = element.getElementsByAttributeValue("data-name", "aps").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.dmgAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_dps").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.physDmgAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_pdps").get(0) .text().replaceAll(regex_horizontal_whitespace, "").trim(); item.eleDmg = element.getElementsByAttributeValue("data-name", "edps").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.armourAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_armour").get(0) .text().replaceAll(regex_horizontal_whitespace, "").trim(); item.evasionAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_evasion").get(0) .text().replaceAll(regex_horizontal_whitespace, "").trim(); item.energyShieldAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_shield") .get(0).text().replaceAll(regex_horizontal_whitespace, "").trim(); item.block = element.getElementsByAttributeValue("data-name", "block").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.crit = element.getElementsByAttributeValue("data-name", "crit").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.level = element.getElementsByAttributeValue("data-name", "level").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.imageUrl = element.getElementsByAttributeValue("alt", "Item icon").get(0).attr("src"); item.stackSize = asList(split(trimToEmpty(item.imageUrl), '&')).stream() .filter(t -> t.startsWith("stackSize=")).findFirst().map(s -> substringAfter(s, "=")) .orElse(null); Elements onlineSpans = element.getElementsMatchingText("online"); if (!onlineSpans.isEmpty()) { item.online = "Online"; } else { item.online = ""; } tradeItems.add(item); } // System.out.println("DONE --- Items"); return tradeItems; }