List of usage examples for org.jsoup.select Elements parents
public Elements parents()
From source file:gov.medicaid.screening.dao.impl.OIGDAOBean.java
/** * Parses the excluded provider profile details page. * * @param page the details page/* ww w . ja v a 2 s . co m*/ * @return the parsed license details * @throws ParsingException if the expected tags were not found */ private ProviderProfile parseProfile(Document page) throws ParsingException { ProviderProfile profile = new ProviderProfile(); // name User user = new User(); profile.setUser(user); user.setLastName(page.select("th:containsOwn(Last Name) + td").text()); user.setFirstName(page.select("th:containsOwn(First Name) + td").text()); // business String businessName = page.select("th:containsOwn(Entity) + td").text(); if (!"N/A".equals(businessName)) { Business business = new Business(); profile.setBusiness(business); business.setName(businessName); } // DOB Date dob = parseDate(page.select("th:has(acronym:containsOwn(DOB)) + td").text(), DATE_FORMAT); if (dob != null) { profile.setDob(dob); } // exclusion type ExclusionType exclusionType = new ExclusionType(); profile.setExclusionType(exclusionType); exclusionType.setName(page.select("th:containsOwn(Excl. Type) + td").text()); // specialty List<Specialty> specialties = new ArrayList<Specialty>(); Specialty specialty = new Specialty(); specialties.add(specialty); specialty.setName(page.select("th:containsOwn(Specialty) + td").text()); profile.setSpecialties(specialties); // address Elements addrElement = page.select("th:containsOwn(Address) + td"); String addr = addrElement.text(); Element addrNextRow = addrElement.parents().first().nextElementSibling(); if ("".equals(addrNextRow.select("th").text())) { addr += " " + addrNextRow.select("td").text(); } Address address = new Address(); address.setLocation(addr); profile.setAddresses(Arrays.asList(new Address[] { address })); Date date = parseDate(page.select("th:containsOwn(Excl. Date) + td").text(), DATE_FORMAT); if (date != null) { profile.setRequestEffectiveDate(date); } return profile; }
From source file:xxx.web.comments.debates.impl.ProConOrgParser.java
@Override public Debate parseDebate(InputStream inputStream) throws IOException { Debate result = new Debate(); Document doc = Jsoup.parse(inputStream, "UTF-8", "http://www.procon.org/"); // Set the Url of the doc // title// w ww . j a va2 s . c om Element body = doc.body(); Elements debateTitleElements = body.select("h2"); // Elements debateTitleElements = body.select("p[class=title]").select("p[style]"); if (debateTitleElements.first() == null) { // not a debate return null; } String title = Utils.normalize(debateTitleElements.first().text()); result.setTitle(title); Elements proConTr = body.select("tr > td > b:contains(PRO \\(yes\\))"); if (proConTr == null || proConTr.parents() == null || proConTr.parents().first() == null || proConTr.parents().first().parents() == null || proConTr.parents().first().parents().first() == null || proConTr.parents().first().parents().first().nextElementSibling() == null) { // not a pro-con debate return null; } Element trAnswers = proConTr.parents().first().parents().first().nextElementSibling(); // the PRO side Element proTd = trAnswers.select("td").get(0); Element conTd = trAnswers.select("td").get(1); // System.out.println(proTd.select("blockquote").size()); // System.out.println(conTd.select("blockquote").size()); for (Element text : proTd.select("blockquote > div[class=editortext]")) { Argument argument = new Argument(); argument.setStance("pro"); argument.setText(extractPlainTextFromTextElement(text)); argument.setOriginalHTML(text.html()); // set ID idCounter++; argument.setId("pcq_" + idCounter); if (!argument.getText().isEmpty()) { result.getArgumentList().add(argument); } else { System.err.println("Failed to extract text from " + text.html()); } } for (Element text : conTd.select("blockquote > div[class=editortext]")) { Argument argument = new Argument(); argument.setStance("con"); argument.setText(extractPlainTextFromTextElement(text)); argument.setOriginalHTML(text.html()); idCounter++; argument.setId("pcq_" + idCounter); if (!argument.getText().isEmpty()) { result.getArgumentList().add(argument); } else { System.err.println("Failed to extract text from " + text.html()); } } // show some stats: Map<String, Integer> map = new HashMap<>(); map.put("pro", 0); map.put("con", 0); for (Argument argument : result.getArgumentList()) { map.put(argument.getStance(), map.get(argument.getStance()) + 1); } System.out.println(map); return result; }