List of usage examples for org.jsoup.nodes Document getAllElements
public Elements getAllElements()
From source file:Main.java
public static String getScrapeText(Document doc, String query) { return getScrapeText(doc.getAllElements(), query); }
From source file:coyote.dx.web.TestHtmlWorker.java
@Test public void setHtmlGet() throws IOException { Resource resource = new Resource("http://localhost:" + port + "/data/test.html"); Response response = resource.request(); assertNotNull(response);//from ww w.ja v a 2 s . c om while (!response.isComplete()) { Thread.yield(); } Document doc = response.getDocument(); assertNotNull(doc); Elements elements = doc.getAllElements(); System.out.println(doc.toString()); System.out.println("Retrieved document contains " + elements.size() + " elements"); assertTrue(elements.size() >= 40); }
From source file:edu.usu.sdl.openstorefront.service.io.HelpImporter.java
/** * Accept a stream pointed to markdown//from ww w. j a v a 2 s. com * * @param in * @return */ public List<HelpSection> processHelp(InputStream in) { List<HelpSection> helpSections = new ArrayList<>(); String data = ""; try (BufferedReader bin = new BufferedReader(new InputStreamReader(in))) { data = bin.lines().collect(Collectors.joining("\n")); } catch (IOException e) { } PegDownProcessor pegDownProcessor = new PegDownProcessor(PROCESSING_TIMEOUT); String html = pegDownProcessor.markdownToHtml(data); Document doc = Jsoup.parse(html); Elements elements = doc.getAllElements(); Set<String> headerTags = new HashSet<>(); headerTags.add("h1"); headerTags.add("h2"); headerTags.add("h3"); headerTags.add("h4"); headerTags.add("h5"); headerTags.add("h6"); boolean capture = false; HelpSection helpSection = null; for (Element element : elements) { if (headerTags.contains(element.tagName().toLowerCase()) == false && capture) { if (helpSection != null) { if (helpSection.getContent().contains(element.outerHtml()) == false) { helpSection.setContent(helpSection.getContent() + element.outerHtml()); } } } if (headerTags.contains(element.tagName().toLowerCase())) { String title = element.html(); if (helpSection != null) { //save old section addHelpSection(helpSections, helpSection); } String titleSplit[] = title.split(" "); helpSection = new HelpSection(); helpSection.setTitle(title); helpSection.setHeaderLevel(Convert.toInteger(element.tagName().toLowerCase().replace("h", ""))); helpSection.setSectionNumber(titleSplit[0]); helpSection.setContent(""); if (title.contains("*")) { helpSection.setAdminSection(true); } else { helpSection.setAdminSection(false); } capture = true; } } //Add last section if (helpSection != null) { addHelpSection(helpSections, helpSection); } return helpSections; }
From source file:model.ParseInfoFromSite.java
/** * Clear page from all comments. Its give access to some closed information * //from w w w . jav a 2 s . c o m * @param doc * @return */ public List<Comment> findAllComments(Document doc) { logger.info("run"); List<Comment> comments = new ArrayList<>(); for (Element element : doc.getAllElements()) { for (Node n : element.childNodes()) { if (n.nodeName().equals("#comment")) { comments.add((Comment) n); } } } return Collections.unmodifiableList(comments); }
From source file:com.kingfong.webcrawler.util.DOMContentUtils.java
/** * This method finds all anchors below the supplied DOM * <code>node</code>, and creates appropriate {@link Outlink} * records for each (relative to the supplied <code>base</code> * URL), and adds them to the <code>outlinks</code> {@link * ArrayList}./*from w w w.j a v a 2 s. co m*/ * * <p> * * Links without inner structure (tags, text, etc) are discarded, as * are links which contain only single nested links and empty text * nodes (this is a common DOM-fixup artifact, at least with * nekohtml). */ public void getOutlinks(String html, URL url, HashSet<String> outlinks) { Document document = Jsoup.parse(html); Elements elements = document.getAllElements(); for (Element currentNode : elements) { String nodeName = currentNode.tagName(); // short nodeType = currentNode.; Elements children = currentNode.children(); nodeName = nodeName.toLowerCase(); LinkParams params = linkParams.get(nodeName); if (params != null) { // if (!shouldThrowAwayLink(currentNode, children, childLen, // params)) { // StringBuilder linkText = new StringBuilder(); // getText(linkText, currentNode, true); Attributes attrs = currentNode.attributes(); String target = null; boolean noFollow = false; boolean post = false; Iterator<Attribute> iterator = attrs.iterator(); while (iterator.hasNext()) { Attribute attr = iterator.next(); String attrName = attr.getKey(); if (params.attrName.equalsIgnoreCase(attrName)) { target = attr.getValue(); } else if ("rel".equalsIgnoreCase(attrName) && "nofollow".equalsIgnoreCase(attr.getValue())) { noFollow = true; } else if ("method".equalsIgnoreCase(attrName) && "post".equalsIgnoreCase(attr.getValue())) { post = true; } } if (StringUtils.startsWith(target, "/")) { target = url.getProtocol() + "://" + url.getHost() + target; } if (target != null && URLFilter.filt(target)) { outlinks.add(target); } // } // this should not have any children, skip them if (params.childLen == 0) continue; } } }
From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.helpers.boilerplateremoval.impl.JusTextBoilerplateRemoval.java
/** * remove unwanted parts from a jsoup doc * * @param jsoupDoc//from w w w .j a va 2 s . co m * @return */ public Document cleanDom(Document jsoupDoc) { String[] tagsToRemove = { "head", "script", ".hidden", "embedded" }; for (String tag : tagsToRemove) { Elements selectedTags = jsoupDoc.select(tag); for (Element element : selectedTags) { element.remove(); } } //remove comments (might be slow) for (Element element : jsoupDoc.getAllElements()) { for (Node n : element.childNodes()) { NodeHelper.removeComments(n); } } return jsoupDoc; }
From source file:autoInsurance.BeiJPiccImpl.java
void init(Document doc) throws Exception { String str = "<select class=\"w_p80\" name=\"carKindCodeBak\" title=\" \" id=\"carKindCodeBak\"><option value=\"A01\"></option><option value=\"B01\"></option><option value=\"B02\"></option><option value=\"B11\"></option><option value=\"B12\"></option><option value=\"B13\"></option><option value=\"B21\"></option><option value=\"B91\"></option><option value=\"C01\"></option><option value=\"C02\"></option><option value=\"C03\"></option><option value=\"C04\"></option><option value=\"C11\"></option><option value=\"C20\"></option><option value=\"C22\"></option><option value=\"C23\"></option><option value=\"C24\"></option><option value=\"C25\"></option><option value=\"C26\"></option><option value=\"C27\"></option><option value=\"C28\"></option><option value=\"C29\"></option><option value=\"C30\"></option><option value=\"C31\"></option><option value=\"C39\"></option><option value=\"C41\"></option><option value=\"C42\"></option><option value=\"C43\"></option><option value=\"C44\"></option><option value=\"C45\"></option><option value=\"C46\"></option><option value=\"C47\"></option><option value=\"C48\"></option><option value=\"C49\"></option><option value=\"C50\"></option><option value=\"C51\">X</option><option value=\"C52\">/</option><option value=\"C53\">/</option><option value=\"C54\"></option><option value=\"C55\"></option><option value=\"C56\"></option><option value=\"C57\"></option><option value=\"C58\"></option><option value=\"C61\"></option><option value=\"C69\"></option><option value=\"C90\"></option><option value=\"D01\"></option><option value=\"D02\"></option><option value=\"D03\"></option><option value=\"E01\"></option><option value=\"E11\"></option><option value=\"E12\">/</option><option value=\"Z99\"></option></select>"; Document tmpDoc = Jsoup.parse(str); Elements els = tmpDoc.select("#carKindCodeBak> option"); for (Element el : els) { carTypeMap.put(el.attr("value"), el.text()); }/*from w ww.ja va 2 s . c o m*/ templateData = new HashMap<String, String>(); List<FormElement> forms = doc.getAllElements().forms(); for (FormElement form : forms) { List<KeyVal> datas = form.formData(); for (KeyVal item : datas) { templateData.put(item.key(), item.value()); //System.out.print(item.key()+"="+item.value() + "&"); } System.out.println("------"); } templateData.put("prpCmainCI.sumAmount", "122000"); templateData.put("prpCitemKindCI.familyNo", "1");//null templateData.put("prpCitemKindCI.amount", "122000");//0 templateData.put("prpCitemKindCI.adjustRate", "0.9");//1 }
From source file:org.norvelle.addressdiscoverer.parse.structured.BackwardsFlattenedDocumentIterator.java
/** * Generate the iterator and position its pointer so it can be walked backward * using next()//from w w w .j a v a 2 s . c o m * * @param soup * @param encoding * @param status * @throws java.io.UnsupportedEncodingException * @throws org.norvelle.addressdiscoverer.exceptions.EndNodeWalkingException */ public BackwardsFlattenedDocumentIterator(Document soup, String encoding, ExtractIndividualsStatusReporter status) throws UnsupportedEncodingException, EndNodeWalkingException { this.status = status; this.status.setTotalNumericSteps(soup.getAllElements().size()); // First we generate the flattened list of elements this.walkNodeBackwards(soup, encoding); this.status.reportProgressText("Backwards document iterator created successfully"); logger.log(Level.FINE, "Flattened document: \n{0}", StringUtils.join(this.elementsWithNames, "\n")); // Now, we set the cursor to the end so we can iterate backwards this.currPosition = this.elementsWithNames.size() - 1; }
From source file:org.norvelle.addressdiscoverer.parse.unstructured.ForwardsFlattenedDocumentIterator.java
/** * Generate the iterator and position its pointer so it can be walked backward * using next()/* ww w. j ava 2s . c o m*/ * * @param soup * @param encoding * @param status * @throws java.io.UnsupportedEncodingException * @throws org.norvelle.addressdiscoverer.exceptions.EndNodeWalkingException */ public ForwardsFlattenedDocumentIterator(Document soup, String encoding, ExtractIndividualsStatusReporter status) throws UnsupportedEncodingException, EndNodeWalkingException { this.status = status; this.status.setTotalNumericSteps(soup.getAllElements().size()); // First we generate the flattened list of elements this.walkNodeForwards(soup, encoding); this.status.reportProgressText("Backwards document iterator created successfully"); logger.log(Level.FINE, "Flattened document: \n{0}", StringUtils.join(this.elementsWithNames, "\n")); // Now, we set the cursor to the end so we can iterate backwards this.currPosition = this.elementsWithNames.size() - 1; // If we have any remaining Nodes to add as intermediates, add them to // the last name Node we found. if (!intermediateValuesList.isEmpty()) this.intermediateElementMap.put(lastNameContainingElement, this.intermediateValuesList); }
From source file:org.tinymediamanager.scraper.ofdb.OfdbMetadataProvider.java
@Override public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { LOGGER.debug("getMetadata() " + options.toString()); if (options.getType() != MediaType.MOVIE) { throw new UnsupportedMediaTypeException(options.getType()); }/*from w w w . j a v a 2s . co m*/ // we have 3 entry points here // a) getMetadata has been called with an ofdbId // b) getMetadata has been called with an imdbId // c) getMetadata has been called from a previous search String detailUrl = ""; // case a) and c) if (StringUtils.isNotBlank(options.getId(getProviderInfo().getId())) || options.getResult() != null) { if (StringUtils.isNotBlank(options.getId(getProviderInfo().getId()))) { detailUrl = "http://www.ofdb.de/view.php?page=film&fid=" + options.getId(getProviderInfo().getId()); } else { detailUrl = options.getResult().getUrl(); } } // case b) if (options.getResult() == null && StringUtils.isNotBlank(options.getId(MediaMetadata.IMDB))) { MediaSearchOptions searchOptions = new MediaSearchOptions(MediaType.MOVIE); searchOptions.setImdbId(options.getId(MediaMetadata.IMDB)); try { List<MediaSearchResult> results = search(searchOptions); if (results != null && !results.isEmpty()) { options.setResult(results.get(0)); detailUrl = options.getResult().getUrl(); } } catch (Exception e) { LOGGER.warn("failed IMDB search: " + e.getMessage()); } } // we can only work further if we got a search result on ofdb.de if (StringUtils.isBlank(detailUrl)) { throw new Exception("We did not get any useful movie url"); } MediaMetadata md = new MediaMetadata(providerInfo.getId()); // generic Elements used all over Elements el = null; String ofdbId = StrgUtils.substr(detailUrl, "film\\/(\\d+),"); if (StringUtils.isBlank(ofdbId)) { ofdbId = StrgUtils.substr(detailUrl, "fid=(\\d+)"); } Url url; try { LOGGER.trace("get details page"); url = new Url(detailUrl); InputStream in = url.getInputStream(); Document doc = Jsoup.parse(in, "UTF-8", ""); in.close(); if (doc.getAllElements().size() < 10) { throw new Exception("meh - we did not receive a valid web page"); } // parse details // IMDB ID "http://www.imdb.com/Title?1194173" el = doc.getElementsByAttributeValueContaining("href", "imdb.com"); if (!el.isEmpty()) { md.setId(MediaMetadata.IMDB, "tt" + StrgUtils.substr(el.first().attr("href"), "\\?(\\d+)")); } // title / year // <meta property="og:title" content="Bourne Vermchtnis, Das (2012)" /> el = doc.getElementsByAttributeValue("property", "og:title"); if (!el.isEmpty()) { String[] ty = parseTitle(el.first().attr("content")); md.setTitle(StrgUtils.removeCommonSortableName(ty[0])); try { md.setYear(Integer.parseInt(ty[1])); } catch (Exception ignored) { } } // another year position if (md.getYear() == 0) { // <a href="view.php?page=blaettern&Kat=Jahr&Text=2012">2012</a> el = doc.getElementsByAttributeValueContaining("href", "Kat=Jahr"); try { md.setYear(Integer.parseInt(el.first().text())); } catch (Exception ignored) { } } // original title (has to be searched with a regexp) // <tr valign="top"> // <td nowrap=""><font class="Normal" face="Arial,Helvetica,sans-serif" // size="2">Originaltitel:</font></td> // <td> </td> // <td width="99%"><font class="Daten" face="Arial,Helvetica,sans-serif" // size="2"><b>Brave</b></font></td> // </tr> String originalTitle = StrgUtils.substr(doc.body().html(), "(?s)Originaltitel.*?<b>(.*?)</b>"); if (!originalTitle.isEmpty()) { md.setOriginalTitle(StrgUtils.removeCommonSortableName(originalTitle)); } // Genre: <a href="view.php?page=genre&Genre=Action">Action</a> el = doc.getElementsByAttributeValueContaining("href", "page=genre"); for (Element g : el) { md.addGenre(getTmmGenre(g.text())); } // rating // <div itemtype="http://schema.org/AggregateRating" itemscope // itemprop="aggregateRating">Note: <span // itemprop="ratingValue">6.73</span><meta // itemprop="worstRating" content="1" /> el = doc.getElementsByAttributeValue("itemprop", "ratingValue"); if (!el.isEmpty()) { String r = el.text(); if (!r.isEmpty()) { try { md.setRating(Float.parseFloat(r)); } catch (Exception e) { LOGGER.debug("could not parse rating"); } } } // get PlotLink; open url and parse // <a href="plot/22523,31360,Die-Bourne-Identitt"><b>[mehr]</b></a> LOGGER.trace("parse plot"); el = doc.getElementsByAttributeValueMatching("href", "plot\\/\\d+,"); if (!el.isEmpty()) { String plotUrl = BASE_URL + "/" + el.first().attr("href"); try { url = new Url(plotUrl); in = url.getInputStream(); Document plot = Jsoup.parse(in, "UTF-8", ""); in.close(); Elements block = plot.getElementsByClass("Blocksatz"); // first // Blocksatz // is plot String p = block.first().text(); // remove all html stuff p = p.substring(p.indexOf("Mal gelesen") + 12); // remove "header" md.setPlot(p); } catch (Exception e) { LOGGER.error("failed to get plot page: " + e.getMessage()); } } // http://www.ofdb.de/view.php?page=film_detail&fid=226745 LOGGER.debug("parse actor detail"); String movieDetail = BASE_URL + "/view.php?page=film_detail&fid=" + ofdbId; doc = null; try { url = new Url(movieDetail); in = url.getInputStream(); doc = Jsoup.parse(in, "UTF-8", ""); in.close(); } catch (Exception e) { LOGGER.error("failed to get detail page: " + e.getMessage()); } if (doc != null) { parseCast(doc.getElementsContainingOwnText("Regie"), MediaCastMember.CastType.DIRECTOR, md); parseCast(doc.getElementsContainingOwnText("Darsteller"), MediaCastMember.CastType.ACTOR, md); parseCast(doc.getElementsContainingOwnText("Stimme/Sprecher"), MediaCastMember.CastType.ACTOR, md); parseCast(doc.getElementsContainingOwnText("Synchronstimme (deutsch)"), MediaCastMember.CastType.ACTOR, md); parseCast(doc.getElementsContainingOwnText("Drehbuchautor(in)"), MediaCastMember.CastType.WRITER, md); parseCast(doc.getElementsContainingOwnText("Produzent(in)"), MediaCastMember.CastType.PRODUCER, md); } } catch (Exception e) { LOGGER.error("Error parsing " + detailUrl); throw e; } return md; }