List of usage examples for org.jsoup.nodes Element previousElementSibling
public Element previousElementSibling()
From source file:com.kantenkugel.discordbot.jdocparser.JDocParser.java
static void parse(final String jdocBase, final String name, final InputStream inputStream, Map<String, ClassDocumentation> docs) { final String[] pathSplits = name.split("/"); final String fileName = pathSplits[pathSplits.length - 1]; if (!Character.isUpperCase(fileName.charAt(0))) { //ignore jdoc structure html return;//from w w w. jav a 2 s. c om } final String[] nameSplits = fileName.split("\\."); final String className = nameSplits[nameSplits.length - 2]; final String fullName = fileName.substring(0, fileName.length() - nameSplits[nameSplits.length - 1].length() - 1); try (BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream))) { //create dom Document final String content = buffer.lines().collect(Collectors.joining("\n")); Document document = Jsoup.parse(content); //classDocument (classname, package, description) Element titleElem = getSingleElementByClass(document, "title"); final String classSig = JDocUtil.fixSpaces(titleElem.text()); Element packageElem = titleElem.previousElementSibling(); if (packageElem.children().size() > 1) { packageElem = packageElem.children().last(); } final String pack = JDocUtil.fixSpaces(packageElem.text()); final String link = JDocUtil.getLink(jdocBase, pack, fullName); Element descriptionElement = null; Elements descriptionCandidates = document.select(".description .block"); if (descriptionCandidates.size() > 1) { List<Element> removed = descriptionCandidates.stream().map(elem -> elem.child(0)) .filter(child -> child != null && !child.className().startsWith("deprecat")) .map(Element::parent).collect(Collectors.toList()); if (removed.size() != 1) throw new RuntimeException("Found too many description candidates"); descriptionElement = removed.get(0); } else if (descriptionCandidates.size() == 1) { descriptionElement = descriptionCandidates.get(0); } final String description = descriptionElement == null ? "" : JDocUtil.formatText(descriptionElement.html(), link); final ClassDocumentation classDoc = new ClassDocumentation(pack, fullName, classSig, description, classSig.startsWith("Enum")); //methods, fields final Element details = document.getElementsByClass("details").first(); if (details != null) { //methods Element tmp = getSingleElementByQuery(details, "a[name=\"method.detail\"]"); List<DocBlock> docBlock = getDocBlock(jdocBase, tmp, classDoc); if (docBlock != null) { for (DocBlock block : docBlock) { Set<MethodDocumentation> mdocs = classDoc.methodDocs .computeIfAbsent(block.title.toLowerCase(), key -> new HashSet<>()); mdocs.add(new MethodDocumentation(classDoc, block.signature, block.hashLink, block.description, block.fields)); } } //vars tmp = getSingleElementByQuery(details, "a[name=\"field.detail\"]"); docBlock = getDocBlock(jdocBase, tmp, classDoc); if (docBlock != null) { for (DocBlock block : docBlock) { classDoc.classValues.put(block.title.toLowerCase(), new ValueDocumentation(classDoc, block.title, block.hashLink, block.signature, block.description)); } } //enum-values tmp = getSingleElementByQuery(details, "a[name=\"enum.constant.detail\"]"); docBlock = getDocBlock(jdocBase, tmp, classDoc); if (docBlock != null) { for (DocBlock block : docBlock) { classDoc.classValues.put(block.title.toLowerCase(), new ValueDocumentation(classDoc, block.title, block.hashLink, block.signature, block.description)); } } } final Element methodSummary = getSingleElementByQuery(document, "a[name=\"method.summary\"]"); classDoc.inheritedMethods.putAll(getInheritedMethods(methodSummary)); //storing if (nameSplits.length > 2) { if (!docs.containsKey(nameSplits[0].toLowerCase())) docs.put(nameSplits[0].toLowerCase(), new ClassDocumentation(null, null, null, null, false)); ClassDocumentation parent = docs.get(nameSplits[0].toLowerCase()); for (int i = 1; i < nameSplits.length - 2; i++) { if (!parent.subClasses.containsKey(nameSplits[i].toLowerCase())) parent.subClasses.put(nameSplits[i].toLowerCase(), new ClassDocumentation(null, null, null, null, false)); parent = parent.subClasses.get(nameSplits[i].toLowerCase()); } if (parent.subClasses.containsKey(className.toLowerCase())) classDoc.subClasses.putAll(parent.subClasses.get(className.toLowerCase()).subClasses); parent.subClasses.put(className.toLowerCase(), classDoc); } if (docs.containsKey(fullName.toLowerCase())) { ClassDocumentation current = docs.get(fullName.toLowerCase()); if (current.classSig != null) throw new RuntimeException("Got a class-name conflict with classes " + classDoc.classSig + "(" + classDoc.className + ") AND " + current.classSig + "(" + current.className + ")"); classDoc.subClasses.putAll(current.subClasses); } docs.put(fullName.toLowerCase(), classDoc); } catch (final IOException | NullPointerException ex) { JDocUtil.LOG.error("Got excaption for element {}", fullName, ex); } try { inputStream.close(); } catch (final IOException e) { JDocUtil.LOG.error("Error closing inputstream", e); } }
From source file:net.pixomania.crawler.W3C.parser.rules.principalAuthors.PrincipalAuthorsRule1.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("dt:contains(Principal Author) ~ dd"); if (editors.size() == 0) return null; boolean skip = false; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev.tagName().equals("dt")) { if (!prev.text().trim().toLowerCase().startsWith("principal author")) { skip = true;/*from w w w. j a v a 2 s . c o m*/ } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("principal author")) { skip = false; continue; } } continue; } String[] splitted = editor.html().split(","); for (String split : splitted) { if (!split.isEmpty()) { if (split.toLowerCase().startsWith("(in alphabetic") || split.toLowerCase().startsWith("see acknowl") || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac") || split.toLowerCase().startsWith("see participants") || split.toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } if (editorList.size() == 0) return null; return editorList; }
From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule7.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("dt:contains(Authors/Editors) ~ dd, dt:contains(Author/Editor) ~ dd"); if (editors.size() == 0) return null; boolean skip = false; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev.tagName().equals("dt")) { if (!prev.text().trim().toLowerCase().startsWith("authors/editors") && !prev.text().trim().toLowerCase().startsWith("author/editor")) { skip = true;//from w w w .j a v a 2 s . co m } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("authors/editors") || next.text().trim().toLowerCase().startsWith("author/editor")) { skip = false; continue; } } continue; } if (StringUtils.countMatches(editor.text(), " - ") > 2) { Log.log("warning", url + ": This editor may be a list of editors separated by - "); EditorsRule5 ed5 = new EditorsRule5(); return ed5.run(url, doc); } String[] splitted = editor.html().split("<br />|<br clear=\"none\" />"); if (splitted.length < 2) { if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:")) continue; Person result = NameParser.parse(editor.text()); if (result == null) continue; for (int i = 0; i < editor.select("a").size(); i++) { if (!editor.select("a").get(i).attr("href").isEmpty()) { if (editor.select("a").get(i).attr("href").contains("@")) { result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(editor.select("a").get(i).attr("href")); } } } editorList.add(result); } else { for (String split : splitted) { if (!split.isEmpty()) { if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } Element next = editor.nextElementSibling(); if (next != null) if (next.tag().getName().equals("dt")) break; } if (editorList.size() == 0) return null; return editorList; }
From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule2.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("dt:contains(Editor) ~ dd, dt:contains(Edition Editor) ~ dd"); if (editors.size() == 0) return null; boolean skip = false; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev.tagName().equals("dt")) { if ((!prev.text().trim().toLowerCase().startsWith("editor") && !prev.text().trim().toLowerCase().startsWith("edition editor")) || prev.text().trim().toLowerCase().contains("version") || prev.text().trim().toLowerCase().endsWith("draft:")) { skip = true;/*from ww w . jav a2s .com*/ } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("editor") || next.text().trim().toLowerCase().contains("edition editor")) { skip = false; continue; } } continue; } if (StringUtils.countMatches(editor.text(), " - ") > 2) { Log.log("warning", "This editor may be a list of editors separated by - "); EditorsRule5 ed5 = new EditorsRule5(); return ed5.run(url, doc); } String[] splitted = editor.html().split("<br />|<br clear=\"none\" />"); if (splitted.length < 2) { if (editor.text().toLowerCase().startsWith("(in alphabetic") || editor.text().toLowerCase().startsWith("see acknowl") || editor.text().toLowerCase().startsWith("the w3") || editor.text().toLowerCase().startsWith("(see ac") || editor.text().toLowerCase().startsWith("see participants") || editor.text().toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:")) continue; Person result = NameParser.parse(editor.text()); if (result == null) continue; for (int i = 0; i < editor.select("a").size(); i++) { if (!editor.select("a").get(i).attr("href").isEmpty()) { if (editor.select("a").get(i).attr("href").contains("@")) { result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(editor.select("a").get(i).attr("href")); } } } editorList.add(result); } else { for (String split : splitted) { if (!split.isEmpty()) { if (split.toLowerCase().startsWith("(in alphabetic") || split.toLowerCase().startsWith("see acknowl") || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac") || split.toLowerCase().startsWith("see participants") || split.toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } Element next = editor.nextElementSibling(); if (next != null) if (next.tag().getName().equals("dt")) break; } if (editorList.size() == 0) return null; return editorList; }
From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule8.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("h4:contains(Editor) ~ blockquote"); if (editors.size() == 0) return null; boolean skip = false; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev.tagName().equals("h4")) { if ((!prev.text().trim().toLowerCase().startsWith("editor") && !prev.text().trim().toLowerCase().startsWith("edition editor")) || prev.text().trim().toLowerCase().endsWith("version:") || prev.text().trim().toLowerCase().endsWith("draft:")) { skip = true;// ww w . j a v a 2 s . c o m } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("editor") || next.text().trim().toLowerCase().contains("edition editor")) { skip = false; continue; } } continue; } if (StringUtils.countMatches(editor.text(), " - ") > 2) { Log.log("warning", "This editor may be a list of editors separated by - "); EditorsRule5 ed5 = new EditorsRule5(); return ed5.run(url, doc); } String[] splitted = editor.html().split("<br />|<br clear=\"none\" />"); if (splitted.length < 2) { if (editor.text().toLowerCase().startsWith("(in alphabetic") || editor.text().toLowerCase().startsWith("see acknowl") || editor.text().toLowerCase().startsWith("the w3") || editor.text().toLowerCase().startsWith("(see ac") || editor.text().toLowerCase().startsWith("see participants") || editor.text().toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:")) continue; Person result = NameParser.parse(editor.text()); if (result == null) continue; for (int i = 0; i < editor.select("a").size(); i++) { if (!editor.select("a").get(i).attr("href").isEmpty()) { if (editor.select("a").get(i).attr("href").contains("@")) { result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(editor.select("a").get(i).attr("href")); } } } editorList.add(result); } else { for (String split : splitted) { if (!split.isEmpty()) { if (split.toLowerCase().startsWith("(in alphabetic") || split.toLowerCase().startsWith("see acknowl") || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac") || split.toLowerCase().startsWith("see participants") || split.toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } Element next = editor.nextElementSibling(); if (next != null) if (next.tag().getName().equals("h4")) break; } if (editorList.size() == 0) return null; return editorList; }
From source file:net.pixomania.crawler.W3C.parser.rules.editors.version.VersionEditorRule1.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("dt:contains(version 1), dt:contains(version 1) ~ dd"); if (editors.size() == 0) return null; boolean skip = false; String version = ""; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev != null) { if (prev.tagName().equals("dt")) { if (!prev.text().trim().toLowerCase().startsWith("version 1") && !prev.text().trim().toLowerCase().startsWith("editors (version 1")) { skip = true;/*from ww w.j a v a 2s . c om*/ } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("version 1") || next.text().trim().toLowerCase().startsWith("editors (version 1")) { skip = false; continue; } } continue; } } if (editor.tagName().equals("dt")) { version = editor.text(); continue; } String[] splitted = editor.html().split("<br />|<br clear=\"none\" />"); if (splitted.length < 2) { if (editor.text().toLowerCase().startsWith("(in alphabetic") || editor.text().toLowerCase().startsWith("see acknowl") || editor.text().toLowerCase().startsWith("the w3") || editor.text().toLowerCase().startsWith("(see ac") || editor.text().toLowerCase().startsWith("see participants") || editor.text().toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:")) continue; Person result = NameParser.parse(editor.text()); if (result == null) continue; result.setVersion(version); for (int i = 0; i < editor.select("a").size(); i++) { if (!editor.select("a").get(i).attr("href").isEmpty()) { if (editor.select("a").get(i).attr("href").contains("@")) { result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(editor.select("a").get(i).attr("href")); } } } editorList.add(result); } else { for (String split : splitted) { if (!split.isEmpty()) { if (split.toLowerCase().startsWith("(in alphabetic") || split.toLowerCase().startsWith("see acknowl") || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac") || split.toLowerCase().startsWith("see participants") || split.toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; result.setVersion(version); for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } Element next = editor.nextElementSibling(); if (next != null) if (next.tag().getName().equals("dt") && !next.text().trim().toLowerCase().startsWith("editors (version 1")) break; } if (editorList.size() == 0) return null; return editorList; }
From source file:com.jimplush.goose.ContentExtractor.java
/** * adds any siblings that may have a decent score to this node * * @param node/*from w w w . j ava2 s . c o m*/ * @return */ private Element addSiblings(Element node) { if (logger.isDebugEnabled()) { logger.debug("Starting to add siblings"); } int baselineScoreForSiblingParagraphs = getBaselineScoreForSiblings(node); Element currentSibling = node.previousElementSibling(); while (currentSibling != null) { if (logger.isDebugEnabled()) { logger.debug("SIBLINGCHECK: " + debugNode(currentSibling)); } if (currentSibling.tagName().equals("p")) { node.child(0).before(currentSibling.outerHtml()); currentSibling = currentSibling.previousElementSibling(); continue; } // check for a paraph embedded in a containing element int insertedSiblings = 0; Elements potentialParagraphs = currentSibling.getElementsByTag("p"); if (potentialParagraphs.first() == null) { currentSibling = currentSibling.previousElementSibling(); continue; } for (Element firstParagraph : potentialParagraphs) { WordStats wordStats = StopWords.getStopWordCount(firstParagraph.text()); int paragraphScore = wordStats.getStopWordCount(); if ((float) (baselineScoreForSiblingParagraphs * .30) < paragraphScore) { if (logger.isDebugEnabled()) { logger.debug("This node looks like a good sibling, adding it"); } node.child(insertedSiblings).before("<p>" + firstParagraph.text() + "<p>"); insertedSiblings++; } } currentSibling = currentSibling.previousElementSibling(); } return node; }
From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java
/** * ??://ww w .ja v a2 s.com * 1. ??? * 2. ??????? * 3. ?? * 4. ?? ?? * 5. ? * * @return * @throws XpathSyntaxErrorException */ private String getAuthor() throws XpathSyntaxErrorException { String author = ""; if (StringUtils.isBlank(srcTime)) { author = getAuthor(doc.body().html()); return author; } Element cur = doc.body().select("*:containsOwn(" + srcTime + ")").first(); if (cur == null) { LOG.warn("?srcTime=" + srcTime); author = getAuthor(doc.body().html()); return author; } if (!noText(cur)) { String arr[] = cur.html().split(srcTime); for (String text : arr) { author = getShortText(text); if (!StringUtils.isBlank(author)) return author; } } Element parent = cur.parent(); while (parent != null && noText(parent)) { cur = parent; parent = parent.parent(); } author = getAuthor(parent.html()); if (!StringUtils.isBlank(author)) return author; Element pre = cur.previousElementSibling(); while (pre != null && noText(pre)) { pre = pre.previousElementSibling(); } if (pre != null) { author = getShortText(pre.text()); } if (!StringUtils.isBlank(author)) return author; Element next = cur.nextElementSibling(); while (next != null && noText(next)) { next = next.nextElementSibling(); } if (next != null) { author = getShortText(next.text()); } if (!StringUtils.isBlank(author)) return author; author = getShortText(parent.html().replace(srcTime, " ")); if (!StringUtils.isBlank(author)) return author; author = getAuthor(doc.body().html()); if (StringUtils.isBlank(author)) { return author_bak; } return author; }
From source file:org.asqatasun.rules.elementselector.LinkElementSelector.java
/** * /*from www.j a v a2 s. co m*/ * @param element * @return whether one of the preceding sibling is of heading type */ private boolean isOneOfPrecedingSiblingofHeadingType(Element element) { Element prevElementSibling = element.previousElementSibling(); while (prevElementSibling != null) { if (PREV_SIBLING_CONTEXT_ELEMENTS.contains(prevElementSibling.tagName()) || !prevElementSibling.select(CssLikeQueryStore.HEADINGS_CSS_LIKE_QUERY).isEmpty()) { return true; } prevElementSibling = prevElementSibling.previousElementSibling(); } return false; }