Example usage for org.jsoup.nodes Element previousElementSibling

List of usage examples for org.jsoup.nodes Element previousElementSibling

Introduction

In this page you can find the example usage for org.jsoup.nodes Element previousElementSibling.

Prototype

public Element previousElementSibling() 

Source Link

Document

Gets the previous element sibling of this element.

Usage

From source file:com.kantenkugel.discordbot.jdocparser.JDocParser.java

static void parse(final String jdocBase, final String name, final InputStream inputStream,
        Map<String, ClassDocumentation> docs) {
    final String[] pathSplits = name.split("/");
    final String fileName = pathSplits[pathSplits.length - 1];
    if (!Character.isUpperCase(fileName.charAt(0))) {
        //ignore jdoc structure html
        return;//from  w  w  w.  jav  a  2  s. c om
    }
    final String[] nameSplits = fileName.split("\\.");
    final String className = nameSplits[nameSplits.length - 2];
    final String fullName = fileName.substring(0,
            fileName.length() - nameSplits[nameSplits.length - 1].length() - 1);
    try (BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream))) {
        //create dom Document
        final String content = buffer.lines().collect(Collectors.joining("\n"));
        Document document = Jsoup.parse(content);

        //classDocument (classname, package, description)
        Element titleElem = getSingleElementByClass(document, "title");
        final String classSig = JDocUtil.fixSpaces(titleElem.text());
        Element packageElem = titleElem.previousElementSibling();
        if (packageElem.children().size() > 1) {
            packageElem = packageElem.children().last();
        }
        final String pack = JDocUtil.fixSpaces(packageElem.text());
        final String link = JDocUtil.getLink(jdocBase, pack, fullName);
        Element descriptionElement = null;
        Elements descriptionCandidates = document.select(".description .block");
        if (descriptionCandidates.size() > 1) {
            List<Element> removed = descriptionCandidates.stream().map(elem -> elem.child(0))
                    .filter(child -> child != null && !child.className().startsWith("deprecat"))
                    .map(Element::parent).collect(Collectors.toList());
            if (removed.size() != 1)
                throw new RuntimeException("Found too many description candidates");
            descriptionElement = removed.get(0);
        } else if (descriptionCandidates.size() == 1) {
            descriptionElement = descriptionCandidates.get(0);
        }
        final String description = descriptionElement == null ? ""
                : JDocUtil.formatText(descriptionElement.html(), link);
        final ClassDocumentation classDoc = new ClassDocumentation(pack, fullName, classSig, description,
                classSig.startsWith("Enum"));

        //methods, fields
        final Element details = document.getElementsByClass("details").first();
        if (details != null) {
            //methods
            Element tmp = getSingleElementByQuery(details, "a[name=\"method.detail\"]");
            List<DocBlock> docBlock = getDocBlock(jdocBase, tmp, classDoc);
            if (docBlock != null) {
                for (DocBlock block : docBlock) {
                    Set<MethodDocumentation> mdocs = classDoc.methodDocs
                            .computeIfAbsent(block.title.toLowerCase(), key -> new HashSet<>());
                    mdocs.add(new MethodDocumentation(classDoc, block.signature, block.hashLink,
                            block.description, block.fields));
                }
            }
            //vars
            tmp = getSingleElementByQuery(details, "a[name=\"field.detail\"]");
            docBlock = getDocBlock(jdocBase, tmp, classDoc);
            if (docBlock != null) {
                for (DocBlock block : docBlock) {
                    classDoc.classValues.put(block.title.toLowerCase(), new ValueDocumentation(classDoc,
                            block.title, block.hashLink, block.signature, block.description));
                }
            }
            //enum-values
            tmp = getSingleElementByQuery(details, "a[name=\"enum.constant.detail\"]");
            docBlock = getDocBlock(jdocBase, tmp, classDoc);
            if (docBlock != null) {
                for (DocBlock block : docBlock) {
                    classDoc.classValues.put(block.title.toLowerCase(), new ValueDocumentation(classDoc,
                            block.title, block.hashLink, block.signature, block.description));
                }
            }
        }
        final Element methodSummary = getSingleElementByQuery(document, "a[name=\"method.summary\"]");
        classDoc.inheritedMethods.putAll(getInheritedMethods(methodSummary));

        //storing
        if (nameSplits.length > 2) {
            if (!docs.containsKey(nameSplits[0].toLowerCase()))
                docs.put(nameSplits[0].toLowerCase(), new ClassDocumentation(null, null, null, null, false));
            ClassDocumentation parent = docs.get(nameSplits[0].toLowerCase());
            for (int i = 1; i < nameSplits.length - 2; i++) {
                if (!parent.subClasses.containsKey(nameSplits[i].toLowerCase()))
                    parent.subClasses.put(nameSplits[i].toLowerCase(),
                            new ClassDocumentation(null, null, null, null, false));
                parent = parent.subClasses.get(nameSplits[i].toLowerCase());
            }
            if (parent.subClasses.containsKey(className.toLowerCase()))
                classDoc.subClasses.putAll(parent.subClasses.get(className.toLowerCase()).subClasses);
            parent.subClasses.put(className.toLowerCase(), classDoc);
        }
        if (docs.containsKey(fullName.toLowerCase())) {
            ClassDocumentation current = docs.get(fullName.toLowerCase());
            if (current.classSig != null)
                throw new RuntimeException("Got a class-name conflict with classes " + classDoc.classSig + "("
                        + classDoc.className + ") AND " + current.classSig + "(" + current.className + ")");
            classDoc.subClasses.putAll(current.subClasses);
        }
        docs.put(fullName.toLowerCase(), classDoc);
    } catch (final IOException | NullPointerException ex) {
        JDocUtil.LOG.error("Got excaption for element {}", fullName, ex);
    }
    try {
        inputStream.close();
    } catch (final IOException e) {
        JDocUtil.LOG.error("Error closing inputstream", e);
    }
}

From source file:net.pixomania.crawler.W3C.parser.rules.principalAuthors.PrincipalAuthorsRule1.java

@Override
public ArrayList<Person> run(String url, Document doc) {
    ArrayList<Person> editorList = new ArrayList<>();

    Elements editors = doc.select("dt:contains(Principal Author) ~ dd");
    if (editors.size() == 0)
        return null;

    boolean skip = false;
    for (Element editor : editors) {
        Element prev = editor.previousElementSibling();
        if (prev.tagName().equals("dt")) {
            if (!prev.text().trim().toLowerCase().startsWith("principal author")) {
                skip = true;/*from w  w  w. j a  v  a  2 s  .  c o  m*/
            }
        }

        if (skip) {
            Element next = editor.nextElementSibling();
            if (next != null) {
                if (next.text().trim().toLowerCase().startsWith("principal author")) {
                    skip = false;
                    continue;
                }
            }
            continue;
        }

        String[] splitted = editor.html().split(",");

        for (String split : splitted) {
            if (!split.isEmpty()) {
                if (split.toLowerCase().startsWith("(in alphabetic")
                        || split.toLowerCase().startsWith("see acknowl")
                        || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac")
                        || split.toLowerCase().startsWith("see participants")
                        || split.toLowerCase().contains("note:")) {
                    Log.log("warning", "Spec " + url + " may refer to a different section!");
                    continue;
                }
                if (split.equals("WHATWG:") || split.equals("W3C:"))
                    continue;
                Document newdoc = Jsoup.parse(split.replaceAll("\n", ""));
                Person result = NameParser.parse(newdoc.text());
                if (result == null)
                    continue;

                for (int i = 0; i < newdoc.select("a").size(); i++) {
                    if (!newdoc.select("a").get(i).attr("href").isEmpty()) {
                        if (newdoc.select("a").get(i).attr("href").contains("@")) {
                            result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", ""));
                        } else {
                            result.addWebsite(newdoc.select("a").get(i).attr("href"));
                        }
                    }
                }

                editorList.add(result);
            }
        }
    }

    if (editorList.size() == 0)
        return null;

    return editorList;
}

From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule7.java

@Override
public ArrayList<Person> run(String url, Document doc) {
    ArrayList<Person> editorList = new ArrayList<>();

    Elements editors = doc.select("dt:contains(Authors/Editors) ~ dd, dt:contains(Author/Editor) ~ dd");
    if (editors.size() == 0)
        return null;

    boolean skip = false;
    for (Element editor : editors) {
        Element prev = editor.previousElementSibling();
        if (prev.tagName().equals("dt")) {
            if (!prev.text().trim().toLowerCase().startsWith("authors/editors")
                    && !prev.text().trim().toLowerCase().startsWith("author/editor")) {
                skip = true;//from  w  w w .j a  v  a  2  s  .  co  m
            }
        }

        if (skip) {
            Element next = editor.nextElementSibling();
            if (next != null) {
                if (next.text().trim().toLowerCase().startsWith("authors/editors")
                        || next.text().trim().toLowerCase().startsWith("author/editor")) {
                    skip = false;
                    continue;
                }
            }
            continue;
        }

        if (StringUtils.countMatches(editor.text(), " - ") > 2) {
            Log.log("warning", url + ": This editor may be a list of editors separated by  - ");
            EditorsRule5 ed5 = new EditorsRule5();

            return ed5.run(url, doc);
        }

        String[] splitted = editor.html().split("<br />|<br clear=\"none\" />");

        if (splitted.length < 2) {
            if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:"))
                continue;
            Person result = NameParser.parse(editor.text());
            if (result == null)
                continue;

            for (int i = 0; i < editor.select("a").size(); i++) {
                if (!editor.select("a").get(i).attr("href").isEmpty()) {
                    if (editor.select("a").get(i).attr("href").contains("@")) {
                        result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", ""));
                    } else {
                        result.addWebsite(editor.select("a").get(i).attr("href"));
                    }
                }
            }

            editorList.add(result);
        } else {
            for (String split : splitted) {
                if (!split.isEmpty()) {
                    if (split.equals("WHATWG:") || split.equals("W3C:"))
                        continue;
                    Document newdoc = Jsoup.parse(split.replaceAll("\n", ""));
                    Person result = NameParser.parse(newdoc.text());
                    if (result == null)
                        continue;

                    for (int i = 0; i < newdoc.select("a").size(); i++) {
                        if (!newdoc.select("a").get(i).attr("href").isEmpty()) {
                            if (newdoc.select("a").get(i).attr("href").contains("@")) {
                                result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", ""));
                            } else {
                                result.addWebsite(newdoc.select("a").get(i).attr("href"));
                            }
                        }
                    }

                    editorList.add(result);
                }
            }
        }

        Element next = editor.nextElementSibling();
        if (next != null)
            if (next.tag().getName().equals("dt"))
                break;
    }

    if (editorList.size() == 0)
        return null;

    return editorList;
}

From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule2.java

@Override
public ArrayList<Person> run(String url, Document doc) {
    ArrayList<Person> editorList = new ArrayList<>();

    Elements editors = doc.select("dt:contains(Editor) ~ dd, dt:contains(Edition Editor) ~ dd");
    if (editors.size() == 0)
        return null;

    boolean skip = false;
    for (Element editor : editors) {
        Element prev = editor.previousElementSibling();
        if (prev.tagName().equals("dt")) {
            if ((!prev.text().trim().toLowerCase().startsWith("editor")
                    && !prev.text().trim().toLowerCase().startsWith("edition editor"))
                    || prev.text().trim().toLowerCase().contains("version")
                    || prev.text().trim().toLowerCase().endsWith("draft:")) {
                skip = true;/*from   ww  w  .  jav  a2s .com*/
            }
        }

        if (skip) {
            Element next = editor.nextElementSibling();
            if (next != null) {
                if (next.text().trim().toLowerCase().startsWith("editor")
                        || next.text().trim().toLowerCase().contains("edition editor")) {
                    skip = false;
                    continue;
                }
            }
            continue;
        }

        if (StringUtils.countMatches(editor.text(), " - ") > 2) {
            Log.log("warning", "This editor may be a list of editors separated by  - ");
            EditorsRule5 ed5 = new EditorsRule5();

            return ed5.run(url, doc);
        }

        String[] splitted = editor.html().split("<br />|<br clear=\"none\" />");

        if (splitted.length < 2) {
            if (editor.text().toLowerCase().startsWith("(in alphabetic")
                    || editor.text().toLowerCase().startsWith("see acknowl")
                    || editor.text().toLowerCase().startsWith("the w3")
                    || editor.text().toLowerCase().startsWith("(see ac")
                    || editor.text().toLowerCase().startsWith("see participants")
                    || editor.text().toLowerCase().contains("note:")) {
                Log.log("warning", "Spec " + url + " may refer to a different section!");
                continue;
            }
            if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:"))
                continue;
            Person result = NameParser.parse(editor.text());
            if (result == null)
                continue;

            for (int i = 0; i < editor.select("a").size(); i++) {
                if (!editor.select("a").get(i).attr("href").isEmpty()) {
                    if (editor.select("a").get(i).attr("href").contains("@")) {
                        result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", ""));
                    } else {
                        result.addWebsite(editor.select("a").get(i).attr("href"));
                    }
                }
            }

            editorList.add(result);
        } else {
            for (String split : splitted) {
                if (!split.isEmpty()) {
                    if (split.toLowerCase().startsWith("(in alphabetic")
                            || split.toLowerCase().startsWith("see acknowl")
                            || split.toLowerCase().startsWith("the w3")
                            || split.toLowerCase().startsWith("(see ac")
                            || split.toLowerCase().startsWith("see participants")
                            || split.toLowerCase().contains("note:")) {
                        Log.log("warning", "Spec " + url + " may refer to a different section!");
                        continue;
                    }
                    if (split.equals("WHATWG:") || split.equals("W3C:"))
                        continue;
                    Document newdoc = Jsoup.parse(split.replaceAll("\n", ""));
                    Person result = NameParser.parse(newdoc.text());
                    if (result == null)
                        continue;

                    for (int i = 0; i < newdoc.select("a").size(); i++) {
                        if (!newdoc.select("a").get(i).attr("href").isEmpty()) {
                            if (newdoc.select("a").get(i).attr("href").contains("@")) {
                                result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", ""));
                            } else {
                                result.addWebsite(newdoc.select("a").get(i).attr("href"));
                            }
                        }
                    }

                    editorList.add(result);
                }
            }
        }

        Element next = editor.nextElementSibling();
        if (next != null)
            if (next.tag().getName().equals("dt"))
                break;
    }

    if (editorList.size() == 0)
        return null;

    return editorList;
}

From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule8.java

@Override
public ArrayList<Person> run(String url, Document doc) {
    ArrayList<Person> editorList = new ArrayList<>();

    Elements editors = doc.select("h4:contains(Editor) ~ blockquote");
    if (editors.size() == 0)
        return null;

    boolean skip = false;
    for (Element editor : editors) {
        Element prev = editor.previousElementSibling();
        if (prev.tagName().equals("h4")) {
            if ((!prev.text().trim().toLowerCase().startsWith("editor")
                    && !prev.text().trim().toLowerCase().startsWith("edition editor"))
                    || prev.text().trim().toLowerCase().endsWith("version:")
                    || prev.text().trim().toLowerCase().endsWith("draft:")) {
                skip = true;// ww w  .  j a  v a 2  s  . c  o  m
            }
        }

        if (skip) {
            Element next = editor.nextElementSibling();
            if (next != null) {
                if (next.text().trim().toLowerCase().startsWith("editor")
                        || next.text().trim().toLowerCase().contains("edition editor")) {
                    skip = false;
                    continue;
                }
            }
            continue;
        }

        if (StringUtils.countMatches(editor.text(), " - ") > 2) {
            Log.log("warning", "This editor may be a list of editors separated by  - ");
            EditorsRule5 ed5 = new EditorsRule5();

            return ed5.run(url, doc);
        }

        String[] splitted = editor.html().split("<br />|<br clear=\"none\" />");

        if (splitted.length < 2) {
            if (editor.text().toLowerCase().startsWith("(in alphabetic")
                    || editor.text().toLowerCase().startsWith("see acknowl")
                    || editor.text().toLowerCase().startsWith("the w3")
                    || editor.text().toLowerCase().startsWith("(see ac")
                    || editor.text().toLowerCase().startsWith("see participants")
                    || editor.text().toLowerCase().contains("note:")) {
                Log.log("warning", "Spec " + url + " may refer to a different section!");
                continue;
            }
            if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:"))
                continue;
            Person result = NameParser.parse(editor.text());
            if (result == null)
                continue;

            for (int i = 0; i < editor.select("a").size(); i++) {
                if (!editor.select("a").get(i).attr("href").isEmpty()) {
                    if (editor.select("a").get(i).attr("href").contains("@")) {
                        result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", ""));
                    } else {
                        result.addWebsite(editor.select("a").get(i).attr("href"));
                    }
                }
            }

            editorList.add(result);
        } else {
            for (String split : splitted) {
                if (!split.isEmpty()) {
                    if (split.toLowerCase().startsWith("(in alphabetic")
                            || split.toLowerCase().startsWith("see acknowl")
                            || split.toLowerCase().startsWith("the w3")
                            || split.toLowerCase().startsWith("(see ac")
                            || split.toLowerCase().startsWith("see participants")
                            || split.toLowerCase().contains("note:")) {
                        Log.log("warning", "Spec " + url + " may refer to a different section!");
                        continue;
                    }
                    if (split.equals("WHATWG:") || split.equals("W3C:"))
                        continue;
                    Document newdoc = Jsoup.parse(split.replaceAll("\n", ""));
                    Person result = NameParser.parse(newdoc.text());
                    if (result == null)
                        continue;

                    for (int i = 0; i < newdoc.select("a").size(); i++) {
                        if (!newdoc.select("a").get(i).attr("href").isEmpty()) {
                            if (newdoc.select("a").get(i).attr("href").contains("@")) {
                                result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", ""));
                            } else {
                                result.addWebsite(newdoc.select("a").get(i).attr("href"));
                            }
                        }
                    }

                    editorList.add(result);
                }
            }
        }

        Element next = editor.nextElementSibling();
        if (next != null)
            if (next.tag().getName().equals("h4"))
                break;
    }

    if (editorList.size() == 0)
        return null;

    return editorList;
}

From source file:net.pixomania.crawler.W3C.parser.rules.editors.version.VersionEditorRule1.java

@Override
public ArrayList<Person> run(String url, Document doc) {
    ArrayList<Person> editorList = new ArrayList<>();

    Elements editors = doc.select("dt:contains(version 1), dt:contains(version 1) ~ dd");
    if (editors.size() == 0)
        return null;

    boolean skip = false;
    String version = "";
    for (Element editor : editors) {
        Element prev = editor.previousElementSibling();
        if (prev != null) {
            if (prev.tagName().equals("dt")) {
                if (!prev.text().trim().toLowerCase().startsWith("version 1")
                        && !prev.text().trim().toLowerCase().startsWith("editors (version 1")) {
                    skip = true;/*from ww w.j  a  v a 2s . c  om*/
                }
            }

            if (skip) {
                Element next = editor.nextElementSibling();
                if (next != null) {
                    if (next.text().trim().toLowerCase().startsWith("version 1")
                            || next.text().trim().toLowerCase().startsWith("editors (version 1")) {
                        skip = false;

                        continue;
                    }
                }
                continue;
            }
        }

        if (editor.tagName().equals("dt")) {
            version = editor.text();
            continue;
        }

        String[] splitted = editor.html().split("<br />|<br clear=\"none\" />");

        if (splitted.length < 2) {
            if (editor.text().toLowerCase().startsWith("(in alphabetic")
                    || editor.text().toLowerCase().startsWith("see acknowl")
                    || editor.text().toLowerCase().startsWith("the w3")
                    || editor.text().toLowerCase().startsWith("(see ac")
                    || editor.text().toLowerCase().startsWith("see participants")
                    || editor.text().toLowerCase().contains("note:")) {
                Log.log("warning", "Spec " + url + " may refer to a different section!");
                continue;
            }
            if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:"))
                continue;
            Person result = NameParser.parse(editor.text());
            if (result == null)
                continue;

            result.setVersion(version);
            for (int i = 0; i < editor.select("a").size(); i++) {
                if (!editor.select("a").get(i).attr("href").isEmpty()) {
                    if (editor.select("a").get(i).attr("href").contains("@")) {
                        result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", ""));
                    } else {
                        result.addWebsite(editor.select("a").get(i).attr("href"));
                    }
                }
            }

            editorList.add(result);
        } else {
            for (String split : splitted) {
                if (!split.isEmpty()) {
                    if (split.toLowerCase().startsWith("(in alphabetic")
                            || split.toLowerCase().startsWith("see acknowl")
                            || split.toLowerCase().startsWith("the w3")
                            || split.toLowerCase().startsWith("(see ac")
                            || split.toLowerCase().startsWith("see participants")
                            || split.toLowerCase().contains("note:")) {
                        Log.log("warning", "Spec " + url + " may refer to a different section!");
                        continue;
                    }
                    if (split.equals("WHATWG:") || split.equals("W3C:"))
                        continue;
                    Document newdoc = Jsoup.parse(split.replaceAll("\n", ""));
                    Person result = NameParser.parse(newdoc.text());
                    if (result == null)
                        continue;

                    result.setVersion(version);
                    for (int i = 0; i < newdoc.select("a").size(); i++) {
                        if (!newdoc.select("a").get(i).attr("href").isEmpty()) {
                            if (newdoc.select("a").get(i).attr("href").contains("@")) {
                                result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", ""));
                            } else {
                                result.addWebsite(newdoc.select("a").get(i).attr("href"));
                            }
                        }
                    }

                    editorList.add(result);
                }
            }
        }

        Element next = editor.nextElementSibling();
        if (next != null)
            if (next.tag().getName().equals("dt")
                    && !next.text().trim().toLowerCase().startsWith("editors (version 1"))
                break;
    }

    if (editorList.size() == 0)
        return null;

    return editorList;
}

From source file:com.jimplush.goose.ContentExtractor.java

/**
 * adds any siblings that may have a decent score to this node
 *
 * @param node/*from w  w w  . j  ava2 s  . c o  m*/
 * @return
 */
private Element addSiblings(Element node) {
    if (logger.isDebugEnabled()) {
        logger.debug("Starting to add siblings");
    }
    int baselineScoreForSiblingParagraphs = getBaselineScoreForSiblings(node);

    Element currentSibling = node.previousElementSibling();
    while (currentSibling != null) {
        if (logger.isDebugEnabled()) {
            logger.debug("SIBLINGCHECK: " + debugNode(currentSibling));
        }

        if (currentSibling.tagName().equals("p")) {

            node.child(0).before(currentSibling.outerHtml());
            currentSibling = currentSibling.previousElementSibling();
            continue;
        }

        // check for a paraph embedded in a containing element
        int insertedSiblings = 0;
        Elements potentialParagraphs = currentSibling.getElementsByTag("p");
        if (potentialParagraphs.first() == null) {
            currentSibling = currentSibling.previousElementSibling();
            continue;
        }
        for (Element firstParagraph : potentialParagraphs) {
            WordStats wordStats = StopWords.getStopWordCount(firstParagraph.text());

            int paragraphScore = wordStats.getStopWordCount();

            if ((float) (baselineScoreForSiblingParagraphs * .30) < paragraphScore) {
                if (logger.isDebugEnabled()) {
                    logger.debug("This node looks like a good sibling, adding it");
                }
                node.child(insertedSiblings).before("<p>" + firstParagraph.text() + "<p>");
                insertedSiblings++;
            }

        }

        currentSibling = currentSibling.previousElementSibling();
    }
    return node;

}

From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java

/**
 * ??://ww w  .ja v  a2  s.com
 * 1. ???
 * 2. ???????
 * 3. ??
 * 4. ?? ??
 * 5. ?
 *
 * @return
 * @throws XpathSyntaxErrorException
 */
private String getAuthor() throws XpathSyntaxErrorException {
    String author = "";
    if (StringUtils.isBlank(srcTime)) {
        author = getAuthor(doc.body().html());
        return author;
    }
    Element cur = doc.body().select("*:containsOwn(" + srcTime + ")").first();
    if (cur == null) {
        LOG.warn("?srcTime=" + srcTime);
        author = getAuthor(doc.body().html());
        return author;
    }

    if (!noText(cur)) {
        String arr[] = cur.html().split(srcTime);
        for (String text : arr) {
            author = getShortText(text);
            if (!StringUtils.isBlank(author))
                return author;
        }
    }
    Element parent = cur.parent();
    while (parent != null && noText(parent)) {
        cur = parent;
        parent = parent.parent();
    }
    author = getAuthor(parent.html());
    if (!StringUtils.isBlank(author))
        return author;

    Element pre = cur.previousElementSibling();
    while (pre != null && noText(pre)) {
        pre = pre.previousElementSibling();
    }
    if (pre != null) {
        author = getShortText(pre.text());
    }
    if (!StringUtils.isBlank(author))
        return author;
    Element next = cur.nextElementSibling();
    while (next != null && noText(next)) {
        next = next.nextElementSibling();
    }
    if (next != null) {
        author = getShortText(next.text());
    }
    if (!StringUtils.isBlank(author))
        return author;

    author = getShortText(parent.html().replace(srcTime, " "));
    if (!StringUtils.isBlank(author))
        return author;

    author = getAuthor(doc.body().html());
    if (StringUtils.isBlank(author)) {
        return author_bak;
    }
    return author;
}

From source file:org.asqatasun.rules.elementselector.LinkElementSelector.java

/**
 * /*from  www.j a v a2 s.  co m*/
 * @param element
 * @return whether one of the preceding sibling is of heading type
 */
private boolean isOneOfPrecedingSiblingofHeadingType(Element element) {
    Element prevElementSibling = element.previousElementSibling();
    while (prevElementSibling != null) {
        if (PREV_SIBLING_CONTEXT_ELEMENTS.contains(prevElementSibling.tagName())
                || !prevElementSibling.select(CssLikeQueryStore.HEADINGS_CSS_LIKE_QUERY).isEmpty()) {
            return true;
        }
        prevElementSibling = prevElementSibling.previousElementSibling();
    }
    return false;
}