Example usage for org.jsoup.nodes Element tag

List of usage examples for org.jsoup.nodes Element tag

Introduction

In this page you can find the example usage for org.jsoup.nodes Element tag.

Prototype

Tag tag

To view the source code for org.jsoup.nodes Element tag.

Click Source Link

Usage

From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule7.java

@Override
public ArrayList<Person> run(String url, Document doc) {
    ArrayList<Person> editorList = new ArrayList<>();

    Elements editors = doc.select("dt:contains(Authors/Editors) ~ dd, dt:contains(Author/Editor) ~ dd");
    if (editors.size() == 0)
        return null;

    boolean skip = false;
    for (Element editor : editors) {
        Element prev = editor.previousElementSibling();
        if (prev.tagName().equals("dt")) {
            if (!prev.text().trim().toLowerCase().startsWith("authors/editors")
                    && !prev.text().trim().toLowerCase().startsWith("author/editor")) {
                skip = true;/*from  w  ww.  j av a2  s  .c om*/
            }
        }

        if (skip) {
            Element next = editor.nextElementSibling();
            if (next != null) {
                if (next.text().trim().toLowerCase().startsWith("authors/editors")
                        || next.text().trim().toLowerCase().startsWith("author/editor")) {
                    skip = false;
                    continue;
                }
            }
            continue;
        }

        if (StringUtils.countMatches(editor.text(), " - ") > 2) {
            Log.log("warning", url + ": This editor may be a list of editors separated by  - ");
            EditorsRule5 ed5 = new EditorsRule5();

            return ed5.run(url, doc);
        }

        String[] splitted = editor.html().split("<br />|<br clear=\"none\" />");

        if (splitted.length < 2) {
            if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:"))
                continue;
            Person result = NameParser.parse(editor.text());
            if (result == null)
                continue;

            for (int i = 0; i < editor.select("a").size(); i++) {
                if (!editor.select("a").get(i).attr("href").isEmpty()) {
                    if (editor.select("a").get(i).attr("href").contains("@")) {
                        result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", ""));
                    } else {
                        result.addWebsite(editor.select("a").get(i).attr("href"));
                    }
                }
            }

            editorList.add(result);
        } else {
            for (String split : splitted) {
                if (!split.isEmpty()) {
                    if (split.equals("WHATWG:") || split.equals("W3C:"))
                        continue;
                    Document newdoc = Jsoup.parse(split.replaceAll("\n", ""));
                    Person result = NameParser.parse(newdoc.text());
                    if (result == null)
                        continue;

                    for (int i = 0; i < newdoc.select("a").size(); i++) {
                        if (!newdoc.select("a").get(i).attr("href").isEmpty()) {
                            if (newdoc.select("a").get(i).attr("href").contains("@")) {
                                result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", ""));
                            } else {
                                result.addWebsite(newdoc.select("a").get(i).attr("href"));
                            }
                        }
                    }

                    editorList.add(result);
                }
            }
        }

        Element next = editor.nextElementSibling();
        if (next != null)
            if (next.tag().getName().equals("dt"))
                break;
    }

    if (editorList.size() == 0)
        return null;

    return editorList;
}

From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule2.java

@Override
public ArrayList<Person> run(String url, Document doc) {
    ArrayList<Person> editorList = new ArrayList<>();

    Elements editors = doc.select("dt:contains(Editor) ~ dd, dt:contains(Edition Editor) ~ dd");
    if (editors.size() == 0)
        return null;

    boolean skip = false;
    for (Element editor : editors) {
        Element prev = editor.previousElementSibling();
        if (prev.tagName().equals("dt")) {
            if ((!prev.text().trim().toLowerCase().startsWith("editor")
                    && !prev.text().trim().toLowerCase().startsWith("edition editor"))
                    || prev.text().trim().toLowerCase().contains("version")
                    || prev.text().trim().toLowerCase().endsWith("draft:")) {
                skip = true;/*w  ww. j  a  va 2  s.  co  m*/
            }
        }

        if (skip) {
            Element next = editor.nextElementSibling();
            if (next != null) {
                if (next.text().trim().toLowerCase().startsWith("editor")
                        || next.text().trim().toLowerCase().contains("edition editor")) {
                    skip = false;
                    continue;
                }
            }
            continue;
        }

        if (StringUtils.countMatches(editor.text(), " - ") > 2) {
            Log.log("warning", "This editor may be a list of editors separated by  - ");
            EditorsRule5 ed5 = new EditorsRule5();

            return ed5.run(url, doc);
        }

        String[] splitted = editor.html().split("<br />|<br clear=\"none\" />");

        if (splitted.length < 2) {
            if (editor.text().toLowerCase().startsWith("(in alphabetic")
                    || editor.text().toLowerCase().startsWith("see acknowl")
                    || editor.text().toLowerCase().startsWith("the w3")
                    || editor.text().toLowerCase().startsWith("(see ac")
                    || editor.text().toLowerCase().startsWith("see participants")
                    || editor.text().toLowerCase().contains("note:")) {
                Log.log("warning", "Spec " + url + " may refer to a different section!");
                continue;
            }
            if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:"))
                continue;
            Person result = NameParser.parse(editor.text());
            if (result == null)
                continue;

            for (int i = 0; i < editor.select("a").size(); i++) {
                if (!editor.select("a").get(i).attr("href").isEmpty()) {
                    if (editor.select("a").get(i).attr("href").contains("@")) {
                        result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", ""));
                    } else {
                        result.addWebsite(editor.select("a").get(i).attr("href"));
                    }
                }
            }

            editorList.add(result);
        } else {
            for (String split : splitted) {
                if (!split.isEmpty()) {
                    if (split.toLowerCase().startsWith("(in alphabetic")
                            || split.toLowerCase().startsWith("see acknowl")
                            || split.toLowerCase().startsWith("the w3")
                            || split.toLowerCase().startsWith("(see ac")
                            || split.toLowerCase().startsWith("see participants")
                            || split.toLowerCase().contains("note:")) {
                        Log.log("warning", "Spec " + url + " may refer to a different section!");
                        continue;
                    }
                    if (split.equals("WHATWG:") || split.equals("W3C:"))
                        continue;
                    Document newdoc = Jsoup.parse(split.replaceAll("\n", ""));
                    Person result = NameParser.parse(newdoc.text());
                    if (result == null)
                        continue;

                    for (int i = 0; i < newdoc.select("a").size(); i++) {
                        if (!newdoc.select("a").get(i).attr("href").isEmpty()) {
                            if (newdoc.select("a").get(i).attr("href").contains("@")) {
                                result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", ""));
                            } else {
                                result.addWebsite(newdoc.select("a").get(i).attr("href"));
                            }
                        }
                    }

                    editorList.add(result);
                }
            }
        }

        Element next = editor.nextElementSibling();
        if (next != null)
            if (next.tag().getName().equals("dt"))
                break;
    }

    if (editorList.size() == 0)
        return null;

    return editorList;
}

From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule8.java

@Override
public ArrayList<Person> run(String url, Document doc) {
    ArrayList<Person> editorList = new ArrayList<>();

    Elements editors = doc.select("h4:contains(Editor) ~ blockquote");
    if (editors.size() == 0)
        return null;

    boolean skip = false;
    for (Element editor : editors) {
        Element prev = editor.previousElementSibling();
        if (prev.tagName().equals("h4")) {
            if ((!prev.text().trim().toLowerCase().startsWith("editor")
                    && !prev.text().trim().toLowerCase().startsWith("edition editor"))
                    || prev.text().trim().toLowerCase().endsWith("version:")
                    || prev.text().trim().toLowerCase().endsWith("draft:")) {
                skip = true;/*from w  w  w.  j a  va2  s  . c o m*/
            }
        }

        if (skip) {
            Element next = editor.nextElementSibling();
            if (next != null) {
                if (next.text().trim().toLowerCase().startsWith("editor")
                        || next.text().trim().toLowerCase().contains("edition editor")) {
                    skip = false;
                    continue;
                }
            }
            continue;
        }

        if (StringUtils.countMatches(editor.text(), " - ") > 2) {
            Log.log("warning", "This editor may be a list of editors separated by  - ");
            EditorsRule5 ed5 = new EditorsRule5();

            return ed5.run(url, doc);
        }

        String[] splitted = editor.html().split("<br />|<br clear=\"none\" />");

        if (splitted.length < 2) {
            if (editor.text().toLowerCase().startsWith("(in alphabetic")
                    || editor.text().toLowerCase().startsWith("see acknowl")
                    || editor.text().toLowerCase().startsWith("the w3")
                    || editor.text().toLowerCase().startsWith("(see ac")
                    || editor.text().toLowerCase().startsWith("see participants")
                    || editor.text().toLowerCase().contains("note:")) {
                Log.log("warning", "Spec " + url + " may refer to a different section!");
                continue;
            }
            if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:"))
                continue;
            Person result = NameParser.parse(editor.text());
            if (result == null)
                continue;

            for (int i = 0; i < editor.select("a").size(); i++) {
                if (!editor.select("a").get(i).attr("href").isEmpty()) {
                    if (editor.select("a").get(i).attr("href").contains("@")) {
                        result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", ""));
                    } else {
                        result.addWebsite(editor.select("a").get(i).attr("href"));
                    }
                }
            }

            editorList.add(result);
        } else {
            for (String split : splitted) {
                if (!split.isEmpty()) {
                    if (split.toLowerCase().startsWith("(in alphabetic")
                            || split.toLowerCase().startsWith("see acknowl")
                            || split.toLowerCase().startsWith("the w3")
                            || split.toLowerCase().startsWith("(see ac")
                            || split.toLowerCase().startsWith("see participants")
                            || split.toLowerCase().contains("note:")) {
                        Log.log("warning", "Spec " + url + " may refer to a different section!");
                        continue;
                    }
                    if (split.equals("WHATWG:") || split.equals("W3C:"))
                        continue;
                    Document newdoc = Jsoup.parse(split.replaceAll("\n", ""));
                    Person result = NameParser.parse(newdoc.text());
                    if (result == null)
                        continue;

                    for (int i = 0; i < newdoc.select("a").size(); i++) {
                        if (!newdoc.select("a").get(i).attr("href").isEmpty()) {
                            if (newdoc.select("a").get(i).attr("href").contains("@")) {
                                result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", ""));
                            } else {
                                result.addWebsite(newdoc.select("a").get(i).attr("href"));
                            }
                        }
                    }

                    editorList.add(result);
                }
            }
        }

        Element next = editor.nextElementSibling();
        if (next != null)
            if (next.tag().getName().equals("h4"))
                break;
    }

    if (editorList.size() == 0)
        return null;

    return editorList;
}

From source file:net.pixomania.crawler.W3C.parser.rules.editors.version.VersionEditorRule1.java

@Override
public ArrayList<Person> run(String url, Document doc) {
    ArrayList<Person> editorList = new ArrayList<>();

    Elements editors = doc.select("dt:contains(version 1), dt:contains(version 1) ~ dd");
    if (editors.size() == 0)
        return null;

    boolean skip = false;
    String version = "";
    for (Element editor : editors) {
        Element prev = editor.previousElementSibling();
        if (prev != null) {
            if (prev.tagName().equals("dt")) {
                if (!prev.text().trim().toLowerCase().startsWith("version 1")
                        && !prev.text().trim().toLowerCase().startsWith("editors (version 1")) {
                    skip = true;/*from  w w  w.  j ava2s  .co  m*/
                }
            }

            if (skip) {
                Element next = editor.nextElementSibling();
                if (next != null) {
                    if (next.text().trim().toLowerCase().startsWith("version 1")
                            || next.text().trim().toLowerCase().startsWith("editors (version 1")) {
                        skip = false;

                        continue;
                    }
                }
                continue;
            }
        }

        if (editor.tagName().equals("dt")) {
            version = editor.text();
            continue;
        }

        String[] splitted = editor.html().split("<br />|<br clear=\"none\" />");

        if (splitted.length < 2) {
            if (editor.text().toLowerCase().startsWith("(in alphabetic")
                    || editor.text().toLowerCase().startsWith("see acknowl")
                    || editor.text().toLowerCase().startsWith("the w3")
                    || editor.text().toLowerCase().startsWith("(see ac")
                    || editor.text().toLowerCase().startsWith("see participants")
                    || editor.text().toLowerCase().contains("note:")) {
                Log.log("warning", "Spec " + url + " may refer to a different section!");
                continue;
            }
            if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:"))
                continue;
            Person result = NameParser.parse(editor.text());
            if (result == null)
                continue;

            result.setVersion(version);
            for (int i = 0; i < editor.select("a").size(); i++) {
                if (!editor.select("a").get(i).attr("href").isEmpty()) {
                    if (editor.select("a").get(i).attr("href").contains("@")) {
                        result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", ""));
                    } else {
                        result.addWebsite(editor.select("a").get(i).attr("href"));
                    }
                }
            }

            editorList.add(result);
        } else {
            for (String split : splitted) {
                if (!split.isEmpty()) {
                    if (split.toLowerCase().startsWith("(in alphabetic")
                            || split.toLowerCase().startsWith("see acknowl")
                            || split.toLowerCase().startsWith("the w3")
                            || split.toLowerCase().startsWith("(see ac")
                            || split.toLowerCase().startsWith("see participants")
                            || split.toLowerCase().contains("note:")) {
                        Log.log("warning", "Spec " + url + " may refer to a different section!");
                        continue;
                    }
                    if (split.equals("WHATWG:") || split.equals("W3C:"))
                        continue;
                    Document newdoc = Jsoup.parse(split.replaceAll("\n", ""));
                    Person result = NameParser.parse(newdoc.text());
                    if (result == null)
                        continue;

                    result.setVersion(version);
                    for (int i = 0; i < newdoc.select("a").size(); i++) {
                        if (!newdoc.select("a").get(i).attr("href").isEmpty()) {
                            if (newdoc.select("a").get(i).attr("href").contains("@")) {
                                result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", ""));
                            } else {
                                result.addWebsite(newdoc.select("a").get(i).attr("href"));
                            }
                        }
                    }

                    editorList.add(result);
                }
            }
        }

        Element next = editor.nextElementSibling();
        if (next != null)
            if (next.tag().getName().equals("dt")
                    && !next.text().trim().toLowerCase().startsWith("editors (version 1"))
                break;
    }

    if (editorList.size() == 0)
        return null;

    return editorList;
}

From source file:org.structr.web.common.microformat.MicroformatParser.java

private void recurse(final Element element, final Map<String, Object> values, final int depth) {

    final Tag tag = element.tag();
    final Set<String> classes = element.classNames();
    final String link = element.attr("href");
    final Object content = extractChildContent(element);

    if (!classes.isEmpty()) {

        removeEmpty(classes);//ww  w  .j a v a 2 s . c  om

        // toplevel classes define type
        if (tag.isBlock()) {

            if (depth == 0) {

                // store type attribute
                values.put("type", classes);

                for (final Element child : element.children()) {
                    recurse(child, values, depth + 1);
                }

            } else {

                final Map<String, Object> childMap = new LinkedHashMap<>();
                values.put(classes.iterator().next(), childMap);

                if (content != null) {
                    childMap.put("name", content);
                }

                for (final Element child : element.children()) {
                    recurse(child, childMap, depth + 1);
                }
            }

        } else if (tag.isInline()) {

            // extract href and store as URL
            if (classes.contains("url") && StringUtils.isNotBlank(link)) {

                values.put("url", link);
                classes.remove("url");
            }

            if (content != null) {

                for (final String type : classes) {
                    values.put(type, content);
                }
            }

        }
    }
}

From source file:org.tinymediamanager.scraper.imdb.ImdbMetadataProvider.java

@Override
public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception {
    LOGGER.debug("getMetadata() " + options.toString());
    // check if there is a md in the result
    if (options.getResult() != null && options.getResult().getMetadata() != null) {
        LOGGER.debug("IMDB: getMetadata from cache: " + options.getResult());
        return options.getResult().getMetadata();
    }/*from  w w w  .j  av a2s  . c om*/

    MediaMetadata md = new MediaMetadata(providerInfo.getId());
    String imdbId = "";

    // imdbId from searchResult
    if (options.getResult() != null) {
        imdbId = options.getResult().getIMDBId();
    }

    // imdbid from scraper option
    if (!MetadataUtil.isValidImdbId(imdbId)) {
        imdbId = options.getImdbId();
    }

    if (!MetadataUtil.isValidImdbId(imdbId)) {
        return md;
    }

    LOGGER.debug("IMDB: getMetadata(imdbId): " + imdbId);
    md.setId(MediaMetadata.IMDBID, imdbId);

    ExecutorCompletionService<Document> compSvcImdb = new ExecutorCompletionService<Document>(executor);
    ExecutorCompletionService<MediaMetadata> compSvcTmdb = new ExecutorCompletionService<MediaMetadata>(
            executor);

    // worker for imdb request (/combined) (everytime from akas.imdb.com)
    // StringBuilder sb = new StringBuilder(imdbSite.getSite());
    StringBuilder sb = new StringBuilder(ImdbSiteDefinition.IMDB_COM.getSite());
    sb.append("title/");
    sb.append(imdbId);
    sb.append("/combined");
    Callable<Document> worker = new ImdbWorker(sb.toString(), options.getLanguage().name(),
            options.getCountry().getAlpha2());
    Future<Document> futureCombined = compSvcImdb.submit(worker);

    // worker for imdb request (/plotsummary) (from chosen site)
    Future<Document> futurePlotsummary = null;
    sb = new StringBuilder(imdbSite.getSite());
    sb.append("title/");
    sb.append(imdbId);
    sb.append("/plotsummary");

    worker = new ImdbWorker(sb.toString(), options.getLanguage().name(), options.getCountry().getAlpha2());
    futurePlotsummary = compSvcImdb.submit(worker);

    // worker for tmdb request
    Future<MediaMetadata> futureTmdb = null;
    if (options.isScrapeImdbForeignLanguage() || options.isScrapeCollectionInfo()) {
        Callable<MediaMetadata> worker2 = new TmdbWorker(imdbId, options.getLanguage(), options.getCountry());
        futureTmdb = compSvcTmdb.submit(worker2);
    }

    Document doc;
    doc = futureCombined.get();

    /*
     * title and year have the following structure
     * 
     * <div id="tn15title"><h1>Merida - Legende der Highlands <span>(<a href="/year/2012/">2012</a>) <span class="pro-link">...</span> <span
     * class="title-extra">Brave <i>(original title)</i></span> </span></h1> </div>
     */

    // parse title and year
    Element title = doc.getElementById("tn15title");
    if (title != null) {
        Element element = null;
        // title
        Elements elements = title.getElementsByTag("h1");
        if (elements.size() > 0) {
            element = elements.first();
            String movieTitle = cleanString(element.ownText());
            md.storeMetadata(MediaMetadata.TITLE, movieTitle);
        }

        // year
        elements = title.getElementsByTag("span");
        if (elements.size() > 0) {
            element = elements.first();
            String content = element.text();

            // search year
            Pattern yearPattern = Pattern.compile("\\(([0-9]{4})|/\\)");
            Matcher matcher = yearPattern.matcher(content);
            while (matcher.find()) {
                if (matcher.group(1) != null) {
                    String movieYear = matcher.group(1);
                    md.storeMetadata(MediaMetadata.YEAR, movieYear);
                    break;
                }
            }
        }

        // original title
        elements = title.getElementsByAttributeValue("class", "title-extra");
        if (elements.size() > 0) {
            element = elements.first();
            String content = element.text();
            content = content.replaceAll("\\(original title\\)", "").trim();
            md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, content);
        }
    }

    // poster
    Element poster = doc.getElementById("primary-poster");
    if (poster != null) {
        String posterUrl = poster.attr("src");
        posterUrl = posterUrl.replaceAll("SX[0-9]{2,4}_", "SX400_");
        posterUrl = posterUrl.replaceAll("SY[0-9]{2,4}_", "SY400_");
        processMediaArt(md, MediaArtworkType.POSTER, "Poster", posterUrl);
    }

    /*
     * <div class="starbar-meta"> <b>7.4/10</b> &nbsp;&nbsp;<a href="ratings" class="tn15more">52,871 votes</a>&nbsp;&raquo; </div>
     */

    // rating and rating count
    Element ratingElement = doc.getElementById("tn15rating");
    if (ratingElement != null) {
        Elements elements = ratingElement.getElementsByClass("starbar-meta");
        if (elements.size() > 0) {
            Element div = elements.get(0);

            // rating comes in <b> tag
            Elements b = div.getElementsByTag("b");
            if (b.size() == 1) {
                String ratingAsString = b.text();
                Pattern ratingPattern = Pattern.compile("([0-9]\\.[0-9])/10");
                Matcher matcher = ratingPattern.matcher(ratingAsString);
                while (matcher.find()) {
                    if (matcher.group(1) != null) {
                        float rating = 0;
                        try {
                            rating = Float.valueOf(matcher.group(1));
                        } catch (Exception e) {
                        }
                        md.storeMetadata(MediaMetadata.RATING, rating);
                        break;
                    }
                }
            }

            // count
            Elements a = div.getElementsByAttributeValue("href", "ratings");
            if (a.size() == 1) {
                String countAsString = a.text().replaceAll("[.,]|votes", "").trim();
                int voteCount = 0;
                try {
                    voteCount = Integer.parseInt(countAsString);
                } catch (Exception e) {
                }
                md.storeMetadata(MediaMetadata.VOTE_COUNT, voteCount);
            }
        }

        // top250
        elements = ratingElement.getElementsByClass("starbar-special");
        if (elements.size() > 0) {
            Elements a = elements.get(0).getElementsByTag("a");
            if (a.size() > 0) {
                Element anchor = a.get(0);
                Pattern topPattern = Pattern.compile("Top 250: #([0-9]{1,3})");
                Matcher matcher = topPattern.matcher(anchor.ownText());
                while (matcher.find()) {
                    if (matcher.group(1) != null) {
                        int top250 = 0;
                        try {
                            top250 = Integer.parseInt(matcher.group(1));
                        } catch (Exception e) {
                        }
                        md.storeMetadata(MediaMetadata.TOP_250, top250);
                    }
                }
            }
        }
    }

    // parse all items coming by <div class="info">
    Elements elements = doc.getElementsByClass("info");
    for (Element element : elements) {
        // only parse divs
        if (!"div".equals(element.tag().getName())) {
            continue;
        }

        // elements with h5 are the titles of the values
        Elements h5 = element.getElementsByTag("h5");
        if (h5.size() > 0) {
            Element firstH5 = h5.first();
            String h5Title = firstH5.text();

            // release date
            /*
             * <div class="info"><h5>Release Date:</h5><div class="info-content">5 January 1996 (USA)<a class="tn15more inline"
             * href="/title/tt0114746/releaseinfo"
             * onclick="(new Image()).src='/rg/title-tease/releasedates/images/b.gif?link=/title/tt0114746/releaseinfo';"> See more</a>&nbsp;</div></div>
             */
            if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getReleaseDate() + ".*")) {
                Elements div = element.getElementsByClass("info-content");
                if (div.size() > 0) {
                    Element releaseDateElement = div.first();
                    String releaseDate = cleanString(releaseDateElement.ownText().replaceAll("", ""));
                    Pattern pattern = Pattern.compile("(.*)\\(.*\\)");
                    Matcher matcher = pattern.matcher(releaseDate);
                    if (matcher.find()) {
                        try {
                            SimpleDateFormat sdf = new SimpleDateFormat("d MMM yyyy");
                            Date parsedDate = sdf.parse(matcher.group(1));
                            sdf = new SimpleDateFormat("dd-MM-yyyy");
                            md.storeMetadata(MediaMetadata.RELEASE_DATE, sdf.format(parsedDate));
                        } catch (Exception e) {
                        }
                    }
                }
            }

            /*
             * <div class="info"><h5>Tagline:</h5><div class="info-content"> (7) To Defend Us... <a class="tn15more inline"
             * href="/title/tt0472033/taglines" onClick= "(new Image()).src='/rg/title-tease/taglines/images/b.gif?link=/title/tt0472033/taglines';" >See
             * more</a>&nbsp;&raquo; </div></div>
             */
            // tagline
            if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getTagline() + ".*")
                    && !options.isScrapeImdbForeignLanguage()) {
                Elements div = element.getElementsByClass("info-content");
                if (div.size() > 0) {
                    Element taglineElement = div.first();
                    String tagline = cleanString(taglineElement.ownText().replaceAll("", ""));
                    md.storeMetadata(MediaMetadata.TAGLINE, tagline);
                }
            }

            /*
             * <div class="info-content"><a href="/Sections/Genres/Animation/">Animation</a> | <a href="/Sections/Genres/Action/">Action</a> | <a
             * href="/Sections/Genres/Adventure/">Adventure</a> | <a href="/Sections/Genres/Fantasy/">Fantasy</a> | <a
             * href="/Sections/Genres/Mystery/">Mystery</a> | <a href="/Sections/Genres/Sci-Fi/">Sci-Fi</a> | <a
             * href="/Sections/Genres/Thriller/">Thriller</a> <a class="tn15more inline" href="/title/tt0472033/keywords" onClick=
             * "(new Image()).src='/rg/title-tease/keywords/images/b.gif?link=/title/tt0472033/keywords';" > See more</a>&nbsp;&raquo; </div>
             */
            // genres are only scraped from akas.imdb.com
            if (h5Title.matches("(?i)" + imdbSite.getGenre() + "(.*)")) {
                Elements div = element.getElementsByClass("info-content");
                if (div.size() > 0) {
                    Elements a = div.first().getElementsByTag("a");
                    for (Element anchor : a) {
                        if (anchor.attr("href").matches("/Sections/Genres/.*")) {
                            md.addGenre(getTmmGenre(anchor.ownText()));
                        }
                    }
                }
            }
            // }

            /*
             * <div class="info"><h5>Runtime:</h5><div class="info-content">162 min | 171 min (special edition) | 178 min (extended cut)</div></div>
             */
            // runtime
            // if (h5Title.matches("(?i)" + imdbSite.getRuntime() + ".*")) {
            if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getRuntime() + ".*")) {
                Elements div = element.getElementsByClass("info-content");
                if (div.size() > 0) {
                    Element taglineElement = div.first();
                    String first = taglineElement.ownText().split("\\|")[0];
                    String runtimeAsString = cleanString(first.replaceAll("min", ""));
                    int runtime = 0;
                    try {
                        runtime = Integer.parseInt(runtimeAsString);
                    } catch (Exception e) {
                        // try to filter out the first number we find
                        Pattern runtimePattern = Pattern.compile("([0-9]{2,3})");
                        Matcher matcher = runtimePattern.matcher(runtimeAsString);
                        if (matcher.find()) {
                            runtime = Integer.parseInt(matcher.group(0));
                        }
                    }
                    md.storeMetadata(MediaMetadata.RUNTIME, runtime);
                }
            }

            /*
             * <div class="info"><h5>Country:</h5><div class="info-content"><a href="/country/fr">France</a> | <a href="/country/es">Spain</a> | <a
             * href="/country/it">Italy</a> | <a href="/country/hu">Hungary</a></div></div>
             */
            // country
            if (h5Title.matches("(?i)Country.*")) {
                Elements a = element.getElementsByTag("a");
                String countries = "";
                for (Element anchor : a) {
                    Pattern pattern = Pattern.compile("/country/(.*)");
                    Matcher matcher = pattern.matcher(anchor.attr("href"));
                    if (matcher.matches()) {
                        String country = matcher.group(1);
                        if (StringUtils.isNotEmpty(countries)) {
                            countries += ", ";
                        }
                        countries += country.toUpperCase();
                    }
                }
                md.storeMetadata(MediaMetadata.COUNTRY, countries);
            }

            /*
             * <div class="info"><h5>Language:</h5><div class="info-content"><a href="/language/en">English</a> | <a href="/language/de">German</a> | <a
             * href="/language/fr">French</a> | <a href="/language/it">Italian</a></div>
             */
            // Spoken languages
            if (h5Title.matches("(?i)Language.*")) {
                Elements a = element.getElementsByTag("a");
                String spokenLanguages = "";
                for (Element anchor : a) {
                    Pattern pattern = Pattern.compile("/language/(.*)");
                    Matcher matcher = pattern.matcher(anchor.attr("href"));
                    if (matcher.matches()) {
                        String langu = matcher.group(1);
                        if (StringUtils.isNotEmpty(spokenLanguages)) {
                            spokenLanguages += ", ";
                        }
                        spokenLanguages += langu;
                    }
                }
                md.storeMetadata(MediaMetadata.SPOKEN_LANGUAGES, spokenLanguages);
            }

            /*
             * <div class="info"><h5>Certification:</h5><div class="info-content"><a href="/search/title?certificates=us:pg">USA:PG</a> <i>(certificate
             * #47489)</i> | <a href="/search/title?certificates=ca:pg">Canada:PG</a> <i>(Ontario)</i> | <a
             * href="/search/title?certificates=au:pg">Australia:PG</a> | <a href="/search/title?certificates=in:u">India:U</a> | <a
             * href="/search/title?certificates=ie:pg">Ireland:PG</a> ...</div></div>
             */
            // certification
            // if (h5Title.matches("(?i)" + imdbSite.getCertification() + ".*")) {
            if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getCertification() + ".*")) {
                Elements a = element.getElementsByTag("a");
                for (Element anchor : a) {
                    // certification for the right country
                    if (anchor.attr("href").matches(
                            "(?i)/search/title\\?certificates=" + options.getCountry().getAlpha2() + ".*")) {
                        Pattern certificationPattern = Pattern.compile(".*:(.*)");
                        Matcher matcher = certificationPattern.matcher(anchor.ownText());
                        Certification certification = null;
                        while (matcher.find()) {
                            if (matcher.group(1) != null) {
                                certification = Certification.getCertification(options.getCountry(),
                                        matcher.group(1));
                            }
                        }

                        if (certification != null) {
                            md.addCertification(certification);
                            break;
                        }
                    }
                }
            }
        }

        /*
         * <div id="director-info" class="info"> <h5>Director:</h5> <div class="info-content"><a href="/name/nm0000416/" onclick=
         * "(new Image()).src='/rg/directorlist/position-1/images/b.gif?link=name/nm0000416/';" >Terry Gilliam</a><br/> </div> </div>
         */
        // director
        if ("director-info".equals(element.id())) {
            Elements a = element.getElementsByTag("a");
            for (Element anchor : a) {
                if (anchor.attr("href").matches("/name/nm.*")) {
                    MediaCastMember cm = new MediaCastMember(CastType.DIRECTOR);
                    cm.setName(anchor.ownText());
                    md.addCastMember(cm);
                }
            }
        }
    }

    /*
     * <table class="cast"> <tr class="odd"><td class="hs"><a href="http://pro.imdb.com/widget/resume_redirect/" onClick=
     * "(new Image()).src='/rg/resume/prosystem/images/b.gif?link=http://pro.imdb.com/widget/resume_redirect/';" ><img src=
     * "http://i.media-imdb.com/images/SF9113d6f5b7cb1533c35313ccd181a6b1/tn15/no_photo.png" width="25" height="31" border="0"></td><td class="nm"><a
     * href="/name/nm0577828/" onclick= "(new Image()).src='/rg/castlist/position-1/images/b.gif?link=/name/nm0577828/';" >Joseph Melito</a></td><td
     * class="ddd"> ... </td><td class="char"><a href="/character/ch0003139/">Young Cole</a></td></tr> <tr class="even"><td class="hs"><a
     * href="/name/nm0000246/" onClick= "(new Image()).src='/rg/title-tease/tinyhead/images/b.gif?link=/name/nm0000246/';" ><img src=
     * "http://ia.media-imdb.com/images/M/MV5BMjA0MjMzMTE5OF5BMl5BanBnXkFtZTcwMzQ2ODE3Mw@@._V1._SY30_SX23_.jpg" width="23" height="32"
     * border="0"></a><br></td><td class="nm"><a href="/name/nm0000246/" onclick=
     * "(new Image()).src='/rg/castlist/position-2/images/b.gif?link=/name/nm0000246/';" >Bruce Willis</a></td><td class="ddd"> ... </td><td
     * class="char"><a href="/character/ch0003139/">James Cole</a></td></tr> <tr class="odd"><td class="hs"><a href="/name/nm0781218/" onClick=
     * "(new Image()).src='/rg/title-tease/tinyhead/images/b.gif?link=/name/nm0781218/';" ><img src=
     * "http://ia.media-imdb.com/images/M/MV5BODI1MTA2MjkxM15BMl5BanBnXkFtZTcwMTcwMDg2Nw@@._V1._SY30_SX23_.jpg" width="23" height="32"
     * border="0"></a><br></td><td class="nm"><a href="/name/nm0781218/" onclick=
     * "(new Image()).src='/rg/castlist/position-3/images/b.gif?link=/name/nm0781218/';" >Jon Seda</a></td><td class="ddd"> ... </td><td
     * class="char"><a href="/character/ch0003143/">Jose</a></td></tr>...</table>
     */
    // cast
    elements = doc.getElementsByClass("cast");
    if (elements.size() > 0) {
        Elements tr = elements.get(0).getElementsByTag("tr");
        for (Element row : tr) {
            Elements td = row.getElementsByTag("td");
            MediaCastMember cm = new MediaCastMember();
            for (Element column : td) {
                // actor thumb
                if (column.hasClass("hs")) {
                    Elements img = column.getElementsByTag("img");
                    if (img.size() > 0) {
                        String thumbUrl = img.get(0).attr("src");
                        if (thumbUrl.contains("no_photo.png")) {
                            cm.setImageUrl("");
                        } else {
                            thumbUrl = thumbUrl.replaceAll("SX[0-9]{2,4}_", "SX400_");
                            thumbUrl = thumbUrl.replaceAll("SY[0-9]{2,4}_", "");
                            cm.setImageUrl(thumbUrl);
                        }
                    }
                }
                // actor name
                if (column.hasClass("nm")) {
                    cm.setName(cleanString(column.text()));
                }
                // character
                if (column.hasClass("char")) {
                    cm.setCharacter(cleanString(column.text()));
                }
            }
            if (StringUtils.isNotEmpty(cm.getName()) && StringUtils.isNotEmpty(cm.getCharacter())) {
                cm.setType(CastType.ACTOR);
                md.addCastMember(cm);
            }
        }
    }

    Element content = doc.getElementById("tn15content");
    if (content != null) {
        elements = content.getElementsByTag("table");
        for (Element table : elements) {
            // writers
            if (table.text().contains(ImdbSiteDefinition.IMDB_COM.getWriter())) {
                Elements anchors = table.getElementsByTag("a");
                for (Element anchor : anchors) {
                    if (anchor.attr("href").matches("/name/nm.*")) {
                        MediaCastMember cm = new MediaCastMember(CastType.WRITER);
                        cm.setName(anchor.ownText());
                        md.addCastMember(cm);
                    }
                }
            }

            // producers
            if (table.text().contains(ImdbSiteDefinition.IMDB_COM.getProducers())) {
                Elements rows = table.getElementsByTag("tr");
                for (Element row : rows) {
                    if (row.text().contains(ImdbSiteDefinition.IMDB_COM.getProducers())) {
                        continue;
                    }
                    Elements columns = row.children();
                    if (columns.size() == 0) {
                        continue;
                    }
                    MediaCastMember cm = new MediaCastMember(CastType.PRODUCER);
                    String name = cleanString(columns.get(0).text());
                    if (StringUtils.isBlank(name)) {
                        continue;
                    }
                    cm.setName(name);
                    if (columns.size() >= 3) {
                        cm.setPart(cleanString(columns.get(2).text()));
                    }
                    md.addCastMember(cm);
                }
            }
        }
    }

    // Production companies
    elements = doc.getElementsByClass("blackcatheader");
    for (Element blackcatheader : elements) {
        if (blackcatheader.ownText().equals(ImdbSiteDefinition.IMDB_COM.getProductionCompanies())) {
            Elements a = blackcatheader.nextElementSibling().getElementsByTag("a");
            StringBuilder productionCompanies = new StringBuilder();
            for (Element anchor : a) {
                if (StringUtils.isNotEmpty(productionCompanies)) {
                    productionCompanies.append(", ");
                }
                productionCompanies.append(anchor.ownText());
            }
            md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, productionCompanies.toString());
            break;
        }
    }

    /*
     * plot from /plotsummary
     */
    // build the url
    doc = null;
    doc = futurePlotsummary.get();

    // imdb.com has another site structure
    if (imdbSite == ImdbSiteDefinition.IMDB_COM) {
        Elements zebraList = doc.getElementsByClass("zebraList");
        if (zebraList != null && !zebraList.isEmpty()) {
            Elements odd = zebraList.get(0).getElementsByClass("odd");
            if (odd.isEmpty()) {
                odd = zebraList.get(0).getElementsByClass("even"); // sometimes imdb has even
            }
            if (odd.size() > 0) {
                Elements p = odd.get(0).getElementsByTag("p");
                if (p.size() > 0) {
                    String plot = cleanString(p.get(0).ownText());
                    md.storeMetadata(MediaMetadata.PLOT, plot);
                }
            }
        }
    } else {
        Element wiki = doc.getElementById("swiki.2.1");
        if (wiki != null) {
            String plot = cleanString(wiki.ownText());
            md.storeMetadata(MediaMetadata.PLOT, plot);
        }
    }

    // title also from chosen site if we are not scraping akas.imdb.com
    if (imdbSite != ImdbSiteDefinition.IMDB_COM) {
        title = doc.getElementById("tn15title");
        if (title != null) {
            Element element = null;
            // title
            elements = title.getElementsByClass("main");
            if (elements.size() > 0) {
                element = elements.first();
                String movieTitle = cleanString(element.ownText());
                md.storeMetadata(MediaMetadata.TITLE, movieTitle);
            }
        }
    }
    // }

    // get data from tmdb?
    if (options.isScrapeImdbForeignLanguage() || options.isScrapeCollectionInfo()) {
        MediaMetadata tmdbMd = futureTmdb.get();
        if (options.isScrapeImdbForeignLanguage() && tmdbMd != null
                && StringUtils.isNotBlank(tmdbMd.getStringValue(MediaMetadata.PLOT))) {
            // tmdbid
            md.setId(MediaMetadata.TMDBID, tmdbMd.getId(MediaMetadata.TMDBID));
            // title
            md.storeMetadata(MediaMetadata.TITLE, tmdbMd.getStringValue(MediaMetadata.TITLE));
            // original title
            md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, tmdbMd.getStringValue(MediaMetadata.ORIGINAL_TITLE));
            // tagline
            md.storeMetadata(MediaMetadata.TAGLINE, tmdbMd.getStringValue(MediaMetadata.TAGLINE));
            // plot
            md.storeMetadata(MediaMetadata.PLOT, tmdbMd.getStringValue(MediaMetadata.PLOT));
            // collection info
            md.storeMetadata(MediaMetadata.COLLECTION_NAME,
                    tmdbMd.getStringValue(MediaMetadata.COLLECTION_NAME));
            md.storeMetadata(MediaMetadata.TMDBID_SET, tmdbMd.getIntegerValue(MediaMetadata.TMDBID_SET));
        }
        if (options.isScrapeCollectionInfo() && tmdbMd != null) {
            md.storeMetadata(MediaMetadata.TMDBID_SET, tmdbMd.getIntegerValue(MediaMetadata.TMDBID_SET));
            md.storeMetadata(MediaMetadata.COLLECTION_NAME,
                    tmdbMd.getStringValue(MediaMetadata.COLLECTION_NAME));
        }
    }

    // if we have still no original title, take the title
    if (StringUtils.isBlank(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) {
        md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE));
    }

    return md;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java

protected MediaMetadata parseReferencePage(Document doc, MediaScrapeOptions options, MediaMetadata md) {
    /*/*  ww w.  j  a  v  a 2  s . co m*/
     * title and year have the following structure
     * 
     * <div id="tn15title"><h1>Merida - Legende der Highlands <span>(<a href="/year/2012/">2012</a>) <span class="pro-link">...</span> <span
     * class="title-extra">Brave <i>(original title)</i></span> </span></h1> </div>
     */

    // title
    Element title = doc.getElementsByAttributeValue("name", "title").first();
    if (title != null) {
        String movieTitle = cleanString(title.attr("content"));
        int yearStart = movieTitle.lastIndexOf("(");
        if (yearStart > 0) {
            movieTitle = movieTitle.substring(0, yearStart - 1).trim();
            md.setTitle(movieTitle);
        }
    }

    // original title and year
    Element originalTitleYear = doc.getElementsByAttributeValue("property", "og:title").first();
    if (originalTitleYear != null) {
        String content = originalTitleYear.attr("content");
        int startOfYear = content.lastIndexOf("(");
        if (startOfYear > 0) {
            // noo - this is NOT the original title!!! (seems always english?) parse from AKAs page...
            // String originalTitle = content.substring(0, startOfYear - 1).trim();
            // md.setOriginalTitle(originalTitle);

            String yearText = content.substring(startOfYear);

            // search year
            Pattern yearPattern = Pattern.compile("[1-2][0-9]{3}");
            Matcher matcher = yearPattern.matcher(yearText);
            while (matcher.find()) {
                if (matcher.group(0) != null) {
                    String movieYear = matcher.group(0);
                    try {
                        md.setYear(Integer.parseInt(movieYear));
                        break;
                    } catch (Exception ignored) {
                    }
                }
            }
        }
    }

    // poster
    Element poster = doc.getElementsByAttributeValue("property", "og:image").first();
    if (poster != null) {
        String posterUrl = poster.attr("content");

        int fileStart = posterUrl.lastIndexOf("/");
        if (fileStart > 0) {
            int parameterStart = posterUrl.indexOf("_", fileStart);
            if (parameterStart > 0) {
                int startOfExtension = posterUrl.lastIndexOf(".");
                if (startOfExtension > parameterStart) {
                    posterUrl = posterUrl.substring(0, parameterStart) + posterUrl.substring(startOfExtension);

                }
            }
        }
        processMediaArt(md, MediaArtwork.MediaArtworkType.POSTER, posterUrl);
    }

    /*
     * <div class="starbar-meta"> <b>7.4/10</b> &nbsp;&nbsp;<a href="ratings" class="tn15more">52,871 votes</a>&nbsp;&raquo; </div>
     */

    // rating and rating count
    Element ratingElement = doc.getElementsByClass("ipl-rating-star__rating").first();
    if (ratingElement != null) {
        String ratingAsString = ratingElement.ownText().replace(",", ".");
        try {
            md.setRating(Float.valueOf(ratingAsString));
        } catch (Exception ignored) {
        }

        Element votesElement = doc.getElementsByClass("ipl-rating-star__total-votes").first();
        if (votesElement != null) {
            String countAsString = votesElement.ownText().replaceAll("[.,()\\u00a0]", "").trim();
            try {
                md.setVoteCount(Integer.parseInt(countAsString));
            } catch (Exception ignored) {
            }
        }
    }
    // top250
    Element topRatedElement = doc.getElementsByAttributeValue("href", "/chart/top").first();
    if (topRatedElement != null) {
        Pattern topPattern = Pattern.compile("Top Rated Movies: #([0-9]{1,3})");
        Matcher matcher = topPattern.matcher(topRatedElement.ownText());
        while (matcher.find()) {
            if (matcher.group(1) != null) {
                try {
                    String top250Text = matcher.group(1);
                    md.setTop250(Integer.parseInt(top250Text));
                } catch (Exception ignored) {
                }
            }
        }
    }

    // releasedate
    Element releaseDateElement = doc
            .getElementsByAttributeValue("href", "/title/" + options.getImdbId().toLowerCase() + "/releaseinfo")
            .first();
    if (releaseDateElement != null) {
        String releaseDateText = releaseDateElement.ownText();
        int startOfCountry = releaseDateText.indexOf("(");
        if (startOfCountry > 0) {
            releaseDateText = releaseDateText.substring(0, startOfCountry - 1).trim();
        }
        try {
            SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US);
            Date parsedDate = sdf.parse(releaseDateText);
            md.setReleaseDate(parsedDate);
        } catch (ParseException otherformat) {
            try {
                SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US);
                Date parsedDate = sdf.parse(releaseDateText);
                md.setReleaseDate(parsedDate);
            } catch (ParseException ignored) {
            }
        }
    }

    Elements elements = doc.getElementsByClass("ipl-zebra-list__label");
    for (Element element : elements) {
        // only parse tds
        if (!"td".equals(element.tag().getName())) {
            continue;
        }

        String elementText = element.ownText();

        if (elementText.equals("Taglines")) {
            if (!ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb")) {
                Element taglineElement = element.nextElementSibling();
                if (taglineElement != null) {
                    String tagline = cleanString(taglineElement.ownText().replaceAll("", ""));
                    md.setTagline(tagline);
                }
            }
        }

        if (elementText.equals("Genres")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                Elements genreElements = nextElement.getElementsByAttributeValueStarting("href", "/genre/");

                for (Element genreElement : genreElements) {
                    String genreText = genreElement.ownText();
                    md.addGenre(getTmmGenre(genreText));
                }
            }
        }

        /*
         * Old HTML, but maybe the same content formart <div class="info"><h5>Runtime:</h5><div class="info-content">162 min | 171 min (special edition)
         * | 178 min (extended cut)</div></div>
         */
        if (elementText.equals("Runtime")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                Element runtimeElement = nextElement.getElementsByClass("ipl-inline-list__item").first();
                if (runtimeElement != null) {
                    String first = runtimeElement.ownText().split("\\|")[0];
                    String runtimeAsString = cleanString(first.replaceAll("min", ""));
                    int runtime = 0;
                    try {
                        runtime = Integer.parseInt(runtimeAsString);
                    } catch (Exception e) {
                        // try to filter out the first number we find
                        Pattern runtimePattern = Pattern.compile("([0-9]{2,3})");
                        Matcher matcher = runtimePattern.matcher(runtimeAsString);
                        if (matcher.find()) {
                            runtime = Integer.parseInt(matcher.group(0));
                        }
                    }
                    md.setRuntime(runtime);
                }
            }
        }

        if (elementText.equals("Country")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                Elements countryElements = nextElement.getElementsByAttributeValueStarting("href", "/country/");
                Pattern pattern = Pattern.compile("/country/(.*)");

                for (Element countryElement : countryElements) {
                    Matcher matcher = pattern.matcher(countryElement.attr("href"));
                    if (matcher.matches()) {
                        if (ImdbMetadataProvider.providerInfo.getConfig()
                                .getValueAsBool("scrapeLanguageNames")) {
                            md.addCountry(LanguageUtils.getLocalizedCountryForLanguage(
                                    options.getLanguage().getLanguage(), countryElement.text(),
                                    matcher.group(1)));
                        } else {
                            md.addCountry(matcher.group(1));
                        }
                    }
                }
            }
        }

        if (elementText.equals("Language")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                Elements languageElements = nextElement.getElementsByAttributeValueStarting("href",
                        "/language/");
                Pattern pattern = Pattern.compile("/language/(.*)");

                for (Element languageElement : languageElements) {
                    Matcher matcher = pattern.matcher(languageElement.attr("href"));
                    if (matcher.matches()) {
                        if (ImdbMetadataProvider.providerInfo.getConfig()
                                .getValueAsBool("scrapeLanguageNames")) {
                            md.addSpokenLanguage(LanguageUtils.getLocalizedLanguageNameFromLocalizedString(
                                    options.getLanguage(), languageElement.text(), matcher.group(1)));
                        } else {
                            md.addSpokenLanguage(matcher.group(1));
                        }
                    }
                }
            }
        }

        if (elementText.equals("Certification")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                String languageCode = options.getCountry().getAlpha2();
                Elements certificationElements = nextElement.getElementsByAttributeValueStarting("href",
                        "/search/title?certificates=" + languageCode);
                boolean done = false;
                for (Element certificationElement : certificationElements) {
                    String certText = certificationElement.ownText();
                    int startOfCert = certText.indexOf(":");
                    if (startOfCert > 0 && certText.length() > startOfCert + 1) {
                        certText = certText.substring(startOfCert + 1);
                    }

                    Certification certification = Certification.getCertification(options.getCountry(),
                            certText);
                    if (certification != null) {
                        md.addCertification(certification);
                        done = true;
                        break;
                    }
                }

                if (!done && languageCode.equals("DE")) {
                    certificationElements = nextElement.getElementsByAttributeValueStarting("href",
                            "/search/title?certificates=XWG");
                    for (Element certificationElement : certificationElements) {
                        String certText = certificationElement.ownText();
                        int startOfCert = certText.indexOf(":");
                        if (startOfCert > 0 && certText.length() > startOfCert + 1) {
                            certText = certText.substring(startOfCert + 1);
                        }

                        Certification certification = Certification.getCertification(options.getCountry(),
                                certText);
                        if (certification != null) {
                            md.addCertification(certification);
                            break;
                        }
                    }
                }

            }
        }
    }

    // director
    Element directorsElement = doc.getElementById("directors");
    while (directorsElement != null && directorsElement.tag().getName() != "header") {
        directorsElement = directorsElement.parent();
    }
    if (directorsElement != null) {
        directorsElement = directorsElement.nextElementSibling();
    }
    if (directorsElement != null) {
        for (Element directorElement : directorsElement.getElementsByClass("name")) {
            String director = directorElement.text().trim();

            MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.DIRECTOR);
            cm.setName(director);
            md.addCastMember(cm);
        }
    }

    // actors
    Element castTableElement = doc.getElementsByClass("cast_list").first();
    if (castTableElement != null) {
        Elements tr = castTableElement.getElementsByTag("tr");
        for (Element row : tr) {
            MediaCastMember cm = parseCastMember(row);
            if (cm != null && StringUtils.isNotEmpty(cm.getName())
                    && StringUtils.isNotEmpty(cm.getCharacter())) {
                cm.setType(MediaCastMember.CastType.ACTOR);
                md.addCastMember(cm);
            }
        }
    }

    // writers
    Element writersElement = doc.getElementById("writers");
    while (writersElement != null && writersElement.tag().getName() != "header") {
        writersElement = writersElement.parent();
    }
    if (writersElement != null) {
        writersElement = writersElement.nextElementSibling();
    }
    if (writersElement != null) {
        Elements writersElements = writersElement.getElementsByAttributeValueStarting("href", "/name/");

        for (Element writerElement : writersElements) {
            String writer = cleanString(writerElement.ownText());
            MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.WRITER);
            cm.setName(writer);
            md.addCastMember(cm);
        }
    }

    // producers
    Element producersElement = doc.getElementById("producers");
    while (producersElement != null && producersElement.tag().getName() != "header") {
        producersElement = producersElement.parent();
    }
    if (producersElement != null) {
        producersElement = producersElement.nextElementSibling();
    }
    if (producersElement != null) {
        Elements producersElements = producersElement.getElementsByAttributeValueStarting("href", "/name/");

        for (Element producerElement : producersElements) {
            String producer = cleanString(producerElement.ownText());
            MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.PRODUCER);
            cm.setName(producer);
            md.addCastMember(cm);
        }
    }

    // producers
    Elements prodCompHeaderElements = doc.getElementsByClass("ipl-list-title");
    Element prodCompHeaderElement = null;

    for (Element possibleProdCompHeaderEl : prodCompHeaderElements) {
        if (possibleProdCompHeaderEl.ownText().equals("Production Companies")) {
            prodCompHeaderElement = possibleProdCompHeaderEl;
            break;
        }
    }

    while (prodCompHeaderElement != null && prodCompHeaderElement.tag().getName() != "header") {
        prodCompHeaderElement = prodCompHeaderElement.parent();
    }
    if (prodCompHeaderElement != null) {
        prodCompHeaderElement = prodCompHeaderElement.nextElementSibling();
    }
    if (prodCompHeaderElement != null) {
        Elements prodCompElements = prodCompHeaderElement.getElementsByAttributeValueStarting("href",
                "/company/");

        for (Element prodCompElement : prodCompElements) {
            String prodComp = prodCompElement.ownText();
            md.addProductionCompany(prodComp);
        }
    }

    return md;
}

From source file:org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java

private void processEntry(@NotNull String queryString, @NotNull Element entryNode,
        @NotNull BilingualQueryResultBuilder resultBuilder, @NotNull Language sourceLanguage,
        @NotNull Language targetLanguage) {
    if (!StringUtils.equals(entryNode.tag().getName(), "tr")) {
        LOGGER.warn("Expected <tr> tag - got <{}>", entryNode.tag().getName());
        return;//  w  w w  .  j a  va 2  s  .c  o m
    }
    Elements words = entryNode.getElementsByClass(CLASS_TRANSLATION);

    if (words.size() != 2) {
        LOGGER.warn("Expected 2 elements with class \"" + CLASS_TRANSLATION + "\" - got {}", words.size());
        return;
    }

    BilingualEntryBuilder entryBuilder = ImmutableBilingualEntry.builder();

    entryBuilder.setEntryType(detectEntryType(words.get(0)));
    entryBuilder.setInputObject(processSingleNode(words.get(0), sourceLanguage, queryString));
    entryBuilder.setOutputObject(processSingleNode(words.get(1), targetLanguage, queryString));

    resultBuilder.addBilingualEntry(entryBuilder.build());
}