Example usage for org.jsoup.nodes Document getElementsByTag

List of usage examples for org.jsoup.nodes Document getElementsByTag

Introduction

In this page you can find the example usage for org.jsoup.nodes Document getElementsByTag.

Prototype

public Elements getElementsByTag(String tagName) 

Source Link

Document

Finds elements, including and recursively under this element, with the specified tag name.

Usage

From source file:org.silverpeas.mobile.server.servlets.PublicationContentServlet.java

private void displayWysiwyg(String html, HttpServletRequest request, HttpServletResponse response,
        String instanceId) throws IOException {
    html = "<html><body>" + html + "</body></html>";
    Document doc = Jsoup.parse(html);

    Elements body = doc.getElementsByTag("body");
    if (!body.isEmpty()) {
        html = body.first().html();/*w w  w. ja  va2 s.c  o  m*/
    }

    Elements images = doc.getElementsByTag("img");
    for (Element img : images) {
        String source = img.attr("src");
        String newSource = source;
        if (source.contains("/silverpeas")) {
            // need to convert in dataurl
            newSource = convertSpImageUrlToDataUrl(source);
        }
        img.attr("src", newSource);
    }
    Elements embeds = doc.getElementsByTag("embed");
    for (Element embed : embeds) {
        String htmlPart = embed.outerHtml();
        if (htmlPart.contains("flash")) {
            String attachmentId = htmlPart
                    .substring(htmlPart.indexOf("attachmentId/") + "attachmentId/".length());
            attachmentId = attachmentId.substring(0, attachmentId.indexOf("/"));
            SimpleDocument attachment = AttachmentServiceProvider.getAttachmentService().searchDocumentById(
                    new SimpleDocumentPK(attachmentId),
                    getUserInSession(request).getUserPreferences().getLanguage());
            String type = attachment.getContentType();
            String url = getServletContext().getContextPath() + "/services/spmobile/Attachment";
            url = url + "?id=" + attachmentId + "&instanceId=" + instanceId + "&lang="
                    + getUserInSession(request).getUserPreferences().getLanguage() + "&userId="
                    + getUserInSession(request).getId();
            if (type.equals("audio/mpeg") || type.equals("audio/ogg") || type.equals("audio/wav")) {
                embed.parent().append("<audio controls><source src='" + url + "' type='" + type + "'></audio>");
                embed.remove();
            } else if (type.equals("video/mp4") || type.equals("video/ogg") || type.equals("video/webm")) {
                embed.parent()
                        .append("<video controls='controls'><source src='" + url + "' type='" + type + "' />");
                embed.remove();
            }
        }
    }
    html = doc.outerHtml();
    OutputStreamWriter out = new OutputStreamWriter(response.getOutputStream(), "UTF-8");
    writeContainer(out, html);
    out.flush();
}

From source file:org.silverpeas.mobile.server.servlets.PublicationContentServlet.java

private void displayFormView(Writer out, PublicationDetail pub, UserDetail user, String ua) throws Exception {

    PublicationTemplate pubTemplate = PublicationTemplateManager.getInstance()
            .getPublicationTemplate(pub.getInstanceId() + ":" + pub.getInfoId());
    DataRecord xmlData = pubTemplate.getRecordSet().getRecord(pub.getId());

    PagesContext xmlContext = new PagesContext("myForm", "0", user.getUserPreferences().getLanguage(), false,
            pub.getInstanceId(), "useless");
    xmlContext.setObjectId(pub.getId());
    xmlContext.setDesignMode(false);/*from   w  ww  . java2s  .c o  m*/
    xmlContext.setBorderPrinted(false);
    xmlContext.setContentLanguage(user.getUserPreferences().getLanguage());
    xmlContext.setCreation(false);

    StringWriter generatedHtml = new StringWriter();
    PrintWriter outTmp = new PrintWriter(generatedHtml);

    Form xmlForm = pubTemplate.getViewForm();
    if (xmlForm instanceof XmlForm) {
        Method m = XmlForm.class.getDeclaredMethod("display",
                new Class[] { PrintWriter.class, PagesContext.class, DataRecord.class });
        m.setAccessible(true);
        m.invoke(xmlForm, outTmp, xmlContext, xmlData);
        outTmp.flush();
    } else if (xmlForm instanceof HtmlForm) {
        String html = ((HtmlForm) xmlForm).toString(xmlContext, xmlData);
        outTmp.write(html);
        outTmp.flush();
    }
    String html = generatedHtml.toString();

    Document doc = Jsoup.parse(html);
    Elements images = doc.getElementsByTag("img");
    for (Element img : images) {
        if (img.attr("class").equals("preview-file")) {
            // remove preview for files
            img.remove();
        } else if (img.attr("src").startsWith("/silverpeas/attached_file/componentId/")) {
            // convert url to dataurl
            String data = img.attr("src");
            data = convertImageAttachmentUrl(data, data);
            img.attr("src", data);
        }
    }
    Elements links = doc.getElementsByTag("a");
    for (Element link : links) {
        if (link.attr("href").startsWith("/silverpeas/attached_file/componentId/")) {
            // link to file
            String url = link.attr("href");
            String attachmentId = url.substring(url.indexOf("attachmentId/") + "attachmentId/".length());
            attachmentId = attachmentId.substring(0, attachmentId.indexOf("/"));
            url = getServletContext().getContextPath() + "/services/spmobile/Attachment";
            url = url + "?id=" + attachmentId + "&instanceId=" + pub.getInstanceId() + "&lang="
                    + user.getUserPreferences().getLanguage() + "&userId=" + user.getId();
            link.attr("href", url);
            link.attr("target", "_self");

            if (link.attr("id").startsWith("player")) {

                boolean playable = false;

                SimpleDocument attachment = AttachmentServiceProvider.getAttachmentService().searchDocumentById(
                        new SimpleDocumentPK(attachmentId), user.getUserPreferences().getLanguage());
                String type = attachment.getContentType();
                if (type.contains("mp4") || type.contains("ogg") || type.contains("webm")) {
                    playable = true;
                }

                if (playable) {
                    String style = link.attr("style");
                    String width = style.substring(style.indexOf("width") + "width".length() + 1);
                    width = width.substring(0, width.indexOf("px"));
                    String height = style.substring(style.indexOf("height") + "height".length() + 1);
                    height = height.substring(0, height.indexOf("px"));
                    link.parent().append("<video width='" + width + "' height='" + height
                            + "' controls='controls'><source src='" + url + "' type='" + type + "' />");
                    link.remove();
                } else {
                    // display image instead of video player
                    String style = "display:block; width:150px; height:98px; background-repeat: no-repeat; ";
                    style += "background-image: url(data:image/jpeg;base64," + "/9j/4AAQSkZJRgABAQEBLAEsAAD"
                            + "/4QYfRXhpZgAATU0AKgAAAAgAAAAAAA4AAgIBAAQAAAABAAAALAICAAQAAAABAAAF6wAAAAD/2P"
                            + "/gABBKRklGAAEBAAABAAEAAP/bAEMACAYGBwYFCAcHBwkJCAoMFA0MCwsMGRITDxQdGh8eHRocHCAkLicgIiwjHBwoNyksMDE0NDQfJzk9ODI8LjM0Mv/AAAsIAEAAYgEBEQD/xAAfAAABBQEBAQEBAQAAAAAAAAAAAQIDBAUGBwgJCgv/xAC1EAACAQMDAgQDBQUEBAAAAX0BAgMABBEFEiExQQYTUWEHInEUMoGRoQgjQrHBFVLR8CQzYnKCCQoWFxgZGiUmJygpKjQ1Njc4OTpDREVGR0hJSlNUVVZXWFlaY2RlZmdoaWpzdHV2d3h5eoOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4eLj5OXm5+jp6vHy8/T19vf4+fr/2gAIAQEAAD8A98orN1zXdO8OaVLqWqXS29tEOWPUnsAO5PoK8X1H9ouR7t00Xw600APEk8pDMP8AdUcfma3PCXx60nWr6Ow1qzbSp5CFWUvviLehOAV/Hj3r1me4htraS4mlSOGNS7yMcKqjnJPpXjPiL9oSxtL17XQNKfUQpI8+VyiN/uqASR+VQaJ+0TBJeLBr+iSWcZODPA5fb9UIB/In6V7TYX1tqVlDeWc6T20yho5EOQw9qs0UUUUV89/HO6udZ8f6D4WExitWWN+vG+Ryu78AP1Ne2eHvDGkeGNLisdLs4oY0UAuFG9z6sepNec/HHwVpVz4RuPEMFtFb6jZsrGSNQvmqWCkN6nnIPtXB6/4z1K7+Aei2ryuXuLp7SaXJy8cfIBP4r/3zXrnws8F6V4e8IabeR20cmoXlvHcTXLAFsuobaD2AzjitLx34K0nxb4fuoru2iF0kbNBchQHjYDI59OORXnf7OWs3NxpesaPM7NBaPHLDk527924fmoP4mvcKKKKKK8e+N3gXUNZis/EmixvLf2C7JI4/vtGCWDL6lSTx71m+Hf2hbKLT0tvEem3S3sQ2vLbKCrkdyCQVP51zfjH4iat8VpofDPhnS7hLSSQNIH5eTB4LY4VR1/AV6Br3wn+0fCO18OWbq+o2H+kxv0EsvJYfQ5IH0FcZ4K+MVz4KsV8N+K9Muz9i/dRugxLGo6KysRkDsc9KseMfjmdf06TRvCmnXgmvFMRmlUb8HghFUnk9M/pXefB3wNP4N8NSyagmzUr9lkmTr5ajO1frySfrXo1FFFFFcl8RPGsfgXwwdTNv9omklEEEZOAXIJyT6YBr5y1T4qPrFyZ7/wAJ+HZ5ScmRrZtx+pDc1e0v43arosBh0zQNCs4z1EFuy5+uG5q//wANE+Kv+fDTP+/b/wDxVZWrfGS+10D+1fDegXhHAaa2YsPod2abpPxeutDfdpfhjw/aueN8Vswb/vrdmve/hh4//wCE90S4nmtVtry0kCTIhJU5BIYZ+h49qf4n+KfhnwprEWl31xJJdMR5iQJv8oHpu9Pp1rtI5FljWRTlWGR9KdRRRXj37Rn/ACIun/8AYRX/ANFyV8yUUUUV23hD4iXngzw3q9jpkeL7UHj23B5EKqGBIHduePSuRE0lxfiaaRpJZJNzuxyWJPJJr7tsv+PGD/rmv8qnooorx79oz/kRdP8A+wkv/ot6+ZKKKKKKkt/+PiL/AHh/OvvGy/48YP8Armv8qnooorx/9osf8ULp/wD2El/9FyV8xmiiiivSvh58O4vHnhbXWhk8rVLSSI2zsflbIbKt9cDntXCXOm3elaw1jfQPBcwy7JI3GCpBr7msv+PG3/65r/Kp6KKKwPGHhLT/ABpoL6TqO9ULB45Iz80bjOGH5n868nP7Ndpk48STY/69R/8AFUf8M12n/QyTf+Ao/wDiqP8Ahmu0/wChkm/8BR/8VR/wzXaf9DJN/wCAo/8AiqP+Ga7T/oZJv/AUf/FV6f4H8Dab4E0d7GwZ5XlfzJp5PvSN0HA6ADtV3UvCHh7WNUg1PUNJtri8gxsldMnjpn1x71tgADAGBRRX/9n/2wBDAAMCAgMCAgMDAwMEAwMEBQgFBQQEBQoHBwYIDAoMDAsKCwsNDhIQDQ4RDgsLEBYQERMUFRUVDA8XGBYUGBIUFRT/wgALCABiAJYBAREA/8QAHQABAAICAwEBAAAAAAAAAAAAAAcIAgYEBQkDAf/aAAgBAQAAAAG0o4cWdnJf71sXbBIOQB1VCZO6Dl2uolLekbxbQAgPVbTKAyDu9hvzzq9DeSAi6sV3+iorZmJrm6bSn0PyAK6V+59q5RrDB/eWzkAAUe0sDfrrxVt2zgw8t/kBz7fU2u/NoMPLf5AZMbyTYDDy3+QAvJNgMPLf5AcnYdVvJNgMfLrjAdn6GUKuNMAEL6mBukwfL6gAAB//xAAoEAABAwQBBAIBBQAAAAAAAAAFAwQGAAECByAWFzAyEzYQERIxNED/2gAIAQEAAQUC4OXaLJB1tWPNsxmxgBTO1/1tT4i2GIONsR5DMTPgZlTxEyKIlgcPE50XHaWeLIyDU5MQhrieLCXkgNox4S7elp6ZZ6ScqJSXWJKPIayni2Lrw7kdZJRnSo1G+P42QOSGS7ZJRZ1E9MjUkgdZWte0qaYg5Y2U+Zv4NjAszsX11MMIsRZv25FE9J2EcbOVnc4lGy4vdeJ6ymyUeVQcpOkpNMR8YbCmTmaynHH9uPhmWqkyiysMkY1QdrmQFlodBWkTTva17SzUnzrZxCRsMxeszxRSJQ9nE2nik2zipB/1yfrrk/XXJ+uuT9dcn665P11yfqJbKKNClSjYY6MLx6QN5KN5ZeufvzY/3dgbIxD2UUyWz1D9Q5Zeufvzte+N73vf8ah+ocsvXP38WofqHLL1z9+aCfzLSGOu40RrUP1Dlf8AhbG6avMbhkqQksaaSgcfj7qNkNWM12UT5yLVww887Js67Js67Js67Js67Js67Js67Js6jeshsed1mlgr/o//xABAEAACAQICAg0JBwQDAAAAAAABAgMABBESITEFEBMiMDI1QUJRYXGSFCBSc5GTocHRFSQzYnKx8AYjQFOBguH/2gAIAQEABj8C8xpriVIIl1vIcAKyi5eftiiOHxoIl8IZDqWdcnx1VjzbRmup47eIdKQ4VlWeWb80cRw+NCOC+VZTqjmBQ/Hg57y4bLDCuZqVcHkLthBax6l/nXQe8v47Zz0I03TD/nGnntpF2QiXSwjXB/DUWx19LnsJDkQufwT9Knvp9KxjQo1seYUBg1zcOd5EnEjHyFY3WyccUnoxxZx7cRTXCMt9bLxmiGDL2lai2I2QlMkUm9t5X1q3o93BQRLoEtwA3cAT9K2QvmAMwIhX8o1n+dm3eJCMkb5Zco5sRpr+mcxP3iLdpO1gi/U1c3uAM8spTN1KMNHx2tOmr6K13ghmzR4dHnFRSaiyg8DMkS5p4G3dB14Y4j2E1Ilzj5DcYCQjoEamoS206XEZ6cbYimlvJ1U9GIHF27hTMq/eLuTBV9Ec3sFWvkyljsaBgOfc8MD+wPtqWxvWyWUzZxJ/rf6fShLBIs0Z1NGcQaZriVXuMN5bq2/Y/LvpQ++e5lMkzDorjvqwGrgpLzYtlguX0vA/EY9nUaI+zrsHrgGYe1a02b24OuW6OX/2jIG8ovnGDzkauxeqsDqp7nYZkTHSbV9A/wCpor9nXqHn3JSR8KG6W3kcZ1yXJw+GuikH924f8W4YaW7OwcHILG5aztFYiMRaCw6ya5WuvHXK11465WuvHXK11465WuvHXK11465WuvHUEV/cNd2kjBG3TjL2g7Qt3z3Nz0o4cN530l7ah1jJylZBpB8803fwEH61/en2P2McSX2qSUaRF2fq/amd2LuxxLMcSaX17/Lzz3Ue/gMQcCK0nHaX17/Lzz3Ue/g19e/y8891Hv4BE1ZiFxp7S6TfDSrjiuvWNpfXv8uAdWGDAkEHm4C2RRmZpFAA76a2uRlYaY5RrjP85qa1vI8rDiuOK46xUQniaIvIzqG9E8/ANdq8tlPIcX3PDKx68OuuU5/diuU5/diuU5/diuU5/diuU5/diuU5/diuU5/dilu80l3cJxDLhlTtA2hnRXw0jMP8j//EACkQAQABAgQFBQADAQAAAAAAAAERACExQVGhMGFxgcEQkbHw8SBA0eH/2gAIAQEAAT8h/gaMJJ65aQQ5hV90J7U7aYSl3eVAARVxMPTGLtFf9eVIIG38EUizoQToQBeQtTwpnFsYvI5tqGV8eQythOvVgVdRQs7kF+k9aLL8wM1xT2V5UgkaWK4Q+1MscmpJLprmw9Vqx5C25PIDFe7NEIvmcHd8FFc+XAmOg5i84pi4DU5RuawMxt04LavP4HPc9lM0dXFDet7vTGj7wTDcwjrL3qVZAjkA+2dSJrYXCjuU+3orABIRp09vYjpO1KwSedQeDFroZeIfQTFTExBy5KzLoxk5xCV65BbVBERI3J+TDmVPHGe5hjoN3k0AWlm4HvkjkUh5lCtAEvMBOkKJTci9MlSnchnJEZNVvhWj06LpcoLHagIwBAacJi+NsdirO9nlV9gwubuNBJywDvd2GsOBUoZlk3dqRBKIRMaVmWsRu1OjY1oksYkXe3enxWZZ+b2qaCHYG0fB78KdcCg98VdLRJuZV+9r97X6Wv3tfpa/W1+tqFk8hFYzlpLOPpnXvDOU1IXTHtikAIAHEjE66/z278VvHAcXY/yVckjtTa/TSlpwlA1XgTb58VvXAMIRImJSqom6vBm3z4rev6U2+fFb1wLv78IUh4sJyTfYbcGYyzUincGghUonAQFioFhV9IA+tPNJZgzGr8x2zvQpH6hYIh24C+wR0rGTcMV9x819x819x819x819x819x819x81LC0mCZAzNV9MYAsDDrWP9f//aAAgBAQAAABD8n/2U/wDyv/jH/uJ/4A//AP7/AL+X9/7+/wDf3+f8AP8Af/8A/8QAKBABAAIBAwMDBAMBAAAAAAAAAREhADFBYTBRcYGR8BAgocFAULHx/9oACAEBAAE/EPsQZM4/dUDxOO04Bp4IYQnEC1xJ7sAHSQtzMpHj6D+Gvm7E+wJwytc9+JPOWr4DzmYMlMkzBMHsXJ6nSchaXAKLdEHLi1Qt26BKRJXtJKAAaWmxxgfCDCFvdmMJBzDhMkIgeoNREEgwA3V5EFhXuQnYl2yaqUpIzBX/AFSTCClI3ezO4U9fU1QTXthgloQKzfy2JZQBhOl1+pvDo9yZDvkGsBClU3Jhr8mNxNxZOICEk5yNzIAHmFBMDYGExTJbxX5/dlUU3J0HUlBvHYfRHpiJEdRO2BxOJ71kePjwxUqD7LB6T0VXX3gAcpDhiXWmTB2SIbaBsMJC1ETypj2ybGUw8sXyeWEAO0Q5MeVTUTXNbmzloALE73AY05c5CSOtsFqDVz3NNiIx9HAWJN+gBU9hBsLE5woWZa7KTk74VtQSABABsRXRgNt5xXmVTJRiwqjcya8iaMGB4VzQKH58xLMEhiQit4lFmYJWAISLoAOybl5YWYJMstHYXYGIvWB69d4d3VDSdUkt5ws0MCCZC2Gw72lyeigBpqOBGfMFFaRuggTpnw79ZB878ZP8D8Z8O/WfEP1nyT9ZdPzPGBSPwdgNJozAE1v6CZSm0pKPVEuBlsY2IVuWWKhZ95OAbNN9BdRRc5J3o0JGsFd7bdeGjYSTZUKquq4JRtLf4EionmLogyImjjlCUFVdV/pVwKrgVOD2QSEAx64IcV9IY3oWk1sWdFfx0sWuMwJAdxOgJ8H1VEGTG1LCaJ9R0kh01hy2yMwfSnxKVUMb5SjLMsHRNu/QlEFAlKHSWwLt0O3bt27du2tci4ZBW2iE2jIO3OHkiM4WSU846pue/Qgutdeeh/37/wD/2Q==);";
                    link.attr("style", style);
                }
            }
        }
    }

    // remove all scripts
    Elements scripts = doc.getElementsByTag("script");
    for (Element script : scripts) {
        script.remove();
    }
    html = doc.outerHtml();
    writeContainer(out, html);
    out.flush();
}

From source file:org.tinymediamanager.scraper.imdb.ImdbMovieParser.java

private MediaMetadata parseReleaseinfoPageAKAs(Document doc, MediaScrapeOptions options, MediaMetadata md) {
    // <table id="akas" class="subpage_data spEven2Col">
    // <tr class="even">
    // <td>(original title)</td>
    // <td>Intouchables</td>
    // </tr>
    // need to search all tables for correct ID, since the UNIQUE id is used multiple times - thanks for nothing :p
    for (Element table : doc.getElementsByTag("table")) {
        if (table.id().equalsIgnoreCase("akas")) {
            Elements rows = table.getElementsByTag("tr");
            for (Element row : rows) {
                Element c1 = row.getElementsByTag("td").get(0);
                Element c2 = row.getElementsByTag("td").get(1);
                if (c1 != null && c1.text().toLowerCase(Locale.ROOT).contains("original title")) {
                    md.setOriginalTitle(c2.text());
                    break;
                }// w  w  w  .jav  a  2 s.c o  m
            }
        }
    }

    // alternative; new way with table classes
    // <tr class="ipl-zebra-list__item aka-item">
    // <td class="aka-item__name">Germany</td>
    // <td class="aka-item__title">Avatar - Aufbruch nach Pandora</td>
    // </tr>
    if (md.getOriginalTitle().isEmpty()) {
        Elements rows = doc.getElementsByClass("aka-item");
        for (Element row : rows) {
            Element country = row.getElementsByClass("aka-item__name").first();
            Element title = row.getElementsByClass("aka-item__title").first();
            if (country != null && country.text().toLowerCase(Locale.ROOT).contains("original title")) {
                md.setOriginalTitle(title.text());
                break;
            }
        }
    }

    return md;
}

From source file:org.tinymediamanager.scraper.zelluloid.ZelluloidMetadataProvider.java

@Override
public List<MediaSearchResult> search(MediaSearchOptions options) throws Exception {
    LOGGER.debug("search() " + options.toString());
    List<MediaSearchResult> resultList = new ArrayList<MediaSearchResult>();
    String searchUrl = "";
    String searchTerm = "";
    String imdb = "";

    // only title search
    if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.QUERY))) {
        searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.QUERY));
        searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8");
        LOGGER.debug("search for everything: " + searchTerm);
    } else if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.TITLE))) {
        searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.TITLE));
        searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8");
        LOGGER.debug("search with title: " + searchTerm);
    } else {//from w w  w.  j av a  2  s. c  om
        LOGGER.debug("empty searchString");
        return resultList;
    }

    searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm);

    Document doc = null;
    try {
        Url url = new CachedUrl(searchUrl);
        InputStream in = url.getInputStream();
        doc = Jsoup.parse(in, PAGE_ENCODING, "");
        in.close();
    } catch (Exception e) {
        LOGGER.error("failed to search for " + searchTerm + ": " + e.getMessage());

        // clear cache
        CachedUrl.removeCachedFileForUrl(searchUrl);
    }

    if (doc == null) {
        return resultList;
    }

    // only look for movie links
    Elements filme = doc.getElementsByAttributeValueStarting("href", "hit.php");
    LOGGER.debug("found " + filme.size() + " search results");
    if (filme.isEmpty()) {
        if (!doc.getElementsByTag("title").text().contains("Suche nach")) {
            // redirected to detail page
            MediaSearchResult msr = new MediaSearchResult(providerInfo.getId());
            Elements el = doc.getElementsByAttributeValueStarting("href", "index.php3?id=");
            if (el.size() > 0) {
                msr.setId(StrgUtils.substr(el.get(0).attr("href"), "id=(\\d+)"));
            }
            msr.setTitle(StrgUtils.substr(doc.getElementsByTag("title").text(), "(.*?)\\|").trim());
            el = doc.getElementsByAttributeValueContaining("href", "az.php3?j=");
            if (el.size() == 1) {
                msr.setYear(el.get(0).text());
            }
            resultList.add(msr);
        }
        return resultList;
    }

    // <a
    // href="hit.php3?hit=d6900d7d9baf66ba77d8e59cc425da9e-movie-7614-17114331-1"
    // class="normLight">Avatar - Aufbruch nach Pandora</B>
    // <nobr>(2009)</nobr><br /><span class="smallLight"
    // style="color:#ccc;">Avatar</span></a>

    // map to merge 2 results :/
    Map<String, MediaSearchResult> res = new HashMap<String, MediaSearchResult>();

    for (Element a : filme) {
        try {
            String id = StrgUtils.substr(a.attr("href"), "-movie-(.*?)-");
            MediaSearchResult sr = new MediaSearchResult(providerInfo.getId());
            if (res.containsKey(id)) {
                LOGGER.debug("dupe found; merging with previous searchresult");
                sr = res.get(id);
            }

            if (StringUtils.isNotEmpty(imdb)) {
                sr.setIMDBId(imdb);
            }
            if (StringUtils.isEmpty(sr.getId())) {
                sr.setId(id);
            }
            if (StringUtils.isEmpty(sr.getTitle())) {
                if (a.html().contains("nobr")) {
                    sr.setTitle(a.ownText());
                } else {
                    sr.setTitle(a.text());
                }
            }
            LOGGER.debug("found movie " + sr.getTitle());
            if (StringUtils.isEmpty(sr.getOriginalTitle())) {
                sr.setOriginalTitle(a.getElementsByTag("span").text());
            }
            if (StringUtils.isEmpty(sr.getYear())) {
                sr.setYear(StrgUtils.substr(a.getElementsByTag("nobr").text(), ".*(\\d{4}).*")); // any
                                                                                                 // 4
                                                                                                 // digit
            }
            sr.setMediaType(MediaType.MOVIE);
            sr.setUrl(BASE_URL + "/filme/index.php3?id=" + id);
            // sr.setPosterUrl(BASE_URL + "/images" + StrgUtils.substr(a.toString(),
            // "images(.*?)\\&quot"));

            if (imdb.equals(sr.getIMDBId())) {
                // perfect match
                sr.setScore(1);
            } else {
                // compare score based on names
                sr.setScore(MetadataUtil.calculateScore(searchTerm, sr.getTitle()));
            }

            // populate extra args
            MetadataUtil.copySearchQueryToSearchResult(options, sr);
            res.put(id, sr);
        } catch (Exception e) {
            LOGGER.warn("error parsing movie result: " + e.getMessage());
        }
    }
    for (String r : res.keySet()) {
        resultList.add(res.get(r));
    }
    Collections.sort(resultList);
    Collections.reverse(resultList);
    return resultList;
}

From source file:org.xlrnet.metadict.engines.leo.LeoEngine.java

private EngineQueryResultBuilder processDocument(Document doc) {
    EngineQueryResultBuilder resultBuilder = new EngineQueryResultBuilder();

    // Find sections:
    Elements sections = doc.getElementsByTag("section");

    // Process sections:
    sections.stream().parallel().forEach(s -> processSection(s, resultBuilder));

    // Find similarities:
    Element similarityNode = doc.getElementsByTag("similar").get(0);

    // Process similarities:
    processSimilarities(similarityNode, resultBuilder);

    // Find external contents:

    // Process external contents:

    return resultBuilder;
}

From source file:Project.FILER.java

public static String[] Dealing_Files(File f) throws IOException //return array of important strings in the file
{
    Text = "";
    String[] Importants = { "", "", "" }; //first element is the title,second is all headers,third is img alt
    org.jsoup.nodes.Document doc = Jsoup.parse(f, "UTF-8");
    Importants[0] = doc.title(); //get the title of the file
    //Text=Text+" "+doc.title(); 
    String tag = "h";
    String All_Headers = "";
    Elements Header;//from  w  ww  .  java2s.  co  m
    for (int i = 1; i < 20; i++) //loop to get text with headers tag of the file
    {
        tag = "h" + String.valueOf(i);
        Header = doc.select(tag);
        if (Header.size() > 0) {
            Header = doc.getElementsByTag(tag);
            String pConcatenated = "";
            for (Element x : Header) {
                pConcatenated += x.text() + " ";
            }
            All_Headers = All_Headers + pConcatenated;
        } else
            break;

    }
    Importants[1] = All_Headers;
    Text = Text + " " + doc.text(); //get the text of the document
    Elements img = doc.getElementsByTag("img"); //get the text with img tag 
    for (Element element : img) {
        if (element.attr("alt") != null && !(element.attr("alt").equals(""))) {
            Text = Text + " " + element.attr("alt");
            Importants[2] = Importants[2] + " " + element.attr("alt");
        }
    }
    return Importants;
}

From source file:psef.handler.HTMLFilter.java

/**
 * Convert and download all scripts/*from  w  ww  .  jav  a2  s .  co m*/
 * @param doc the DOM document
 * @throws PsefException 
 */
private void filterScripts(Document doc) throws PsefException {
    Elements scripts = doc.getElementsByTag("script");
    for (Element script : scripts) {
        String scriptSrc = script.attr("src");
        try {
            if (scriptSrc != null && scriptSrc.length() > 0) {
                if (!scriptSrc.startsWith("http"))
                    scriptSrc = "http://" + host + base + relPath + "/" + scriptSrc;
                URL url = new URL(scriptSrc);
                if (url.getPath().startsWith(base)) {
                    String newPath = url.getPath().substring(base.length());
                    String newUrl = "scripts" + newPath;
                    script.attr("src", newUrl);
                    downloadResource(url, newUrl);
                }
                // else what?
            }
        } catch (Exception e) {
            throw new PsefException(e);
        }
    }
}

From source file:psef.handler.HTMLFilter.java

/**
 * Revise all "style" statements with @import directives
 * @param doc the DOM document/* w  ww . j  a va2 s .co m*/
 * @throws PsefException 
 */
private void filterStyles(Document doc) throws PsefException {
    try {
        Elements styles = doc.getElementsByTag("style");
        for (Element style : styles) {
            List<DataNode> data = style.dataNodes();
            String styleText = "";
            StringBuilder sb = new StringBuilder();
            for (DataNode node : data) {
                styleText = node.getWholeData();
                int pos = readAtImport(styleText);
                while (pos > 0) {
                    sb.append("@import ");
                    styleText = styleText.substring(pos);
                    CSSUrl cssu = new CSSUrl(styleText);
                    styleText = styleText.substring(cssu.getPos());
                    cssu.revise(host, base + relPath, "styles");
                    URL u = new URL(cssu.getUrl(host, base + relPath));
                    downloadResource(u, cssu.getLocalPath());
                    sb.append(cssu.toString());
                    sb.append("\n");
                    pos = readAtImport(styleText);
                }
                node.setWholeData(sb.toString());
            }
            sb.append(styleText);
            //style.( sb.toString() );
        }
    } catch (Exception e) {
        throw new PsefException(e);
    }
}

From source file:psef.handler.HTMLFilter.java

/**
 * Convert the link hrefs to their local equivalents
 * @param doc the document/*from w  w  w  .j  a  va 2 s .  c om*/
 */
private void filterLinks(Document doc) throws PsefException {
    try {
        Elements links = doc.getElementsByTag("link");
        for (Element link : links) {
            String type = link.attr("type");
            String href = link.attr("href");
            href = cleanHref(href);
            String folder = "other";
            if (type.equals("text/css"))
                folder = "styles";
            else if (type.equals("text/javascript"))
                folder = "scripts";
            else if (type.startsWith("image/"))
                folder = "corpix";
            URL u = new URL(href);
            String localPath = folder + "/" + cleanPath(u.getPath());
            downloadResource(u, localPath);
            link.attr("href", localPath);
        }
    } catch (MalformedURLException e) {
        throw new PsefException(e);
    }
}

From source file:psef.handler.HTMLFilter.java

/**
 * Update all the anchors//from   www.  ja va  2s .  c  o m
 * @param doc the DOM document
 */
private void filterAnchors(Document doc) throws PsefException {
    try {
        Elements anchors = doc.getElementsByTag("a");
        for (Element anchor : anchors) {
            String href = anchor.attr("href");
            String localPath = "";
            URL u = null;
            if (href.startsWith("#"))
                continue;
            else if (href.startsWith("http:")) {
                u = new URL(href);
                if (u.getHost().equals(host) && u.getPath().startsWith(base))
                    localPath = cleanPath(u.getPath());
                u = null;
            } else
                localPath = cleanPath(href);
            if (localPath.length() > 0 && u == null)
                u = new URL(cleanHref(href));
            if (u != null && localPath.length() > 0) {
                if (!localPath.equals("index.html")) {
                    localPath = "anchors" + localPath;
                    String[] parts = localPath.split("\\?");
                    if (parts.length == 2) {
                        MessageDigest md = MessageDigest.getInstance("MD5");
                        md.update(parts[1].getBytes());
                        byte[] digest = md.digest();
                        byte[] bytesEncoded = Base64.encodeBase64(digest);
                        localPath = parts[0] + "_" + new String(bytesEncoded);
                    }
                }
                downloadResource(u, localPath);
                anchor.attr("href", localPath);
            }
        }
    } catch (Exception e) {
        throw new PsefException(e);
    }
}