Example usage for org.jsoup.nodes Document select

List of usage examples for org.jsoup.nodes Document select

Introduction

In this page you can find the example usage for org.jsoup.nodes Document select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:net.sf.texprinter.utils.StringUtils.java

/**
 * Escapes HTML entities and tags to a TeX format. This method tries to
 * replace HTML code by the TeX equivalent macros.
 *
 * @param text The input text.//w w w.  j  av  a2  s .c  o  m
 * @return A new text formatted from HTML to TeX.
 */
public static String escapeHTMLtoTeX(String text) {

    // replace bold tags
    String newText = text.replaceAll("<b>", "\\\\textbf{");
    newText = newText.replaceAll("</b>", "}");

    // replace bold tags
    newText = newText.replaceAll("<strong>", "\\\\textbf{");
    newText = newText.replaceAll("</strong>", "}");

    // replace italic tags
    newText = newText.replaceAll("<i>", "\\\\textit{");
    newText = newText.replaceAll("</i>", "}");

    // replace emphasized tags
    newText = newText.replaceAll("<em>", "\\\\emph{");
    newText = newText.replaceAll("</em>", "}");

    // replace paragraphs tags
    newText = newText.replaceAll("<p>", "");
    newText = newText.replaceAll("</p>", "\n\n");

    // replace ordered lists tags
    newText = newText.replaceAll("<ol>", "\\\\begin{enumerate}\n");
    newText = newText.replaceAll("</ol>", "\\\\end{enumerate}\n");

    // replace unordered lists tags
    newText = newText.replaceAll("<ul>", "\\\\begin{itemize}\n");
    newText = newText.replaceAll("</ul>", "\\\\end{itemize}\n");

    // replace item tags
    newText = newText.replaceAll("<li>", "\\\\item ");
    newText = newText.replaceAll("</li>", "\n");

    // replace blockquote tags
    newText = newText.replaceAll("<blockquote>", "\\\\begin{quotation}\n");
    newText = newText.replaceAll("</blockquote>", "\\\\end{quotation}\n");

    // replace code tags
    newText = newText.replaceAll("<pre><code>", "\\\\begin{TeXPrinterListing}\n");
    newText = newText.replaceAll("<pre class=.*\"><code>", "\\\\begin{TeXPrinterListing}\n");
    newText = newText.replaceAll("</code></pre>", "\\\\end{TeXPrinterListing}\n\n");

    // replace inline code tags
    newText = newText.replaceAll("<code>", "\\\\lstinline|");
    newText = newText.replaceAll("</code>", "|");

    // replace links tags
    newText = newText.replaceAll("alt=\".*\" ", "");

    // parse the text
    Document docLinks = Jsoup.parse(newText);

    // get all the links
    Elements links = docLinks.getElementsByTag("a");

    // if there are links
    if (links.size() > 0) {

        // for every link
        for (Element link : links) {

            // get the outer HTML
            String temp = link.outerHtml();

            // replace it
            newText = newText.replaceFirst(Pattern.quote(temp),
                    "\\\\href{" + link.attr("href") + "}{" + link.text() + "}");

        }
    }

    // create a list of images
    ArrayList<ImageGroup> images = new ArrayList<ImageGroup>();

    // parse the current text
    Document doc = Jsoup.parse(text);

    // fetch all the media found
    Elements media = doc.select("[src]");

    // for all media found
    for (Element m : media) {

        // if it's an image tag
        if (m.tagName().equals("img")) {

            // create a new image group with the image link
            ImageGroup image = new ImageGroup(m.attr("abs:src"));

            // add to the list of images
            images.add(image);

            // set the current image to null
            image = null;
        }
    }

    // create a new loop saver
    LoopSaver lps = null;

    // for every image in the list of images
    for (ImageGroup img : images) {

        // create a new object
        lps = new LoopSaver();

        // while there are references for that image in the text
        while (newText.indexOf(img.getURL()) != -1) {

            // tick loop
            lps.tick();

            // replace the occurrence of that image
            newText = newText.replaceFirst("<img src=\"" + img.getURL() + "\" />",
                    "\\\\begin{figure}[h!]\n\\\\centering\n\\\\includegraphics[scale=0.5]{" + img.getName()
                            + "}\n\\\\end{figure}");
        }

        // lets try
        try {

            // finally, download the image to the current directory
            Downloader.download(img.getURL(), img.getName());

        } catch (Exception exception) {

            // log message
            log.log(Level.WARNING,
                    "An error occurred while getting the current image. Trying to set the replacement image instead. MESSAGE: {0}",
                    StringUtils.printStackTrace(exception));

            // image could not be downloaded for any reason
            try {

                // open a file stream
                FileOutputStream f = new FileOutputStream(img.getName());

                // write a replacement image
                f.write(Base64.decode(
                        "iVBORw0KGgoAAAANSUhEUgAAALAAAABKCAIAAACU3El2AAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAAcjSURBVHhe7VzrmeMgDExdKSj1pJptZovZi3lqhAQ4n7HhrPt1STAaRoOELa0ff/bPGCAMPIwNY4AyYIIwPQADJggThAnCNKAzYBHC1GERwjRgEcI00MmApYxOou4yzARxF093rtME0UnUXYaZIO7i6c51miA6ibrLMBPEXTzduc4vBPHzejwez/cvt+C+f7x+GpY7h/2+nz2zdS5z6LCFoLZ5qAli8wj/9xHCzIK4xDeXGG27No7YB68dIfx8JCJoguiHOG7kvsUfhOMSo/3Y98H7XhBvlyFIWAfDPjFIOWT/MLgC9IlW/E/xH0tqeWiR7tx1z7dfkE95dKo00TbF8/2TrMTsiIyLhsh8eJVHG00EJGAizUeScT+8KieSqL4VRGCOHggILySI/L5feNzYP0wThGBF2Q2frwPn0oBImuhgZ8T/ElzjZqK6JHPKhmhM/Xlt15e7wpkISIiFIBaF54AJ4LkP6vhmYPlWEFHT4JR0DPR4hINnWjSB3RxWFQRaaYdHIeGxpMjmyIqAS1EFwvk3jy5NaiYASSkaIpQULhR4sl+acvj7GyOItJuEnKGkDH57ohGupAwI9cKdDgnkhVQl97DDdNYvRHevRm1FwZAgUkSTU2uPILRrfVSob9QLBeFNs71HvgOfNYZpEQKP0dL2CSPoBGqEwD0nha12hBANCavTzuXdgmjCmytC5Exapb9zGGREksfFy0WqHc+eQymZ8S1MEzDdU/QMISdp2RALahsQxUSPILRr1QiBP7SCxKCUEU5HrbuMzmEkA4VTvveudHn6DjZRjrOvl7tXoI/V2jGdnNqer1e8l5EOoYqhjBTuWUhiIodK6fyLQmdpg8ILKxOimXqqA420BdGS1KDfBS8NstQ9rRbouydYYeC0gpiQ/QkhHS+xKQURA2yzLnI8H7UZTRDn8m3WZmBgyggxAzF3xWCCuKvnlXWbIEwQa9x2mp+uYaAeIbRndiOx7nsAsW/0cbilp+2HzX7Ior5EuIogeElH7kU4zCXNib6kuzmvGzCvIPrwHztKZqOsHfj65iHcfbGAmwqC0B3qNq1mHrKTc8GAbW94Vo8tQ6qLIXkRbzBBkOpG0fXHLJGqQ+oLVi5PgknXhIqGWJigdRahGk1KwNt07Ras2JgDvVUfSHWqOcJe0ddTBhdEKAtF3txyiaty/bFUEusbAEe6KYSWD7KIHkEoc4qooDzse7oqkDwQcg0tfArtSbwpKhBGCq6EOr9yuXwqfR/r/EINTEPYq4bPuJ2CaBfigu0MzW8DV110vEiRHhSB8qDzQSsb3YjNOUVUWPVksaZEIRQQs1tTrMjRK0+4/c9VWTecIdSmWny9pQUfl4uJCqnG/kyla60ikIMFgckh96yw/0EU5N24REEZuJx1YFvzc2euvQuoyp4u/XKPAp3B/c7yI673M7XPDLEVIowGb0PMis2IXAFlCAjs5ZgUkXx5yjlSEHSPZeQ0L0sdXn3hDFIGuYTYxM2Uxsio4s+ZNuVypkmBbmkTk95tL4XPF5up0Nsd0mNbEKy5Ja1FXpQWw/oo9qMOFwTJk879JEJSXJqD5bY7TKV0noKZ4k/HeIiOqIpdqkMqQ0R5hpCSaVj80+nBr+H5+ZAgdggCFIFJqOwBo0EBEO5QxJGCoGGYNCaxWIyHx9wzhE8Wcgj2i+mIEHlYmhT607eD65bI6eHDjcxVdg1qJDT9Do1b+GccoEh0S/gkd2+KKSPnqrAmgT3oAdMQdktieC1DCGOTtTl0c3WLgaMFgWf3VlS+BeVzL3K0IFK05/cSc9NyX3QnCOK+5K64chPEil4biNkEMZDcFac2QazotYGYTRADyV1x6l2CaD7dXZEBwwwMdD+pTM8B+TPEOQlltcs5Qc6IygQxo1cuxFQTRPHKppAyirdLffDTmqYUQ8jv8ck1LRxAETG/7ikUpppvf2J/CA4F1qIlQLLrC0/C+6M6lnah9waY3h8h6m+XgrceJbz08OFfskQfYpMiXXRlEA37qDY1lfNrKUOxGxs06i9ochf/55WY/YIoO3wY+SVt5WFU6iEoezz4G2g0Q8JhVxGEZld720ZzaQP26LVTHiEIVjRmJWWpM1ptBGIOkPxRvv1Jcr4sCNWuJojW0q513gjrhwmicvPB3RALXqwPMTUc5qgsCaI0JMyvtedLEaJ8oVgedb8b7cZzCCQEPpEPrao2eIycIcouo3qE6Ho1k59fe7ESXYLch4Zy1ZbWWvKIzXvKnK0HU+nAnk6CQpdw5LBsf0pryAd/7EpkjUANQeiGKvOzkAK3IM3mJc3ibQVxiirNyDwMtCLEPEgNySkMmCBOoXkdIyaIdXx1ClITxCk0r2PEBLGOr05BaoI4heZ1jJgg1vHVKUhNEKfQvI4RE8Q6vjoFqQniFJrXMWKCWMdXpyA1QZxC8zpGTBDr+OoUpP8Arv92hCPEu+kAAAAASUVORK5CYII="));

                // close the file
                f.close();

            } catch (IOException ioexception) {

                // log message
                log.log(Level.SEVERE,
                        "An IO exception occured while trying to create the image replacement. MESSAGE: {0}",
                        StringUtils.printStackTrace(ioexception));

            } catch (Exception except) {

                // log message
                log.log(Level.SEVERE,
                        "An error occured while trying to create the image replacement. MESSAGE: {0}",
                        StringUtils.printStackTrace(except));

            }

        }

    }

    // unescape all HTML entities
    newText = StringEscapeUtils.unescapeHtml(newText);

    // return new text
    return newText;
}

From source file:net.intelliant.util.UtilCommon.java

/**
 * 1. Compressed JPEG images.// ww w.jav a2s .  co m
 * 2. Prefixes (if required) image server URL to image src locations. 
 * 
 * @return a <code>String</code> value
 */
public static String parseHtmlAndGenerateCompressedImages(String html) throws IOException {
    if (UtilValidate.isEmpty(html)) {
        return html;
    }
    org.jsoup.nodes.Document doc = Jsoup.parse(html);
    Elements images = doc.select("img[src~=(?i)\\.(jpg|jpeg|png|gif)]");
    if (images != null && images.size() > 0) {
        Set<String> imageLocations = new HashSet<String>();
        for (Element image : images) {
            String srcAttributeValue = image.attr("src");
            if (!(imageLocations.contains(srcAttributeValue))) {
                if (Debug.infoOn()) {
                    Debug.logInfo(
                            "[parseHtmlAndGenerateCompressedImages] originalSource >> " + srcAttributeValue,
                            module);
                }
                if (!UtilValidate.isUrl(srcAttributeValue)) {
                    int separatorIndex = srcAttributeValue.lastIndexOf("/");
                    if (separatorIndex == -1) {
                        separatorIndex = srcAttributeValue
                                .lastIndexOf("\\"); /** just in case some one plays with html source. */
                    }
                    if (separatorIndex != -1) {
                        String originalFileName = srcAttributeValue.substring(separatorIndex + 1);

                        /* Handling spaces in file-name to make url friendly. */
                        String outputFileName = StringEscapeUtils.escapeHtml(originalFileName);
                        /** Compression works for jpeg's only. 
                        if (originalFileName.endsWith("jpg") || originalFileName.endsWith("jpeg")) {
                           try {
                              outputFileName = generateCompressedImageForInputFile(imageUploadLocation, originalFileName);
                           } catch (NoSuchAlgorithmException e) {
                              Debug.logError(e, module);
                              return html;
                           }
                        }
                        */
                        StringBuilder finalLocation = new StringBuilder(campaignBaseURL);
                        finalLocation.append(imageUploadWebApp).append(outputFileName);
                        html = StringUtil.replaceString(html, srcAttributeValue, finalLocation.toString());
                        imageLocations.add(srcAttributeValue);
                    }
                } else {
                    Debug.logWarning("[parseHtmlAndGenerateCompressedImages] ignoring encountered HTML URL..",
                            module);
                }
            }
        }
    } else {
        if (Debug.infoOn()) {
            Debug.logInfo("[parseHtmlAndGenerateCompressedImages] No jpeg images, doing nothing..", module);
        }
    }
    if (Debug.infoOn()) {
        Debug.logInfo("[parseHtmlAndGenerateCompressedImages] returning html >> " + html, module);
    }
    return html;
}

From source file:com.ettoremastrogiacomo.sktradingjava.starters.Temp.java

public static void fetchEuroNext() throws Exception {

    String u0 = "https://www.euronext.com/en/equities/directory";
    com.ettoremastrogiacomo.utils.HttpFetch httpf = new com.ettoremastrogiacomo.utils.HttpFetch();
    if (Init.use_http_proxy.equals("true")) {
        httpf.setProxy(Init.http_proxy_host, Integer.parseInt(Init.http_proxy_port), Init.http_proxy_user,
                Init.http_proxy_password);
    }// w w  w .j av a2 s .c o  m
    String s = new String(httpf.HttpGetUrl(u0, Optional.empty(), Optional.empty()));

    int k1 = s.indexOf("\\/en\\/popup\\/data\\/download?");
    int k2 = s.indexOf("\"", k1);
    String u1 = s.substring(k1, k2 - 1);
    //LOG.debug(u1);
    u1 = u1.replace("\\u0026", "&");
    u1 = "https://www.euronext.com" + u1.replace("/", "");
    u1 = u1.replace("\\", "/");
    LOG.debug(u1);
    s = new String(httpf.HttpGetUrl(u1, Optional.empty(), Optional.empty()));
    Document doc = Jsoup.parse(s);
    java.util.HashMap<String, String> vmap = new java.util.HashMap<>();
    vmap.put("format", "1");
    vmap.put("layout", "2");
    vmap.put("decimal_separator", "1");
    vmap.put("date_format", "1");
    vmap.put("op", "Go");
    Elements links = doc.select("input[name=\"form_build_id\"]");
    links.forEach((x) -> {
        vmap.put("form_build_id", x.attr("value"));
    });
    links = doc.select("input[name=\"form_id\"]");
    links.forEach((x) -> {
        vmap.put("form_id", x.attr("value"));
    });
    HttpURLConnection post = httpf.sendPostRequest(u1, vmap);
    StringBuffer response;
    try (BufferedReader in = new BufferedReader(new InputStreamReader(post.getInputStream()))) {
        String inputLine;
        response = new StringBuffer();
        while ((inputLine = in.readLine()) != null) {
            response.append("\n").append(inputLine);
        }
    }
    String res = response.toString();
    String[] lines = res.split("\n");
    for (String line : lines) {
        String[] row = line.split("\t");
        if (row.length == 13) {
            LOG.debug(row[0] + "\t" + row[1] + "\t" + row[2] + "\t" + row[3] + "\t" + row[4] + "\t" + row[5]);
        }
    }
}

From source file:FILER.java

public static String[] Dealing_Files(File f) throws IOException //return array of important strings in the file
{
    Text = "";
    String[] Importants = { "", "", "", "" }; //first element is the title,second is all headers,third is img alt,4th is the url
    org.jsoup.nodes.Document doc = Jsoup.parse(f, "UTF-8");
    Importants[0] = doc.title(); //get the title of the file
    //Text=Text+" "+doc.title(); 
    String tag = "h";
    String All_Headers = "";
    Elements Header;/*from  ww  w  .j  av a 2 s.c  o m*/
    for (int i = 1; i < 20; i++) //loop to get text with headers tag of the file
    {
        tag = "h" + String.valueOf(i);
        Header = doc.select(tag);
        if (Header.size() > 0) {
            Header = doc.getElementsByTag(tag);
            String pConcatenated = "";
            for (Element x : Header) {
                pConcatenated += x.text() + " ";
            }
            All_Headers = All_Headers + pConcatenated;
        } else
            break;

    }
    Importants[1] = All_Headers;
    Text = Text + " " + doc.text(); //get the text of the document
    Elements img = doc.getElementsByTag("img"); //get the text with img tag 
    for (Element element : img) {
        if (element.attr("alt") != null && !(element.attr("alt").equals(""))) {
            Text = Text + " " + element.attr("alt");
            Importants[2] = Importants[2] + " " + element.attr("alt");
        }
    }
    return Importants;
}

From source file:me.vertretungsplan.parser.DaVinciParser.java

@NotNull
static List<String> getDayUrls(String url, Document doc) throws IOException {
    List<String> dayUrls = new ArrayList<>();
    if (doc.select("ul.classes").size() > 0) {
        // List of classes
        Elements classes = doc.select("ul.classes li a");
        for (Element klasse : classes) {
            dayUrls.add(new URL(new URL(url), klasse.attr("href")).toString());
        }//w w w .j  a  va  2  s . c  om
    } else if (doc.select("ul.month").size() > 0) {
        // List of days in calendar view
        Elements days = doc.select("ul.month li input[onclick]");
        for (Element day : days) {
            String urlFromOnclick = urlFromOnclick(day.attr("onclick"));
            if (urlFromOnclick == null)
                continue;
            dayUrls.add(new URL(new URL(url), urlFromOnclick).toString());
        }
    } else if (doc.select("ul.day-index").size() > 0) {
        // List of days in list view
        Elements days = doc.select("ul.day-index li a");
        for (Element day : days) {
            dayUrls.add(new URL(new URL(url), day.attr("href")).toString());
        }
    } else if (doc.select("table td[align=left] a").size() > 0) {
        // Table of classes (DaVinci 5)
        Elements classes = doc.select("table td[align=left] a");
        for (Element klasse : classes) {
            dayUrls.add(new URL(new URL(url), klasse.attr("href")).toString());
        }
    } else {
        // Single day
        dayUrls.add(url);
    }
    return dayUrls;
}

From source file:io.jari.geenstijl.API.API.java

/**
 * Get article and comments (note that getArticles doesn't get the comments)
 *
 * @param url The direct url to the geenstijl article
 * @return Artikel The fetched article//from w ww .jav a2 s  . co m
 * @throws IOException
 * @throws ParseException
 */
public static Artikel getArticle(String url, Context context) throws IOException, ParseException {
    ensureCookies();
    domain = context.getSharedPreferences("geenstijl", 0).getString("gsdomain", "www.geenstijl.nl");
    Artikel artikel;
    Log.i(TAG, "GETARTICLE STEP 1/2: Getting/parsing article page & images... " + url);
    Document document = Jsoup.connect(url).get();
    Element artikel_el = document.select("#content>article").first();
    artikel = parseArtikel(artikel_el, context);

    Log.i(TAG, "GETARTICLE STEP 2/2: Parsing comments...");
    ArrayList<Comment> comments = new ArrayList<Comment>();
    int i = 0;
    Elements comments_el = document.select("#comments article");
    for (Element comment_el : comments_el) {
        i++;
        Comment comment = new Comment();
        comment.id = Integer.parseInt(comment_el.attr("id").substring(1));
        Element footer = comment_el.select("footer").first();
        StringTokenizer footer_items = new StringTokenizer(footer.text(), "|");
        comment.auteur = footer_items.nextToken().trim();

        try {
            SimpleDateFormat simpleDateFormat = new SimpleDateFormat("dd-MM-yyHH:mm", Locale.US);
            comment.datum = simpleDateFormat
                    .parse(footer_items.nextToken().trim() + footer_items.nextToken().trim());
        } catch (ParseException parseEx) {
            //fuck gebruikers met pipe chars in hun naam, pech, gehad.
            continue;
        }

        comment.inhoud = comment_el.select("p").first().html();

        Log.d(TAG + ".perf", "CommentParser: Parsed " + comment.id + ": " + i + "/" + comments_el.size());

        comments.add(comment);
    }

    Comment[] comm = new Comment[comments.size()];
    comments.toArray(comm);
    artikel.comments = comm;

    Log.i(TAG, "GETARTICLE: DONE");

    return artikel;
}

From source file:com.normalexception.app.rx8club.html.HtmlFormUtils.java

/**
 * Report the value inside of an input element
 * @param pan   The panel where all of the input elements reside
 * @param name   The name of the input to get the value for
 * @return      The string value of the input
 */// ww w  .  ja  v a2  s.  c o  m
public static String getInputElementValueByName(Document pan, String name) {
    try {
        return pan.select("input[name=" + name + "]").attr("value");
    } catch (NullPointerException npe) {
        return "";
    }
}

From source file:com.normalexception.app.rx8club.html.HtmlFormUtils.java

/**
 * Report the value inside of an input element
 * @param pan   The panel where all of the input elements reside
 * @param name   The name of the input to get the value for
 * @return      The string value of the input
 *//*www  .j  av  a  2  s.  c om*/
public static String getInputElementValueById(Document pan, String name) {
    try {
        return pan.select("input[id=" + name + "]").attr("value");
    } catch (NullPointerException npe) {
        return "";
    }
}

From source file:mailbox.CreationViaEmail.java

private static String replaceCidWithAttachments(String html, Map<String, Attachment> attachments) {
    Document doc = Jsoup.parse(html);
    String[] attrNames = { "src", "href" };

    for (String attrName : attrNames) {
        Elements tags = doc.select("*[" + attrName + "]");
        for (Element tag : tags) {
            String uriString = tag.attr(attrName).trim();

            if (!uriString.toLowerCase().startsWith("cid:")) {
                continue;
            }/*from  www.  j  a va 2s . c  om*/

            String cid = uriString.substring("cid:".length());

            if (!attachments.containsKey(cid)) {
                continue;
            }

            Long id = attachments.get(cid).id;
            tag.attr(attrName, controllers.routes.AttachmentApp.getFile(id).url());
        }
    }

    Elements bodies = doc.getElementsByTag("body");

    if (bodies.size() > 0) {
        return bodies.get(0).html();
    } else {
        return doc.html();
    }
}

From source file:models.NotificationMail.java

/**
 * Make every link to be absolute and to have 'rel=noreferrer' if
 * necessary./*  w  w  w  . j a  v a2s.co  m*/
 */
public static void handleLinks(Document doc) {
    String hostname = Config.getHostname();
    String[] attrNames = { "src", "href" };
    Boolean noreferrer = play.Configuration.root().getBoolean("application.noreferrer", false);

    for (String attrName : attrNames) {
        Elements tags = doc.select("*[" + attrName + "]");
        for (Element tag : tags) {
            boolean isNoreferrerRequired = false;
            String uriString = tag.attr(attrName);

            if (noreferrer && attrName.equals("href")) {
                isNoreferrerRequired = true;
            }

            try {
                URI uri = new URI(uriString);

                if (!uri.isAbsolute()) {
                    tag.attr(attrName, Url.create(uriString));
                }

                if (uri.getHost() == null || uri.getHost().equals(hostname)) {
                    isNoreferrerRequired = false;
                }
            } catch (URISyntaxException e) {
                play.Logger.info("A malformed URI is detected while" + " checking an email to send", e);
            }

            if (isNoreferrerRequired) {
                tag.attr("rel", tag.attr("rel") + " noreferrer");
            }
        }
    }
}