Example usage for org.jsoup.nodes Document html

List of usage examples for org.jsoup.nodes Document html

Introduction

In this page you can find the example usage for org.jsoup.nodes Document html.

Prototype

public String html() 

Source Link

Document

Retrieves the element's inner HTML.

Usage

From source file:Main.java

public static void main(String[] args) throws Exception {
    Document doc = Jsoup.parse(new File("t2.html"), "UTF-8");
    doc.select("area#area1").after("<area id=\"newArea\" />");
    System.out.println(doc.html());
}

From source file:isc_415_practica_1.ISC_415_Practica_1.java

/**
 * @param args the command line arguments
 *//*from   www. j  av  a2s  . c  om*/
public static void main(String[] args) {
    String urlString;
    Scanner input = new Scanner(System.in);
    Document doc;

    try {
        urlString = input.next();
        if (urlString.equals("servlet")) {
            urlString = "http://localhost:8084/ISC_415_Practica1_Servlet/client";
        }
        urlString = urlString.contains("http://") || urlString.contains("https://") ? urlString
                : "http://" + urlString;
        doc = Jsoup.connect(urlString).get();
    } catch (Exception ex) {
        System.out.println("El URL ingresado no es valido.");
        return;
    }

    ArrayList<NameValuePair> formInputParams;
    formInputParams = new ArrayList<>();
    String[] plainTextDoc = new TextNode(doc.html(), "").getWholeText().split("\n");
    System.out.println(String.format("Nmero de lineas del documento: %d", plainTextDoc.length));
    System.out.println(String.format("Nmero de p tags: %d", doc.select("p").size()));
    System.out.println(String.format("Nmero de img tags: %d", doc.select("img").size()));
    System.out.println(String.format("Nmero de form tags: %d", doc.select("form").size()));

    Integer index = 1;

    ArrayList<NameValuePair> urlParameters = new ArrayList<>();
    for (Element e : doc.select("form")) {
        System.out.println(String.format("Form %d: Nmero de Input tags %d", index, e.select("input").size()));
        System.out.println(e.select("input"));

        for (Element formInput : e.select("input")) {
            if (formInput.attr("id") != null && formInput.attr("id") != "") {
                urlParameters.add(new BasicNameValuePair(formInput.attr("id"), "PRACTICA1"));
            } else if (formInput.attr("name") != null && formInput.attr("name") != "") {
                urlParameters.add(new BasicNameValuePair(formInput.attr("name"), "PRACTICA1"));
            }
        }

        index++;
    }

    if (!urlParameters.isEmpty()) {
        try {
            CloseableHttpClient httpclient = HttpClients.createDefault();
            UrlEncodedFormEntity entity = new UrlEncodedFormEntity(urlParameters, Consts.UTF_8);
            HttpPost httpPost = new HttpPost(urlString);
            httpPost.setHeader("User-Agent", USER_AGENT);
            httpPost.setEntity(entity);
            HttpResponse response = httpclient.execute(httpPost);
            System.out.println(response.getStatusLine());
        } catch (IOException ex) {
            Logger.getLogger(ISC_415_Practica_1.class.getName()).log(Level.SEVERE, null, ex);
        }

    }

}

From source file:Main.java

public static String toXHTML(String html) {
    html = html.replaceAll("(?s)<script>.*?</script>", "<!-- removed scripts --!>");
    final Document document = Jsoup.parse(html);
    document.outputSettings().syntax(Document.OutputSettings.Syntax.xml);
    return document.html();
}

From source file:com.facultyshowcase.app.ui.UIUtil.java

public static String scrubHtml(String html, Request request, Response response) {
    if (!StringFactory.isEmptyString(html)) {
        // Process the HTML converting links as necessary (adding JSESSIONID(s)
        /// for URL based session tracking, converting resource links to increase concurrent loading limit,
        /// CMS link externalization, etc).
        /// This is *not* sanitation and should always be done before rendering - never before persisting.
        /// We are doing this before sanitizing the HTML to avoid having to whitelist internal URL protocols, etc.

        try {/* ww w . j a  v a  2 s .com*/
            html = XMLRenderer.parseWithRoot(html, request, response);
        } catch (IOException e) {
            _logger.error("Unable to accept HTML: " + html, e);
        }

        // We don't trust the input, so we sanitize it with a whitelist of allowed HTML.
        Document dirty = Jsoup.parseBodyFragment(html, "");
        Whitelist whitelist = Whitelist.relaxed();
        // Don't allow users to use our website as a link farm
        whitelist.addEnforcedAttribute("a", "rel", "nofollow");
        Cleaner cleaner = new Cleaner(whitelist);
        Document clean = cleaner.clean(dirty);
        html = clean.html();

        return html;
    } else {
        return "";
    }
}

From source file:cognition.common.utils.StringTools.java

public static String addMetaDataToHtml(String htmlText, String metaData) {
    Document doc = Jsoup.parse(htmlText);
    doc.head().append(metaData);/*w ww  .  j  a  v  a2s .c  o  m*/

    return doc.html();
}

From source file:models.NotificationMail.java

private static String getHtmlMessage(Lang lang, String message, String urlToView, Resource resource) {
    String content = getRenderedHTMLWithTemplate(lang, Markdown.render(message), urlToView, resource);
    Document doc = Jsoup.parse(content);

    handleLinks(doc);//from   w  w w.j a va  2  s  .  com
    handleImages(doc);

    return doc.html();
}

From source file:automation.Launcher.java

public static String br2nl(String html) {
    if (html == null) {
        return html;
    }/* w  w w . jav  a 2s.  com*/
    Document document = Jsoup.parse(html);
    document.outputSettings(new Document.OutputSettings().prettyPrint(false));//makes html() preserve linebreaks and spacing
    document.select("p").prepend("\\n\\n");
    document.select("div").prepend("\\n");
    //   System.out.println(document.html());
    document.select("br").append("\\n");
    //   System.out.println(document.html());

    String s = document.html().replaceAll("\\\\n", "\n");
    //   System.out.println(s);
    return Jsoup.clean(s, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
}

From source file:ru.xxlabaza.popa.pack.PackingService.java

private String processHtml(Document document) {
    String content = commentRemoveService.removeComments(document.html(), HTML);
    return Parser.unescapeEntities(content, false);
}

From source file:de.limod.portals.AutoScout.java

@Override
public List<Car> getCars() {
    Document page = super.getPage("AutoScout", AutoScout.HOST, this.getQuery());

    String content = page.html();

    // parse json object from request
    Pattern pattern = Pattern.compile("articlesFromServer\\s+=\\s+\\Q[{\\E.*\\Q}]\\E");

    Matcher matcher = pattern.matcher(content);
    List<Car> cars = new ArrayList<>();
    // check  occurance
    if (matcher.find()) {
        String json = matcher.group().trim();
        json = json.substring(json.indexOf("["), json.length());
        JSONArray carsArray = new JSONArray(json);

        // iterate json object
        for (Object carEntry : carsArray) {
            JSONObject obj = (JSONObject) carEntry;
            String title = obj.getString("mk") + " " + obj.getString("vr");
            String price = String.valueOf(obj.getDouble("price_raw"));
            String url = String.format(AutoScout.URL, String.valueOf(obj.getInt("ei")));
            String erstzulassung = obj.getString("fr");
            String kilomter = obj.getString("ma");
            String id = String.valueOf(obj.getInt("ei"));
            Car c = new Car(title, "--", price, url, "AutoScout", id);
            c.setFound(new Date());

            cars.add(c);//  www .j a va 2  s.com
            if (cars.size() > this.getMaxHits()) {
                break;
            }
        }
    }

    return cars;
}

From source file:com.betel.flowers.pdf.util.XMLtoHtml.java

public String checkHTML(String htmlString) throws IOException {

    String checkedhtml = null;/*w w  w  .j  ava2 s  . c  o m*/
    try {
        Document docHtml = Jsoup.parse(htmlString);
        docHtml.outputSettings().syntax(Document.OutputSettings.Syntax.xml);
        String value = new String(docHtml.html());
        checkedhtml = StringEscapeUtils.unescapeHtml4(value);
    } catch (Exception ex) {
        throw ex;
    }
    return checkedhtml;
}