Example usage for org.jsoup.nodes Document html

List of usage examples for org.jsoup.nodes Document html

Introduction

In this page you can find the example usage for org.jsoup.nodes Document html.

Prototype

public String html() 

Source Link

Document

Retrieves the element's inner HTML.

Usage

From source file:org.b3log.symphony.service.LinkForgeMgmtService.java

/**
 * Forges the specified URL./* ww  w.  ja  v  a2 s  .  c  o m*/
 *
 * @param url the specified URL
 * @param userId the specified user id
 */
public void forge(final String url, final String userId) {
    String html;
    String baseURL;
    try {
        final Document doc = Jsoup.connect(url).timeout(5000).userAgent(Symphonys.USER_AGENT_BOT).get();

        doc.select("body").prepend("<a href=\"" + url + "\">" + url + "</a>"); // Add the specified URL itfself

        html = doc.html();

        baseURL = doc.baseUri();
    } catch (final Exception e) {
        LOGGER.log(Level.ERROR, "Parses link [" + url + "] failed", e);

        return;
    }

    final List<JSONObject> links = Links.getLinks(baseURL, html);
    final List<JSONObject> cachedTags = tagCache.getTags();

    final Transaction transaction = linkRepository.beginTransaction();
    try {
        for (final JSONObject lnk : links) {
            final String addr = lnk.optString(Link.LINK_ADDR);
            JSONObject link = linkRepository.getLink(addr);

            if (null == link) {
                link = new JSONObject();
                link.put(Link.LINK_ADDR, lnk.optString(Link.LINK_ADDR));
                link.put(Link.LINK_BAD_CNT, 0);
                link.put(Link.LINK_BAIDU_REF_CNT, 0);
                link.put(Link.LINK_CLICK_CNT, 0);
                link.put(Link.LINK_GOOD_CNT, 0);
                link.put(Link.LINK_SCORE, 0);
                link.put(Link.LINK_SUBMIT_CNT, 0);
                link.put(Link.LINK_TITLE, lnk.optString(Link.LINK_TITLE));
                link.put(Link.LINK_TYPE, Link.LINK_TYPE_C_FORGE);

                LOGGER.info(link.optString(Link.LINK_ADDR) + "____" + link.optString(Link.LINK_TITLE));
                linkRepository.add(link);

                final JSONObject linkCntOption = optionRepository.get(Option.ID_C_STATISTIC_LINK_COUNT);
                final int linkCnt = linkCntOption.optInt(Option.OPTION_VALUE);
                linkCntOption.put(Option.OPTION_VALUE, linkCnt + 1);
                optionRepository.update(Option.ID_C_STATISTIC_LINK_COUNT, linkCntOption);
            } else {
                link.put(Link.LINK_BAIDU_REF_CNT, lnk.optInt(Link.LINK_BAIDU_REF_CNT));
                link.put(Link.LINK_TITLE, lnk.optString(Link.LINK_TITLE));
                link.put(Link.LINK_SCORE, lnk.optInt(Link.LINK_BAIDU_REF_CNT)); // XXX: Need a score algorithm

                linkRepository.update(link.optString(Keys.OBJECT_ID), link);
            }

            final String linkId = link.optString(Keys.OBJECT_ID);
            final double linkScore = link.optDouble(Link.LINK_SCORE, 0D);
            String title = link.optString(Link.LINK_TITLE) + " " + link.optString(Link.LINK_T_KEYWORDS);
            title = Pangu.spacingText(title);
            String[] titles = title.split(" ");
            titles = Strings.trimAll(titles);

            for (final JSONObject cachedTag : cachedTags) {
                final String tagId = cachedTag.optString(Keys.OBJECT_ID);

                final String tagTitle = cachedTag.optString(Tag.TAG_TITLE);
                if (!Strings.containsIgnoreCase(tagTitle, titles)) {
                    continue;
                }

                final JSONObject tag = tagRepository.get(tagId);

                // clean
                tagUserLinkRepository.removeByTagIdUserIdAndLinkId(tagId, userId, linkId);

                // re-add
                final JSONObject tagLinkRel = new JSONObject();
                tagLinkRel.put(Tag.TAG_T_ID, tagId);
                tagLinkRel.put(UserExt.USER_T_ID, userId);
                tagLinkRel.put(Link.LINK_T_ID, linkId);
                tagLinkRel.put(Link.LINK_SCORE, linkScore);
                tagUserLinkRepository.add(tagLinkRel);

                // refresh link score
                tagUserLinkRepository.updateTagLinkScore(tagId, linkId, linkScore);

                // re-calc tag link count
                final int tagLinkCnt = tagUserLinkRepository.countTagLink(tagId);
                tag.put(Tag.TAG_LINK_CNT, tagLinkCnt);
                tagRepository.update(tagId, tag);
            }
        }

        transaction.commit();

        LOGGER.info("Forged link [" + url + "]");
    } catch (final Exception e) {
        if (transaction.isActive()) {
            transaction.rollback();
        }

        LOGGER.log(Level.ERROR, "Saves links failed", e);
    }
}

From source file:org.b3log.symphony.util.MarkdownsTestCase.java

@Test
public void jsoupParse() {
    final Document parse = Jsoup.parse("<p><strong><br>??????</strong></p>\n" + "<hr>\n"
            + "<p> <br> -?  java <br> -?  javascript<br> -???????<br> -?blog?<br> -  <br> -           </p>\n"
            + "<p> ???</p>\n"
            + "<p>1java ???a1?????<br>  A ) a1.java            B) a1.class             C) a1                    D) </p>\n"
            + "<p>2  Java<br>  A) ????<br>  B) ?????<br>  C) ????<br>  D)?</p>\n"
            + "<p>3  mainJava??main?<br>       A)public  static  void  main<br>      B)public  static  void   main String[]  args <br>   C)public  static int  mainString  [] arg <br>   D)public  void  mainString  arg[] </p>\n"
            + "<p>4 Java??????????????????? <br>A)??              B)               C)?         D)Java??</p>\n"
            + "<p>5  A?BB?CJava?? </p>\n"
            + "<pre><code>1.    A  a0=new  A(); \n" + "</code></pre><ol>\n" + "<li>A  a1 =new  B(); </li>\n"
            + "<li>A  a2=new  C();<br>       <br>A)?1<br>B)1?23<br>C)1?2?32?3?<br>D)1?23 </li>\n"
            + "</ol>\n"
            + "<p>6 ? <br>     1  String  s1=a?+b?;<br>    2   String  s2=new  Strings1<br>3    ifs1= =s2<br>4       System.out.println(= =  is succeeded?);<br>5     if (s1.equals(s2))<br>6        System.out.println(.equals() is succeeded?);<br>A)46<br>B)46?<br>C)64?<br> D)4?6? </p>\n"
            + "<p>7 ??18??</p>\n"
            + "<p>A)int B) char C) varchar D)text </p>\n"
            + "<p>8?<br>     A.)? B) C)  D) </p>\n"
            + "<p>9 HTML?Javascript?<br>   A)<javascript></javascript><br>   B)<script></script><br>   C) <head></head><br>   D) <body </body></p>\n"
            + "<p>10 ?</p>\n"
            + "<p><input id=\"btnGo\" type=\"button\" value=\"?\" class=\"btn\"/><br>A)  $(&quot;#btnGo&quot;)<br>B)   $(&quot;.btnGo&quot;)<br>C)  $(&quot;.btn&quot;)<br>    D)  $(&quot;input[type=&#39;button&#39;]&quot;)</p>\n"
            + "<p></p>\n"
            + "<p> <br>-Java?</p>\n"
            + "<p>-java?listMap,Set, Queue</p>\n"
            + "<p>-??<br>Spring:<br>springmvcstruts2):<br>Hibernatemybatis):<br>Jquery:<br>Bootstrap</p>\n"
            + "<p>-Javascript??</p>\n"
            + "<p>-sql<br>Select * from Table:<br>Where :<br>Having:<br>Group by:<br>Order by:</p>\n"
            + "<p> ?<br><br>-</p>\n"
            + "<p>-java(SSH)</p>\n"
            + "<p>-?</p>\n"
            + "<p>-?:</p>\n"
            + "<p>-?</p>\n" + "<hr>\n");

    final String html = parse.html();
    System.out.println(html);//  ww w  .  j  a va  2  s.c  o m
    Assert.assertTrue(html.contains("<body < body>")); // Jsoup bug
}

From source file:org.jasig.portlet.proxy.mvc.portlet.proxy.ProxyPortletController.java

@RenderMapping
public void showContent(final RenderRequest request, final RenderResponse response) {

    final PortletPreferences preferences = request.getPreferences();

    // locate the content service to use to retrieve our HTML content
    final String contentServiceKey = preferences.getValue(CONTENT_SERVICE_KEY, null);
    final IContentService contentService = applicationContext.getBean(contentServiceKey, IContentService.class);

    final IContentRequest proxyRequest;
    try {/*from w  w  w .  j a  v a 2  s .  co m*/
        proxyRequest = contentService.getRequest(request);
    } catch (RuntimeException e) {
        log.error("URL was not in the proxy list");
        // TODO: how should we handle these errors?
        return;
    }

    // retrieve the HTML content
    final IContentResponse proxyResponse;
    try {
        proxyResponse = contentService.getContent(proxyRequest, request);
    } catch (Exception e) {
        log.error("Failed to proxy content", e);
        // TODO: error handling
        return;
    }

    // locate all filters configured for this portlet
    final List<IDocumentFilter> filters = new ArrayList<IDocumentFilter>();
    final String[] filterKeys = preferences.getValues(FILTER_LIST_KEY, new String[] {});
    for (final String filterKey : filterKeys) {
        final IDocumentFilter filter = applicationContext.getBean(filterKey, IDocumentFilter.class);
        filters.add(filter);
    }

    try {
        String sourceEncodingFormat = preferences.getValue(PREF_CHARACTER_ENCODING, CHARACTER_ENCODING_DEFAULT);
        final Document document = Jsoup.parse(proxyResponse.getContent(), sourceEncodingFormat,
                proxyResponse.getProxiedLocation());

        // apply each of the document filters in order
        for (final IDocumentFilter filter : filters) {
            filter.filter(document, proxyResponse, request, response);
        }

        // write out the final content
        OutputStream out = null;
        try {
            out = response.getPortletOutputStream();
            IOUtils.write(document.html(), out);
            out.flush();
        } catch (IOException e) {
            log.error("Exception writing proxied content", e);
        } finally {
            IOUtils.closeQuietly(out);
        }

    } catch (IOException e) {
        log.error("Error parsing HTML content", e);
    } finally {
        if (proxyResponse != null) {
            proxyResponse.close();
        }
    }

}

From source file:org.structr.web.function.HttpGetFunction.java

@Override
public Object apply(ActionContext ctx, final GraphObject entity, final Object[] sources) {

    if (sources != null && sources.length >= 1 && sources.length <= 4 && sources[0] != null) {

        try {// w  w  w.j  a va  2  s .  c  o m

            String address = sources[0].toString();
            String contentType = null;
            String username = null;
            String password = null;

            switch (sources.length) {

            case 4:
                password = sources[3].toString();
            case 3:
                username = sources[2].toString();
            case 2:
                contentType = sources[1].toString();
                break;
            }

            //long t0 = System.currentTimeMillis();
            if ("text/html".equals(contentType)) {

                HttpClient client = getHttpClient();

                GetMethod get = new GetMethod(address);
                get.addRequestHeader("User-Agent", "curl/7.35.0");
                get.addRequestHeader("Connection", "close");
                get.getParams().setParameter("http.protocol.single-cookie-header", true);
                get.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);

                get.setFollowRedirects(true);

                client.executeMethod(get);

                final InputStream response = get.getResponseBodyAsStream();

                // Skip BOM to workaround this Jsoup bug: https://github.com/jhy/jsoup/issues/348
                String code = IOUtils.toString(response, "UTF-8");

                if (code.charAt(0) == 65279) {
                    code = code.substring(1);
                }

                final Document doc = Jsoup.parse(code);

                if (sources.length > 2) {

                    return doc.select(sources[2].toString()).html();

                } else {

                    return doc.html();
                }

            } else {

                return getFromUrl(ctx, address, username, password);
            }

        } catch (Throwable t) {

            logException(entity, t, sources);

        }

        return "";

    } else {

        logParameterError(entity, sources, ctx.isJavaScriptContext());

    }

    return usage(ctx.isJavaScriptContext());
}

From source file:org.uberfire.server.locale.GWTLocaleHeaderFilter.java

@Override
public void doFilter(final ServletRequest request, final ServletResponse response, final FilterChain chain)
        throws IOException, ServletException {

    final CharResponseWrapper wrappedResponse = getWrapper((HttpServletResponse) response);
    chain.doFilter(request, wrappedResponse);

    final String output;

    final Locale locale = getLocale(request);
    final String injectedScript = "<meta name=\"gwt:property\" content=\"locale=" + locale.toString() + "\">";

    final Document document = Jsoup.parse(wrappedResponse.toString());
    document.head().append(injectedScript);
    output = document.html();

    final byte[] outputBytes = output.getBytes("UTF-8");
    response.setContentLength(outputBytes.length);
    response.getWriter().print(output);//from w w w  .j  a va  2 s. c  o  m
}

From source file:org.xlrnet.metadict.engines.leo.LeoEngine.java

/**
 * The main method for querying a {@link SearchEngine}. This method will be called by the metadict core on incoming
 * search queries. The core will always try to parallelize the query as much as possible according to the specified
 * supported dictionaries of this engine.
 * <p>//  w w w .j a va  2 s  . co  m
 * Upon calling, the core will make sure that the language parameters of this method correspond exactly to a
 * supported {@link Dictionary} as described in the engine's {@link
 * FeatureSet}. However, an engine may also return results from a different
 * language. In this case, the core component will decide it the supplied results are useful.
 * <p>
 * Example:
 * If the engine says it supports a one-way german-english dictionary, this method will be called with the language
 * parameters inputLanguage=GERMAN, outputLanguage=ENGLISH and allowBothWay=false.
 * However, it the engine supports a bidirectional german-english dictionary, this method will be called with the
 * language parameters inputLanguage=GERMAN, outputLanguage=ENGLISH and allowBothWay=true.
 *
 * @param queryInput
 *         The query string i.e. word that should be looked up.
 * @param inputLanguage
 *         The input language of the query. This language must be specified as a dictionary's input language of
 *         this engine.
 * @param outputLanguage
 *         The expected output language of the query. This language must be specified as the output language of the
 *         same dictionary to which the given inputLanguage belongs.
 * @param allowBothWay
 *         True, if the engine may search in both directions. I.e. the queryInput can also be seen as the
 *         outputLanguage. The core will set this flag only if the engine declared a dictionary with matching input
 *         and output language. Otherwise the will be called for each direction separately.
 * @return The results from the search query. You can use an instance of {@link EngineQueryResultBuilder}
 * to build this result list.
 */
@Override
public EngineQueryResult executeSearchQuery(String queryInput, Language inputLanguage, Language outputLanguage,
        boolean allowBothWay) throws Exception {
    Connection targetConnection = buildTargetConnection(queryInput, inputLanguage, outputLanguage);
    Document doc = targetConnection.get();

    LOGGER.debug(doc.html());

    EngineQueryResultBuilder builder = processDocument(doc);

    return builder.build();
}

From source file:perflab.LoadrunnerWrapper.java

/**
 * @param htmlSummaryFile - load runner analysis html report file to parse
 * @param summaryFile     - location of summary file to be generated out of loadrunner html analysis
 *//*from  w  ww.  j  a  v  a2 s.  c  o m*/
protected void parseSummaryFile(String htmlSummaryFile, String summaryFile) {
    try {

        File input = new File(htmlSummaryFile);
        Document document = Jsoup.parse(input, "UTF-8");
        Document parse = Jsoup.parse(document.html());
        Elements table = parse.select("table").select("[summary=Transactions statistics summary table]");
        Elements rows = table.select("tr");

        getLog().info("number of rows in summary file=" + rows.size());

        for (Element row : rows) {

            //getLog().info("table element = " + row.toString());

            String name = row.select("td[headers=LraTransaction Name]").select("span").text();

            if (!name.isEmpty()) {

                float avgRT = Float.valueOf(row.select("td[headers=LraAverage]").select("span").text());
                float minRT = Float.valueOf(row.select("td[headers=LraMinimum]").select("span").text());
                float maxRT = Float.valueOf(row.select("td[headers=LraMaximum]").select("span").text());
                int passed = Integer.valueOf(row.select("td[headers=LraPass]").select("span").text()
                        .replace(".", "").replace(",", ""));
                int failed = Integer.valueOf(row.select("td[headers=LraFail]").select("span").text()
                        .replace(".", "").replace(",", ""));
                int failedPrecentage = failed / (failed + passed) * 100;

                getLog().info("Saving Transaction [" + name + "]");
                this.transactions.add(
                        new LoadRunnerTransaction(name, minRT, avgRT, maxRT, passed, failed, failedPrecentage));
            }
        }

    } catch (IOException e) {
        getLog().error("Can't read LoadRunner Analysis html report " + e.getMessage());
    }

}

From source file:perflab.loadrunnerwrapperjenkins.LoadRunnerWrapper.java

/**
 * @param htmlSummaryFile - load runner analysis html report file to parse
 * @param summaryFile     - location of summary file to be generated out of loadrunner
 *                        html analysis//from   www . j  a v  a2 s .com
 */
protected void parseSummaryFile(String htmlSummaryFile, String summaryFile) {
    try {

        File input = new File(htmlSummaryFile);
        Document document = Jsoup.parse(input, "UTF-8");
        Document parse = Jsoup.parse(document.html());
        Elements table = parse.select("table").select("[summary=Transactions statistics summary table]");
        Elements rows = table.select("tr");

        logger.println("number of rows in summary file=" + rows.size());

        for (Element row : rows) {

            // logger.println("table element = " + row.toString());

            String name = row.select("td[headers=LraTransaction Name]").select("span").text();

            if (!name.isEmpty()) {

                float avgRT = Float.valueOf(row.select("td[headers=LraAverage]").select("span").text());
                float minRT = Float.valueOf(row.select("td[headers=LraMinimum]").select("span").text());
                float maxRT = Float.valueOf(row.select("td[headers=LraMaximum]").select("span").text());
                int passed = Integer.valueOf(row.select("td[headers=LraPass]").select("span").text()
                        .replace(".", "").replace(",", ""));
                int failed = Integer.valueOf(row.select("td[headers=LraFail]").select("span").text()
                        .replace(".", "").replace(",", ""));

                // logger.println("Saving Transaction [" + name + "]");
                this.transactions.add(new LoadRunnerTransaction(name, minRT, avgRT, maxRT, passed, failed));
            }
        }

    } catch (IOException e) {
        logger.println("Can't read LoadRunner Analysis html report " + e.getMessage());
    }

}

From source file:psef.handler.HTMLFilter.java

/**
 * Filter the entire document/*w w w  . j a v a  2s .c  om*/
 * @return the filtered document
 * @throws PsefException 
 */
public String filter() throws PsefException {
    Document doc = Jsoup.parse(src);
    System.out.println("Filtering scripts");
    filterScripts(doc);
    System.out.println("Filtering styles");
    filterStyles(doc);
    System.out.println("Filtering links");
    filterLinks(doc);
    System.out.println("Filtering anchors");
    filterAnchors(doc);
    // write converted dom back to a string
    StringWriter sw = new StringWriter(src.length());
    PrintWriter writer = new PrintWriter(sw);
    writer.write(doc.html());
    writer.flush();
    writer.close();
    return sw.toString();
}

From source file:webcralwerproject1.Webcrawler.java

public String writeContent(Document htmlDocument) {// throws IOException {
    FileWriter fWriter = null;/*  ww w  . j  a v a2 s  .  com*/
    BufferedWriter writer = null;
    String path = null;
    try {
        File file = new File(DirectoryName + "/" + crawlcount);
        if (!file.exists()) {
            if (file.mkdir()) {
                System.out.println("Repository Directory is created!");
            } else {
                System.out.println("Failed to create directory!");
            }
        }
        File f = new File(file.getAbsolutePath() + "/" + MaxPage + "file.html");
        path = f.getAbsolutePath();
        Elements img = htmlDocument.getElementsByTag("img");
        Elements srcc = htmlDocument.getElementsByAttribute("src");
        for (Element el : img) {
            imagecount++;
            el.attr("src", "a");
        }
        // System.out.println("Imagecount : " + imagecount );
        FileUtils.writeStringToFile(f, htmlDocument.html(), "UTF-8");

    } catch (Exception e) {
        System.out.println("Inside writeContent Exception " + e);
    }
    System.out.println("Inside writeContent ");
    return path;
}