Example usage for org.jsoup.nodes Document body

List of usage examples for org.jsoup.nodes Document body

Introduction

In this page you can find the example usage for org.jsoup.nodes Document body.

Prototype

public Element body() 

Source Link

Document

Accessor to the document's body element.

Usage

From source file:mml.handler.post.MMLPostHTMLHandler.java

/**
 * Handle the request by writing everything out to scratch space
 * @param request/*  w w  w . ja  v a  2s .  com*/
 * @param response
 * @param urn
 * @throws MMLException 
 */
public void handle(HttpServletRequest request, HttpServletResponse response, String urn) throws MMLException {
    try {
        parseRequest(request);
        StringBuilder log = new StringBuilder();
        Document doc = Jsoup.parseBodyFragment(html);
        Element body = doc.body();
        parseBody(body);
        int totalWait = 0;
        while (Autosave.inProgress && totalWait < 100000) {
            Thread.sleep(4000);
            totalWait += 400;
        }
        if (totalWait >= 100000)
            throw new DbException("Save timed out");
        if (!Autosave.inProgress) {
            Autosave.lock = true;
            saveCortex(log);
            saveCorcode(log);
            saveMetadata(log);
            Autosave.lock = false;
        }
        System.out.println(log.toString());
    } catch (Exception e) {
        System.out.println(e.getMessage());
        Autosave.lock = false;
        throw new MMLException(e);
    }
}

From source file:com.thesmartweb.swebrank.WebParser.java

/**
 * Parse the url and get all the content
 * @param link_html the url to parse/* ww  w  .  j a v  a  2s . c  o m*/
 * @return The content parsed
 */
public String cleanhtml(String link_html) {
    try {
        Document doc = Jsoup.connect(link_html).timeout(10 * 1000).get();
        String title = doc.title();
        String mainbody = doc.body().text();
        Elements links = doc.select("a[href]");
        Elements media = doc.select("[src]");
        //fix link html to remove https:// or http:// and simple /
        if (link_html.substring(link_html.length() - 1, link_html.length()).equalsIgnoreCase("/")) {
            link_html = link_html.substring(0, link_html.length() - 1);
        }
        if (link_html.substring(0, 5).equalsIgnoreCase("https")) {
            link_html = link_html.substring(8);
        } else if (link_html.substring(0, 4).equalsIgnoreCase("http")) {
            link_html = link_html.substring(7);
        }
        String anchortext = "";
        String alttext = "";
        //-----get the anchor text of internal links
        for (Element link : links) {
            String str_check = link.attr("abs:href").toString();
            if (link.attr("abs:href").contains(link_html) && link.text().length() > 1) {
                anchortext = anchortext + link.text() + " ";
            }
        }
        //-------get alt text to internal images links
        for (Element medi : media) {
            if (medi.getElementsByTag("img").attr("src").toString().contains(link_html)) {
                alttext = alttext + " " + medi.getElementsByTag("img").attr("alt").toString();
            }
            if (medi.getElementsByTag("img").attr("src").toString().startsWith("/")) {
                alttext = alttext + " " + medi.getElementsByTag("img").attr("alt").toString();
            }
        }
        String content = mainbody + title + anchortext + alttext;

        return content;
    } catch (IOException ex) {
        Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex);
        String check = null;
        return check;
    } catch (NullPointerException ex) {
        Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex);
        String check = null;
        return check;
    } catch (Exception ex) {
        Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex);
        String check = null;
        return check;
    }

}

From source file:info.mikaelsvensson.devtools.sitesearch.SiteSearchPlugin.java

private IndexEntry createIndexEntry(final File file) {
    try {//w w w. j  a v  a 2 s.com
        Document document = Jsoup.parse(file, "UTF-8", "http://invalid.host");
        Element contentEl = document.getElementById("contentBox");
        if (contentEl == null) {
            contentEl = document.body();
        }
        if (contentEl != null) {
            String text = Jsoup.clean(contentEl.html(), Whitelist.simpleText());
            Collection<WordCount> wordCount = getWordCount(text);
            Collection<WordCount> filteredWordCount = filterWordCount(wordCount);
            return new IndexEntry(document.title(), getRelativePath(getSiteOutputFolder(), file),
                    filteredWordCount);
        }
    } catch (IOException e) {
        e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
    }
    return null;
}

From source file:gui.InboxPanel.java

private String getTextBody() {
    String html = BodyTextPane.getText();
    Document doc = Jsoup.parseBodyFragment(html);
    Element body = doc.body();
    return body.text();
}

From source file:com.crawler.app.run.CrawlSite.java

/**
 * This function is called when a page is fetched and ready to be processed
 * by your program./*from   www.  ja  v  a2s. c o  m*/
 */

public org.jsoup.nodes.Element convertUrlToDocument(String url) {
    try {

        Connection.Response response =

                Jsoup.connect(url)
                        //enable for error urls
                        .ignoreHttpErrors(true)
                        //MAXIMUN TIME
                        .timeout(50000)
                        //This is to prevent producing garbage by attempting to parse a JPEG binary image
                        .ignoreContentType(true).execute();

        int status = response.statusCode();
        //after done
        if (status == 200) {
            org.jsoup.nodes.Document doc = response.parse();
            Element body = doc.body();
            return body;
        } else {
            return null;
        }
    } catch (SocketTimeoutException se) {

        System.out.println("getContentOnly: SocketTimeoutException");
        System.out.println(se.getMessage());
        return null;
    }

    catch (Exception e) {

        System.out.println("getContentOnly: Exception");
        e.printStackTrace();
        return null;
    }
}

From source file:fr.eolya.extraction.tika.TikaWrapper.java

private void processWithPdfToText(InputStream input) {
    File tempFile = null;/*from ww w . ja  v  a2s.co m*/
    File tempFile2 = null;
    try {
        if (input != null && pdfToTextPath != null && !"".equals(pdfToTextPath)) {
            // Get a local copy of the file
            tempFile = createTempFile("tmp", ".pdf", tmpPath);
            if (!writeToFile(tempFile, input))
                return;

            meta2 = new HashMap<String, String>();
            meta2.put(META_CONTENTSIZE, String.valueOf(tempFile.length()));

            tempFile2 = createTempFile("tmp", ".html", tmpPath);

            Shell sh = new Shell();

            // Convert with PDFTOTEXT - pdftotext -enc UTF-8 -raw -q -htmlmeta -eol unix in.pdf out.html
            sh.exec(pdfToTextPath, "-enc", "UTF-8", "-raw", "-q", "-htmlmeta", "-eol", "unix",
                    tempFile.getAbsolutePath(), tempFile2.getAbsolutePath()).consumeAsString();
            tempFile.delete();

            // Load in string and add the <meta http-equiv='Content-Type' content='text/html; charset=utf-8'> line
            InputStreamReader fr1 = new InputStreamReader(new FileInputStream(tempFile2), "UTF-8");
            BufferedReader br1 = new BufferedReader(fr1);
            StringBuilder sb = new StringBuilder();

            while (br1.ready()) {
                String line = br1.readLine();
                sb.append(line).append("\n");
                if ("</head>".equals(line)) {
                    sb.append("<meta http-equiv='Content-Type' content='text/html; charset=utf-8'>")
                            .append("\n");
                }
            }
            br1.close();
            tempFile2.delete();

            meta2.put(META_CONTENTTYPE, CONTENT_TYPE_PDF);

            text = sb.toString();

            Document doc = Jsoup.parse(text);
            if (doc != null) {
                meta2.put(META_TITLE, doc.select("title").text());
                meta2.put(META_AUTHOR, getMetaContent(doc, "Author"));
                String creationDate = getMetaContent(doc, "CreationDate");
                if (creationDate != null) {
                    // 20130322143113Z00'00' -> 2013-03-22T14:31:13Z
                    Pattern p = Pattern.compile("[0-9]{14}Z[0-9]{2}'[0-9]{2}'");
                    Matcher m = p.matcher(creationDate);
                    if (m.find()) {
                        String value = String.format("%1$s-%2$s-%3$sT%4$s:%5$s:%6$sZ",
                                creationDate.substring(0, 4), creationDate.substring(4, 6),
                                creationDate.substring(6, 8), creationDate.substring(8, 10),
                                creationDate.substring(10, 12), creationDate.substring(12, 14));
                        meta2.put(META_CREATED, value);
                    } else {
                        // 20130322143113+02'00' -> 2013-03-22T14:31:13Z
                        p = Pattern.compile("[0-9]{14}\\+[0-9]{2}'[0-9]{2}'");
                        m = p.matcher(creationDate);
                        if (m.find()) {
                            String value = String.format("%1$s-%2$s-%3$sT%4$s:%5$s:%6$sZ",
                                    creationDate.substring(0, 4), creationDate.substring(4, 6),
                                    creationDate.substring(6, 8), creationDate.substring(8, 10),
                                    creationDate.substring(10, 12), creationDate.substring(12, 14));
                            meta2.put(META_CREATED, value);
                        }
                    }
                }
                if (OUTPUT_FORMAT_TEXT.equals(outputFormat)) {
                    Document doc2 = new Cleaner(Whitelist.basic()).clean(doc);
                    text = doc2.body().text();
                }
            }
        }
    } catch (Exception e) {
        if (tempFile != null && tempFile.exists())
            tempFile.delete();
        if (tempFile2 != null && tempFile2.exists())
            tempFile2.delete();
        e.printStackTrace();
        text = null;
        meta2 = null;
    }
}

From source file:edu.ucla.cs.scai.swim.qa.ontology.dbpedia.DBpediaOntologyOld.java

private DBpediaOntologyOld() {

    Document doc = null;
    try {// ww w  . java2 s  . c om
        doc = Jsoup.connect(DBPEDIA_CLASSES_URL).get();
    } catch (IOException ex) {
        Logger.getLogger(DBpediaOntologyOld.class.getName()).log(Level.SEVERE, null, ex);
    }

    traverseHierarchy(doc.body().children().get(1).children().get(1), root, categoryMap);

    for (DBpediaCategory c : categoryMap.values()) {
        categoriesByUri.put(c.uri, c);
    }

    File dir = new File(DBPEDIA_CSV_FOLDER);
    for (File f : dir.listFiles()) {
        if (f.isFile() && f.getName().endsWith(".csv.gz")) {
            BufferedReader in = null;
            try {
                in = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(f))));
            } catch (IOException ex) {
                Logger.getLogger(DBpediaOntologyOld.class.getName()).log(Level.SEVERE, null, ex);
            }
            String label = f.getName().replace(".csv.gz", "");
            DBpediaCategory category = categoryMap.get(label);
            System.out.println("Processing category " + label);
            if (category == null) {
                System.out.println("Category " + label + " not found");
                continue;
            }
            try {
                processFile(in, category, attributeMap);
            } catch (Exception ex) {
                Logger.getLogger(DBpediaOntologyOld.class.getName()).log(Level.SEVERE, null, ex);
            }
            System.out.println(category.domainOfAttributes.size() + " attributes found");
            for (DBpediaAttribute a : category.domainOfAttributes) {
                System.out.println(a.getName());
            }
        }
    }
    try {
        typicalityEvaluator = new TypicalityEvaluator(DBPEDIA_CSV_FOLDER + "counts.bin");
    } catch (IOException ex) {
        Logger.getLogger(DBpediaOntologyOld.class.getName()).log(Level.SEVERE, null, ex);
    } catch (ClassNotFoundException ex) {
        Logger.getLogger(DBpediaOntologyOld.class.getName()).log(Level.SEVERE, null, ex);
    }

    try {
        BufferedReader in = new BufferedReader(new FileReader(SUPERPAGES_FILE));
        String l = in.readLine();
        while (l != null) {
            StringTokenizer st = new StringTokenizer(l, "\t ");
            String category = st.nextToken();
            String superpage = st.nextToken();
            Integer count = Integer.parseInt(st.nextToken());
            DBpediaCategory c = categoryMap.get(category);
            if (c != null) {
                c.updateMostPopularEntity(superpage, count);
            }
            l = in.readLine();
        }
        in.close();
        for (DBpediaCategory c : categoryMap.values()) {
            if (c.getMostPopularEntity() != null) {
                c.updateAncestorsPopularity();
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:me.vertretungsplan.parser.UntisCommonParser.java

void parseMultipleMonitorDays(SubstitutionSchedule v, Document doc, JSONObject data)
        throws JSONException, CredentialInvalidException {
    if (doc.select(".mon_head").size() > 1) {
        for (int j = 0; j < doc.select(".mon_head").size(); j++) {
            Document doc2 = Document.createShell(doc.baseUri());
            doc2.body().appendChild(doc.select(".mon_head").get(j).clone());
            Element next = doc.select(".mon_head").get(j).nextElementSibling();
            if (next != null && next.tagName().equals("center")) {
                doc2.body().appendChild(next.select(".mon_title").first().clone());
                if (next.select("table:has(tr.list)").size() > 0) {
                    doc2.body().appendChild(next.select("table:has(tr.list)").first());
                }//from  w w w  .  j a va2  s . c  o m
                if (next.select("table.info").size() > 0) {
                    doc2.body().appendChild(next.select("table.info").first());
                }
            } else if (doc.select(".mon_title").size() - 1 >= j) {
                doc2.body().appendChild(doc.select(".mon_title").get(j).clone());
                doc2.body().appendChild(doc.select("table:has(tr.list)").get(j).clone());
            } else {
                continue;
            }
            SubstitutionScheduleDay day = parseMonitorDay(doc2, data);
            v.addDay(day);
        }
    } else if (doc.select(".mon_title").size() > 1) {
        for (int j = 0; j < doc.select(".mon_title").size(); j++) {
            Document doc2 = Document.createShell(doc.baseUri());
            doc2.body().appendChild(doc.select(".mon_title").get(j).clone());
            Element next = doc.select(".mon_title").get(j).nextElementSibling();
            while (next != null && !next.tagName().equals("center")) {
                doc2.body().appendChild(next);
                next = doc.select(".mon_title").get(j).nextElementSibling();
            }
            SubstitutionScheduleDay day = parseMonitorDay(doc2, data);
            v.addDay(day);
        }
    } else {
        SubstitutionScheduleDay day = parseMonitorDay(doc, data);
        v.addDay(day);
    }
}

From source file:org.wallride.web.support.Posts.java

protected String parse(String html) {
    Document document = Jsoup.parse(html);
    Elements elements = document.select("img");
    for (Element element : elements) {
        String src = element.attr("src");
        if (src.startsWith(wallRideProperties.getMediaUrlPrefix())) {
            String style = element.attr("style");
            Pattern pattern = Pattern.compile("width: ([0-9]+)px;");
            Matcher matcher = pattern.matcher(element.attr("style"));
            if (matcher.find()) {
                String replaced = src + "?w=" + Integer.parseInt(matcher.group(1)) * 2;
                element.attr("src", replaced);
            }//from w  w  w .  j a v a  2s. c o m
        }
    }
    return document.body().html();
}

From source file:app.data.parse.WebPageUtil.java

public static WebPageInfo parse(String url, Cache<String, WebPageInfo> urlInfoCache) throws IOException {
    String original = url;//from   w  w  w  .  ja va2 s  . c o  m

    // hit toutiao.io
    // fixme http://toutiao.io/shares/640539/url
    if (original.startsWith("https://toutiao.io/posts/")) {
        original = original.replace("/posts/", "/k/");
    }

    // check cache
    WebPageInfo info = urlInfoCache != null ? urlInfoCache.getIfPresent(original) : null;
    if (info != null) {
        return info;
    } else {
        info = new WebPageInfo();
        info.url = original;
    }

    // attach url
    Document doc = requestUrl(info.url);
    info.url = doc.baseUri(); // or doc.location()

    // hit gold.xitu.io
    if (info.url.startsWith("http://gold.xitu.io/entry/")) {
        Elements origin = doc.select("div[class=ellipsis]");
        Elements originLink = origin.select("a[class=share-link]");
        info.url = originLink.attr("href");

        // reconnect
        doc = requestUrl(info.url);
        info.url = doc.baseUri(); // or doc.location()
    }

    info.url = smartUri(info.url);

    // get title
    Elements metaTitle = doc.select("meta[property=og:title]");
    if (metaTitle != null) {
        info.title = metaTitle.attr("content");
    }
    if (StringUtils.isEmpty(info.title)) {
        metaTitle = doc.select("meta[property=twitter:title]");
        if (metaTitle != null) {
            info.title = metaTitle.attr("content");
        }
        info.title = StringUtils.isEmpty(info.title) ? doc.title() : info.title;
    }

    // get desc
    Elements metaDesc = doc.select("meta[property=og:description]");
    if (metaDesc != null) {
        info.description = metaDesc.attr("content");
    }
    if (StringUtils.isEmpty(info.description)) {
        metaDesc = doc.select("meta[property=twitter:description]");
        if (metaDesc != null) {
            info.description = metaDesc.attr("content");
        }
        if (StringUtils.isEmpty(info.description)) {
            metaDesc = doc.select("meta[name=description]");
            if (metaDesc != null) {
                info.description = metaDesc.attr("content");
            }
            if (StringUtils.isEmpty(info.description)) {
                metaDesc = doc.body().select("p");
                if (metaDesc != null) {
                    for (Element element : metaDesc) {
                        info.description = element.text();
                        if (info.description != null && info.description.length() >= 20) {
                            break;
                        }
                    }
                }
            }
        }
    }
    info.description = ellipsis(info.description, 140, "...");

    // cache info
    if (urlInfoCache != null) {
        urlInfoCache.put(original, info);
    }
    return info;
}