Example usage for org.jsoup.nodes Element parentNode

List of usage examples for org.jsoup.nodes Element parentNode

Introduction

In this page you can find the example usage for org.jsoup.nodes Element parentNode.

Prototype

public final Node parentNode() 

Source Link

Document

Gets this node's parent node.

Usage

From source file:webcralwerproject1.Webcrawler.java

public String contentprocessor() {
    File folder = new File(DirectoryName + "/" + crawlcount);
    FileWriter f_write = null;/*from   w w  w.  ja v a 2 s .  c  o m*/
    Elements p, c = null;
    String contentprocessfile = "./crawler" + crawlcount + "content.html";
    if (!folder.exists()) {
    } else {
        try {
            File[] listOfFiles = folder.listFiles();
            f_write = new FileWriter(contentprocessfile, true);

            //Open repo directory and loop through all files
            for (File file : listOfFiles) {
                if (file.isFile()) {
                    File input = new File(file.getAbsolutePath());
                    Document doc = Jsoup.parse(input, "UTF-8");
                    String title = doc.select("title").toString();
                    Elements n = doc.select("nav").remove();
                    //  String d =doc.select("div.id");
                    doc.select("head").remove();
                    doc.select("link").remove();
                    doc.select("style").remove();
                    doc.select("meta").remove();
                    doc.select("script").remove();
                    doc.select("figure").remove();
                    doc.select("img").remove();
                    doc.select("footer").remove();
                    doc.select("input[type = search]").remove();
                    doc.select("form").remove();
                    doc.select("button").remove();
                    doc.select("video").remove();
                    doc.select("div:empty").remove();
                    doc.select("div#footer").remove();
                    doc.select("div#id").remove();
                    doc.select("div#nav").remove();
                    doc.select("div#navigation").remove();
                    doc.select("div.footer").remove();
                    doc.select("div.header").remove();
                    doc.select("li > a[href]").remove();

                    Elements linksOnPage = doc.select("body a[href]");
                    for (Element link : linksOnPage) {
                        if (link.html() == null) {
                            link.remove();//<a></a>
                        } else if (link.html().length() <= 4) {// does not contains title of the page 
                            link.remove();
                        } else {
                            int child = link.parentNode().childNodeSize();
                            if (child == 1) {//only element remove
                                link.remove();
                            }
                        }
                    }
                    f_write.write(doc.text());
                }
                f_write.write("<br>");
            }
            f_write.close();
        } catch (Exception e) {
            System.out.println("Inside Contentprocessor" + e);
        }

        return contentprocessfile;
    }
    return null;
}