List of usage examples for org.jsoup.nodes Element parentNode
public final Node parentNode()
From source file:webcralwerproject1.Webcrawler.java
public String contentprocessor() { File folder = new File(DirectoryName + "/" + crawlcount); FileWriter f_write = null;/*from w w w. ja v a 2 s . c o m*/ Elements p, c = null; String contentprocessfile = "./crawler" + crawlcount + "content.html"; if (!folder.exists()) { } else { try { File[] listOfFiles = folder.listFiles(); f_write = new FileWriter(contentprocessfile, true); //Open repo directory and loop through all files for (File file : listOfFiles) { if (file.isFile()) { File input = new File(file.getAbsolutePath()); Document doc = Jsoup.parse(input, "UTF-8"); String title = doc.select("title").toString(); Elements n = doc.select("nav").remove(); // String d =doc.select("div.id"); doc.select("head").remove(); doc.select("link").remove(); doc.select("style").remove(); doc.select("meta").remove(); doc.select("script").remove(); doc.select("figure").remove(); doc.select("img").remove(); doc.select("footer").remove(); doc.select("input[type = search]").remove(); doc.select("form").remove(); doc.select("button").remove(); doc.select("video").remove(); doc.select("div:empty").remove(); doc.select("div#footer").remove(); doc.select("div#id").remove(); doc.select("div#nav").remove(); doc.select("div#navigation").remove(); doc.select("div.footer").remove(); doc.select("div.header").remove(); doc.select("li > a[href]").remove(); Elements linksOnPage = doc.select("body a[href]"); for (Element link : linksOnPage) { if (link.html() == null) { link.remove();//<a></a> } else if (link.html().length() <= 4) {// does not contains title of the page link.remove(); } else { int child = link.parentNode().childNodeSize(); if (child == 1) {//only element remove link.remove(); } } } f_write.write(doc.text()); } f_write.write("<br>"); } f_write.close(); } catch (Exception e) { System.out.println("Inside Contentprocessor" + e); } return contentprocessfile; } return null; }