List of usage examples for org.jsoup.nodes Document getElementsByTag
public Elements getElementsByTag(String tagName)
From source file:ru.dmitry.mamishev.URLParse.HtmlString.java
public GazInfo getInfoBill() { String html = this.htmlString; Document doc = Jsoup.parse(html); Elements ul = doc.getElementsByTag("ul"); String numBill = ""; numBill = ul.eq(1).text();//from w w w . j a v a 2 s .co m Document bElements = Jsoup.parseBodyFragment(ul.toString()); Elements b = bElements.getElementsByTag("b"); GazInfo billInfo = null; String date = ""; String pay = ""; if (b.size() > 2) { String[] ss = SPLIT.split(b.get(2).text()); if (ss.length > 0) { date = ss[0]; pay = ss[1]; } billInfo = new GazInfo(b.get(0).text(), b.get(1).text(), date, pay, numBill); } else { billInfo = new GazInfo("", "", date, pay, ""); } return billInfo; }
From source file:solarrecorder.SolarRecorder.java
private void getProdData() throws IOException { org.jsoup.nodes.Document doc = Jsoup.connect("http://envoy/production").get(); Element h1 = doc.getElementsByTag("h1").first(); Element table = h1.nextElementSibling(); Elements alltr = table.getElementsByTag("tbody").first().getElementsByTag("tr"); for (Element tr : alltr) { Elements alltd = tr.getElementsByTag("td"); if (alltd.size() == 2) { String name = alltd.first().text(); String value = alltd.last().text(); switch (name) { case "Currently": case "Today": envoyData.add(new EnvoyData(name, value)); break; }//from w w w . j a va2 s . c o m } } }
From source file:solarrecorder.SolarRecorder.java
private void getSysData() throws IOException { org.jsoup.nodes.Document doc = Jsoup.connect("http://envoy").get(); Elements allh2 = doc.getElementsByTag("h2"); for (Element h2 : allh2) { if (h2.text().equals("System Statistics")) { Elements tables = h2.parent().getElementsByTag("table"); Elements alltr = tables.first().getElementsByTag("tbody").first().getElementsByTag("tr"); for (Element tr : alltr) { Elements alltd = tr.getElementsByTag("td"); String name = alltd.first().text(); String value = alltd.last().text(); if (name.equals("Number of Microinverters Online")) { envoyData.add(new EnvoyData(name, value)); }//from w ww . j ava 2s. c om } } } }
From source file:uk.co.certait.htmlexporter.writer.AbstractExporter.java
protected Elements getTables(String html) { Document document = Jsoup.parse(html);// FIXME parsing twice return document.getElementsByTag("table"); }
From source file:uk.co.certait.htmlexporter.writer.AbstractExporter.java
protected StyleMap getStyleMapper(String html) { Document document = Jsoup.parse(html); Elements styles = document.getElementsByTag("style");// FIXME parsing // twice//from ww w .j a v a2s.c o m StyleParser parser = new StyleParser(); StyleMap mapper = new StyleMap(parser.parseStyles(styles)); return mapper; }
From source file:us.colloquy.sandbox.FileProcessor.java
@Test public void listAllUzipedFiles() { ///Documents/Tolstoy/diaries //System.getProperty("user.home") + "/Documents/Tolstoy/unzipLetters" Path pathToLetters = FileSystems.getDefault() .getPath(System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries"); List<Path> results = new ArrayList<>(); int maxDepth = 6; try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> { return String.valueOf(path).endsWith(".ncx"); })) {//from w w w. j a va2s . c om stream.forEach(results::add); // String joined = stream // .sorted() // .map(String::valueOf) // .collect(Collectors.joining("; ")); // // System.out.println("\nFound: " + joined); } catch (IOException e) { e.printStackTrace(); } System.out.println("files: " + results.size()); Set<String> uriList = new TreeSet<>(); try { for (Path res : results) { Path parent = res.getParent(); System.out.println("---------------------------------------------"); System.out.println(parent.toString()); //use jsoup to list all files that contain something useful Document doc = Jsoup.parse(res.toFile(), "UTF-8"); for (Element element : doc.getElementsByTag("docTitle")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { System.out.println("Title: " + child.text()); } } for (Element element : doc.getElementsByTag("navPoint")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { String label = child.text(); if (StringUtils.isNotEmpty(label)) { if (label.matches("?")) { System.out.println("------------------"); } String url = child.getElementsByTag("content").attr("src"); if (label.matches(".*\\d{1,3}.*[?--?]+.*") && StringUtils.isNotEmpty(url)) { uriList.add(parent.toString() + File.separator + url.replaceAll("#.*", "")); // System.out.println("nav point: " + label + " src " + parent.toString() // + System.lineSeparator() + url.replaceAll("#.*","")); } else { // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src")); } } } } } } catch (Exception e) { e.printStackTrace(); } System.out.println("Size: " + uriList.size()); for (String uri : uriList) { //parse and System.out.println(uri); } }
From source file:us.colloquy.sandbox.FileProcessor.java
@Test public void listAllUzipedFilesContent() { ///Documents/Tolstoy/diaries Path pathToLetters = FileSystems.getDefault() .getPath(System.getProperty("user.home") + "/Documents/Tolstoy/unzipLetters"); List<Path> results = new ArrayList<>(); int maxDepth = 6; try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> { return String.valueOf(path).endsWith(".opf"); })) {/*from ww w .ja va2 s . c o m*/ stream.forEach(results::add); // String joined = stream // .sorted() // .map(String::valueOf) // .collect(Collectors.joining("; ")); // // System.out.println("\nFound: " + joined); } catch (IOException e) { e.printStackTrace(); } System.out.println("files: " + results.size()); Set<String> uriList = new TreeSet<>(); try { for (Path res : results) { Path parent = res.getParent(); System.out.println("---------------------------------------------"); System.out.println(parent.toString()); //use jsoup to list all files that contain something useful Document doc = Jsoup.parse(res.toFile(), "UTF-8"); for (Element element : doc.getElementsByTag("dc:title")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); System.out.println(element.text()); // for (Element child : element.children()) // { // System.out.println(child.tagName() + "\t" + child.text()); // } } // for (Element element : doc.getElementsByTag("navPoint")) // { // //Letter letter = new Letter(); // // // StringBuilder content = new StringBuilder(); // // for (Element child : element.children()) // { // String label = child.text(); // // if (StringUtils.isNotEmpty(label)) // { // if (label.matches("?")) // { // System.out.println("------------------"); // } // // // String url = child.getElementsByTag("content").attr("src"); // // if (label.matches(".*\\d{1,3}.*[?--?]+.*") && // StringUtils.isNotEmpty(url) ) // { // // uriList.add(parent.toString() // + File.separator + url.replaceAll("#.*","")); //// System.out.println("nav point: " + label + " src " + parent.toString() //// + System.lineSeparator() + url.replaceAll("#.*","")); // // // } else // { // // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src")); // } // // // } // } // } } } catch (Exception e) { e.printStackTrace(); } System.out.println("Size: " + uriList.size()); for (String uri : uriList) { //parse and System.out.println(uri); } }
From source file:us.colloquy.sandbox.FileProcessor.java
@Test public void getURIForAllDiaries() { Set<DocumentPointer> uriList = new HashSet<>(); //String letterDirectory = System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries"; ///*from w ww .j a va 2 s . c o m*/ String letterDirectory = System.getProperty("user.home") + "/Documents/Tolstoy/90-volume-set/diaries/uzip/dnevnik_1881-1887_vol_49"; Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory); List<Path> results = new ArrayList<>(); int maxDepth = 6; try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> { return String.valueOf(path).endsWith(".ncx"); })) { stream.forEach(results::add); } catch (IOException e) { e.printStackTrace(); } System.out.println("files: " + results.size()); try { for (Path res : results) { Path parent = res.getParent(); // System.out.println("---------------------------------------------"); // System.out.println(parent.toString()); //use jsoup to list all files that contain something useful Document doc = Jsoup.parse(res.toFile(), "UTF-8"); String title = ""; for (Element element : doc.getElementsByTag("docTitle")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { title = child.text(); // System.out.println("Title: " + title); } } // System.out.println("========================== " + res.toString() + " =========================="); boolean startPrinting = false; boolean newFile = true; for (Element element : doc.getElementsByTag("navPoint")) { //get nav label and content Element navLabelElement = element.select("navLabel").first(); Element srsElement = element.select("content").first(); String navLabel = ""; String srs = ""; if (navLabelElement != null) { navLabel = navLabelElement.text().replaceAll("\\*", "").trim(); } if (srsElement != null) { srs = srsElement.attr("src"); } if ("??".matches(navLabel)) { startPrinting = false; // System.out.println("----------------- end of file pointer ---------------"); } if (StringUtils.isNotEmpty(navLabel) && navLabel.matches("??.*|?? ?.*") && newFile) { newFile = false; startPrinting = true; title = navLabel; } if (startPrinting) { // System.out.println("----------------- file pointer ---------------"); // System.out.println(navLabel + "\t" + srs); DocumentPointer documentPointer = new DocumentPointer( parent.toString() + File.separator + srs.replaceAll("#.*", ""), title); uriList.add(documentPointer); } // for (Element child : element.children()) // { // String label = child.text(); // // if (StringUtils.isNotEmpty(label)) // { // if (label.matches("??\\s\\d{4}.*")) // { // System.out.println("------------------"); // } // // String url = child.getElementsByTag("content").attr("src"); // // if (label.matches(".*\\d{1,3}.*[?--?]+.*") && // StringUtils.isNotEmpty(url)) // { // DocumentPointer letterPointer = new DocumentPointer(parent.toString() // + File.separator + url.replaceAll("#.*", ""), title); // // uriList.add(letterPointer); //// System.out.println("nav point: " + label + " src " + parent.toString() //// + System.lineSeparator() + url.replaceAll("#.*","")); // // // } else if (label.matches(".*\\d{1,3}.*") && // StringUtils.isNotEmpty(url) && useOnlyNumber) // { // DocumentPointer letterPointer = new DocumentPointer(parent.toString() // + File.separator + url.replaceAll("#.*", ""), title); // // uriList.add(letterPointer); //// System.out.println("nav point: " + label + " src " + parent.toString() //// + System.lineSeparator() + url.replaceAll("#.*","")); // // // } else // { // // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src")); // } // // // } // } } // System.out.println("========================== END OF FILE =========================="); } } catch (Exception e) { e.printStackTrace(); } System.out.println("Size: " + uriList.size()); for (DocumentPointer pointer : uriList) { //parse and System.out.println(pointer.getSourse() + "\t" + pointer.getUri()); } }
From source file:us.colloquy.util.EpubExtractor.java
public static void getURIForAllLetters(Set<DocumentPointer> uriList, String letterDirectory, boolean useOnlyNumber) { Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory); List<Path> results = new ArrayList<>(); int maxDepth = 6; try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> String.valueOf(path).endsWith(".ncx"))) { stream.forEach(results::add);/*from ww w . j a v a 2 s. c o m*/ // String joined = stream // .sorted() // .map(String::valueOf) // .collect(Collectors.joining("; ")); // // System.out.println("\nFound: " + joined); } catch (IOException e) { e.printStackTrace(); } System.out.println("files: " + results.size()); try { for (Path res : results) { Path parent = res.getParent(); // System.out.println("---------------------------------------------"); // System.out.println(parent.toString()); //use jsoup to list all files that contain something useful Document doc = Jsoup.parse(res.toFile(), "UTF-8"); String title = ""; for (Element element : doc.getElementsByTag("docTitle")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { title = child.text(); // System.out.println("Title: " + title); } } for (Element element : doc.getElementsByTag("avantitul")) { for (Element child : element.children()) { String label = child.text(); if (StringUtils.isNotEmpty(label)) { if (label.matches( " ? ? .*")) { System.out.println("------------------ " + label); } } } } for (Element element : doc.getElementsByTag("navPoint")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { String label = child.text(); if (StringUtils.isNotEmpty(label)) { if (label.matches("?")) { System.out.println("------------------ " + "?" + " -------------------"); } else if (label.contains(" ?")) { break; } String url = child.getElementsByTag("content").attr("src"); if (label.matches(".*\\d{1,3}.*[?--?A-Za-z]+.*") && StringUtils.isNotEmpty(url)) { DocumentPointer documentPointer = new DocumentPointer( parent.toString() + File.separator + url.replaceAll("#.*", ""), title); uriList.add(documentPointer); // System.out.println("nav point: " + label + " src " + parent.toString() // + System.lineSeparator() + url.replaceAll("#.*","")); } else if (label.matches(".*\\d{1,3}.*") && StringUtils.isNotEmpty(url) && useOnlyNumber) { DocumentPointer documentPointer = new DocumentPointer( parent.toString() + File.separator + url.replaceAll("#.*", ""), title); uriList.add(documentPointer); // System.out.println("nav point: " + label + " src " + parent.toString() // + System.lineSeparator() + url.replaceAll("#.*","")); } else { // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src")); } } } } } } catch (Exception e) { e.printStackTrace(); } // System.out.println("Size: " + uriList.size()); // for (DocumentPointer pointer : uriList) // { // //parse and // System.out.println(pointer.getSourse() + "\t" + pointer.getUri()); // } }
From source file:webcralwerproject1.Webcrawler.java
public String writeContent(Document htmlDocument) {// throws IOException { FileWriter fWriter = null;/* w w w .jav a2 s.co m*/ BufferedWriter writer = null; String path = null; try { File file = new File(DirectoryName + "/" + crawlcount); if (!file.exists()) { if (file.mkdir()) { System.out.println("Repository Directory is created!"); } else { System.out.println("Failed to create directory!"); } } File f = new File(file.getAbsolutePath() + "/" + MaxPage + "file.html"); path = f.getAbsolutePath(); Elements img = htmlDocument.getElementsByTag("img"); Elements srcc = htmlDocument.getElementsByAttribute("src"); for (Element el : img) { imagecount++; el.attr("src", "a"); } // System.out.println("Imagecount : " + imagecount ); FileUtils.writeStringToFile(f, htmlDocument.html(), "UTF-8"); } catch (Exception e) { System.out.println("Inside writeContent Exception " + e); } System.out.println("Inside writeContent "); return path; }