List of usage examples for org.jsoup.nodes Document getElementsByTag
public Elements getElementsByTag(String tagName)
From source file:webscrap.WebScrap.java
/** * @param args the command line arguments *///from w ww. j a v a2s.c o m public static void main(String[] args) { // TODO code application logic here Document doc; try { doc = Jsoup.connect( "http://www.metmuseum.org/collection/the-collection-online/search/15538?pos=1&rpp=30&pg=1&rndkey=20150122&ft=*&deptids=2") .get(); File jsonFile = new File("Records.json"); FileWriter output = new FileWriter(jsonFile); JSONArray store = new JSONArray(); //Declarations for JSON output String nameTag = "Name"; String name; String artistTag = "Artist"; String artistName; String imgURLTag = "imgURL"; String imgsrc; String dateTag = "Date"; String date; String geoTag = "Geography"; String geoVal; String cultureTag = "Culture"; String culture; String mediumTag = "Medium"; String medium; String dimTag = "Dimension"; String dim; String classTag = "Classification"; String classification; String credit_line_tag = "Credit_Line"; String credit_line; String accessNumTag = "Accession_Number"; String accessNum; String RnRTag = "Rights_and_Reproduction"; String RnR; //trying to load the next urls String next = "http://www.metmuseum.org/collection/the-collection-online/search/11432?pos=1&rpp=30&pg=1&rndkey=20150123&ft=*&deptids=2"; int i = 500; while (i != 0) { name = ""; artistName = ""; imgsrc = ""; date = ""; //geoVal = "not available"; //culture = "not available"; medium = ""; dim = ""; classification = ""; credit_line = ""; accessNum = ""; //RnR = "not available"; doc = Jsoup.connect(next).get(); String o_title = doc.getElementsByTag("h2").text(); String[] part_o = o_title.split("Email"); String part_o1 = part_o[0]; String part_o2 = part_o[1]; //System.out.println(o_title); name = part_o1; //String artist = doc.getElementsByTag("h3").text(); //System.out.println(artist); //artistName = artist; Elements imgdiv = doc.select("div#inner-image-container img"); for (Element e : imgdiv) { imgsrc = e.absUrl("src"); } Elements divs; divs = doc.select("div.tombstone"); Elements divchild; divchild = divs.select("div"); int count = 0; for (Element div : divchild) { String info = div.text(); if (count != 0) { String[] parts = info.split(":"); String part1 = parts[0]; String part2 = parts[1]; switch (part1) { case "Artist": artistName = part2; break; case "Date": date = part2; break; case "Geography": geoVal = part2; break; case "Culture": culture = part2; break; case "Medium": medium = part2; break; case "Dimensions": dim = part2; break; case "Classification": classification = part2; break; case "Credit Line": credit_line = part2; break; case "Accession Number": accessNum = part2; break; case "Rights and Reproduction": RnR = part2; break; } } count++; } if (classification.equals(" Paintings")) { //System.out.println(nameTag+name); //System.out.println(artistTag+artistName); //System.out.println(imgURLTag+imgsrc); //System.out.println(dateTag+date); //System.out.println(mediumTag+medium); //System.out.println(dimTag+dim); //System.out.println(classTag+classification); //System.out.println(credit_line_tag+credit_line); //System.out.println(accessNumTag+accessNum); //System.out.println(i); //json writing JSONObject jsonObj = new JSONObject(); jsonObj.put(nameTag, name); jsonObj.put(artistTag, artistName); jsonObj.put(imgURLTag, imgsrc); jsonObj.put(dateTag, date); jsonObj.put(mediumTag, medium); jsonObj.put(dimTag, dim); jsonObj.put(classTag, classification); jsonObj.put(credit_line_tag, credit_line); jsonObj.put(accessNumTag, accessNum); store.add(jsonObj); i--; } //going to next page Element link = doc.select("a.next").first(); next = link.attr("abs:href"); } output.write(store.toJSONString()); output.write("\n"); output.flush(); output.close(); } catch (IOException e) { } }
From source file:wherehows.common.utils.GitUtil.java
/** * Crawlling the project page to get list of repositories, only works for Gitorious * @param projectUrl the project url e.g. https://git.example.com/project * @return List of path of repositories e.g. project/repo * @throws IOException/*www . j av a 2s .co m*/ */ public static Map<String, String> getRepoListFromProject(String projectUrl) throws IOException { Map<String, String> repoList = new HashMap<>(); Document doc = Jsoup.connect(projectUrl).data("format", "xml").get(); Elements repos = doc.getElementsByTag("repositories"); Elements mainlines = repos.first().getElementsByTag("mainlines"); Elements repo = mainlines.first().getElementsByTag("repository"); for (Element e : repo) { String repoName = e.getElementsByTag("name").first().text(); String repoUrl = e.getElementsByTag("clone_url").first().text(); repoList.put(repoName.trim(), repoUrl.trim()); } return repoList; }