List of usage examples for org.jsoup.nodes Document getElementsByClass
public Elements getElementsByClass(String className)
From source file:faescapeplan.FAEscapePlanUI.java
@SuppressWarnings("unchecked") private void downloadJournals(ArrayList<String> journalList) { JSONArray jsonList = new JSONArray(); String downloadLoc = this.saveLocText.getText(); Path jsonPath = Paths.get(downloadLoc + "\\" + userData.getName() + "\\journals\\journals.json"); try {//from ww w.j a va 2s .c o m Files.deleteIfExists(jsonPath); Files.createFile(jsonPath); } catch (IOException ex) { Logger.getLogger(FAEscapePlanUI.class.getName()).log(Level.SEVERE, null, ex); JOptionPane.showMessageDialog(this, "A critical IO exception occurred in method: downloadJournals"); } for (String item : journalList) { try { Map<String, String> jsonMap = new LinkedHashMap<>(); Document doc = Jsoup.connect("http://www.furaffinity.net/journal/" + item + "/") .cookies(userData.getCookies()).userAgent(USER_AGENT).get(); String title = doc.title().split(" -- ")[0]; String date = doc.getElementsByClass("popup_date").get(0).attr("title"); String body = doc.getElementsByClass("journal-body").get(0).html(); jsonMap.put("title", title); jsonMap.put("date", date); jsonMap.put("body", body); jsonList.add(jsonMap); Path journalPath = Paths.get(downloadLoc, "\\" + userData.getName() + "\\journals\\" + item + "_" + title + ".txt"); String bodyParsed = removeHtmlTags(body); try (FileWriter journalWriter = new FileWriter(new File(journalPath.toString()))) { journalWriter.append(title + System.getProperty("line.separator")); journalWriter.append(date + System.getProperty("line.separator")); journalWriter.append(bodyParsed + System.getProperty("line.separator")); } } catch (FileAlreadyExistsException ex) { Logger.getLogger(FAEscapePlanUI.class.getName()).log(Level.SEVERE, null, ex); updateTextLog("File already exists"); } catch (IOException ex) { Logger.getLogger(FAEscapePlanUI.class.getName()).log(Level.SEVERE, null, ex); updateTextLog("An IO Exception occurred while downloading journal: " + item); } } String jsonString = JSONValue.toJSONString(jsonList); try { Files.write(jsonPath, Arrays.asList(jsonString), StandardOpenOption.WRITE); } catch (IOException ex) { Logger.getLogger(FAEscapePlanUI.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:com.kantenkugel.discordbot.jdocparser.JDocParser.java
static void parse(final String jdocBase, final String name, final InputStream inputStream, Map<String, ClassDocumentation> docs) { final String[] pathSplits = name.split("/"); final String fileName = pathSplits[pathSplits.length - 1]; if (!Character.isUpperCase(fileName.charAt(0))) { //ignore jdoc structure html return;//from w w w.j a v a2 s . c o m } final String[] nameSplits = fileName.split("\\."); final String className = nameSplits[nameSplits.length - 2]; final String fullName = fileName.substring(0, fileName.length() - nameSplits[nameSplits.length - 1].length() - 1); try (BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream))) { //create dom Document final String content = buffer.lines().collect(Collectors.joining("\n")); Document document = Jsoup.parse(content); //classDocument (classname, package, description) Element titleElem = getSingleElementByClass(document, "title"); final String classSig = JDocUtil.fixSpaces(titleElem.text()); Element packageElem = titleElem.previousElementSibling(); if (packageElem.children().size() > 1) { packageElem = packageElem.children().last(); } final String pack = JDocUtil.fixSpaces(packageElem.text()); final String link = JDocUtil.getLink(jdocBase, pack, fullName); Element descriptionElement = null; Elements descriptionCandidates = document.select(".description .block"); if (descriptionCandidates.size() > 1) { List<Element> removed = descriptionCandidates.stream().map(elem -> elem.child(0)) .filter(child -> child != null && !child.className().startsWith("deprecat")) .map(Element::parent).collect(Collectors.toList()); if (removed.size() != 1) throw new RuntimeException("Found too many description candidates"); descriptionElement = removed.get(0); } else if (descriptionCandidates.size() == 1) { descriptionElement = descriptionCandidates.get(0); } final String description = descriptionElement == null ? "" : JDocUtil.formatText(descriptionElement.html(), link); final ClassDocumentation classDoc = new ClassDocumentation(pack, fullName, classSig, description, classSig.startsWith("Enum")); //methods, fields final Element details = document.getElementsByClass("details").first(); if (details != null) { //methods Element tmp = getSingleElementByQuery(details, "a[name=\"method.detail\"]"); List<DocBlock> docBlock = getDocBlock(jdocBase, tmp, classDoc); if (docBlock != null) { for (DocBlock block : docBlock) { Set<MethodDocumentation> mdocs = classDoc.methodDocs .computeIfAbsent(block.title.toLowerCase(), key -> new HashSet<>()); mdocs.add(new MethodDocumentation(classDoc, block.signature, block.hashLink, block.description, block.fields)); } } //vars tmp = getSingleElementByQuery(details, "a[name=\"field.detail\"]"); docBlock = getDocBlock(jdocBase, tmp, classDoc); if (docBlock != null) { for (DocBlock block : docBlock) { classDoc.classValues.put(block.title.toLowerCase(), new ValueDocumentation(classDoc, block.title, block.hashLink, block.signature, block.description)); } } //enum-values tmp = getSingleElementByQuery(details, "a[name=\"enum.constant.detail\"]"); docBlock = getDocBlock(jdocBase, tmp, classDoc); if (docBlock != null) { for (DocBlock block : docBlock) { classDoc.classValues.put(block.title.toLowerCase(), new ValueDocumentation(classDoc, block.title, block.hashLink, block.signature, block.description)); } } } final Element methodSummary = getSingleElementByQuery(document, "a[name=\"method.summary\"]"); classDoc.inheritedMethods.putAll(getInheritedMethods(methodSummary)); //storing if (nameSplits.length > 2) { if (!docs.containsKey(nameSplits[0].toLowerCase())) docs.put(nameSplits[0].toLowerCase(), new ClassDocumentation(null, null, null, null, false)); ClassDocumentation parent = docs.get(nameSplits[0].toLowerCase()); for (int i = 1; i < nameSplits.length - 2; i++) { if (!parent.subClasses.containsKey(nameSplits[i].toLowerCase())) parent.subClasses.put(nameSplits[i].toLowerCase(), new ClassDocumentation(null, null, null, null, false)); parent = parent.subClasses.get(nameSplits[i].toLowerCase()); } if (parent.subClasses.containsKey(className.toLowerCase())) classDoc.subClasses.putAll(parent.subClasses.get(className.toLowerCase()).subClasses); parent.subClasses.put(className.toLowerCase(), classDoc); } if (docs.containsKey(fullName.toLowerCase())) { ClassDocumentation current = docs.get(fullName.toLowerCase()); if (current.classSig != null) throw new RuntimeException("Got a class-name conflict with classes " + classDoc.classSig + "(" + classDoc.className + ") AND " + current.classSig + "(" + current.className + ")"); classDoc.subClasses.putAll(current.subClasses); } docs.put(fullName.toLowerCase(), classDoc); } catch (final IOException | NullPointerException ex) { JDocUtil.LOG.error("Got excaption for element {}", fullName, ex); } try { inputStream.close(); } catch (final IOException e) { JDocUtil.LOG.error("Error closing inputstream", e); } }
From source file:info.dolezel.fatrat.plugins.UloztoDownload.java
@Override public void processLink(String link) { //if (link.contains("/live/")) // link = link.replace("/live/", "/"); if (link.startsWith("http://uloz.to") || link.startsWith("https://uloz.to")) link = link.replace("https?://uloz.to", "https://www.uloz.to"); if (link.startsWith("http://m.uloz.to") || link.startsWith("https://m.uloz.to")) link = link.replace("https?://m.uloz.to", "https://www.uloz.to"); if (!logIn(link)) return;/*from w w w . ja v a2 s . c o m*/ final String downloadLink = link; // I can't make 'link' final fetchPage(link, new PageFetchListener() { @Override public void onCompleted(ByteBuffer buf, Map<String, String> headers) { try { if (headers.containsKey("location")) { String location = headers.get("location"); if (location.contains("smazano") || location.contains("nenalezeno")) setFailed("The file has been removed"); else processLink(location); return; } CharBuffer cb = charsetUtf8.decode(buf); if (cb.toString().contains("?disclaimer=1")) { processLink(downloadLink + "?disclaimer=1"); return; } final Document doc = Jsoup.parse(cb.toString()); final Element freeForm = doc.getElementById("frm-download-freeDownloadTab-freeDownloadForm"); final Element premiumLink = doc.getElementById("#quickDownloadButton"); boolean usePremium = usePremium(downloadLink); if (cb.toString().contains("Nem dostatek kreditu")) setMessage("Credit depleted, using FREE download"); else if (usePremium && premiumLink != null) { String msg = "Using premium download"; Elements aCredits = doc.getElementsByAttributeValue("href", "/kredit"); if (!aCredits.isEmpty()) msg += " (" + aCredits.get(0).ownText() + " left)"; setMessage(msg); startDownload("http://www.uloz.to" + premiumLink.attr("href")); return; } else if (loggedIn) setMessage("Login failed, using FREE download"); Elements aNames = doc.getElementsByClass("jsShowDownload"); if (!aNames.isEmpty()) reportFileName(aNames.get(0).ownText()); final PostQuery pq = new PostQuery(); final Map<String, String> hdr = new HashMap<String, String>(); Elements eHiddens = freeForm.select("input[type=hidden]"); hdr.put("X-Requested-With", "XMLHttpRequest"); hdr.put("Referer", downloadLink); hdr.put("Accept", "application/json, text/javascript, */*; q=0.01"); for (Element e : eHiddens) pq.add(e.attr("name"), e.attr("value")); fetchPage("https://uloz.to/reloadXapca.php?rnd=" + Math.abs(new Random().nextInt()), new PageFetchListener() { @Override public void onCompleted(ByteBuffer buf, Map<String, String> headers) { CharBuffer cb = charsetUtf8.decode(buf); String captchaUrl; try { JSONObject json = new JSONObject(cb.toString()); captchaUrl = "https:" + json.getString("image"); pq.add("hash", json.getString("hash")); pq.add("timestamp", "" + json.getInt("timestamp")); pq.add("salt", "" + json.getInt("salt")); } catch (JSONException e) { setFailed("Error parsing captcha JSON"); return; } solveCaptcha(captchaUrl, new CaptchaListener() { @Override public void onFailed() { setFailed("Failed to decode the captcha code"); } @Override public void onSolved(String text) { String action = freeForm.attr("action"); pq.add("captcha_value", text); fetchPage("https://www.uloz.to" + action, new PageFetchListener() { @Override public void onCompleted(ByteBuffer buf, Map<String, String> headers) { try { CharBuffer cb = charsetUtf8.decode(buf); JSONObject obj = new JSONObject(cb.toString()); startDownload(obj.getString("url")); } catch (Exception e) { setFailed("" + e); } } @Override public void onFailed(String error) { setFailed(error); } }, pq.toString(), hdr); } }); } @Override public void onFailed(String error) { setFailed("Failed to load captcha AJAX page"); } }); } catch (Exception e) { e.printStackTrace(); setFailed(e.toString()); } } @Override public void onFailed(String error) { setFailed("Failed to load the initial page"); } }, null); }
From source file:de.geeksfactory.opacclient.apis.SISIS.java
protected boolean login(Account acc) throws OpacErrorException { String html;//from w w w .j ava2s . c o m List<NameValuePair> nameValuePairs = new ArrayList<>(2); try { String loginPage; loginPage = httpGet(opac_url + "/userAccount.do?methodToCall=show&type=1", ENCODING); Document loginPageDoc = Jsoup.parse(loginPage); if (loginPageDoc.select("input[name=as_fid]").size() > 0) { nameValuePairs.add(new BasicNameValuePair("as_fid", loginPageDoc.select("input[name=as_fid]").first().attr("value"))); } } catch (IOException e1) { e1.printStackTrace(); } nameValuePairs.add(new BasicNameValuePair("username", acc.getName())); nameValuePairs.add(new BasicNameValuePair("password", acc.getPassword())); nameValuePairs.add(new BasicNameValuePair("CSId", CSId)); nameValuePairs.add(new BasicNameValuePair("methodToCall", "submit")); try { html = handleLoginMessage( httpPost(opac_url + "/login.do", new UrlEncodedFormEntity(nameValuePairs), ENCODING)); } catch (UnsupportedEncodingException e) { e.printStackTrace(); return false; } catch (ClientProtocolException e) { e.printStackTrace(); return false; } catch (IOException e) { e.printStackTrace(); return false; } Document doc = Jsoup.parse(html); if (doc.getElementsByClass("error").size() > 0) { throw new OpacErrorException(doc.getElementsByClass("error").get(0).text()); } logged_in = System.currentTimeMillis(); logged_in_as = acc; return true; }
From source file:de.geeksfactory.opacclient.apis.Bibliotheca.java
@Override public AccountData account(Account acc) throws IOException, JSONException, OpacErrorException { if (!initialised) { start();// w ww . j av a 2 s . co m } if (acc.getName() == null || acc.getName().equals("null")) { return null; } List<NameValuePair> nameValuePairs; String html = httpGet(opac_url + "/index.asp?kontofenster=start", "ISO-8859-1"); Document doc = Jsoup.parse(html); if (doc.select("input[name=AUSWEIS]").size() > 0) { // Login vonnten nameValuePairs = new ArrayList<>(); nameValuePairs.add(new BasicNameValuePair("AUSWEIS", acc.getName())); nameValuePairs.add(new BasicNameValuePair("PWD", acc.getPassword())); if (data.has("db")) { nameValuePairs.add(new BasicNameValuePair("vkontodb", data.getString("db"))); } nameValuePairs.add(new BasicNameValuePair("B1", "weiter")); nameValuePairs.add(new BasicNameValuePair("kontofenster", "true")); nameValuePairs.add(new BasicNameValuePair("target", "konto")); nameValuePairs.add(new BasicNameValuePair("type", "K")); html = httpPost(opac_url + "/index.asp", new UrlEncodedFormEntity(nameValuePairs), "ISO-8859-1", true); doc = Jsoup.parse(html); } if (doc.getElementsByClass("kontomeldung").size() == 1) { throw new OpacErrorException(doc.getElementsByClass("kontomeldung").get(0).text()); } logged_in_as = acc; logged_in = System.currentTimeMillis(); return parse_account(acc, doc, data); }
From source file:de.geeksfactory.opacclient.apis.SISIS.java
@Override public ReservationResult reservation(DetailledItem item, Account acc, int useraction, String selection) throws IOException { String reservation_info = item.getReservation_info(); final String branch_inputfield = "issuepoint"; Document doc = null; String action = "reservation"; if (reservation_info.contains("doBestellung")) { action = "order"; }/*from www. j av a 2 s . c o m*/ if (useraction == MultiStepResult.ACTION_CONFIRMATION) { List<NameValuePair> nameValuePairs = new ArrayList<>(2); nameValuePairs.add(new BasicNameValuePair("methodToCall", action)); nameValuePairs.add(new BasicNameValuePair("CSId", CSId)); String html = httpPost(opac_url + "/" + action + ".do", new UrlEncodedFormEntity(nameValuePairs), ENCODING); doc = Jsoup.parse(html); } else if (selection == null || useraction == 0) { String html = httpGet(opac_url + "/availability.do?" + reservation_info, ENCODING); doc = Jsoup.parse(html); if (doc.select("input[name=username]").size() > 0) { // Login vonnten List<NameValuePair> nameValuePairs = new ArrayList<>(2); nameValuePairs.add(new BasicNameValuePair("username", acc.getName())); nameValuePairs.add(new BasicNameValuePair("password", acc.getPassword())); nameValuePairs.add(new BasicNameValuePair("methodToCall", "submit")); nameValuePairs.add(new BasicNameValuePair("CSId", CSId)); nameValuePairs.add(new BasicNameValuePair("login_action", "Login")); html = handleLoginMessage( httpPost(opac_url + "/login.do", new UrlEncodedFormEntity(nameValuePairs), ENCODING)); doc = Jsoup.parse(html); if (doc.getElementsByClass("error").size() == 0) { logged_in = System.currentTimeMillis(); logged_in_as = acc; } } if (doc.select("input[name=expressorder]").size() > 0) { List<NameValuePair> nameValuePairs = new ArrayList<>(2); nameValuePairs.add(new BasicNameValuePair(branch_inputfield, selection)); nameValuePairs.add(new BasicNameValuePair("methodToCall", action)); nameValuePairs.add(new BasicNameValuePair("CSId", CSId)); nameValuePairs.add(new BasicNameValuePair("expressorder", " ")); html = httpPost(opac_url + "/" + action + ".do", new UrlEncodedFormEntity(nameValuePairs), ENCODING); doc = Jsoup.parse(html); } if (doc.select("input[name=" + branch_inputfield + "]").size() > 0) { List<Map<String, String>> branches = new ArrayList<>(); for (Element option : doc.select("input[name=" + branch_inputfield + "]").first().parent().parent() .parent().select("td")) { if (option.select("input").size() != 1) { continue; } String value = option.text().trim(); String key = option.select("input").val(); Map<String, String> selopt = new HashMap<>(); selopt.put("key", key); selopt.put("value", value); branches.add(selopt); } ReservationResult result = new ReservationResult(MultiStepResult.Status.SELECTION_NEEDED); result.setActionIdentifier(ReservationResult.ACTION_BRANCH); result.setSelection(branches); return result; } } else if (useraction == ReservationResult.ACTION_BRANCH) { List<NameValuePair> nameValuePairs = new ArrayList<>(2); nameValuePairs.add(new BasicNameValuePair(branch_inputfield, selection)); nameValuePairs.add(new BasicNameValuePair("methodToCall", action)); nameValuePairs.add(new BasicNameValuePair("CSId", CSId)); String html = httpPost(opac_url + "/" + action + ".do", new UrlEncodedFormEntity(nameValuePairs), ENCODING); doc = Jsoup.parse(html); } if (doc == null) { return new ReservationResult(MultiStepResult.Status.ERROR); } if (doc.getElementsByClass("error").size() >= 1) { return new ReservationResult(MultiStepResult.Status.ERROR, doc.getElementsByClass("error").get(0).text()); } if (doc.select("#CirculationForm p").size() > 0 && doc.select("input[type=button]").size() >= 2) { List<String[]> details = new ArrayList<>(); for (String row : doc.select("#CirculationForm p").first().html().split("<br>")) { Document frag = Jsoup.parseBodyFragment(row); if (frag.text().contains(":")) { String[] split = frag.text().split(":"); if (split.length >= 2) { details.add(new String[] { split[0].trim() + ":", split[1].trim() }); } } else { details.add(new String[] { "", frag.text().trim() }); } } ReservationResult result = new ReservationResult(Status.CONFIRMATION_NEEDED); result.setDetails(details); return result; } if (doc.select("#CirculationForm .textrot").size() >= 1) { String errmsg = doc.select("#CirculationForm .textrot").get(0).text(); if (errmsg.contains("Dieses oder andere Exemplare in anderer Zweigstelle ausleihbar")) { Copy best = null; for (Copy copy : item.getCopies()) { if (copy.getResInfo() == null) { continue; } if (best == null) { best = copy; continue; } try { if (Integer.parseInt(copy.getReservations()) < Long.parseLong(best.getReservations())) { best = copy; } else if (Integer.parseInt(copy.getReservations()) == Long .parseLong(best.getReservations())) { if (copy.getReturnDate().isBefore(best.getReturnDate())) { best = copy; } } } catch (NumberFormatException e) { } } if (best != null) { item.setReservation_info(best.getResInfo()); return reservation(item, acc, 0, null); } } return new ReservationResult(MultiStepResult.Status.ERROR, errmsg); } if (doc.select("#CirculationForm td[colspan=2] strong").size() >= 1) { return new ReservationResult(MultiStepResult.Status.OK, doc.select("#CirculationForm td[colspan=2] strong").get(0).text()); } return new ReservationResult(Status.OK); }
From source file:hu.tbognar76.apking.ApKing.java
public GoogleCategory getCategoryFromGooglePlayStore(String packageName) { GoogleCategory cc = new GoogleCategory(); cc.cat1 = "Unknown"; cc.cat2 = "Unknown"; String url = "https://play.google.com/store/apps/details?id=" + URI.create(packageName) + "&hl=en"; Document doc = null; try {// w w w . j av a 2 s.com doc = Jsoup.connect(url).get(); } catch (IOException e) { // TODO Auto-generated catch block // e.printStackTrace(); System.out.println("!! GooglePlay connect error with : " + url); return cc; } // <span itemprop="genre">letstlus</span> /* * Elements link = doc.select(".document-subtitle category"); String * linkHref = link.attr("href"); // "http://example.com/" String * linkText = link.text(); // "example"" */ Elements genres = doc.select("a[itemprop=genre]"); if (genres != null) { Element e = genres.first(); if (e != null) { cc.cat2 = e.text(); String hr = e.attr("href"); if (hr.indexOf("category/GAME") != -1 || hr.indexOf("category/FAMILY") != -1) { cc.cat1 = "Game"; } else { cc.cat1 = "Application"; } } else { System.out.println("!! GooglePlay parse error structure with : " + url); } } else { System.out.println("!! GooglePlay parse error with : " + url); } /* * for (Element e : genres) { // System.out.println(e.text()); if * (!out.equals("")) { out = out + " "; } out = out + e.text(); * * } */ // <div class="content" itemprop="softwareVersion"> 2.6.9.0 </div> // Elements versions = doc.select("div[itemprop=softwareVersion]"); // System.out.println(versions.first().text()); // <a class="document-subtitle category" // href="/store/apps/category/GAME_ADVENTURE"> <span // itemprop="genre">Kalandjtkok</span> </a> /* Elements maincat = doc.getElementsByClass("category"); if (maincat != null) { Element p = maincat.first(); if (p != null) { String href = maincat.attr("href"); if (href != null) { if ((href.lastIndexOf("GAME") != -1) || (href.lastIndexOf("FAMILY") != -1)) { cc.cat1 = "Game"; } else { cc.cat1 = "Application"; } } // cc.cat1 = maincat.attr("href"); } } */ // <img alt="PEGI 3" class="document-subtitle content-rating-badge" // src="//lpfw=h28"> // <span class="document-subtitle content-rating-title">PEGI 3</span> Elements pegi = doc.getElementsByClass("content-rating-title"); if (pegi != null) { Element p = pegi.first(); if (p != null) { // cc.cat1 = p.text(); } } return cc; }
From source file:crawler.HackerEarthCrawler.java
@Override public void crawl() { int flag = 0; //set of urls which should be crawled TreeSet<String> linksset = new TreeSet<String>(); TreeSet<String> tempset = new TreeSet<String>(); TreeSet<String> tutorialset = new TreeSet<String>(); //final set of problem urls TreeSet<String> problemset = new TreeSet<String>(); //visited for maintaing status of if url is already crawled or not TreeMap<String, Integer> visited = new TreeMap<String, Integer>(); //add base url linksset.add(baseUrl);// ww w .j av a2 s . c o m //mark base url as not crawled visited.put(baseUrl, 0); try { while (true) { flag = 0; tempset.clear(); for (String str : linksset) { //check if url is already crawled or not and it has valid domain name if ((visited.get(str) == 0) && (str.startsWith("https://www.hackerearth.com/"))) { System.out.println("crawling " + str); //retriving response of current url as document Document doc = Jsoup.connect(str).timeout(0).userAgent( "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0") .referrer("http://www.google.com").ignoreHttpErrors(true).get(); //retriving all urls from current page Elements links = doc.select("a[href]"); //mark url as crawled visited.put(str, 1); //mark flag as url is crawled flag = 1; //retrive all urls for (Element link : links) { if (link.absUrl("href").endsWith("/tutorial/")) { tutorialset.add(link.absUrl("href")); } //check if url is problem url then add it in problemurlset if (link.absUrl("href").startsWith("https://www.hackerearth.com/") && isProblemUrl(link.absUrl("href"))) { problemset.add(link.absUrl("href")); } //check if url has valid domain and it has problem urls or not if (link.absUrl("href").contains(("https://www.hackerearth.com/")) && isCrawlable(link.absUrl("href"))) { //if link is not visited then mark it as uncrawled if (!visited.containsKey(link.absUrl("href"))) { visited.put(link.absUrl("href"), 0); } //add it in tempsetorary set tempset.add(link.absUrl("href")); //System.out.println("\n base: "+str+" ::: link : " + link.absUrl("href")); } } } } //if nothing is left to crawl break the loop if (flag == 0) { break; } //add all retrieved links to linksset linksset.addAll(tempset); } System.out.println("\n\ntotal problem urls " + problemset.size()); int i = 0; for (String str : problemset) { System.out.println("link " + i + " : " + str); i++; } } catch (IOException ex) { Logger.getLogger(HackerEarthCrawler.class.getName()).log(Level.SEVERE, null, ex); } //scrap and store into database //for every problem url scrap problem page for (String problemUrl : problemset) { System.out.println("problemUrl :" + problemUrl); try { //create problem class to store in database Problem problem = new Problem(); String problemSIOC = "", problemIOC = ""; String problemTitle = "", problemStatement = "", problemInput = "", problemOutput = "", problemConstraints = ""; String sampleInput = "", sampleOutput = ""; String problemExplanation = ""; //set default timelimit to 1 second double problemTimeLimit = 1.0; ArrayList<String> tags = new ArrayList<String>(); //get response for given problem url Response response = Jsoup.connect(problemUrl).execute(); Document doc = response.parse(); //retrieve problem title from page Element elementTitle = doc.getElementsByTag("title").first(); StringTokenizer stTitle = new StringTokenizer(elementTitle.text(), "|"); problemTitle = stTitle.nextToken().trim(); Element content = doc.getElementsByClass("starwars-lab").first(); problemSIOC = content.text(); Elements e = content.children(); //to find problem statement String breakloop[] = { "input", "input:", "input :", "input format:", "input format :", "input format", "Input and output", "constraints :", "constraints:", "constraints", "$$Input :$$" }; flag = 0; for (Element p : e) { String tempStatement = ""; for (Element pp : p.getAllElements()) { for (String strbreak : breakloop) { if (StringUtils.equalsIgnoreCase(pp.ownText(), strbreak)) { //System.out.println("strbreak :"+strbreak); tempStatement = p.text().substring(0, p.text().toLowerCase().indexOf(strbreak.toLowerCase())); // System.out.println("temp "+tempStatement); flag = 1; break; } } } if (flag == 1) { problemStatement += tempStatement; //remove extra space at end if (tempStatement.length() == 0) { problemStatement = problemStatement.substring(0, problemStatement.length() - 1); } break; } problemStatement += p.text() + " "; } System.out.println("problemSIOC :" + problemSIOC); System.out.println("problemStatement :" + problemStatement); if (problemStatement.length() <= problemSIOC.length()) { //remove problem statement from whole text and remove extra spaces at the beginning and the end problemIOC = problemSIOC.substring(problemStatement.length()).trim(); } else { problemIOC = ""; } System.out.println("problemIOC :" + problemIOC); //keywords for identifying input String decideInput[] = { "Input format :", "Input format:", "Input format", "inputformat:", "inputformat :", "inputformat", "input and output", "input :", "input:", "input" }; //keywords for identifying output String decideOutput[] = { "output format :", "output format:", "Output format", "outputformat:", "outputformat :", "outputformat", "output :", "output:", "output" }; //keywords for identifying constraint String decideConstraint[] = { "constraints:", "constraints :", "constraints", "Constraints :", "constraint:", "constraint :", "constraint", "Contraints :" }; int posin = 0, posoutput = 0, poscon = 0, idxin, idxout, idxcon, flaginput = 0, flagoutput = 0, flagcon = 0, inlen = 0, outlen = 0, conlen = 0; //find inputformat position,length of keyword for (idxin = 0; idxin < decideInput.length; idxin++) { if (StringUtils.containsIgnoreCase(problemIOC, decideInput[idxin])) { posin = problemIOC.toLowerCase().indexOf(decideInput[idxin].toLowerCase()); flaginput = 1; inlen = decideInput[idxin].length(); //decide it is keyowrd for actucal input or it is "sample input" if (StringUtils.containsIgnoreCase(problemIOC, "sample input")) { if (posin > problemIOC.toLowerCase().indexOf("sample input")) { flaginput = 0; inlen = 0; } else { break; } } else { break; } } } //find outputformat position,length of keyword for (idxout = 0; idxout < decideOutput.length; idxout++) { if (StringUtils.containsIgnoreCase(problemIOC, decideOutput[idxout])) { posoutput = problemIOC.toLowerCase().indexOf(decideOutput[idxout].toLowerCase()); flagoutput = 1; outlen = decideOutput[idxout].length(); break; } } //find constraint position,length of keyword for (idxcon = 0; idxcon < decideConstraint.length; idxcon++) { if (StringUtils.containsIgnoreCase(problemIOC, decideConstraint[idxcon])) { poscon = problemIOC.toLowerCase().indexOf(decideConstraint[idxcon].toLowerCase()); flagcon = 1; conlen = decideConstraint[idxcon].length(); break; } } System.out.println("input " + flaginput + " " + inlen + " " + posin); System.out.println("output " + flagoutput + " " + outlen + " " + posoutput); System.out.println("constraint " + flagcon + " " + conlen + " " + poscon); //retrieve problem input and output if present in problem page //if input format is present if (flaginput == 1) { //if input keyword is "input and output" and contraint is present in problem page if (idxin == 6 && flagcon == 1) { problemInput = problemIOC.substring(inlen, poscon); } //if input keyword is "input and output" and contraint is not present in problem page else if (idxin == 6 && flagcon == 0) { problemInput = problemIOC.substring(inlen); } //if output format and constraint is present else if (flagoutput == 1 && flagcon == 1) { //if constraint is present before input format if (poscon < posin) { problemInput = problemIOC.substring(posin + inlen, posoutput); problemOutput = problemIOC.substring(posoutput + outlen); } //if constraint is present before sample else if (poscon < posoutput) { problemInput = problemIOC.substring(inlen, poscon); problemOutput = problemIOC.substring(posoutput + outlen); } else { problemInput = problemIOC.substring(inlen, posoutput); problemOutput = problemIOC.substring(posoutput + outlen, poscon); } } //if constraint is not present else if (flagoutput == 1 && flagcon == 0) { problemInput = problemIOC.substring(inlen, posoutput); problemOutput = problemIOC.substring(posoutput + outlen); } else if (flagoutput == 0 && flagcon == 1) { if (poscon < posin) { problemInput = problemIOC.substring(posin + inlen); } else { problemInput = problemIOC.substring(poscon + conlen, posin); } problemOutput = ""; } else { problemInput = problemIOC.substring(inlen); problemOutput = ""; } } //if input format and output format is not present else { problemInput = ""; problemOutput = ""; } //if constraint is present if (flagcon == 1) { //if constraint is present before input format if (poscon < posin) { problemConstraints = problemIOC.substring(0, posin); } //if constraint is present before output format else if (poscon < posoutput) { problemConstraints = problemIOC.substring(poscon + conlen, posoutput); } else { problemConstraints = problemIOC.substring(poscon + conlen); } } System.out.println("problemInput :" + problemInput); System.out.println("problemOutput :" + problemOutput); System.out.println("problemConstraints :" + problemConstraints); //retrieve problem tags from problem page Element elementtag = doc.getElementsByClass("problem-tags").first().child(1); StringTokenizer st = new StringTokenizer(elementtag.text(), ","); while (st.hasMoreTokens()) { tags.add(st.nextToken().trim()); } //retrieve sample input sample output if present Element elementSIO = doc.getElementsByClass("input-output-container").first(); //if sample input output is present if (elementSIO != null) { //find position of sample output int soutpos = elementSIO.text().indexOf("SAMPLE OUTPUT"); sampleInput = elementSIO.text().substring(12, soutpos); sampleOutput = elementSIO.text().substring(soutpos + 13); System.out.println("Sample input :\n" + sampleInput + "\n\n\n"); System.out.println("Sample Output :\n" + sampleOutput); } else { sampleInput = ""; sampleOutput = ""; } //retrieve problem explanation from problem page if present Element elementExplanation = doc.getElementsByClass("standard-margin").first().child(0); if (elementExplanation.text().toLowerCase().contains("explanation")) { problemExplanation = elementExplanation.nextElementSibling().text(); } System.out.println("Explanation :" + problemExplanation); //retrieve timelimit Element elementTL = doc.getElementsByClass("problem-guidelines").first().child(0).child(1); StringTokenizer stTL = new StringTokenizer(elementTL.ownText(), " "); problemTimeLimit = Double.parseDouble(stTL.nextToken()); //System.out.println("problemTimeLimit :"+problemTimeLimit); //set all retrieved information to problem class problem.setProblemUrl(problemUrl); if (problemTitle.length() == 0) { problemTitle = null; } if (problemStatement.length() == 0) { problemStatement = null; } if (problemInput.length() == 0) { problemInput = null; } if (problemOutput.length() == 0) { problemOutput = null; } if (problemExplanation.length() == 0) { problemExplanation = null; } if (problemConstraints.length() == 0) { problemConstraints = null; } problem.setTitle(problemTitle); problem.setProblemUrl(problemUrl); problem.setProblemStatement(problemStatement); problem.setInputFormat(problemInput); problem.setOutputFormat(problemOutput); problem.setTimeLimit(problemTimeLimit); problem.setExplanation(problemExplanation); problem.setConstraints(problemConstraints); //set sample input output to problem class SampleInputOutput sampleInputOutput = new SampleInputOutput(problem, sampleInput, sampleOutput); problem.getSampleInputOutputs().add(sampleInputOutput); //set platform as hackerearth problem.setPlatform(Platform.HackerEarth); for (String strtag : tags) { problem.getTags().add(strtag); } //store in database Session session = null; Transaction transaction = null; try { //start session session = HibernateUtil.getSessionFactory().openSession(); transaction = session.beginTransaction(); //check if problem is already stored in database String hql = "FROM Problem p where p.problemUrl = :problem_url"; Problem oldProblem = (Problem) session.createQuery(hql).setString("problem_url", problemUrl) .uniqueResult(); String task; //if problem is present in database if (oldProblem != null) { //update the old problem task = "updated"; //retrieve id of old problem problem.setId(oldProblem.getId()); session.delete(oldProblem); session.flush(); session.save(problem); } else { task = "saved"; session.save(problem); } transaction.commit(); //log the info to console Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.INFO, "{0} {1}", new Object[] { task, problem.getProblemUrl() }); } catch (HibernateException ee) { if (transaction != null) { transaction.rollback(); } Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.SEVERE, "Cannot Insert/Update problem into databse: " + problemUrl, e); } finally { //close the session if (session != null) { session.close(); } } } catch (Exception ee) { System.out.println(ee.toString()); } } System.out.println("\n\n\n\ntutorial urls\n\n"); try { for (String tutorialurl : tutorialset) { //System.out.println(tutorialurl+"\n\n"); Response tutorialres = Jsoup.connect(tutorialurl).execute(); Document doc = tutorialres.parse(); Tutorial tutorial = new Tutorial(); tutorial.setContent(doc.getElementsByClass("tutorial").first().text()); tutorial.setName(baseUrl); tutorialurl = tutorialurl.substring(0, tutorialurl.length() - 10); StringTokenizer tutorialtok = new StringTokenizer(tutorialurl, "/"); String tempstr = ""; while (tutorialtok.hasMoreTokens()) { tempstr = tutorialtok.nextToken(); } Session session = null; Transaction transaction = null; try { //start session session = HibernateUtil.getSessionFactory().openSession(); transaction = session.beginTransaction(); //check if problem is already stored in database String hql = "FROM Tutorial p where p.name = :name"; Tutorial oldProblem = (Tutorial) session.createQuery(hql).setString("name", tempstr) .uniqueResult(); String task; //if problem is present in database if (oldProblem != null) { //update the old problem task = "updated"; //retrieve id of old problem tutorial.setName(oldProblem.getName()); session.delete(oldProblem); session.flush(); session.save(tutorial); } else { task = "saved"; tutorial.setName(tempstr); session.save(tutorial); } transaction.commit(); //log the info to console Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.INFO, "{0} {1}", new Object[] { task, tutorial.getName() }); } catch (HibernateException ee) { if (transaction != null) { transaction.rollback(); } Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.SEVERE, "Cannot Insert/Update problem into databse: " + tempstr, ee); } finally { //close the session if (session != null) { session.close(); } } } } catch (Exception e) { System.out.println(e.getMessage()); } }
From source file:de.geeksfactory.opacclient.apis.Bibliotheca.java
@Override public ReservationResult reservation(DetailledItem item, Account acc, int useraction, String selection) throws IOException { String reservation_info = item.getReservation_info(); Document doc = null; if (useraction == MultiStepResult.ACTION_CONFIRMATION) { List<NameValuePair> nameValuePairs = new ArrayList<>(2); nameValuePairs.add(new BasicNameValuePair("make_allvl", "Bestaetigung")); nameValuePairs.add(new BasicNameValuePair("target", "makevorbest")); httpPost(opac_url + "/index.asp", new UrlEncodedFormEntity(nameValuePairs), getDefaultEncoding()); return new ReservationResult(MultiStepResult.Status.OK); } else if (selection == null || useraction == 0) { String html = httpGet(opac_url + "/" + reservation_info, getDefaultEncoding()); doc = Jsoup.parse(html);/* w w w . ja v a 2 s . co m*/ if (doc.select("input[name=AUSWEIS]").size() > 0) { // Needs login List<NameValuePair> nameValuePairs = new ArrayList<>(2); nameValuePairs.add(new BasicNameValuePair("AUSWEIS", acc.getName())); nameValuePairs.add(new BasicNameValuePair("PWD", acc.getPassword())); if (data.has("db")) { try { nameValuePairs.add(new BasicNameValuePair("vkontodb", data.getString("db"))); } catch (JSONException e) { // TODO Auto-generated catch block e.printStackTrace(); } } nameValuePairs.add(new BasicNameValuePair("B1", "weiter")); nameValuePairs.add(new BasicNameValuePair("target", doc.select("input[name=target]").val())); nameValuePairs.add(new BasicNameValuePair("type", "VT2")); html = httpPost(opac_url + "/index.asp", new UrlEncodedFormEntity(nameValuePairs), getDefaultEncoding()); doc = Jsoup.parse(html); } if (doc.select("select[name=" + branch_inputfield + "]").size() == 0) { if (doc.select("select[name=VZST]").size() > 0) { branch_inputfield = "VZST"; } } if (doc.select("select[name=" + branch_inputfield + "]").size() > 0) { List<Map<String, String>> branches = new ArrayList<>(); for (Element option : doc.select("select[name=" + branch_inputfield + "]").first().children()) { String value = option.text().trim(); String key; if (option.hasAttr("value")) { key = option.attr("value"); } else { key = value; } Map<String, String> selopt = new HashMap<>(); selopt.put("key", key); selopt.put("value", value); branches.add(selopt); } _res_target = doc.select("input[name=target]").attr("value"); ReservationResult result = new ReservationResult(MultiStepResult.Status.SELECTION_NEEDED); result.setActionIdentifier(ReservationResult.ACTION_BRANCH); result.setSelection(branches); return result; } } else if (useraction == ReservationResult.ACTION_BRANCH) { List<NameValuePair> nameValuePairs = new ArrayList<>(2); nameValuePairs.add(new BasicNameValuePair(branch_inputfield, selection)); nameValuePairs.add(new BasicNameValuePair("button2", "weiter")); nameValuePairs.add(new BasicNameValuePair("target", _res_target)); String html = httpPost(opac_url + "/index.asp", new UrlEncodedFormEntity(nameValuePairs), getDefaultEncoding()); doc = Jsoup.parse(html); } if (doc == null) { return new ReservationResult(MultiStepResult.Status.ERROR); } if (doc.select("input[name=target]").size() > 0) { if (doc.select("input[name=target]").attr("value").equals("makevorbest")) { List<String[]> details = new ArrayList<>(); if (doc.getElementsByClass("kontomeldung").size() == 1) { details.add(new String[] { doc.getElementsByClass("kontomeldung").get(0).text().trim() }); } Pattern p = Pattern.compile("geb.hr", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE); for (Element div : doc.select(".kontozeile_center")) { for (String text : Jsoup.parse(div.html().replaceAll("(?i)<br[^>]*>", "br2n")).text() .split("br2n")) { if (p.matcher(text).find() && !text.contains("usstehend") && text.contains("orbestellung")) { details.add(new String[] { text.trim() }); } } } if (doc.select("#vorbest").size() > 0 && doc.select("#vorbest").val().contains("(")) { // Erlangen uses "Kostenpflichtige Vorbestellung (1 Euro)" // as the label of its reservation button details.add(new String[] { doc.select("#vorbest").val().trim() }); } for (Element row : doc.select(".kontozeile_center table tr")) { if (row.select(".konto_feld").size() == 1 && row.select(".konto_feldinhalt").size() == 1) { details.add(new String[] { row.select(".konto_feld").text().trim(), row.select(".konto_feldinhalt").text().trim() }); } } ReservationResult result = new ReservationResult(MultiStepResult.Status.CONFIRMATION_NEEDED); result.setDetails(details); return result; } } if (doc.getElementsByClass("kontomeldung").size() == 1) { return new ReservationResult(MultiStepResult.Status.ERROR, doc.getElementsByClass("kontomeldung").get(0).text()); } return new ReservationResult(MultiStepResult.Status.ERROR, stringProvider.getString(StringProvider.UNKNOWN_ERROR)); }
From source file:com.storm.function.GsxtFunction.java
private Map<String, Object> getHtmlInfoMapOfTianjin(String area, HtmlPage firstInfoPage, String keyword, ChannelLogger LOGGER) throws Exception { Map<String, Object> resultHtmlMap = new HashMap<String, Object>(); //????/*w w w.j a v a2s . com*/ // System.out.println(firstInfoPage.asXml()); WebWindow window = firstInfoPage.getWebClient().getCurrentWindow(); @SuppressWarnings("unchecked") List<HtmlAnchor> divByXPath = (List<HtmlAnchor>) firstInfoPage.getByXPath("//div[@class='result-item']"); HtmlElement firstByXPath = ((HtmlElement) firstInfoPage .getFirstByXPath("//div[@class='content']/div[@style='font-size:12px']")); if (divByXPath.size() == 0 && firstByXPath == null) { resultHtmlMap.put("statusCodeDef", StatusCodeDef.IMAGECODE_ERROR); } else { if (firstByXPath != null) { String textContent = firstByXPath.getTextContent(); if (textContent.indexOf("? 0 ?") > 0) { resultHtmlMap.put("statusCodeDef", StatusCodeDef.NO_DATA_FOUND); } else { resultHtmlMap.put("statusCodeDef", StatusCodeDef.SCCCESS); } } else { resultHtmlMap.put("statusCodeDef", StatusCodeDef.SCCCESS); } } @SuppressWarnings("unchecked") List<HtmlAnchor> anchors = (List<HtmlAnchor>) firstInfoPage.getByXPath("//div[@class='result-item']/div/a"); LOGGER.info(anchors.toString()); if (anchors != null && !anchors.isEmpty()) { boolean matchFlag = false; for (HtmlAnchor anchor : anchors) { String anchorTitle = anchor.getTextContent().toString().trim(); if (anchorTitle.contains(keyword)) { //???? matchFlag = true; //??? HtmlElement target_item_info = (HtmlElement) anchor.getParentNode().getParentNode(); resultHtmlMap.put("target_item_info", target_item_info.asXml()); //*****************? ***************** //?? ?->?->? //?entId //? String ent_id = anchor.getAttribute("href"); if (!StringUtils.isEmpty(ent_id)) { ent_id = ent_id.split("=")[1]; } if (!StringUtils.isEmpty(ent_id)) { /*HtmlPage gsgsxx_djxx_jbxx = anchor.click(); Thread.sleep(3000); resultHtmlMap.put("gsgsxx_djxx_jbxx", gsgsxx_djxx_jbxx.asXml()); */ String gsgsxx_djxx_jbxx_url = "http://tjcredit.gov.cn/platform/saic/baseInfo.json?entId=" + ent_id + "&departmentId=scjgw&infoClassId=dj"; Page gsgsxx_djxx_jbxx = firstInfoPage.getWebClient().getPage(window, new WebRequest(new URL(gsgsxx_djxx_jbxx_url))); resultHtmlMap.put("gsgsxx_djxx_jbxx", gsgsxx_djxx_jbxx.getWebResponse().getContentAsString("utf-8")); //?? ?->?->?->?-> Document qygsxxHtml = Jsoup .parseBodyFragment(gsgsxx_djxx_jbxx.getWebResponse().getContentAsString("utf-8")); if (qygsxxHtml != null) { Element qynbDiv = qygsxxHtml.getElementById("touziren"); if (qynbDiv != null) { Elements qynb_trs = qynbDiv.select("tbody").select("tr").select("td").select("a"); if (qynb_trs.size() != 0) { List<String> list = new ArrayList<String>(); for (int i = 0; i < qynb_trs.size(); i++) { //System.out.println(qynb_trs.get(i).toString()); //System.out.println(qynb_trs.get(i).toString().split("\\(\\'")[1].split("\\'\\)")[0].split("\\'\\,\\'")[0]); if (qynb_trs.get(i).toString() != null && qynb_trs.get(i).toString().split("\\(\\'")[1].split("\\'\\)")[0] .split("\\'\\,\\'")[0] != null) { String gsgsxx_djxx_gdxx_detail_url = "http://tjcredit.gov.cn/saicpf/gsgdcz?gdczid=" + qynb_trs.get(i).toString().split("\\(\\'")[1] .split("\\'\\)")[0].split("\\'\\,\\'")[0] + "&entid=" + ent_id + "&issaic=1&hasInfo=0"; Page gsgsxx_djxx_gdxx_detail = firstInfoPage.getWebClient().getPage( window, new WebRequest(new URL(gsgsxx_djxx_gdxx_detail_url))); list.add(gsgsxx_djxx_gdxx_detail.getWebResponse() .getContentAsString("utf-8")); } } if (list.size() != 0) { resultHtmlMap.put("gsgsxx_djxx_gdxx", list); } } } } //? ?->?->?? String gsgsxx_baxx_zyryxx_url = "http://tjcredit.gov.cn/platform/saic/baseInfo.json?entId=" + ent_id + "&departmentId=scjgw&infoClassId=ba"; Page gsgsxx_baxx_zyryxx = firstInfoPage.getWebClient().getPage(window, new WebRequest(new URL(gsgsxx_baxx_zyryxx_url))); resultHtmlMap.put("gsgsxx_baxx_zyryxx", gsgsxx_baxx_zyryxx.getWebResponse().getContentAsString("utf-8")); //? ?->?->? String gsgsxx_dcdydjxx_dcdydjxx_url = "http://tjcredit.gov.cn/platform/saic/baseInfo.json?entId=" + ent_id + "&departmentId=scjgw&infoClassId=dcdydjxx"; Page gsgsxx_dcdydjxx_dcdydjxx = firstInfoPage.getWebClient().getPage(window, new WebRequest(new URL(gsgsxx_dcdydjxx_dcdydjxx_url))); resultHtmlMap.put("gsgsxx_dcdydjxx_dcdydjxx", gsgsxx_dcdydjxx_dcdydjxx.getWebResponse().getContentAsString("utf-8")); //? ?->??->?? String gsgsxx_gqczdjxx_gqczdjxx_url = "http://tjcredit.gov.cn/platform/saic/baseInfo.json?entId=" + ent_id + "&departmentId=scjgw&infoClassId=gqczdjxx"; Page gsgsxx_gqczdjxx_gqczdjxx = firstInfoPage.getWebClient().getPage(window, new WebRequest(new URL(gsgsxx_gqczdjxx_gqczdjxx_url))); resultHtmlMap.put("gsgsxx_gqczdjxx_gqczdjxx", gsgsxx_gqczdjxx_gqczdjxx.getWebResponse().getContentAsString("utf-8")); //? ?->?->? String gsgsxx_xzcfxx_xzcfxx_url = "http://tjcredit.gov.cn/platform/saic/baseInfo.json?entId=" + ent_id + "&departmentId=scjgw&infoClassId=xzcf"; Page gsgsxx_xzcfxx_xzcfxx = firstInfoPage.getWebClient().getPage(window, new WebRequest(new URL(gsgsxx_xzcfxx_xzcfxx_url))); resultHtmlMap.put("gsgsxx_xzcfxx_xzcfxx", gsgsxx_xzcfxx_xzcfxx.getWebResponse().getContentAsString("utf-8")); // Document xzcfxxHtml = Jsoup.parseBodyFragment( gsgsxx_xzcfxx_xzcfxx.getWebResponse().getContentAsString("utf-8")); if (xzcfxxHtml != null) { Elements qynbDiv = xzcfxxHtml.getElementsByClass("result-table"); if (qynbDiv != null && qynbDiv.size() != 0) { Elements qynb_trs = qynbDiv.get(0).select("tbody").select("tr").select("td") .select("a"); if (qynb_trs.size() != 0) { List<String> list = new ArrayList<String>(); for (int i = 0; i < qynb_trs.size(); i++) { if (qynb_trs.get(i).toString() != null && qynb_trs.get(i).toString().split("\\(\\'")[1].split("\\'\\)")[0] .split("\\'\\,\\'")[0] != null) { String gsgsxx_djxx_gdxx_detail_url = "http://tjcredit.gov.cn/saicpf/gsxzcf?id=" + qynb_trs.get(i).toString().split("\\(\\'")[1] .split("\\'\\)")[0].split("\\'\\,\\'")[0] + "&entid=" + ent_id + "&issaic=1&hasInfo=0"; Page gsgsxx_djxx_gdxx_detail = firstInfoPage.getWebClient().getPage( window, new WebRequest(new URL(gsgsxx_djxx_gdxx_detail_url))); list.add(gsgsxx_djxx_gdxx_detail.getWebResponse() .getContentAsString("utf-8")); } } if (list.size() != 0) { resultHtmlMap.put("gsgsxx_xzcfxx_detail", list); } } } } //? ?->???->??? String gsgsxx_jyycxx_jyycxx_url = "http://tjcredit.gov.cn/platform/saic/baseInfo.json?entId=" + ent_id + "&departmentId=scjgw&infoClassId=qyjyycmlxx"; Page gsgsxx_jyycxx_jyycxx = firstInfoPage.getWebClient().getPage(window, new WebRequest(new URL(gsgsxx_jyycxx_jyycxx_url))); resultHtmlMap.put("gsgsxx_jyycxx_jyycxx", gsgsxx_jyycxx_jyycxx.getWebResponse().getContentAsString("utf-8")); //? ?->???->??? String gsgsxx_yzwfxx_yzwfxx_url = "http://tjcredit.gov.cn/platform/saic/baseInfo.json?entId=" + ent_id + "&departmentId=scjgw&infoClassId=yzwfqyxx"; Page gsgsxx_yzwfxx_yzwfxx = firstInfoPage.getWebClient().getPage(window, new WebRequest(new URL(gsgsxx_yzwfxx_yzwfxx_url))); resultHtmlMap.put("gsgsxx_yzwfxx_yzwfxx", gsgsxx_yzwfxx_yzwfxx.getWebResponse().getContentAsString("utf-8")); //? ?->?->? String gsgsxx_ccjcxx_ccjcxx_url = "http://tjcredit.gov.cn/platform/saic/baseInfo.json?entId=" + ent_id + "&departmentId=scjgw&infoClassId=ccjcxx"; Page gsgsxx_ccjcxx_ccjcxx = firstInfoPage.getWebClient().getPage(window, new WebRequest(new URL(gsgsxx_ccjcxx_ccjcxx_url))); resultHtmlMap.put("gsgsxx_ccjcxx_ccjcxx", gsgsxx_ccjcxx_ccjcxx.getWebResponse().getContentAsString("utf-8")); //*****************? ?***************** //*****************?? ***************** //? ??->?-> String qygsxx_qynb_list_url = "http://tjcredit.gov.cn/report/nblist?entid=" + ent_id; Page qygsxx_qynb_list = firstInfoPage.getWebClient().getPage(window, new WebRequest(new URL(qygsxx_qynb_list_url))); resultHtmlMap.put("qygsxx_qynb_list", qygsxx_qynb_list.getWebResponse().getContentAsString("utf-8")); //? ??->?->-> //? Document qynbHtml = Jsoup .parseBodyFragment(qygsxx_qynb_list.getWebResponse().getContentAsString("utf-8")); if (qynbHtml != null) { Element qynbDiv = qynbHtml.getElementById("touziren"); if (qynbDiv != null) { Elements qynb_trs = qynbDiv.select("tbody").select("tr").select("td").select("a"); if (qynb_trs.size() != 0) { List<String> list = new ArrayList<String>(); for (int i = 0; i < qynb_trs.size(); i++) { //System.out.println(qynb_trs.get(i).toString()); //System.out.println(qynb_trs.get(i).toString().split("\\(\\'")[1].split("\\'\\)")[0].split("\\'\\,\\'")[0]); if (qynb_trs.get(i).toString() != null && qynb_trs.get(i).toString().split("\\(\\'")[1].split("\\'\\)")[0] .split("\\'\\,\\'")[1] != null) { String gsgsxx_djxx_gdxx_detail_url = "http://tjcredit.gov.cn/report/annals?entid=" + ent_id + "&year=" + qynb_trs.get(i).toString().split("\\(\\'")[1] .split("\\'\\)")[0].split("\\'\\,\\'")[1] + "&hasInfo=0"; Page gsgsxx_djxx_gdxx_detail = firstInfoPage.getWebClient().getPage( window, new WebRequest(new URL(gsgsxx_djxx_gdxx_detail_url))); System.out.println(gsgsxx_djxx_gdxx_detail.getWebResponse() .getContentAsString("utf-8")); list.add(gsgsxx_djxx_gdxx_detail.getWebResponse() .getContentAsString("utf-8")); } } if (list.size() != 0) { resultHtmlMap.put("qygsxx_qynb_detail", list); } } } } //? ??->?? String qygsxx_xzxkxx_url = "http://tjcredit.gov.cn/report/xzxk?entid=" + ent_id; Page qygsxx_xzxkxx = firstInfoPage.getWebClient().getPage(window, new WebRequest(new URL(qygsxx_xzxkxx_url))); resultHtmlMap.put("qygsxx_xzxkxx", qygsxx_xzxkxx.getWebResponse().getContentAsString("utf-8")); //? ??->?? String qygsxx_gdjczxx_url = "http://tjcredit.gov.cn/report/gdcz?entid=" + ent_id; Page qygsxx_gdjczxx = firstInfoPage.getWebClient().getPage(window, new WebRequest(new URL(qygsxx_gdjczxx_url))); resultHtmlMap.put("qygsxx_gdjczxx", qygsxx_gdjczxx.getWebResponse().getContentAsString("utf-8")); //? ??->??? String qygsxx_gqbgxx_url = "http://tjcredit.gov.cn/report/gqbg?entid=" + ent_id; Page qygsxx_gqbgxx = firstInfoPage.getWebClient().getPage(window, new WebRequest(new URL(qygsxx_gqbgxx_url))); resultHtmlMap.put("qygsxx_gqbgxx", qygsxx_gqbgxx.getWebResponse().getContentAsString("utf-8")); //? ??->?? String qygsxx_zscqczdjxx_url = "http://tjcredit.gov.cn/report/zscq?entid=" + ent_id; Page qygsxx_zscqczdjxx = firstInfoPage.getWebClient().getPage(window, new WebRequest(new URL(qygsxx_zscqczdjxx_url))); resultHtmlMap.put("qygsxx_zscqczdjxx", qygsxx_zscqczdjxx.getWebResponse().getContentAsString("utf-8")); //? ??->? String qygsxx_xzcfxx_url = "http://tjcredit.gov.cn/report/xzcf?entid=" + ent_id; Page qygsxx_xzcfxx = firstInfoPage.getWebClient().getPage(window, new WebRequest(new URL(qygsxx_xzcfxx_url))); resultHtmlMap.put("qygsxx_xzcfxx", qygsxx_xzcfxx.getWebResponse().getContentAsString("utf-8")); //*****************?? ?***************** //*****************???? ***************** //? ????->?? String sfxzgsxx_gqdjxx_list_url = "http://tjcredit.gov.cn/report/gddjlist?entid=" + ent_id; Page sfxzgsxx_gqdjxx_list = firstInfoPage.getWebClient().getPage(window, new WebRequest(new URL(sfxzgsxx_gqdjxx_list_url))); resultHtmlMap.put("sfxzgsxx_gqdjxx_list", sfxzgsxx_gqdjxx_list.getWebResponse().getContentAsString("utf-8")); /* //? ????->??->-> @SuppressWarnings("unchecked") List<HtmlAnchor> anchors4 = (List<HtmlAnchor>) sfxzgsxx_gqdjxx_list_page.getByXPath("//table[@id='touziren']/tbody[@id='table2']/tr/td/a"); if (anchors4!=null && !anchors4.isEmpty()) { List<String> detail=new ArrayList<String>(); for (@SuppressWarnings("unused") HtmlAnchor anchor4 : anchors4) { HtmlPage sfxzgsxx_gqdjxx_detail = anchor4.click(); // System.out.println("gsgsxx_qynb_detail.asXml()="+gsgsxx_qynb_detail.asXml()); detail.add(sfxzgsxx_gqdjxx_detail.asXml()); } resultHtmlMap.put("sfxzgsxx_gqdjxx_details",detail); }*/ //? ??->?? String qygsxx_gdbgxx_list_url = "http://tjcredit.gov.cn/report/gdbglist?entid=" + ent_id; Page qygsxx_gdbgxx_list = firstInfoPage.getWebClient().getPage(window, new WebRequest(new URL(qygsxx_gdbgxx_list_url))); resultHtmlMap.put("qygsxx_gdbgxx_list", qygsxx_gdbgxx_list.getWebResponse().getContentAsString("utf-8")); //? ??->??->-> /* @SuppressWarnings("unchecked") List<HtmlAnchor> anchors5 = (List<HtmlAnchor>) qygsxx_gdbgxx_page.getByXPath("//table[@id='touziren']/tbody[@id='table2']/tr/td/a"); if (anchors5!=null && !anchors5.isEmpty()) { List<String> detail=new ArrayList<String>(); for (@SuppressWarnings("unused") HtmlAnchor anchor5 : anchors5) { HtmlPage qygsxx_gdbgxx_detail = anchor5.click(); // System.out.println("gsgsxx_qynb_detail.asXml()="+gsgsxx_qynb_detail.asXml()); detail.add(qygsxx_gdbgxx_detail.asXml()); } resultHtmlMap.put("qygsxx_gdbgxx_details",detail); }*/ //*****************???? ?***************** } break;// } } if (!matchFlag) { resultHtmlMap.put("statusCodeDef", StatusCodeDef.NO_DATA_FOUND); LOGGER.info("????"); } } return resultHtmlMap; }