List of usage examples for org.jsoup.nodes Document text
public String text()
From source file:explore.ArgminCorpusReader.java
@Override public void getNext(JCas aJcas) throws CollectionException { try {/*from w w w . j a v a 2 s.c om*/ Map<String, Object> jsonData = this.documentsIterator.next(); String htmlText = (String) jsonData.get(JsonCorpusUtil.TEXT); org.jsoup.nodes.Document cleanedText = Jsoup.parse(htmlText); String rawDocumentText = cleanedText.text(); String file = (String) jsonData.get(JsonCorpusUtil.FILE); String documentId = file.replace(".json", ""); String url = (String) jsonData.get(JsonCorpusUtil.URL); // original HTML version not required for TC experiment // JCas view = jCas.createView(JsonCorpusUtil.VIEW_ORIGINAL_HTML); // view.setDocumentText(htmlText); aJcas.setDocumentText(rawDocumentText); aJcas.setDocumentLanguage(this.language); DocumentMetaData metaData = DocumentMetaData.create(aJcas); metaData.setDocumentBaseUri(""); metaData.setDocumentUri("/" + documentId); metaData.setDocumentTitle(url); metaData.setDocumentId(documentId); Map<Integer, Token> idxToTokenMapping = this.createIndexToTokenMapping(rawDocumentText); @SuppressWarnings("unchecked") List<Map<String, Object>> userAnnotations = (List<Map<String, Object>>) jsonData .get(JsonCorpusUtil.USER_ANNOTATIONS); for (Map<String, Object> userAnnotation : userAnnotations) { String annotator = (String) userAnnotation.get(JsonCorpusUtil.ANNOTATOR); if (annotator.equals(this.annotator)) { @SuppressWarnings("unchecked") List<String> argUnits = (List<String>) userAnnotation.get(JsonCorpusUtil.ARGUMENTATION_UNITS); for (String argUnit : argUnits) { String cleanedArgUnit = argUnit.replaceAll("\\s+", ""); Matcher matcher = JsonCorpusUtil.getRecognitionPattern().matcher(cleanedArgUnit); if (!matcher.matches()) { this.getLogger() .warn(String.format("argument unit %s does not match the expected pattern %s", cleanedArgUnit, JsonCorpusUtil.getRecognitionPattern().pattern())); } else { // ************************************************** // coordinates of an argument unit: String label = matcher.group(1); String stringIndices = matcher.group(3).replaceAll("^,", ""); List<Integer> indices = CollectionUtils.parseIntList(stringIndices, ","); int firstIndex = Collections.min(indices); Token firstToken = idxToTokenMapping.get(firstIndex); int lastIndex = Collections.max(indices); Token lastToken = idxToTokenMapping.get(lastIndex); // ***************************************************** // Read argument unit as Paragraph annotation Paragraph para = new Paragraph(aJcas, firstToken.getBegin(), lastToken.getEnd()); para.addToIndexes(); // print some counts: System.out.println("annotator: " + annotator); counter++; System.out .println("AU " + counter + " -- argument unit text: " + para.getCoveredText()); System.out.println("label: " + label); if (label.contains("claim")) { claims++; } else { premises++; } System.out.println("premises " + premises + "\t claims " + claims); NamedEntity outcome = new NamedEntity(aJcas, firstToken.getBegin(), lastToken.getEnd()); outcome.setValue(label); outcome.addToIndexes(); } // matching was ok } // for argUnit : argUnits ++this.nextDocumentIdx; } // if annotator.equals(this.annotator) } } catch (final CASException e) { throw new CollectionException(e); } catch (final ResourceInitializationException e) { throw new CollectionException(e); } catch (final UIMAException e) { throw new CollectionException(e); } }
From source file:eu.sisob.uma.extractors.adhoc.websearchers.WebSearchersExtractor.java
/** * * @param nextLine/*from w w w . j av a2s . c o m*/ * @param idStaffIdentifier * @param idName * @param idFirstName * @param idLastName * @param idInitials * @param idSubject * @param idInstitutionName * @param idWebAddress * @param expression * @param params * @return */ @Override protected String get_result(String[] nextLine, int idStaffIdentifier, int idName, int idFirstName, int idLastName, int idInitials, int idSubject, int idInstitutionName, int idWebAddress, String expression, Object[] params) { String keywords = " (PROFILE OR PHD OR RESEARCHER OR FACULTY OR PROFESSOR OR RESEARCH) AND "; keywords = ""; String domain = clean_site(nextLine[idWebAddress]); String subject = nextLine[idSubject]; String and_institution_name = (idInstitutionName != -1 ? " AND " + nextLine[idInstitutionName] : ""); String expression_subject = expression + " AND " + subject; String expression_site = expression + " site: " + domain; String expression_inst_name = expression + and_institution_name; String expression_inst_name_and_subject = expression + and_institution_name + " AND " + subject; String url = ""; switch (search_patterns) { case P1: url = "https://duckduckgo.com/html/?q=" + keywords + expression; break; case P2: url = "https://duckduckgo.com/html/?q=" + keywords + expression_subject; break; case P3: url = "https://duckduckgo.com/html/?q=" + keywords + expression_site; break; case P4: url = "https://duckduckgo.com/html/?q=" + keywords + expression_inst_name; break; case P5: url = "https://duckduckgo.com/html/?q=" + keywords + expression_inst_name_and_subject; break; default: url = "https://duckduckgo.com/html/?q=" + keywords + expression_subject; break; } Logger.getRootLogger().info("Go with " + url); boolean again = false; Document doc = null; do { doc = getDocumentFromPage(url, 10, 1000, 5000); if (doc != null && doc.text().contains("If this error persists, please let us know")) { try { Thread.sleep(30000); } catch (InterruptedException ex) { } again = true; } else { again = false; } } while (again); String final_result = ""; if (doc != null && doc.select("div[class*=links_main] > a").size() > 0) { /* Write resercher founded */ Elements elements = doc.select("div[class*=links_main] > a"); /* We will take the first html page and the first pdf */ HashMap<String, String> results = new HashMap<String, String>(); int max_results = 2; int i_result = 0; for (Element e : elements) { if ((e.text().startsWith("[") //&& !e.text().startsWith("[PDF]") ) || e.absUrl("href").contains("duckduckgo.com/y.js") || e.absUrl("href").contains("wikipedia.") || e.absUrl("href").contains("facebook.com") || e.absUrl("href").contains("microsoft.com") || e.absUrl("href").contains("google.com") || e.absUrl("href").contains("linkedin") || e.absUrl("href").contains("www.biography.com") || e.absUrl("href").contains("biomedexperts.com") || e.absUrl("href").contains("www.experts.scival.com") || e.absUrl("href").contains("ratemyprofessors.com") || e.absUrl("href").contains("flickr.com") || e.absUrl("href").endsWith(".txt") || e.absUrl("href").endsWith(".csv") || e.absUrl("href").endsWith(".xml") || e.absUrl("href").endsWith(".doc") || e.absUrl("href").endsWith(".docx") || e.absUrl("href").endsWith(".xls") || e.absUrl("href").endsWith(".xlxs") || e.absUrl("href").contains("www.amazon")) { max_results++; continue; } boolean add = false; String score = ""; String ext = ""; if (!results.containsKey("HTML") && !e.text().startsWith("[")) { //results.put("html", ) File temp; try { temp = File.createTempFile("temp-file-name", ".tmp"); URL fetched_url = Downloader.fetchURL(e.absUrl("href")); FileUtils.copyURLToFile(fetched_url, temp); long sizeInBytes = temp.length(); long sizeInMb = sizeInBytes / (1024 * 1024); if (sizeInMb > 100) { score = "B"; } else { String content = FileUtils.readFileToString(temp); if (content.contains(nextLine[idLastName])) { score = "A"; } else { score = "B"; } } } catch (IOException ex) { score = "B"; } ext = "HTML"; add = true; } //if(!results.containsKey("PDF") && e.text().startsWith("[PDF]")){ // score = "A"; // ext = "PDF"; // add = true; //} if (add) { String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\";"; result += "\"" + nextLine[idLastName] + "\";"; result += "\"" + nextLine[idInitials] + "\";"; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\";"; if (idName != -1) result += "\"" + nextLine[idName] + "\";"; result += "\"" + e.absUrl("href") + "\";"; result += "\"" + ext + "\";"; result += "\"" + "CV" + "\";"; result += "\"" + score + "\""; result += "\r\n"; results.put(ext, result); Logger.getRootLogger().info("Select " + e.absUrl("href") + " - " + e.text()); } // if(results.containsKey("PDF") && results.containsKey("HTML")){ // break; // } i_result++; if (max_results <= i_result) { break; } } // if(results.containsKey("PDF")) // final_result = results.get("PDF"); // else if (results.containsKey("HTML")) final_result = results.get("HTML"); else final_result = ""; } return final_result; }
From source file:eu.sisob.uma.extractors.adhoc.websearchers_cv.WebSearchersCVExtractor.java
/** * * @param nextLine/* w w w . ja va 2 s .com*/ * @param idStaffIdentifier * @param idName * @param idFirstName * @param idLastName * @param idInitials * @param idSubject * @param idInstitutionName * @param idWebAddress * @param expression * @param params * @return */ @Override protected String get_result(String[] nextLine, int idStaffIdentifier, int idName, int idFirstName, int idLastName, int idInitials, int idSubject, int idInstitutionName, int idWebAddress, String expression, Object[] params) { String domain = clean_site(nextLine[idWebAddress]); String subject = nextLine[idSubject]; String expression_subject = expression + " " + subject + " " + files + " " + cv_keywords_in_query; expression_subject = expression_subject.replaceAll("\t", " "); expression_subject = expression_subject.replaceAll(" ", " "); String url = "https://duckduckgo.com/html/?q=" + expression_subject; Logger.getRootLogger().info("Go with " + url); boolean again = false; Document doc = null; do { doc = getDocumentFromPage(url, 10, 2000, 5000); if (doc != null && doc.text().contains("If this error persists, please let us know")) { try { Thread.sleep(30000); } catch (InterruptedException ex) { } again = true; } else { again = false; } } while (again); //if(doc.select("div[class*=links_main] > a[href*=" + domain + "]").size() > 0){ String final_result = ""; if (doc != null && doc.select("div[class*=links_main] > a").size() > 0) { /* Write resercher founded */ Elements elements = doc.select("div[class*=links_main] > a"); /* We will take the first html page and the first pdf */ List<String[]> results = new ArrayList<String[]>(); final int EXT_I = 0; final int SCORE_INT_I = 1; final int SCORE_LETTER_I = 2; final int RESULT_I = 3; final int WORST_SCORE = 67; //int max_results = elements.size(); //int i_result = 0; for (Element e : elements) { if ((e.text().startsWith("[") && !e.text().startsWith("[PDF]")) || e.absUrl("href").contains("duckduckgo.com/y.js") || e.absUrl("href").contains("wikipedia.") || e.absUrl("href").contains("microsoft.com") || e.absUrl("href").contains("google.com") || e.absUrl("href").contains("linkedin") || e.absUrl("href").contains("www.biography.com") || e.absUrl("href").contains("biomedexperts.com") || e.absUrl("href").contains("www.experts.scival.com") || e.absUrl("href").contains("ratemyprofessors.com") || e.absUrl("href").contains("flickr.com") || e.absUrl("href").endsWith(".txt") || e.absUrl("href").endsWith(".csv") || e.absUrl("href").endsWith(".xml") || e.absUrl("href").endsWith(".doc") || e.absUrl("href").endsWith(".docx") || e.absUrl("href").endsWith(".xls") || e.absUrl("href").endsWith(".xlxs") || e.absUrl("href").contains("www.amazon")) { continue; } boolean add = false; int score_int = WORST_SCORE; String score = ""; String ext = ""; if (e.text().startsWith("[PDF]") || e.text().startsWith("[DOCX]") || e.text().startsWith("[DOC]") || e.text().startsWith("[RTF]")) { String clean_name_1 = e.text().replaceAll("[^\\w\\s]", "").toLowerCase(); int i = e.absUrl("href").lastIndexOf("/"); int f = e.absUrl("href").lastIndexOf("."); String clean_name_2 = ""; if (i != -1 && f != -1) clean_name_2 = e.absUrl("href").substring(i, f).toLowerCase(); boolean b = false; for (String k : cv_keywords_in_name_list) { if (clean_name_1.contains(k) || clean_name_2.contains(k)) { b = true; break; } } if (b) { score_int--; } if (clean_name_1.contains(nextLine[idLastName]) || clean_name_2.contains(nextLine[idLastName])) { score_int--; } score = Character.toChars(score_int)[0] + ""; add = true; ext = "PDF"; } //if(!results.containsKey("HTML") && !e.text().startsWith("[")){ //} if (add) { String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\";"; result += "\"" + nextLine[idLastName] + "\";"; result += "\"" + nextLine[idInitials] + "\";"; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\";"; if (idName != -1) result += "\"" + nextLine[idName] + "\";"; result += "\"" + e.absUrl("href") + "\";"; result += "\"" + ext + "\";"; result += "\"" + "CV" + "\";"; result += "\"" + score + "\""; result += "\r\n"; results.add(new String[] { ext, score_int + "", score, result }); Logger.getRootLogger().info("Select " + e.absUrl("href") + " - " + score + " - " + e.text()); } } final_result = ""; int best_score = WORST_SCORE; for (String[] result : results) { if (result[EXT_I].equals("PDF")) { int act_score = Integer.parseInt(result[SCORE_INT_I]); if (act_score < best_score) { best_score = act_score; final_result = result[RESULT_I]; } } } } return final_result; }
From source file:index.IndexManager.java
public static Triple<SolrInputDocument, Collection<String>, Collection<String>> index(Document document) { final SolrInputDocument index = new SolrInputDocument(); index.setField("id", document.location()); index.setField("time", String.valueOf(System.currentTimeMillis())); index.setField("title", document.title()); final Set<String> links = document.select("a[href]").stream().map(e -> e.attr("abs:href")) .collect(Collectors.toSet()); final Set<String> media = document.select("[src]").stream().map(e -> e.attr("abs:src")) .collect(Collectors.toSet()); links.forEach(link -> index.addField("link", link)); media.forEach(link -> index.addField("media", link)); formatText(document.getElementsByTag("h1").stream()).forEach(e -> index.addField("h1", e)); formatText(document.getElementsByTag("h2").stream()).forEach(e -> index.addField("h2", e)); formatText(document.getElementsByTag("h3").stream()).forEach(e -> index.addField("h3", e)); formatText(document.getElementsByTag("strong").stream()).forEach(e -> index.addField("strong", e)); formatText(document.getElementsByTag("em").stream()).forEach(e -> index.addField("em", e)); formatText(document.getElementsByTag("b").stream()).forEach(e -> index.addField("b", e)); formatText(document.getElementsByTag("u").stream()).forEach(e -> index.addField("u", e)); formatText(document.getElementsByTag("i").stream()).forEach(e -> index.addField("i", e)); int i = 0;//from ww w.j a va 2 s .c o m Collection<String> text = chunkToLength(document.text()); for (String chunk : text) index.addField(++i + "_text", chunk); return Triple.of(index, links, media); }
From source file:me.vertretungsplan.parser.TurboVertretungParser.java
private void parseTurboVertretungDay(SubstitutionSchedule v, Document doc) { SubstitutionScheduleDay day = new SubstitutionScheduleDay(); String date = doc.select(".Titel").text().replaceFirst("Vertretungsplan( fr)? ", ""); day.setDate(DateTimeFormat.forPattern("EEEE, d. MMMM yyyy").withLocale(Locale.GERMAN).parseLocalDate(date)); String lastChange = doc.select(".Stand").text().replace("Stand: ", ""); day.setLastChange(DateTimeFormat.forPattern("dd.MM.yyyy HH:mm:ss").withLocale(Locale.GERMAN) .parseLocalDateTime(lastChange)); if (doc.text().contains("Kein Vertretungsplan")) { v.addDay(day);/* w w w. j a v a2 s .c o m*/ return; } if (doc.select(".LehrerFrueher").size() > 0) { day.addMessage(doc.select(".LehrerFrueherLabel").text() + "\n" + doc.select(".LehrerFrueher").text()); } if (doc.select(".LehrerVerplant").size() > 0) { day.addMessage(doc.select(".LehrerVerplantLabel").text() + "\n" + doc.select(".LehrerVerplant").text()); } if (doc.select(".Abwesenheiten-Klassen").size() > 0) { day.addMessage(doc.select(".Abwesenheiten-KlassenLabel").text() + "\n" + doc.select(".Abwesenheiten-Klassen").text()); } Element table = doc.select("table").first(); for (Element row : table.select("tr:has(td)")) { Substitution substitution = new Substitution(); substitution.setLesson(row.select(query("Stunde")).text()); substitution.setPreviousTeacher(row.select(query("Lehrer")).text()); substitution.setTeacher(row.select(query("Vertretung")).text()); substitution.setClasses(new HashSet<>(Arrays.asList(row.select(query("Klasse")).text().split(" ")))); substitution.setSubject(row.select(query("Fach")).text()); substitution.setDesc(row.select(query("Anmerkung")).text()); substitution.setRoom(row.select(query("Raum")).text()); String type = recognizeType(row.select(query("Anmerkung")).text()); if (type == null) type = "Vertretung"; substitution.setType(type); substitution.setColor(colorProvider.getColor(type)); day.addSubstitution(substitution); } v.addDay(day); }
From source file:com.spd.ukraine.lucenewebsearch1.web.IndexingController.java
/** * Method used to create the record of given web page in search database. * * @param webPage webPage.url is entered url * @return webPage for success or null for fail *//*from w ww . j a v a 2 s. c om*/ private WebPage createWebPageRecord(WebPage webPage) { try { Document html = Jsoup.connect(webPage.getUrl()).get(); referencedSites.clear(); indexElements(webPage, html, 0); System.out.println(html.text()); System.out.println("number of indexed fields is " + indexWriter.numDocs()); // indexWriter.commit(); indexWriter.close(); return webPage; } catch (Exception ex) { System.out.println("createWebPageRecord " + ex.getMessage()); // ex.printStackTrace(); return null; } }
From source file:me.vertretungsplan.parser.UntisMonitorParser.java
private void loadUrl(String url, String encoding, boolean following, List<Document> docs, String startUrl, int recursionDepth) throws IOException, CredentialInvalidException { String html;//w w w . ja va 2 s .c om if (url.equals(VALUE_URL_LOGIN_RESPONSE)) { html = loginResponse; } else { try { html = httpGet(url, encoding).replace(" ", ""); } catch (HttpResponseException e) { if (docs.size() == 0) { throw e; } else { return; // ignore if first page was loaded and redirect didn't work } } } Document doc = Jsoup.parse(html); doc.setBaseUri(url); if (doc.select(".mon_title").size() == 0) { // We have a problem - there seems to be no substitution schedule. Maybe it is hiding // inside a frame? if (doc.select("frameset frame[name").size() > 0) { for (Element frame : doc.select("frameset frame")) { if (frame.attr("src").matches(".*subst_\\d\\d\\d.html?") && recursionDepth < MAX_RECURSION_DEPTH) { String frameUrl = frame.absUrl("src"); loadUrl(frame.absUrl("src"), encoding, following, docs, frameUrl, recursionDepth + 1); } } } else if (doc.text().contains("registriert")) { throw new CredentialInvalidException(); } else { if (docs.size() == 0) { // ignore if first page was loaded and redirect didn't work throw new IOException( "Could not find .mon-title, seems like there is no Untis " + "schedule here"); } } } else { findSubDocs(docs, html, doc); if (following && doc.select("meta[http-equiv=refresh]").size() > 0) { Element meta = doc.select("meta[http-equiv=refresh]").first(); String attr = meta.attr("content").toLowerCase(); String redirectUrl = url.substring(0, url.lastIndexOf("/") + 1) + attr.substring(attr.indexOf("url=") + 4); if (!redirectUrl.equals(startUrl) && recursionDepth < MAX_RECURSION_DEPTH) { loadUrl(redirectUrl, encoding, true, docs, startUrl, recursionDepth + 1); } } } }
From source file:de.geeksfactory.opacclient.apis.IOpac.java
@Override public ProlongResult prolong(String media, Account account, int useraction, String Selection) throws IOException { // internal convention: We add "NEW" to the media ID to show that we have the new iOPAC // version//from ww w .j a va 2 s . c om if (media.startsWith("NEW")) { String mediaNr = media.substring(3); String html = httpGet( opac_url + "/cgi-bin/di.exe?mode=42&MedNrVerlAll=" + URLEncoder.encode(mediaNr, "UTF-8"), getDefaultEncoding()); Document doc = Jsoup.parse(html); if (doc.text().contains("1 Medium wurde verl")) { return new ProlongResult(MultiStepResult.Status.OK); } else { return new ProlongResult(MultiStepResult.Status.ERROR, doc.text()); } } else { String html = httpGet(opac_url + "/" + media, getDefaultEncoding()); Document doc = Jsoup.parse(html); if (doc.select("table th").size() > 0) { if (doc.select("h1").size() > 0) { if (doc.select("h1").first().text().contains("Hinweis")) { return new ProlongResult(MultiStepResult.Status.ERROR, doc.select("table th").first().text()); } } try { Element form = doc.select("form[name=form1]").first(); String sessionid = form.select("input[name=sessionid]").attr("value"); String mednr = form.select("input[name=mednr]").attr("value"); httpGet(opac_url + "/cgi-bin/di.exe?mode=8&kndnr=" + account.getName() + "&mednr=" + mednr + "&sessionid=" + sessionid + "&psh100=Verl%C3%A4ngern", getDefaultEncoding()); return new ProlongResult(MultiStepResult.Status.OK); } catch (Throwable e) { e.printStackTrace(); return new ProlongResult(MultiStepResult.Status.ERROR); } } return new ProlongResult(MultiStepResult.Status.ERROR); } }
From source file:com.spd.ukraine.lucenewebsearch1.web.IndexingController.java
/** * Method used to perform recursive creation indexing for a given web page * in search database./*w w w. jav a2s . c o m*/ * * @param webPage webPage.url is entered url * webPage.title is set * @param html Jsoup.Document of entered url * @param recursionNumber used to stop recursion at exceeding * MAX_RECURSION_SEARCH_NUMBER */ private void indexElements(WebPage webPage, Document html, final int recursionNumber) throws IOException, ParseException { String title = html.title(); if (referencedTitles.contains(title.trim())) { return; } referencedTitles.add(title.trim()); webPage.setTitle(title); if (containsPage(webPage)) { System.out.println(webPage.getUrl() + " is already indexed"); return; } Element prevElement = null; Elements elements = html.body().getAllElements(); //.getElementsByTag("a"); addDoc(webPage, html.text()); // for (Element element : elements) { //// System.out.println(element.nodeName() + " element.text() " //// + element.text() + " url " //// + element.absUrl("href")); // if (element.nodeName().equalsIgnoreCase("body")) { // addDoc(webPage, element.text()); // break; //// continue; // } // if (null == prevElement) { // prevElement = element; //// } else if (prevElementContainsElementText(prevElement, element)) { //// continue; // } //// if (null !== webPagesService.findWebPage(element.absUrl("href"))) // if (element.text().trim().isEmpty()) { // continue; // } //// StringTokenizer str = new StringTokenizer(element.text()); //// str. // addDoc(webPage, element.text()); // } if (recursionNumber > MAX_RECURSION_SEARCH_NUMBER || referencedSites.size() > MAX_NUMBER_SITES_INDEXED) { // System.out.println(recursionNumber + " " // + referencedSites.contains(webPage.getUrl())); return; } elements.parallelStream() .filter((Element e) -> e.nodeName().equalsIgnoreCase("a") && null != e.absUrl(HREF) && !e.absUrl(HREF).trim().isEmpty() && !referencedSites.contains(e.absUrl(HREF)) && !referencedSites.contains(removeSharpEtc(e.absUrl(HREF)))) .forEach((Element element) -> { WebPage webPage1 = new WebPage(element.absUrl(HREF)); String url1 = webPage1.getUrl(); // System.out.println(recursionNumber + " recursion for '" // + url1 + "'"); try { Document htmlR = Jsoup.connect(url1).get(); indexElements(webPage1, htmlR, recursionNumber + 1); } catch (IOException | ParseException e) { System.out.println("Exception " + e.getMessage()); } referencedSites.add(url1); }); // for (Element element : elements) { // if (!element.nodeName().equalsIgnoreCase("a")) { // continue; // } // WebPage webPage1 = new WebPage(element.absUrl("href")); // if (null == webPage1.getUrl() // || webPage1.getUrl().isEmpty() // || referencedSites.contains(webPage1.getUrl())) { // continue; // } // System.out.println(recursionNumber + "recursion for " // + element.absUrl("href")); // try { // Document htmlR = Jsoup.connect(webPage1.getUrl()).get(); // webPage1.setTitle(htmlR.title()); // indexElements(webPage1, htmlR, recursionNumber + 1); // } catch (IOException e) { // System.out.println("IOException " + e.getMessage()); // } // referencedSites.add(webPage1.getUrl()); // } }
From source file:de.geeksfactory.opacclient.apis.IOpac.java
@Override public ProlongAllResult prolongAll(Account account, int useraction, String selection) throws IOException { Document doc = getAccountPage(account); // Check if the iOPAC verion supports this feature if (doc.select("button.verlallbutton").size() > 0) { List<NameValuePair> params = new ArrayList<>(); params.add(new BasicNameValuePair("mode", "42")); for (Element checkbox : doc.select("input.VerlAllCheckboxOK")) { params.add(new BasicNameValuePair("MedNrVerlAll", checkbox.val())); }//from ww w.j a v a2 s .c o m String html = httpGet(opac_url + "/cgi-bin/di.exe?" + URLEncodedUtils.format(params, "UTF-8"), getDefaultEncoding()); Document doc2 = Jsoup.parse(html); Pattern pattern = Pattern.compile("(\\d+ Medi(?:en|um) wurden? verl.ngert)\\s*(\\d+ " + "Medi(?:en|um) wurden? nicht verl.ngert)?"); Matcher matcher = pattern.matcher(doc2.text()); if (matcher.find()) { String text1 = matcher.group(1); String text2 = matcher.group(2); List<Map<String, String>> list = new ArrayList<>(); Map<String, String> map1 = new HashMap<>(); // TODO: We are abusing the ProlongAllResult.KEY_LINE_ ... keys here because we // do not get information about all the media map1.put(ProlongAllResult.KEY_LINE_TITLE, text1); list.add(map1); if (text2 != null && !text2.equals("")) { Map<String, String> map2 = new HashMap<>(); map2.put(ProlongAllResult.KEY_LINE_TITLE, text2); list.add(map2); } return new ProlongAllResult(MultiStepResult.Status.OK, list); } else { return new ProlongAllResult(MultiStepResult.Status.ERROR, doc2.text()); } } else { return new ProlongAllResult(MultiStepResult.Status.ERROR, stringProvider.getString(StringProvider.UNSUPPORTED_IN_LIBRARY)); } }