List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:com.wheelermarine.publicAccessSites.Updater.java
@Override protected Integer doInBackground(URL... urls) { try {/*from ww w. j a v a 2 s. c o m*/ final DatabaseHelper db = new DatabaseHelper(context); SQLiteDatabase database = db.getWritableDatabase(); if (database == null) throw new IllegalStateException("Unable to open database!"); database.beginTransaction(); try { // Clear out the old data. database.delete(DatabaseHelper.PublicAccessEntry.TABLE_NAME, null, null); // Connect to the web server and locate the FTP download link. Log.v(TAG, "Finding update: " + urls[0]); activity.runOnUiThread(new Runnable() { @Override public void run() { progress.setMessage("Locating update..."); progress.setIndeterminate(true); } }); Document doc = Jsoup.connect(urls[0].toString()).timeout(timeout * 1000).userAgent(userAgent).get(); URL dataURL = null; for (Element element : doc.select("a")) { if (element.hasAttr("href") && element.attr("href").endsWith(".zip")) { dataURL = new URL(element.attr("href")); } } // Make sure the download URL was fund. if (dataURL == null) throw new FileNotFoundException("Unable to locate data URL."); // Connect to the FTP server and download the update. Log.v(TAG, "Downloading update: " + dataURL); activity.runOnUiThread(new Runnable() { @Override public void run() { progress.setMessage("Downloading update..."); progress.setIndeterminate(true); } }); HttpClient client = new DefaultHttpClient(); HttpGet get = new HttpGet(dataURL.toString()); HttpResponse response = client.execute(get); HttpEntity entity = response.getEntity(); if (entity == null) throw new IOException("Error downloading update."); Map<Integer, Location> locations = null; // Download the ZIP archive. Log.v(TAG, "Downloading: " + dataURL.getFile()); InputStream in = entity.getContent(); if (in == null) throw new FileNotFoundException(dataURL.getFile() + " was not found!"); try { ZipInputStream zin = new ZipInputStream(in); try { // Locate the .dbf entry in the ZIP archive. ZipEntry entry; while ((entry = zin.getNextEntry()) != null) { if (entry.getName().endsWith(entryName)) { readDBaseFile(zin, database); } else if (entry.getName().endsWith(shapeEntryName)) { locations = readShapeFile(zin); } } } finally { try { zin.close(); } catch (Exception e) { // Ignore this error. } } } finally { in.close(); } if (locations != null) { final int recordCount = locations.size(); activity.runOnUiThread(new Runnable() { @Override public void run() { progress.setIndeterminate(false); progress.setMessage("Updating locations..."); progress.setMax(recordCount); } }); int progress = 0; for (int recordNumber : locations.keySet()) { PublicAccess access = db.getPublicAccessByRecordNumber(recordNumber); Location loc = locations.get(recordNumber); access.setLatitude(loc.getLatitude()); access.setLongitude(loc.getLongitude()); db.updatePublicAccess(access); publishProgress(++progress); } } database.setTransactionSuccessful(); return db.getPublicAccessesCount(); } finally { database.endTransaction(); } } catch (Exception e) { error = e; Log.e(TAG, "Error loading data: " + e.getLocalizedMessage(), e); return -1; } }
From source file:com.github.hronom.scrape.dat.website.controllers.ScrapeButtonController.java
public void processByUi4j() { // Disable fields in view. scrapeView.setWebsiteUrlTextFieldEnabled(false); scrapeView.setSelectorTextFieldEnabled(false); scrapeView.setScrapeButtonEnabled(false); scrapeView.setWorkInProgress(true);//ww w . j ava 2s. co m scrapeView.setOutput(""); scrapeView.setProgressBarTaskText("initializing"); logger.info("Start processing..."); long beginTime = System.currentTimeMillis(); // Output input parameters. if (!scrapeView.getWebsiteUrl().isEmpty() && !scrapeView.getSelector().isEmpty()) { logger.info("Input parameters: \"" + scrapeView.getWebsiteUrl() + "\", \"" + scrapeView.getSelector() + "\", \""); } // Navigate to blank page. scrapeView.setProgressBarTaskText("requesting page"); logger.info("Requesting page..."); Page page = browserEngine.navigate(scrapeView.getWebsiteUrl()); //page.show(); logger.info("Requesting of page completed."); scrapeView.setProgressBarTaskText("viewing page as HTML"); logger.info("View page as HTML"); String html = page.getDocument().getBody().getInnerHTML(); // Unescape html. scrapeView.setProgressBarTaskText("unescaping HTML"); logger.info("Unescape html"); html = StringEscapeUtils.unescapeHtml4(html); logger.info("Get selector"); String selector = scrapeView.getSelector(); if (!html.isEmpty() && !selector.isEmpty()) { scrapeView.setProgressBarTaskText("parsing HTML"); logger.info("Parse HTML"); Document doc = Jsoup.parse(html); scrapeView.setProgressBarTaskText("selecting elements in HTML"); logger.info("select elements in HTML"); Elements selectedElements = doc.select(selector); if (!selectedElements.isEmpty()) { scrapeView.setProgressBarTaskText("parsing selected elements"); logger.info("Parse extracted elements"); StringBuilder sb = new StringBuilder(); for (Element element : selectedElements) { String body = element.html(); sb.append(body); sb.append("\n"); sb.append("\n"); } scrapeView.setOutput(sb.toString()); } } browserEngine.clearCookies(); long endTime = System.currentTimeMillis(); logger.info("Process time: " + (endTime - beginTime) + " ms."); logger.info("Processing complete."); // Enable fields in view. scrapeView.setWorkInProgress(false); scrapeView.setScrapeButtonEnabled(true); scrapeView.setSelectorTextFieldEnabled(true); scrapeView.setWebsiteUrlTextFieldEnabled(true); }
From source file:org.apache.karaf.cave.server.storage.CaveRepositoryImpl.java
/** * Populate the Cave repository using the given URL. * * @param url the "source" HTTP URL.// ww w .j av a2 s. c om * @param filter regex filter. Only artifacts URL matching the filter will be considered. * @param update true if the OBR metadata should be updated, false else. * @throws Exception in case of populate failure. */ private void populateFromHttp(String url, String filter, boolean update) throws Exception { LOGGER.debug("Populating from HTTP URL {}", url); HttpClient httpClient = new DefaultHttpClient(); HttpGet httpGet = new HttpGet(url); HttpResponse response = httpClient.execute(httpGet); HttpEntity entity = response.getEntity(); if (entity != null) { if (entity.getContentType().getValue().equals("application/java-archive") || entity.getContentType().getValue().equals("application/octet-stream")) { // I have a jar/binary, potentially a resource try { if ((filter == null) || (url.matches(filter))) { ResourceImpl resource = (ResourceImpl) new DataModelHelperImpl() .createResource(new URL(url)); if (resource != null) { LOGGER.debug("Copy {} into the Cave repository storage", url); int index = url.lastIndexOf("/"); if (index > 0) { url = url.substring(index); } File destination = new File(new File(this.getLocation()), url); FileOutputStream outputStream = new FileOutputStream(destination); entity.writeTo(outputStream); outputStream.flush(); outputStream.close(); if (update) { resource = (ResourceImpl) new DataModelHelperImpl() .createResource(destination.toURI().toURL()); LOGGER.debug("Update OBR metadata with {}", resource.getId()); this.addResource(resource); } } } } catch (IllegalArgumentException e) { LOGGER.warn(e.getMessage()); } } else { // try to find link to "browse" Document document = Jsoup.connect(url).get(); Elements links = document.select("a"); if (links.size() > 1) { for (int i = 1; i < links.size(); i++) { Element link = links.get(i); String absoluteHref = link.attr("abs:href"); this.populateFromHttp(absoluteHref, filter, update); } } } } }
From source file:com.github.hronom.scrape.dat.website.controllers.ScrapeButtonController.java
public void processByJxBrowser() { // Disable fields in view. scrapeView.setWebsiteUrlTextFieldEnabled(false); scrapeView.setSelectorTextFieldEnabled(false); scrapeView.setScrapeButtonEnabled(false); scrapeView.setWorkInProgress(true);// w w w.j a v a2 s . c o m scrapeView.setOutput(""); scrapeView.setProgressBarTaskText("initializing"); logger.info("Start processing..."); long beginTime = System.currentTimeMillis(); // Output input parameters. if (!scrapeView.getWebsiteUrl().isEmpty() && !scrapeView.getSelector().isEmpty()) { logger.info("Input parameters: \"" + scrapeView.getWebsiteUrl() + "\", \"" + scrapeView.getSelector() + "\", \""); } // Navigate to blank page. scrapeView.setProgressBarTaskText("requesting page"); logger.info("Requesting page..."); browser.loadURL(scrapeView.getWebsiteUrl()); // Wait for loading. while (browser.isLoading()) { try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } } logger.info("Requesting of page completed."); scrapeView.setProgressBarTaskText("viewing page as HTML"); logger.info("View page as HTML"); String html = browser.getHTML(); // Unescape html. scrapeView.setProgressBarTaskText("unescaping HTML"); logger.info("Unescape html"); html = StringEscapeUtils.unescapeHtml4(html); logger.info("Get selector"); String selector = scrapeView.getSelector(); if (!html.isEmpty() && !selector.isEmpty()) { scrapeView.setProgressBarTaskText("parsing HTML"); logger.info("Parse HTML"); Document doc = Jsoup.parse(html); scrapeView.setProgressBarTaskText("selecting elements in HTML"); logger.info("select elements in HTML"); Elements selectedElements = doc.select(selector); if (!selectedElements.isEmpty()) { scrapeView.setProgressBarTaskText("parsing selected elements"); logger.info("Parse extracted elements"); StringBuilder sb = new StringBuilder(); for (Element element : selectedElements) { String body = element.html(); sb.append(body); sb.append("\n"); sb.append("\n"); } scrapeView.setOutput(sb.toString()); } } browser.stop(); long endTime = System.currentTimeMillis(); logger.info("Process time: " + (endTime - beginTime) + " ms."); logger.info("Processing complete."); // Enable fields in view. scrapeView.setWorkInProgress(false); scrapeView.setScrapeButtonEnabled(true); scrapeView.setSelectorTextFieldEnabled(true); scrapeView.setWebsiteUrlTextFieldEnabled(true); }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.boilerplate.impl.JusTextBoilerplateRemoval.java
/** * remove unwanted parts from a jsoup doc */// w w w.j ava 2 s . c o m private Document cleanDom(Document jsoupDoc) { String[] tagsToRemove = { "head", "script", ".hidden", "embedded" }; for (String tag : tagsToRemove) { Elements selectedTags = jsoupDoc.select(tag); for (Element element : selectedTags) { element.remove(); } } return jsoupDoc; }
From source file:mobi.jenkinsci.alm.assembla.client.AssemblaClient.java
public void login() throws IOException { Document pinDoc = Jsoup.parse(getData(String.format(AUTH, appId), false)); if (getLatestRedirectedUrl().getPath().startsWith(LOGIN)) { pinDoc = postLoginForm(pinDoc);//from w w w .j a va 2 s . c o m } final Element pinBox = pinDoc.select("div[class=box]").first(); if (pinBox == null) { throw new IOException("Missing PIN code from Assembla auth response"); } final Element pinLabel = pinBox.select("p").first(); final Element pinValue = pinBox.select("h1").first(); if (pinLabel == null || pinValue == null) { throw new IOException("Missing PIN code from Assembla auth response"); } final String pin = pinValue.childNode(0).toString(); final HttpPost authPost = new HttpPost( String.format(ASSEMBLA_SITE_APP_AUTH, appId, appSecret) + String.format(PIN_AUTH, pin)); final HttpResponse pinResponse = httpClient.execute(authPost); try { if (pinResponse.getStatusLine().getStatusCode() != HttpURLConnection.HTTP_OK) { throw new IOException( "Post " + authPost.getURI() + " for a PIN failed: " + pinResponse.getStatusLine()); } accessToken = gson.fromJson( new JsonReader(new InputStreamReader(pinResponse.getEntity().getContent(), "UTF-8")), AssemblaAccessToken.class); } finally { authPost.releaseConnection(); } }
From source file:org.bungeni.ext.integration.bungeniportal.BungeniServiceAccess.java
public HashMap<String, ContentBody> getAuthorizeFormFieldValues(String sBody) throws UnsupportedEncodingException { HashMap<String, ContentBody> nvp = new HashMap<String, ContentBody>(); Document doc = Jsoup.parse(sBody); if (!doc.select("input[name=client_id]").isEmpty()) { nvp.put("client_id", new StringBody(getItemAttributeValue(doc, "input[name=client_id]", "value"))); nvp.put("state", new StringBody(getItemAttributeValue(doc, "input[name=state]", "value"))); nvp.put("time", new StringBody(getItemAttributeValue(doc, "input[name=time]", "value"))); nvp.put("nonce", new StringBody(getItemAttributeValue(doc, "input[name=nonce]", "value"))); nvp.put("form.actions.authorize", new StringBody(getItemAttributeValue(doc, "input[name=form.actions.authorize]", "value"))); }/*from w w w . java2 s . c om*/ return nvp; }
From source file:org.confab.PhpBB3Parser.java
public List<Forum> parseForums(Document root, BulletinBoard parent) { Utilities.debug("parseForums"); List<Forum> ret = new ArrayList<Forum>(); // get table/* ww w .ja v a2 s . c o m*/ Elements forum_tables = root.select("ul[class=topiclist forums]"); assert !forum_tables.isEmpty() : root.html(); for (Element forum_table : forum_tables) { Elements els_li = forum_table.select("li.row"); assert !els_li.isEmpty(); for (Element el_li : els_li) { Forum new_forum = new Forum(parent); // Get the forum url Elements els_a = el_li.select("a.forumtitle"); Element el_a = els_a.first(); assert el_a != null; new_forum.url = el_a.attr("href"); assert new_forum.url != null; Utilities.debug("new_forum.url : " + new_forum.url); // Get the title text new_forum.title = el_a.text(); assert new_forum.title != null; Utilities.debug("new_forum.title : " + new_forum.title); // Check for any subforums in remaining a elements els_a.remove(els_a.first()); for (Element _el_a : els_a) { Forum sub_forum = new Forum(parent); sub_forum.url = el_a.attr("href"); assert sub_forum.url != null; sub_forum.title = el_a.text(); assert sub_forum.title != null; new_forum.subForums.add(sub_forum); Utilities.debug("added subForum: " + sub_forum.title); } // Get the description/message of this topic String el_description = el_a.parent().text(); if (el_description != null) { new_forum.description = el_description; } else { new_forum.description = ""; } Utilities.debug("new_forum.description : " + new_forum.description); Utilities.debug("new_forum.parent.url : " + new_forum.parent.url); ret.add(new_forum); Utilities.debug("-----"); } } Utilities.debug("end parseForums"); return ret; }
From source file:gov.medicaid.screening.dao.impl.NursingLicenseDAOBean.java
/** * Parses the nursing license details page. * * @param page the details page/* ww w . ja va 2 s . c o m*/ * @param licenseType if user has multiple licenses, this one will be used * @return the parsed license details * @throws ParsingException if the page does not contain the expected elements */ private License parseLicense(Document page, String licenseType) throws ParsingException { if (!page.select("span#lblFormTitle").text().equals("License Details")) { throw new ParsingException(ErrorCode.MITA50002.getDesc()); } License license = new License(); ProviderProfile profile = new ProviderProfile(); license.setProfile(profile); String fullName = page.select("#_ctl7_lblName").text(); User user = new User(); profile.setUser(user); String[] nameParts = fullName.split(" "); user.setLastName(nameParts[nameParts.length - 1]); if (nameParts.length > 1) { user.setFirstName(nameParts[0]); } // everything else goes to middle name (per site behavior) if (nameParts.length > 2) { StringBuffer sb = new StringBuffer(); for (int i = 1; i < nameParts.length - 1; i++) { if (sb.length() > 0) { sb.append(" "); } sb.append(nameParts[i]); } user.setMiddleName(sb.toString()); } String dateOfBirth = page.select("#_ctl7_lblDOB").text(); if (Util.isNotBlank(dateOfBirth)) { profile.setDob(parseDate(dateOfBirth, DATE_FORMAT)); } String gender = page.select("#_ctl7_lblGender").text(); if (Util.isNotBlank(gender)) { if ("Female".equals(gender)) { profile.setSex(Sex.FEMALE); } else { profile.setSex(Sex.MALE); } } Elements licenses = page.select("#_ctl7_dgLicense tr.Normal"); for (Element row : licenses) { String licenseNumber = row.select("td:eq(0)").text(); if (licenseType != null && !licenseNumber.startsWith(licenseType)) { // user has multiple licenses, the results will show this user twice (search by name) continue; } String[] licenseParts = licenseNumber.split(" "); LicenseType type = new LicenseType(); type.setName(TYPES.get(licenseParts[0]) == null ? licenseParts[0] : TYPES.get(licenseParts[0])); license.setType(type); license.setLicenseNumber(licenseParts[1]); String issueDate = row.select("td:eq(1)").text(); if (Util.isNotBlank(issueDate)) { license.setOriginalIssueDate(parseDate(issueDate, DATE_FORMAT)); } String expirationDate = row.select("td:eq(2)").text(); if (Util.isNotBlank(expirationDate)) { license.setExpireDate(parseDate(expirationDate, DATE_FORMAT)); } } return license; }
From source file:com.github.hronom.scrape.dat.website.controllers.ScrapeButtonController.java
public void processByHtmlUnit() { // Disable fields in view. scrapeView.setWebsiteUrlTextFieldEnabled(false); scrapeView.setSelectorTextFieldEnabled(false); scrapeView.setScrapeButtonEnabled(false); scrapeView.setWorkInProgress(true);/*w w w .j a v a 2 s . c om*/ scrapeView.setOutput(""); scrapeView.setProgressBarTaskText("initializing"); logger.info("Start processing..."); long beginTime = System.currentTimeMillis(); // Output input parameters. if (!scrapeView.getWebsiteUrl().isEmpty() && !scrapeView.getSelector().isEmpty()) { logger.info("Input parameters: \"" + scrapeView.getWebsiteUrl() + "\", \"" + scrapeView.getSelector() + "\", \""); } // Process. try { URL url = new URL(scrapeView.getWebsiteUrl()); scrapeView.setProgressBarTaskText("requesting page"); logger.info("Requesting page..."); HtmlPage page = webClient.getPage(url); logger.info("Requesting of page completed."); scrapeView.setProgressBarTaskText("viewing page as XML"); logger.info("View page as XML"); String xml = page.asXml(); // Unescape html. scrapeView.setProgressBarTaskText("unescaping HTML"); logger.info("Unescape html"); xml = StringEscapeUtils.unescapeHtml4(xml); logger.info("Get selector"); String selector = scrapeView.getSelector(); if (!xml.isEmpty() && !selector.isEmpty()) { scrapeView.setProgressBarTaskText("parsing HTML"); logger.info("Parse HTML"); Document doc = Jsoup.parse(xml); scrapeView.setProgressBarTaskText("selecting elements in HTML"); logger.info("select elements in HTML"); Elements selectedElements = doc.select(selector); if (!selectedElements.isEmpty()) { scrapeView.setProgressBarTaskText("parsing selected elements"); logger.info("Parse extracted elements"); StringBuilder sb = new StringBuilder(); for (Element element : selectedElements) { String body = element.html(); sb.append(body); sb.append("\n"); sb.append("\n"); } scrapeView.setOutput(sb.toString()); } } } catch (Exception e) { logger.error(e); } webClient.close(); long endTime = System.currentTimeMillis(); logger.info("Process time: " + (endTime - beginTime) + " ms."); logger.info("Processing complete."); // Enable fields in view. scrapeView.setWorkInProgress(false); scrapeView.setScrapeButtonEnabled(true); scrapeView.setSelectorTextFieldEnabled(true); scrapeView.setWebsiteUrlTextFieldEnabled(true); }