Example usage for org.jsoup.nodes Document select

List of usage examples for org.jsoup.nodes Document select

Introduction

In this page you can find the example usage for org.jsoup.nodes Document select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:com.wheelermarine.publicAccessSites.Updater.java

@Override
protected Integer doInBackground(URL... urls) {

    try {/*from   ww  w. j a  v  a 2 s.  c  o  m*/
        final DatabaseHelper db = new DatabaseHelper(context);

        SQLiteDatabase database = db.getWritableDatabase();
        if (database == null)
            throw new IllegalStateException("Unable to open database!");

        database.beginTransaction();
        try {
            // Clear out the old data.
            database.delete(DatabaseHelper.PublicAccessEntry.TABLE_NAME, null, null);

            // Connect to the web server and locate the FTP download link.
            Log.v(TAG, "Finding update: " + urls[0]);
            activity.runOnUiThread(new Runnable() {
                @Override
                public void run() {
                    progress.setMessage("Locating update...");
                    progress.setIndeterminate(true);
                }
            });
            Document doc = Jsoup.connect(urls[0].toString()).timeout(timeout * 1000).userAgent(userAgent).get();
            URL dataURL = null;
            for (Element element : doc.select("a")) {
                if (element.hasAttr("href") && element.attr("href").endsWith(".zip")) {
                    dataURL = new URL(element.attr("href"));
                }
            }

            // Make sure the download URL was fund.
            if (dataURL == null)
                throw new FileNotFoundException("Unable to locate data URL.");

            // Connect to the FTP server and download the update.
            Log.v(TAG, "Downloading update: " + dataURL);
            activity.runOnUiThread(new Runnable() {
                @Override
                public void run() {
                    progress.setMessage("Downloading update...");
                    progress.setIndeterminate(true);
                }
            });
            HttpClient client = new DefaultHttpClient();
            HttpGet get = new HttpGet(dataURL.toString());
            HttpResponse response = client.execute(get);
            HttpEntity entity = response.getEntity();
            if (entity == null)
                throw new IOException("Error downloading update.");

            Map<Integer, Location> locations = null;

            // Download the ZIP archive.
            Log.v(TAG, "Downloading: " + dataURL.getFile());
            InputStream in = entity.getContent();
            if (in == null)
                throw new FileNotFoundException(dataURL.getFile() + " was not found!");
            try {
                ZipInputStream zin = new ZipInputStream(in);
                try {
                    // Locate the .dbf entry in the ZIP archive.
                    ZipEntry entry;
                    while ((entry = zin.getNextEntry()) != null) {
                        if (entry.getName().endsWith(entryName)) {
                            readDBaseFile(zin, database);
                        } else if (entry.getName().endsWith(shapeEntryName)) {
                            locations = readShapeFile(zin);
                        }
                    }
                } finally {
                    try {
                        zin.close();
                    } catch (Exception e) {
                        // Ignore this error.
                    }
                }
            } finally {
                in.close();
            }

            if (locations != null) {
                final int recordCount = locations.size();
                activity.runOnUiThread(new Runnable() {
                    @Override
                    public void run() {
                        progress.setIndeterminate(false);
                        progress.setMessage("Updating locations...");
                        progress.setMax(recordCount);
                    }
                });

                int progress = 0;
                for (int recordNumber : locations.keySet()) {
                    PublicAccess access = db.getPublicAccessByRecordNumber(recordNumber);
                    Location loc = locations.get(recordNumber);
                    access.setLatitude(loc.getLatitude());
                    access.setLongitude(loc.getLongitude());
                    db.updatePublicAccess(access);
                    publishProgress(++progress);
                }
            }
            database.setTransactionSuccessful();
            return db.getPublicAccessesCount();
        } finally {
            database.endTransaction();
        }
    } catch (Exception e) {
        error = e;
        Log.e(TAG, "Error loading data: " + e.getLocalizedMessage(), e);
        return -1;
    }
}

From source file:com.github.hronom.scrape.dat.website.controllers.ScrapeButtonController.java

public void processByUi4j() {
    // Disable fields in view.
    scrapeView.setWebsiteUrlTextFieldEnabled(false);
    scrapeView.setSelectorTextFieldEnabled(false);
    scrapeView.setScrapeButtonEnabled(false);
    scrapeView.setWorkInProgress(true);//ww w .  j ava 2s. co m
    scrapeView.setOutput("");

    scrapeView.setProgressBarTaskText("initializing");
    logger.info("Start processing...");
    long beginTime = System.currentTimeMillis();

    // Output input parameters.
    if (!scrapeView.getWebsiteUrl().isEmpty() && !scrapeView.getSelector().isEmpty()) {
        logger.info("Input parameters: \"" + scrapeView.getWebsiteUrl() + "\", \"" + scrapeView.getSelector()
                + "\", \"");
    }

    // Navigate to blank page.
    scrapeView.setProgressBarTaskText("requesting page");
    logger.info("Requesting page...");
    Page page = browserEngine.navigate(scrapeView.getWebsiteUrl());
    //page.show();
    logger.info("Requesting of page completed.");

    scrapeView.setProgressBarTaskText("viewing page as HTML");
    logger.info("View page as HTML");
    String html = page.getDocument().getBody().getInnerHTML();

    // Unescape html.
    scrapeView.setProgressBarTaskText("unescaping HTML");
    logger.info("Unescape html");
    html = StringEscapeUtils.unescapeHtml4(html);

    logger.info("Get selector");
    String selector = scrapeView.getSelector();
    if (!html.isEmpty() && !selector.isEmpty()) {
        scrapeView.setProgressBarTaskText("parsing HTML");
        logger.info("Parse HTML");
        Document doc = Jsoup.parse(html);

        scrapeView.setProgressBarTaskText("selecting elements in HTML");
        logger.info("select elements in HTML");
        Elements selectedElements = doc.select(selector);

        if (!selectedElements.isEmpty()) {
            scrapeView.setProgressBarTaskText("parsing selected elements");
            logger.info("Parse extracted elements");
            StringBuilder sb = new StringBuilder();
            for (Element element : selectedElements) {
                String body = element.html();
                sb.append(body);
                sb.append("\n");
                sb.append("\n");
            }
            scrapeView.setOutput(sb.toString());
        }
    }

    browserEngine.clearCookies();

    long endTime = System.currentTimeMillis();
    logger.info("Process time: " + (endTime - beginTime) + " ms.");
    logger.info("Processing complete.");

    // Enable fields in view.
    scrapeView.setWorkInProgress(false);
    scrapeView.setScrapeButtonEnabled(true);
    scrapeView.setSelectorTextFieldEnabled(true);
    scrapeView.setWebsiteUrlTextFieldEnabled(true);
}

From source file:org.apache.karaf.cave.server.storage.CaveRepositoryImpl.java

/**
 * Populate the Cave repository using the given URL.
 *
 * @param url    the "source" HTTP URL.// ww  w .j  av  a2 s. c  om
 * @param filter regex filter. Only artifacts URL matching the filter will be considered.
 * @param update true if the OBR metadata should be updated, false else.
 * @throws Exception in case of populate failure.
 */
private void populateFromHttp(String url, String filter, boolean update) throws Exception {
    LOGGER.debug("Populating from HTTP URL {}", url);
    HttpClient httpClient = new DefaultHttpClient();

    HttpGet httpGet = new HttpGet(url);
    HttpResponse response = httpClient.execute(httpGet);
    HttpEntity entity = response.getEntity();

    if (entity != null) {
        if (entity.getContentType().getValue().equals("application/java-archive")
                || entity.getContentType().getValue().equals("application/octet-stream")) {
            // I have a jar/binary, potentially a resource
            try {
                if ((filter == null) || (url.matches(filter))) {
                    ResourceImpl resource = (ResourceImpl) new DataModelHelperImpl()
                            .createResource(new URL(url));
                    if (resource != null) {
                        LOGGER.debug("Copy {} into the Cave repository storage", url);
                        int index = url.lastIndexOf("/");
                        if (index > 0) {
                            url = url.substring(index);
                        }
                        File destination = new File(new File(this.getLocation()), url);
                        FileOutputStream outputStream = new FileOutputStream(destination);
                        entity.writeTo(outputStream);
                        outputStream.flush();
                        outputStream.close();
                        if (update) {
                            resource = (ResourceImpl) new DataModelHelperImpl()
                                    .createResource(destination.toURI().toURL());
                            LOGGER.debug("Update OBR metadata with {}", resource.getId());
                            this.addResource(resource);
                        }
                    }
                }
            } catch (IllegalArgumentException e) {
                LOGGER.warn(e.getMessage());
            }
        } else {
            // try to find link to "browse"
            Document document = Jsoup.connect(url).get();

            Elements links = document.select("a");
            if (links.size() > 1) {
                for (int i = 1; i < links.size(); i++) {
                    Element link = links.get(i);
                    String absoluteHref = link.attr("abs:href");
                    this.populateFromHttp(absoluteHref, filter, update);
                }
            }
        }
    }
}

From source file:com.github.hronom.scrape.dat.website.controllers.ScrapeButtonController.java

public void processByJxBrowser() {
    // Disable fields in view.
    scrapeView.setWebsiteUrlTextFieldEnabled(false);
    scrapeView.setSelectorTextFieldEnabled(false);
    scrapeView.setScrapeButtonEnabled(false);
    scrapeView.setWorkInProgress(true);//  w  w  w.j a v  a2 s .  c o m
    scrapeView.setOutput("");

    scrapeView.setProgressBarTaskText("initializing");
    logger.info("Start processing...");
    long beginTime = System.currentTimeMillis();

    // Output input parameters.
    if (!scrapeView.getWebsiteUrl().isEmpty() && !scrapeView.getSelector().isEmpty()) {
        logger.info("Input parameters: \"" + scrapeView.getWebsiteUrl() + "\", \"" + scrapeView.getSelector()
                + "\", \"");
    }

    // Navigate to blank page.
    scrapeView.setProgressBarTaskText("requesting page");
    logger.info("Requesting page...");
    browser.loadURL(scrapeView.getWebsiteUrl());
    // Wait for loading.
    while (browser.isLoading()) {
        try {
            Thread.sleep(1000);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
    logger.info("Requesting of page completed.");

    scrapeView.setProgressBarTaskText("viewing page as HTML");
    logger.info("View page as HTML");
    String html = browser.getHTML();

    // Unescape html.
    scrapeView.setProgressBarTaskText("unescaping HTML");
    logger.info("Unescape html");
    html = StringEscapeUtils.unescapeHtml4(html);

    logger.info("Get selector");
    String selector = scrapeView.getSelector();
    if (!html.isEmpty() && !selector.isEmpty()) {
        scrapeView.setProgressBarTaskText("parsing HTML");
        logger.info("Parse HTML");
        Document doc = Jsoup.parse(html);

        scrapeView.setProgressBarTaskText("selecting elements in HTML");
        logger.info("select elements in HTML");
        Elements selectedElements = doc.select(selector);

        if (!selectedElements.isEmpty()) {
            scrapeView.setProgressBarTaskText("parsing selected elements");
            logger.info("Parse extracted elements");
            StringBuilder sb = new StringBuilder();
            for (Element element : selectedElements) {
                String body = element.html();
                sb.append(body);
                sb.append("\n");
                sb.append("\n");
            }
            scrapeView.setOutput(sb.toString());
        }
    }

    browser.stop();

    long endTime = System.currentTimeMillis();
    logger.info("Process time: " + (endTime - beginTime) + " ms.");
    logger.info("Processing complete.");

    // Enable fields in view.
    scrapeView.setWorkInProgress(false);
    scrapeView.setScrapeButtonEnabled(true);
    scrapeView.setSelectorTextFieldEnabled(true);
    scrapeView.setWebsiteUrlTextFieldEnabled(true);
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.boilerplate.impl.JusTextBoilerplateRemoval.java

/**
 * remove unwanted parts from a jsoup doc
 *///  w w w.j ava  2 s  .  c  o m
private Document cleanDom(Document jsoupDoc) {
    String[] tagsToRemove = { "head", "script", ".hidden", "embedded" };

    for (String tag : tagsToRemove) {
        Elements selectedTags = jsoupDoc.select(tag);
        for (Element element : selectedTags) {
            element.remove();
        }
    }

    return jsoupDoc;
}

From source file:mobi.jenkinsci.alm.assembla.client.AssemblaClient.java

public void login() throws IOException {
    Document pinDoc = Jsoup.parse(getData(String.format(AUTH, appId), false));
    if (getLatestRedirectedUrl().getPath().startsWith(LOGIN)) {
        pinDoc = postLoginForm(pinDoc);//from  w  w w .j a  va  2 s . c o m
    }

    final Element pinBox = pinDoc.select("div[class=box]").first();
    if (pinBox == null) {
        throw new IOException("Missing PIN code from Assembla auth response");
    }
    final Element pinLabel = pinBox.select("p").first();
    final Element pinValue = pinBox.select("h1").first();
    if (pinLabel == null || pinValue == null) {
        throw new IOException("Missing PIN code from Assembla auth response");
    }
    final String pin = pinValue.childNode(0).toString();
    final HttpPost authPost = new HttpPost(
            String.format(ASSEMBLA_SITE_APP_AUTH, appId, appSecret) + String.format(PIN_AUTH, pin));
    final HttpResponse pinResponse = httpClient.execute(authPost);
    try {
        if (pinResponse.getStatusLine().getStatusCode() != HttpURLConnection.HTTP_OK) {
            throw new IOException(
                    "Post " + authPost.getURI() + " for a PIN failed: " + pinResponse.getStatusLine());
        }
        accessToken = gson.fromJson(
                new JsonReader(new InputStreamReader(pinResponse.getEntity().getContent(), "UTF-8")),
                AssemblaAccessToken.class);
    } finally {
        authPost.releaseConnection();
    }
}

From source file:org.bungeni.ext.integration.bungeniportal.BungeniServiceAccess.java

public HashMap<String, ContentBody> getAuthorizeFormFieldValues(String sBody)
        throws UnsupportedEncodingException {
    HashMap<String, ContentBody> nvp = new HashMap<String, ContentBody>();
    Document doc = Jsoup.parse(sBody);
    if (!doc.select("input[name=client_id]").isEmpty()) {
        nvp.put("client_id", new StringBody(getItemAttributeValue(doc, "input[name=client_id]", "value")));
        nvp.put("state", new StringBody(getItemAttributeValue(doc, "input[name=state]", "value")));
        nvp.put("time", new StringBody(getItemAttributeValue(doc, "input[name=time]", "value")));
        nvp.put("nonce", new StringBody(getItemAttributeValue(doc, "input[name=nonce]", "value")));
        nvp.put("form.actions.authorize",
                new StringBody(getItemAttributeValue(doc, "input[name=form.actions.authorize]", "value")));
    }/*from w  w w  . java2  s .  c om*/
    return nvp;

}

From source file:org.confab.PhpBB3Parser.java

public List<Forum> parseForums(Document root, BulletinBoard parent) {
    Utilities.debug("parseForums");

    List<Forum> ret = new ArrayList<Forum>();

    // get table/* ww w .ja v a2  s  . c  o  m*/
    Elements forum_tables = root.select("ul[class=topiclist forums]");
    assert !forum_tables.isEmpty() : root.html();

    for (Element forum_table : forum_tables) {
        Elements els_li = forum_table.select("li.row");
        assert !els_li.isEmpty();
        for (Element el_li : els_li) {
            Forum new_forum = new Forum(parent);

            // Get the forum url
            Elements els_a = el_li.select("a.forumtitle");
            Element el_a = els_a.first();
            assert el_a != null;
            new_forum.url = el_a.attr("href");
            assert new_forum.url != null;
            Utilities.debug("new_forum.url : " + new_forum.url);

            // Get the title text
            new_forum.title = el_a.text();
            assert new_forum.title != null;
            Utilities.debug("new_forum.title : " + new_forum.title);

            // Check for any subforums in remaining a elements
            els_a.remove(els_a.first());
            for (Element _el_a : els_a) {
                Forum sub_forum = new Forum(parent);
                sub_forum.url = el_a.attr("href");
                assert sub_forum.url != null;
                sub_forum.title = el_a.text();
                assert sub_forum.title != null;
                new_forum.subForums.add(sub_forum);
                Utilities.debug("added subForum: " + sub_forum.title);
            }

            // Get the description/message of this topic
            String el_description = el_a.parent().text();
            if (el_description != null) {
                new_forum.description = el_description;
            } else {
                new_forum.description = "";
            }
            Utilities.debug("new_forum.description : " + new_forum.description);

            Utilities.debug("new_forum.parent.url : " + new_forum.parent.url);

            ret.add(new_forum);
            Utilities.debug("-----");
        }
    }
    Utilities.debug("end parseForums");
    return ret;
}

From source file:gov.medicaid.screening.dao.impl.NursingLicenseDAOBean.java

/**
 * Parses the nursing license details page.
 *
 * @param page the details page/* ww  w  .  ja  va 2 s . c  o  m*/
 * @param licenseType if user has multiple licenses, this one will be used
 * @return the parsed license details
 * @throws ParsingException if the page does not contain the expected elements
 */
private License parseLicense(Document page, String licenseType) throws ParsingException {
    if (!page.select("span#lblFormTitle").text().equals("License Details")) {
        throw new ParsingException(ErrorCode.MITA50002.getDesc());
    }

    License license = new License();
    ProviderProfile profile = new ProviderProfile();
    license.setProfile(profile);

    String fullName = page.select("#_ctl7_lblName").text();

    User user = new User();
    profile.setUser(user);
    String[] nameParts = fullName.split(" ");
    user.setLastName(nameParts[nameParts.length - 1]);
    if (nameParts.length > 1) {
        user.setFirstName(nameParts[0]);
    }
    // everything else goes to middle name (per site behavior)
    if (nameParts.length > 2) {
        StringBuffer sb = new StringBuffer();
        for (int i = 1; i < nameParts.length - 1; i++) {
            if (sb.length() > 0) {
                sb.append(" ");
            }
            sb.append(nameParts[i]);
        }
        user.setMiddleName(sb.toString());
    }

    String dateOfBirth = page.select("#_ctl7_lblDOB").text();
    if (Util.isNotBlank(dateOfBirth)) {
        profile.setDob(parseDate(dateOfBirth, DATE_FORMAT));
    }

    String gender = page.select("#_ctl7_lblGender").text();
    if (Util.isNotBlank(gender)) {
        if ("Female".equals(gender)) {
            profile.setSex(Sex.FEMALE);
        } else {
            profile.setSex(Sex.MALE);
        }
    }

    Elements licenses = page.select("#_ctl7_dgLicense tr.Normal");
    for (Element row : licenses) {
        String licenseNumber = row.select("td:eq(0)").text();
        if (licenseType != null && !licenseNumber.startsWith(licenseType)) {
            // user has multiple licenses, the results will show this user twice (search by name)
            continue;
        }

        String[] licenseParts = licenseNumber.split(" ");
        LicenseType type = new LicenseType();
        type.setName(TYPES.get(licenseParts[0]) == null ? licenseParts[0] : TYPES.get(licenseParts[0]));
        license.setType(type);
        license.setLicenseNumber(licenseParts[1]);

        String issueDate = row.select("td:eq(1)").text();
        if (Util.isNotBlank(issueDate)) {
            license.setOriginalIssueDate(parseDate(issueDate, DATE_FORMAT));
        }

        String expirationDate = row.select("td:eq(2)").text();
        if (Util.isNotBlank(expirationDate)) {
            license.setExpireDate(parseDate(expirationDate, DATE_FORMAT));
        }
    }
    return license;
}

From source file:com.github.hronom.scrape.dat.website.controllers.ScrapeButtonController.java

public void processByHtmlUnit() {
    // Disable fields in view.
    scrapeView.setWebsiteUrlTextFieldEnabled(false);
    scrapeView.setSelectorTextFieldEnabled(false);
    scrapeView.setScrapeButtonEnabled(false);
    scrapeView.setWorkInProgress(true);/*w  w w  .j a  v a 2 s .  c om*/
    scrapeView.setOutput("");

    scrapeView.setProgressBarTaskText("initializing");
    logger.info("Start processing...");
    long beginTime = System.currentTimeMillis();

    // Output input parameters.
    if (!scrapeView.getWebsiteUrl().isEmpty() && !scrapeView.getSelector().isEmpty()) {
        logger.info("Input parameters: \"" + scrapeView.getWebsiteUrl() + "\", \"" + scrapeView.getSelector()
                + "\", \"");
    }

    // Process.
    try {
        URL url = new URL(scrapeView.getWebsiteUrl());
        scrapeView.setProgressBarTaskText("requesting page");
        logger.info("Requesting page...");
        HtmlPage page = webClient.getPage(url);
        logger.info("Requesting of page completed.");

        scrapeView.setProgressBarTaskText("viewing page as XML");
        logger.info("View page as XML");
        String xml = page.asXml();

        // Unescape html.
        scrapeView.setProgressBarTaskText("unescaping HTML");
        logger.info("Unescape html");
        xml = StringEscapeUtils.unescapeHtml4(xml);

        logger.info("Get selector");
        String selector = scrapeView.getSelector();
        if (!xml.isEmpty() && !selector.isEmpty()) {
            scrapeView.setProgressBarTaskText("parsing HTML");
            logger.info("Parse HTML");
            Document doc = Jsoup.parse(xml);

            scrapeView.setProgressBarTaskText("selecting elements in HTML");
            logger.info("select elements in HTML");
            Elements selectedElements = doc.select(selector);

            if (!selectedElements.isEmpty()) {
                scrapeView.setProgressBarTaskText("parsing selected elements");
                logger.info("Parse extracted elements");
                StringBuilder sb = new StringBuilder();
                for (Element element : selectedElements) {
                    String body = element.html();
                    sb.append(body);
                    sb.append("\n");
                    sb.append("\n");
                }
                scrapeView.setOutput(sb.toString());
            }
        }
    } catch (Exception e) {
        logger.error(e);
    }

    webClient.close();

    long endTime = System.currentTimeMillis();
    logger.info("Process time: " + (endTime - beginTime) + " ms.");
    logger.info("Processing complete.");

    // Enable fields in view.
    scrapeView.setWorkInProgress(false);
    scrapeView.setScrapeButtonEnabled(true);
    scrapeView.setSelectorTextFieldEnabled(true);
    scrapeView.setWebsiteUrlTextFieldEnabled(true);
}