Example usage for org.jsoup.nodes Document getElementsByTag

Introduction

In this page you can find the example usage for org.jsoup.nodes Document getElementsByTag.

Prototype

public Elements getElementsByTag(String tagName)

Source Link

Document

Finds elements, including and recursively under this element, with the specified tag name.

Usage

From source file:jp.mau.twappremover.MainActivity.java

private void getTopPage() {

    Document doc = null;
    try {/*from  w ww  .  ja v a2 s . c o  m*/
        Connection conn = Jsoup.connect(TOP_PAGE);
        conn.header("User-Agent", USER_AGENT);
        conn.header("Connection", "keep-alive");
        doc = conn.get();
        Response res = conn.response();
        Map<String, String> cookies = res.cookies();

        for (Map.Entry<String, String> e : cookies.entrySet()) {
            if (e.getKey().equals(KEY_SESSION_ID))
                _session_id = e.getValue();
            if (e.getKey().equals(KEY_GUEST_ID))
                _guest_id = e.getValue();
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
    if (doc == null)
        return;
    // parse top page and get authenticity token
    Elements forms = doc.getElementsByTag("form");
    for (Element e : forms) {
        Elements auths = e.getElementsByAttributeValue("name", "authenticity_token");
        if (auths.size() > 0) {
            _auth_token = auths.get(0).attr("value");
            break;
        }
    }
    if (_auth_token == null) {
        return;
    }
}

From source file:gov.medicaid.screening.dao.impl.DieteticsAndNutritionPracticeLicenseDAOBean.java

/**
 * Performs a search for all possible results.
 *
 * @param identifier The value to be searched.
 * @return the search result for licenses
 * @throws URISyntaxException When an error occurs while building the URL.
 * @throws ClientProtocolException When client does not support protocol used.
 * @throws IOException When an error occurs while parsing response.
 * @throws ParseException When an error occurs while parsing response.
 * @throws PersistenceException for database related errors
 * @throws ServiceException for any other errors
 *//*  www .j a v  a 2 s  .co m*/
private SearchResult<License> getAllResults(String identifier) throws URISyntaxException,
        ClientProtocolException, IOException, ParseException, PersistenceException, ServiceException {
    DefaultHttpClient client = new DefaultHttpClient();
    URIBuilder builder = new URIBuilder(getSearchURL());
    String hostId = builder.build().toString();

    HttpGet httpget = new HttpGet(builder.build());
    HttpResponse landing = client.execute(httpget);
    Document document = Jsoup.parse(EntityUtils.toString(landing.getEntity()));

    HttpPost httppost = new HttpPost(builder.build());
    HttpEntity entity = postForm(hostId, client, httppost,
            new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier },
                    { "_ctl0:_ctl1:_ctl0:btnSubmit", "Search" }, { "__EVENTTARGET", "" },
                    { "__EVENTARGUMENT", "" },
                    { "__VIEWSTATE", document.select("#Form input[name=__VIEWSTATE]").first().val() } },
            true);

    // licenses list
    List<License> licenseList = new ArrayList<License>();
    while (entity != null) {
        String result = EntityUtils.toString(entity);
        document = Jsoup.parse(result);

        Elements trs = document.select(GRID_ROW_SELECTOR);
        if (trs != null) {
            for (Element element : trs) {
                licenseList.add(parseLicense(element.children()));
            }
        }

        // done, check if there are additional results
        entity = null;
        Elements elements = document.getElementsByTag("a");
        for (Element element : elements) {
            if (element.text().equals("Next >>")) {
                entity = postForm(hostId, client, httppost,
                        new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier },
                                { "__EVENTTARGET", "_ctl0:_ctl1:_ctl0:dgrdLicensee:_ctl29:_ctl1" },
                                { "__EVENTARGUMENT", "" },
                                { "__VIEWSTATE",
                                        document.select("#Form input[name=__VIEWSTATE]").first().val() } },
                        true);
                break;
            }
        }
    }

    SearchResult<License> result = new SearchResult<License>();
    result.setItems(licenseList);
    return result;
}

From source file:mx.itdurango.rober.siitdocentes.ActivityAlumnos.java

/**
 * Permite descomponer el cdigo html que se enva con una estructura especifica para llenar los datos de la vista
 *
 * @param html cdigo html que se recibi de una peticin HttpGet, debe tener una estructura similar a la siguiente para que el proceso funcione correctamente
 *             <p/>/*ww  w .ja v a 2 s.  c om*/
 *             <input name="periodo" type="hidden" value="20141" />
 *             <input name="materia" type="hidden" value="SD2424" />
 *             <input name="grupo" type="hidden" value="5VR" />
 *             <input name="docente" type="hidden" value="LOQR841213822" />
 *             <input name="fecha_captura" type="hidden" value="2014/06/12" />
 *             <table>
 *             <tr>
 *             <td>No</td>
 *             <td>Noctrl</td>
 *             <td>Nombre</td>
 *             <td>Unidad 1</td>
 *             <td>Unidad 1</td>
 *             <td>Unidad 3</td>
 *             <td>...</td>
 *             <td>Unidad N</td>
 *             </tr>
 *             <tr>
 *             <td>1</td>
 *             <td>9999999</td>
 *             <td>XXXXXXXXXXXXXXXXXXXXX</td>
 *             <td><input type="text" name="calif[1][1]" value="999"/></td>
 *             <td><input type="text" name="calif[1][2]" value="999"/></td>
 *             <td><input type="text" name="calif[1][3]" value="999"/></td>
 *             <td>...</td>
 *             <td><input type="text" name="calif[1][N]" value="999"/></td>
 *             </tr>
 *             <tr>
 *             <td>2</td>
 *             <td>888888888</td>
 *             <td>YYYYYYYYYYYYYYYYYYYYY</td>
 *             <td><input type="text" name="calif[2][1]" value="999"/></td>
 *             <td><input type="text" name="calif[2][2]" value="999"/></td>
 *             <td><input type="text" name="calif[2][3]" value="999"/></td>
 *             <td>...</td>
 *             <td><input type="text" name="calif[2][N]" value="999"/></td>
 *             </tr>
 *             <tr>
 *             <td>M</td>
 *             <td>000000000</td>
 *             <td>ZZZZZZZZZZZZZZZZZZZZZZ</td>
 *             <td><input type="text" name="calif[M][1]" value="999"/></td>
 *             <td><input type="text" name="calif[M][2]" value="999"/></td>
 *             <td><input type="text" name="calif[M][3]" value="999"/></td>
 *             <td>...</td>
 *             <td><input type="text" name="calif[M][N]" value="999"/></td>
 *             </tr>
 *             </table>
 */
void llenaAlumnos(String html) {
    //Generar un archivo de documento para almacenar los datos del html de forma que se pueda
    //manipular facilmente usando la librera Jsoup
    Document doc = Jsoup.parse(html);

    try {
        //extraer los valores de los elementos del formulario y almacenarlos en los atributos correspondientes de la clase
        Elements e = doc.getElementsByAttributeValue("name", "periodo");
        periodo = e.get(0).attr("value");
        e = doc.getElementsByAttributeValue("name", "materia");
        materia = e.get(0).attr("value");
        e = doc.getElementsByAttributeValue("name", "grupo");
        grupo = e.get(0).attr("value");
        e = doc.getElementsByAttributeValue("name", "docente");
        docente = e.get(0).attr("value");
        e = doc.getElementsByAttributeValue("name", "fecha_captura");
        fecha_captura = e.get(0).attr("value");

        //extraer la tabla correspondiente al listado de alumnos en el caso del siit.itdurango.edu.mx,
        // corresponde a la tabla numero 2 y ya que la numeracin comienza en 0, la tabla que necesitamos est en el indice 1
        Element tabla = doc.getElementsByTag("table").get(1);
        //Extraer todos los elementos de tipo tr que pertenecen a la tabla y almacenarlos en una coleccion de tipo Elements.
        Elements renglones = tabla.getElementsByTag("tr");
        //Recorrer la coleccin de renglones y almacenar cada uno en un objeto
        for (Element tr : renglones) {
            //para cada objeto tr, extraer sus elementos td y almacenarlos en una coleccion
            Elements tds = tr.getElementsByTag("td");
            //permite llevar el control de la columna que se est leyendo, ya que las columnas no tienen un id o clase, se realiza el proceso a mano.
            int col = 1;
            //contenedor de tipo AlumosParciales para almacenar la informacin de cada alumno (tr)
            AlumnosParciales c = new AlumnosParciales();
            for (Element td : tds) {
                if (col == 1) {// la columna 1 corresponde al nmero consecutivo de la tabla
                    c.setNum(td.html());
                } else if (col == 2) {// la columna 2 corresponde al nmero de control del alumno
                    c.setControl(td.html());
                } else if (col == 3) {// la columna 3 corresponde al nombre del alumno
                    c.setNombre(Estaticos.sanitize(td.html()));
                } else { //el resto de las columnas pertenecen a las calificaciones parciales
                    //se extrae el elemento <input> de la columna y se obtiene el atributo valor para recuperar la calificacin en caso de que ya hubiera sido asignada
                    String cal = td.getElementsByTag("input").get(0).attr("value");

                    ArrayList<String> calif = c.getCalificaciones();
                    calif.add(cal);
                    //se agrega la nueva calificacin al conjunto de calificaciones del alumno
                    c.setCalificaciones(calif);
                }
                col++; //incrementa el numero de columa
            }
            if (c.getCalificaciones().size() > 0) { //para evitar agregar al listado de alumnos el encabezado de la tabla, validamos que existan calificaciones.
                gcs.add(c);
            }
        }

        //Llenamos el spinner de unidades a partir del numero de calificaciones que existen en el arreglo
        List<String> spinnerArray = new ArrayList<String>();
        for (int i = 1; i <= gcs.get(1).getCalificaciones().size() - 1; i++) {
            spinnerArray.add("Unidad " + i);
        }
        ArrayAdapter<String> adapter = new ArrayAdapter<String>(this, android.R.layout.simple_spinner_item,
                spinnerArray);
        adapter.setDropDownViewResource(android.R.layout.simple_spinner_dropdown_item);
        spn_unidad.setAdapter(adapter);

        //llenamos el listado de alumnos con la informacin que se obtuvo del proceso anterior
        alumnosParcialesAdapter = new AlumnosParcialesAdapter(this, gcs, unidad);
        lvAlumnos.setAdapter(alumnosParcialesAdapter);

    } catch (Exception e) {
        e.printStackTrace();
        Toast.makeText(this, getString(R.string.error_parser), Toast.LENGTH_SHORT).show();
        finish(); //finaliza el intent actual para desplegar el anterior
    }
}

From source file:gov.medicaid.screening.dao.impl.OptometryLicenseDAOBean.java

/**
 * Performs a search for all possible results.
 *
 * @param identifier The value to be searched.
 * @return the search result for licenses
 * @throws URISyntaxException When an error occurs while building the URL.
 * @throws ClientProtocolException When client does not support protocol used.
 * @throws IOException When an error occurs while parsing response.
 * @throws ParseException When an error occurs while parsing response.
 * @throws PersistenceException for database related errors
 * @throws ServiceException for any other problems encountered
 *//*from w  w  w .  j  ava  2  s  . c  om*/
private SearchResult<License> getAllResults(String identifier) throws URISyntaxException,
        ClientProtocolException, IOException, ParseException, PersistenceException, ServiceException {
    DefaultHttpClient client = new DefaultHttpClient();
    URIBuilder builder = new URIBuilder(getSearchURL()).setPath("/Default.aspx");
    String hostId = builder.build().toString();
    builder.setParameter("tabid", "799");

    HttpGet httpget = new HttpGet(builder.build());
    HttpResponse landing = client.execute(httpget);
    Document document = Jsoup.parse(EntityUtils.toString(landing.getEntity()));

    HttpPost httppost = new HttpPost(builder.build());
    HttpEntity entity = postForm(hostId, client, httppost,
            new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier },
                    { "_ctl0:_ctl1:_ctl0:btnSubmit", "Search" }, { "__EVENTTARGET", "" },
                    { "__EVENTARGUMENT", "" },
                    { "__VIEWSTATE", document.select("#Form input[name=__VIEWSTATE]").first().val() } },
            true);

    // licenses list
    List<License> licenseList = new ArrayList<License>();
    while (entity != null) {
        String result = EntityUtils.toString(entity);
        document = Jsoup.parse(result);

        Elements trs = document.select("table.Datagrid tr");
        if (trs != null) {
            for (Element element : trs) {
                String cssClass = element.attr("class");
                if (!"DatagridHeaderStyle".equals(cssClass.trim()) && element.children().size() == 8) {
                    Elements tds = element.children();
                    licenseList.add(parseLicense(tds));
                }
            }
        }

        // done, check if there are additional results
        entity = null;
        Elements elements = document.getElementsByTag("a");
        for (Element element : elements) {
            if (element.text().equals("Next >>")) {
                entity = postForm(hostId, client, httppost,
                        new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier },
                                { "__EVENTTARGET", "_ctl0:_ctl1:_ctl0:dgrdLicensee:_ctl29:_ctl1" },
                                { "__EVENTARGUMENT", "" },
                                { "__VIEWSTATE",
                                        document.select("#Form input[name=__VIEWSTATE]").first().val() } },
                        true);
                break;
            }
        }
    }

    SearchResult<License> result = new SearchResult<License>();
    result.setItems(licenseList);
    return result;
}

From source file:org.jasig.portlet.proxy.service.proxy.document.URLRewritingFilter.java

protected void updateUrls(final Document document, final IContentResponse proxyResponse,
        final Map<String, Set<String>> elementSet, final RenderRequest request, final RenderResponse response,
        boolean action) {

    // attempt to retrieve the list of rewritten URLs from the session
    final PortletSession session = request.getPortletSession();
    ConcurrentMap<String, String> rewrittenUrls;
    synchronized (PortletUtils.getSessionMutex(session)) {
        rewrittenUrls = (ConcurrentMap<String, String>) session.getAttribute(REWRITTEN_URLS_KEY);

        // if the rewritten URLs list doesn't exist yet, create it
        if (rewrittenUrls == null) {
            rewrittenUrls = new ConcurrentHashMap<String, String>();
            session.setAttribute(REWRITTEN_URLS_KEY, rewrittenUrls);
        }/* w  w w.j  av a  2  s .c  om*/
    }

    // get the list of configured whitelist regexes
    final PortletPreferences preferences = request.getPreferences();
    final String[] whitelistRegexes = preferences.getValues("whitelistRegexes", new String[] {});

    // If we're proxying a remote website (as opposed to a local file system 
    // resources, we'll need to transform any relative URLs.  To do this,
    // we first compute the base and relative URLs for the page.
    String baseUrl = null;
    String relativeUrl = null;
    try {
        baseUrl = getBaseServerUrl(proxyResponse.getProxiedLocation());
        relativeUrl = getRelativePathUrl(proxyResponse.getProxiedLocation());
        LOG.trace("Computed base url {} and relative url {} for proxied url {}", baseUrl, relativeUrl,
                proxyResponse.getProxiedLocation());
    } catch (URISyntaxException e) {
        LOG.error(e.getMessage(), e);
    }

    for (final Map.Entry<String, Set<String>> elementEntry : elementSet.entrySet()) {
        for (final String attributeName : elementEntry.getValue()) {

            // get a list of elements for this element type and iterate through
            // them, updating the relevant URL attribute
            final Elements elements = document.getElementsByTag(elementEntry.getKey());
            for (Element element : elements) {

                String attributeUrl = element.attr(attributeName);
                LOG.trace("Considering element {}  with URL attribute {} of value {}", element, attributeName,
                        attributeUrl);

                // don't adjust or filter javascript url targets
                if (StringUtils.isNotBlank(attributeUrl) && !attributeUrl.startsWith(JAVASCRIPT_PREFIX)
                        && !attributeUrl.startsWith(JAVASCRIPT_PREFIX.toLowerCase())) {

                    // if we're proxying a remote website, adjust any 
                    // relative URLs into absolute URLs
                    if (baseUrl != null) {

                        // (1) do not prefix absolute URLs
                        if (attributeUrl.contains("://") || attributeUrl.startsWith("//")) {
                            // do nothing...
                        }

                        // (2) if the URL is relative to the server base,
                        // prepend the base URL
                        else if (attributeUrl.startsWith("/")) {
                            attributeUrl = baseUrl.concat(attributeUrl);
                        }

                        // (3) otherwise use the full relative path
                        else {
                            attributeUrl = relativeUrl.concat(attributeUrl);
                        }

                    }

                    // if this URL matches our whitelist regex, rewrite it 
                    // to pass through this portlet
                    for (String regex : whitelistRegexes) {

                        if (StringUtils.isNotBlank(regex)) {
                            final Pattern pattern = Pattern.compile(regex); // TODO share compiled regexes
                            if (pattern.matcher(attributeUrl).find()) {

                                // record that we've rewritten this URL
                                rewrittenUrls.put(attributeUrl, attributeUrl);

                                // TODO: the value in the rewritten URLs map needs to 
                                // be a resource URL.  we also want to key URLs by a short
                                // string rather than the full URL

                                if (elementEntry.getKey().equals("form")) {
                                    // the form action needs to be set to POST to
                                    // properly pass through our portlet
                                    boolean isPost = "POST".equalsIgnoreCase(element.attr("method"));
                                    if (!isPost) {
                                        element.attr("method", "POST");
                                    }
                                    attributeUrl = createFormUrl(response, isPost, attributeUrl);
                                }

                                else if (action) {
                                    attributeUrl = createActionUrl(response, attributeUrl);
                                }

                                else {
                                    attributeUrl = createResourceUrl(response, attributeUrl);
                                }
                            }
                        }
                    }

                }

                element.attr(attributeName, attributeUrl.replace("&amp;", "&"));

            }

        }

    }

}

From source file:crawler.HackerEarthCrawler.java

@Override
public void crawl() {

    int flag = 0;

    //set of urls which should be crawled
    TreeSet<String> linksset = new TreeSet<String>();
    TreeSet<String> tempset = new TreeSet<String>();
    TreeSet<String> tutorialset = new TreeSet<String>();
    //final set of problem urls
    TreeSet<String> problemset = new TreeSet<String>();
    //visited for maintaing status of if url is already crawled or not
    TreeMap<String, Integer> visited = new TreeMap<String, Integer>();

    //add base url
    linksset.add(baseUrl);/*from w  w  w  .  j  ava 2s.c  o  m*/
    //mark base url as not crawled
    visited.put(baseUrl, 0);

    try {
        while (true) {
            flag = 0;
            tempset.clear();

            for (String str : linksset) {
                //check if url is already crawled or not and it has valid domain name
                if ((visited.get(str) == 0) && (str.startsWith("https://www.hackerearth.com/"))) {
                    System.out.println("crawling  " + str);

                    //retriving response of current url as document
                    Document doc = Jsoup.connect(str).timeout(0).userAgent(
                            "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0")
                            .referrer("http://www.google.com").ignoreHttpErrors(true).get();
                    //retriving all urls from current page
                    Elements links = doc.select("a[href]");

                    //mark url as crawled
                    visited.put(str, 1);

                    //mark flag as url is crawled
                    flag = 1;
                    //retrive all urls
                    for (Element link : links) {
                        if (link.absUrl("href").endsWith("/tutorial/")) {
                            tutorialset.add(link.absUrl("href"));
                        }
                        //check if url is problem url then add it in problemurlset
                        if (link.absUrl("href").startsWith("https://www.hackerearth.com/")
                                && isProblemUrl(link.absUrl("href"))) {
                            problemset.add(link.absUrl("href"));
                        }
                        //check if url has valid domain and it has problem urls or not
                        if (link.absUrl("href").contains(("https://www.hackerearth.com/"))
                                && isCrawlable(link.absUrl("href"))) {
                            //if link is not visited then mark it as uncrawled
                            if (!visited.containsKey(link.absUrl("href"))) {
                                visited.put(link.absUrl("href"), 0);
                            }
                            //add it in tempsetorary set
                            tempset.add(link.absUrl("href"));
                            //System.out.println("\n  base: "+str+" ::: link  : " + link.absUrl("href"));
                        }
                    }
                }
            }
            //if nothing is left to crawl break the loop
            if (flag == 0) {
                break;
            }
            //add all retrieved links to linksset
            linksset.addAll(tempset);
        }

        System.out.println("\n\ntotal problem urls " + problemset.size());

        int i = 0;
        for (String str : problemset) {
            System.out.println("link " + i + " : " + str);
            i++;
        }

    } catch (IOException ex) {
        Logger.getLogger(HackerEarthCrawler.class.getName()).log(Level.SEVERE, null, ex);
    }

    //scrap and store into database
    //for every problem url scrap problem page
    for (String problemUrl : problemset) {

        System.out.println("problemUrl :" + problemUrl);
        try {
            //create problem class to store in database
            Problem problem = new Problem();
            String problemSIOC = "", problemIOC = "";
            String problemTitle = "", problemStatement = "", problemInput = "", problemOutput = "",
                    problemConstraints = "";
            String sampleInput = "", sampleOutput = "";
            String problemExplanation = "";
            //set default timelimit to 1 second
            double problemTimeLimit = 1.0;
            ArrayList<String> tags = new ArrayList<String>();

            //get response for given problem url
            Response response = Jsoup.connect(problemUrl).execute();
            Document doc = response.parse();

            //retrieve problem title from page
            Element elementTitle = doc.getElementsByTag("title").first();
            StringTokenizer stTitle = new StringTokenizer(elementTitle.text(), "|");
            problemTitle = stTitle.nextToken().trim();

            Element content = doc.getElementsByClass("starwars-lab").first();
            problemSIOC = content.text();
            Elements e = content.children();

            //to find problem statement
            String breakloop[] = { "input", "input:", "input :", "input format:", "input format :",
                    "input format", "Input and output", "constraints :", "constraints:", "constraints",
                    "$$Input :$$" };
            flag = 0;
            for (Element p : e) {
                String tempStatement = "";
                for (Element pp : p.getAllElements()) {

                    for (String strbreak : breakloop) {
                        if (StringUtils.equalsIgnoreCase(pp.ownText(), strbreak)) {
                            //System.out.println("strbreak :"+strbreak);

                            tempStatement = p.text().substring(0,
                                    p.text().toLowerCase().indexOf(strbreak.toLowerCase()));
                            // System.out.println("temp "+tempStatement);
                            flag = 1;
                            break;
                        }
                    }
                }

                if (flag == 1) {
                    problemStatement += tempStatement;
                    //remove extra space at end
                    if (tempStatement.length() == 0) {
                        problemStatement = problemStatement.substring(0, problemStatement.length() - 1);
                    }
                    break;
                }
                problemStatement += p.text() + " ";
            }

            System.out.println("problemSIOC :" + problemSIOC);
            System.out.println("problemStatement :" + problemStatement);

            if (problemStatement.length() <= problemSIOC.length()) {
                //remove problem statement from whole text and remove extra spaces at the beginning and the end
                problemIOC = problemSIOC.substring(problemStatement.length()).trim();
            } else {
                problemIOC = "";
            }

            System.out.println("problemIOC :" + problemIOC);

            //keywords for identifying input
            String decideInput[] = { "Input format :", "Input format:", "Input format", "inputformat:",
                    "inputformat :", "inputformat", "input and output", "input :", "input:", "input" };
            //keywords for identifying output
            String decideOutput[] = { "output format :", "output format:", "Output format", "outputformat:",
                    "outputformat :", "outputformat", "output :", "output:", "output" };
            //keywords for identifying constraint
            String decideConstraint[] = { "constraints:", "constraints :", "constraints", "Constraints :",
                    "constraint:", "constraint :", "constraint", "Contraints :" };

            int posin = 0, posoutput = 0, poscon = 0, idxin, idxout, idxcon, flaginput = 0, flagoutput = 0,
                    flagcon = 0, inlen = 0, outlen = 0, conlen = 0;

            //find inputformat position,length of keyword
            for (idxin = 0; idxin < decideInput.length; idxin++) {
                if (StringUtils.containsIgnoreCase(problemIOC, decideInput[idxin])) {

                    posin = problemIOC.toLowerCase().indexOf(decideInput[idxin].toLowerCase());
                    flaginput = 1;
                    inlen = decideInput[idxin].length();

                    //decide it is keyowrd for actucal input or it is "sample input"
                    if (StringUtils.containsIgnoreCase(problemIOC, "sample input")) {
                        if (posin > problemIOC.toLowerCase().indexOf("sample input")) {
                            flaginput = 0;
                            inlen = 0;
                        } else {
                            break;
                        }
                    } else {
                        break;
                    }
                }
            }

            //find outputformat position,length of keyword
            for (idxout = 0; idxout < decideOutput.length; idxout++) {
                if (StringUtils.containsIgnoreCase(problemIOC, decideOutput[idxout])) {
                    posoutput = problemIOC.toLowerCase().indexOf(decideOutput[idxout].toLowerCase());
                    flagoutput = 1;
                    outlen = decideOutput[idxout].length();
                    break;
                }
            }

            //find constraint position,length of keyword
            for (idxcon = 0; idxcon < decideConstraint.length; idxcon++) {
                if (StringUtils.containsIgnoreCase(problemIOC, decideConstraint[idxcon])) {
                    poscon = problemIOC.toLowerCase().indexOf(decideConstraint[idxcon].toLowerCase());
                    flagcon = 1;
                    conlen = decideConstraint[idxcon].length();
                    break;
                }
            }

            System.out.println("input " + flaginput + " " + inlen + " " + posin);
            System.out.println("output " + flagoutput + " " + outlen + " " + posoutput);
            System.out.println("constraint " + flagcon + " " + conlen + " " + poscon);
            //retrieve problem input and output if present in problem page

            //if input format is present
            if (flaginput == 1) {
                //if input keyword is "input and output" and contraint is present in problem page
                if (idxin == 6 && flagcon == 1) {
                    problemInput = problemIOC.substring(inlen, poscon);
                }
                //if input keyword is "input and output" and contraint is not present in problem page
                else if (idxin == 6 && flagcon == 0) {
                    problemInput = problemIOC.substring(inlen);
                }
                //if output format and constraint is present
                else if (flagoutput == 1 && flagcon == 1) {
                    //if constraint is present before input format
                    if (poscon < posin) {
                        problemInput = problemIOC.substring(posin + inlen, posoutput);
                        problemOutput = problemIOC.substring(posoutput + outlen);
                    }
                    //if constraint is present before sample
                    else if (poscon < posoutput) {
                        problemInput = problemIOC.substring(inlen, poscon);
                        problemOutput = problemIOC.substring(posoutput + outlen);
                    } else {
                        problemInput = problemIOC.substring(inlen, posoutput);
                        problemOutput = problemIOC.substring(posoutput + outlen, poscon);
                    }
                }
                //if constraint is not present
                else if (flagoutput == 1 && flagcon == 0) {
                    problemInput = problemIOC.substring(inlen, posoutput);
                    problemOutput = problemIOC.substring(posoutput + outlen);
                } else if (flagoutput == 0 && flagcon == 1) {
                    if (poscon < posin) {
                        problemInput = problemIOC.substring(posin + inlen);
                    } else {
                        problemInput = problemIOC.substring(poscon + conlen, posin);
                    }
                    problemOutput = "";
                } else {
                    problemInput = problemIOC.substring(inlen);
                    problemOutput = "";
                }
            }
            //if input format and output format is not present
            else {
                problemInput = "";
                problemOutput = "";
            }

            //if constraint is present
            if (flagcon == 1) {
                //if constraint is present before input format
                if (poscon < posin) {
                    problemConstraints = problemIOC.substring(0, posin);
                }
                //if constraint is present before output format
                else if (poscon < posoutput) {
                    problemConstraints = problemIOC.substring(poscon + conlen, posoutput);
                } else {
                    problemConstraints = problemIOC.substring(poscon + conlen);
                }
            }

            System.out.println("problemInput :" + problemInput);
            System.out.println("problemOutput :" + problemOutput);
            System.out.println("problemConstraints :" + problemConstraints);

            //retrieve problem tags from problem page
            Element elementtag = doc.getElementsByClass("problem-tags").first().child(1);
            StringTokenizer st = new StringTokenizer(elementtag.text(), ",");
            while (st.hasMoreTokens()) {
                tags.add(st.nextToken().trim());
            }

            //retrieve sample input sample output if present
            Element elementSIO = doc.getElementsByClass("input-output-container").first();
            //if sample input output is present
            if (elementSIO != null) {
                //find position of sample output
                int soutpos = elementSIO.text().indexOf("SAMPLE OUTPUT");
                sampleInput = elementSIO.text().substring(12, soutpos);
                sampleOutput = elementSIO.text().substring(soutpos + 13);
                System.out.println("Sample input :\n" + sampleInput + "\n\n\n");
                System.out.println("Sample Output :\n" + sampleOutput);
            } else {
                sampleInput = "";
                sampleOutput = "";
            }

            //retrieve problem explanation from problem page if present
            Element elementExplanation = doc.getElementsByClass("standard-margin").first().child(0);
            if (elementExplanation.text().toLowerCase().contains("explanation")) {
                problemExplanation = elementExplanation.nextElementSibling().text();
            }
            System.out.println("Explanation :" + problemExplanation);

            //retrieve timelimit
            Element elementTL = doc.getElementsByClass("problem-guidelines").first().child(0).child(1);
            StringTokenizer stTL = new StringTokenizer(elementTL.ownText(), " ");
            problemTimeLimit = Double.parseDouble(stTL.nextToken());

            //System.out.println("problemTimeLimit :"+problemTimeLimit);
            //set all retrieved information to problem class
            problem.setProblemUrl(problemUrl);
            if (problemTitle.length() == 0) {
                problemTitle = null;
            }
            if (problemStatement.length() == 0) {
                problemStatement = null;
            }
            if (problemInput.length() == 0) {
                problemInput = null;
            }
            if (problemOutput.length() == 0) {
                problemOutput = null;
            }
            if (problemExplanation.length() == 0) {
                problemExplanation = null;
            }
            if (problemConstraints.length() == 0) {
                problemConstraints = null;
            }
            problem.setTitle(problemTitle);
            problem.setProblemUrl(problemUrl);
            problem.setProblemStatement(problemStatement);
            problem.setInputFormat(problemInput);
            problem.setOutputFormat(problemOutput);
            problem.setTimeLimit(problemTimeLimit);
            problem.setExplanation(problemExplanation);
            problem.setConstraints(problemConstraints);

            //set sample input output to problem class
            SampleInputOutput sampleInputOutput = new SampleInputOutput(problem, sampleInput, sampleOutput);
            problem.getSampleInputOutputs().add(sampleInputOutput);
            //set platform as hackerearth
            problem.setPlatform(Platform.HackerEarth);
            for (String strtag : tags) {
                problem.getTags().add(strtag);
            }

            //store in database
            Session session = null;
            Transaction transaction = null;
            try {
                //start session
                session = HibernateUtil.getSessionFactory().openSession();
                transaction = session.beginTransaction();

                //check if problem is already stored in database
                String hql = "FROM Problem p where p.problemUrl = :problem_url";
                Problem oldProblem = (Problem) session.createQuery(hql).setString("problem_url", problemUrl)
                        .uniqueResult();
                String task;

                //if problem is present in database
                if (oldProblem != null) {
                    //update the old problem
                    task = "updated";
                    //retrieve id of old problem
                    problem.setId(oldProblem.getId());
                    session.delete(oldProblem);
                    session.flush();
                    session.save(problem);
                } else {
                    task = "saved";
                    session.save(problem);
                }

                transaction.commit();
                //log the info to console
                Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.INFO, "{0} {1}",
                        new Object[] { task, problem.getProblemUrl() });
            } catch (HibernateException ee) {
                if (transaction != null) {
                    transaction.rollback();
                }
                Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.SEVERE,
                        "Cannot Insert/Update problem into databse: " + problemUrl, e);
            } finally {
                //close the session
                if (session != null) {
                    session.close();
                }
            }
        } catch (Exception ee) {
            System.out.println(ee.toString());
        }
    }

    System.out.println("\n\n\n\ntutorial urls\n\n");
    try {

        for (String tutorialurl : tutorialset) {
            //System.out.println(tutorialurl+"\n\n");
            Response tutorialres = Jsoup.connect(tutorialurl).execute();
            Document doc = tutorialres.parse();

            Tutorial tutorial = new Tutorial();
            tutorial.setContent(doc.getElementsByClass("tutorial").first().text());

            tutorial.setName(baseUrl);
            tutorialurl = tutorialurl.substring(0, tutorialurl.length() - 10);
            StringTokenizer tutorialtok = new StringTokenizer(tutorialurl, "/");

            String tempstr = "";
            while (tutorialtok.hasMoreTokens()) {
                tempstr = tutorialtok.nextToken();
            }

            Session session = null;
            Transaction transaction = null;
            try {
                //start session
                session = HibernateUtil.getSessionFactory().openSession();
                transaction = session.beginTransaction();

                //check if problem is already stored in database
                String hql = "FROM Tutorial p where p.name = :name";
                Tutorial oldProblem = (Tutorial) session.createQuery(hql).setString("name", tempstr)
                        .uniqueResult();
                String task;

                //if problem is present in database
                if (oldProblem != null) {
                    //update the old problem
                    task = "updated";
                    //retrieve id of old problem
                    tutorial.setName(oldProblem.getName());
                    session.delete(oldProblem);
                    session.flush();
                    session.save(tutorial);
                } else {
                    task = "saved";
                    tutorial.setName(tempstr);
                    session.save(tutorial);
                }

                transaction.commit();
                //log the info to console
                Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.INFO, "{0} {1}",
                        new Object[] { task, tutorial.getName() });
            } catch (HibernateException ee) {
                if (transaction != null) {
                    transaction.rollback();
                }
                Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.SEVERE,
                        "Cannot Insert/Update problem into databse: " + tempstr, ee);
            } finally {
                //close the session
                if (session != null) {
                    session.close();
                }
            }

        }
    } catch (Exception e) {
        System.out.println(e.getMessage());
    }
}

From source file:com.google.android.gms.example.bannerexample.CreateFile.java

public ArrayList<String> parseHTML3(String data, String tag) {
    String title = null;//from w  ww .j  a  va 2 s .c o  m
    Document doc = Jsoup.parse(data);
    Elements elements = doc.getElementsByTag("div");
    String value = null;
    for (int i = 0; i < elements.size(); i++) {
        value = elements.get(i).id();
        dataList3.add(value + ".txt");
    }

    return dataList3;
}