List of usage examples for org.jsoup.nodes Document getElementsByTag
public Elements getElementsByTag(String tagName)
From source file:jp.mau.twappremover.MainActivity.java
private void getTopPage() { Document doc = null; try {/*from w ww . ja v a2 s . c o m*/ Connection conn = Jsoup.connect(TOP_PAGE); conn.header("User-Agent", USER_AGENT); conn.header("Connection", "keep-alive"); doc = conn.get(); Response res = conn.response(); Map<String, String> cookies = res.cookies(); for (Map.Entry<String, String> e : cookies.entrySet()) { if (e.getKey().equals(KEY_SESSION_ID)) _session_id = e.getValue(); if (e.getKey().equals(KEY_GUEST_ID)) _guest_id = e.getValue(); } } catch (IOException e) { e.printStackTrace(); } if (doc == null) return; // parse top page and get authenticity token Elements forms = doc.getElementsByTag("form"); for (Element e : forms) { Elements auths = e.getElementsByAttributeValue("name", "authenticity_token"); if (auths.size() > 0) { _auth_token = auths.get(0).attr("value"); break; } } if (_auth_token == null) { return; } }
From source file:gov.medicaid.screening.dao.impl.DieteticsAndNutritionPracticeLicenseDAOBean.java
/** * Performs a search for all possible results. * * @param identifier The value to be searched. * @return the search result for licenses * @throws URISyntaxException When an error occurs while building the URL. * @throws ClientProtocolException When client does not support protocol used. * @throws IOException When an error occurs while parsing response. * @throws ParseException When an error occurs while parsing response. * @throws PersistenceException for database related errors * @throws ServiceException for any other errors *//* www .j a v a 2 s .co m*/ private SearchResult<License> getAllResults(String identifier) throws URISyntaxException, ClientProtocolException, IOException, ParseException, PersistenceException, ServiceException { DefaultHttpClient client = new DefaultHttpClient(); URIBuilder builder = new URIBuilder(getSearchURL()); String hostId = builder.build().toString(); HttpGet httpget = new HttpGet(builder.build()); HttpResponse landing = client.execute(httpget); Document document = Jsoup.parse(EntityUtils.toString(landing.getEntity())); HttpPost httppost = new HttpPost(builder.build()); HttpEntity entity = postForm(hostId, client, httppost, new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier }, { "_ctl0:_ctl1:_ctl0:btnSubmit", "Search" }, { "__EVENTTARGET", "" }, { "__EVENTARGUMENT", "" }, { "__VIEWSTATE", document.select("#Form input[name=__VIEWSTATE]").first().val() } }, true); // licenses list List<License> licenseList = new ArrayList<License>(); while (entity != null) { String result = EntityUtils.toString(entity); document = Jsoup.parse(result); Elements trs = document.select(GRID_ROW_SELECTOR); if (trs != null) { for (Element element : trs) { licenseList.add(parseLicense(element.children())); } } // done, check if there are additional results entity = null; Elements elements = document.getElementsByTag("a"); for (Element element : elements) { if (element.text().equals("Next >>")) { entity = postForm(hostId, client, httppost, new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier }, { "__EVENTTARGET", "_ctl0:_ctl1:_ctl0:dgrdLicensee:_ctl29:_ctl1" }, { "__EVENTARGUMENT", "" }, { "__VIEWSTATE", document.select("#Form input[name=__VIEWSTATE]").first().val() } }, true); break; } } } SearchResult<License> result = new SearchResult<License>(); result.setItems(licenseList); return result; }
From source file:mx.itdurango.rober.siitdocentes.ActivityAlumnos.java
/** * Permite descomponer el cdigo html que se enva con una estructura especifica para llenar los datos de la vista * * @param html cdigo html que se recibi de una peticin HttpGet, debe tener una estructura similar a la siguiente para que el proceso funcione correctamente * <p/>/*ww w .ja v a 2 s. c om*/ * <input name="periodo" type="hidden" value="20141" /> * <input name="materia" type="hidden" value="SD2424" /> * <input name="grupo" type="hidden" value="5VR" /> * <input name="docente" type="hidden" value="LOQR841213822" /> * <input name="fecha_captura" type="hidden" value="2014/06/12" /> * <table> * <tr> * <td>No</td> * <td>Noctrl</td> * <td>Nombre</td> * <td>Unidad 1</td> * <td>Unidad 1</td> * <td>Unidad 3</td> * <td>...</td> * <td>Unidad N</td> * </tr> * <tr> * <td>1</td> * <td>9999999</td> * <td>XXXXXXXXXXXXXXXXXXXXX</td> * <td><input type="text" name="calif[1][1]" value="999"/></td> * <td><input type="text" name="calif[1][2]" value="999"/></td> * <td><input type="text" name="calif[1][3]" value="999"/></td> * <td>...</td> * <td><input type="text" name="calif[1][N]" value="999"/></td> * </tr> * <tr> * <td>2</td> * <td>888888888</td> * <td>YYYYYYYYYYYYYYYYYYYYY</td> * <td><input type="text" name="calif[2][1]" value="999"/></td> * <td><input type="text" name="calif[2][2]" value="999"/></td> * <td><input type="text" name="calif[2][3]" value="999"/></td> * <td>...</td> * <td><input type="text" name="calif[2][N]" value="999"/></td> * </tr> * <tr> * <td>M</td> * <td>000000000</td> * <td>ZZZZZZZZZZZZZZZZZZZZZZ</td> * <td><input type="text" name="calif[M][1]" value="999"/></td> * <td><input type="text" name="calif[M][2]" value="999"/></td> * <td><input type="text" name="calif[M][3]" value="999"/></td> * <td>...</td> * <td><input type="text" name="calif[M][N]" value="999"/></td> * </tr> * </table> */ void llenaAlumnos(String html) { //Generar un archivo de documento para almacenar los datos del html de forma que se pueda //manipular facilmente usando la librera Jsoup Document doc = Jsoup.parse(html); try { //extraer los valores de los elementos del formulario y almacenarlos en los atributos correspondientes de la clase Elements e = doc.getElementsByAttributeValue("name", "periodo"); periodo = e.get(0).attr("value"); e = doc.getElementsByAttributeValue("name", "materia"); materia = e.get(0).attr("value"); e = doc.getElementsByAttributeValue("name", "grupo"); grupo = e.get(0).attr("value"); e = doc.getElementsByAttributeValue("name", "docente"); docente = e.get(0).attr("value"); e = doc.getElementsByAttributeValue("name", "fecha_captura"); fecha_captura = e.get(0).attr("value"); //extraer la tabla correspondiente al listado de alumnos en el caso del siit.itdurango.edu.mx, // corresponde a la tabla numero 2 y ya que la numeracin comienza en 0, la tabla que necesitamos est en el indice 1 Element tabla = doc.getElementsByTag("table").get(1); //Extraer todos los elementos de tipo tr que pertenecen a la tabla y almacenarlos en una coleccion de tipo Elements. Elements renglones = tabla.getElementsByTag("tr"); //Recorrer la coleccin de renglones y almacenar cada uno en un objeto for (Element tr : renglones) { //para cada objeto tr, extraer sus elementos td y almacenarlos en una coleccion Elements tds = tr.getElementsByTag("td"); //permite llevar el control de la columna que se est leyendo, ya que las columnas no tienen un id o clase, se realiza el proceso a mano. int col = 1; //contenedor de tipo AlumosParciales para almacenar la informacin de cada alumno (tr) AlumnosParciales c = new AlumnosParciales(); for (Element td : tds) { if (col == 1) {// la columna 1 corresponde al nmero consecutivo de la tabla c.setNum(td.html()); } else if (col == 2) {// la columna 2 corresponde al nmero de control del alumno c.setControl(td.html()); } else if (col == 3) {// la columna 3 corresponde al nombre del alumno c.setNombre(Estaticos.sanitize(td.html())); } else { //el resto de las columnas pertenecen a las calificaciones parciales //se extrae el elemento <input> de la columna y se obtiene el atributo valor para recuperar la calificacin en caso de que ya hubiera sido asignada String cal = td.getElementsByTag("input").get(0).attr("value"); ArrayList<String> calif = c.getCalificaciones(); calif.add(cal); //se agrega la nueva calificacin al conjunto de calificaciones del alumno c.setCalificaciones(calif); } col++; //incrementa el numero de columa } if (c.getCalificaciones().size() > 0) { //para evitar agregar al listado de alumnos el encabezado de la tabla, validamos que existan calificaciones. gcs.add(c); } } //Llenamos el spinner de unidades a partir del numero de calificaciones que existen en el arreglo List<String> spinnerArray = new ArrayList<String>(); for (int i = 1; i <= gcs.get(1).getCalificaciones().size() - 1; i++) { spinnerArray.add("Unidad " + i); } ArrayAdapter<String> adapter = new ArrayAdapter<String>(this, android.R.layout.simple_spinner_item, spinnerArray); adapter.setDropDownViewResource(android.R.layout.simple_spinner_dropdown_item); spn_unidad.setAdapter(adapter); //llenamos el listado de alumnos con la informacin que se obtuvo del proceso anterior alumnosParcialesAdapter = new AlumnosParcialesAdapter(this, gcs, unidad); lvAlumnos.setAdapter(alumnosParcialesAdapter); } catch (Exception e) { e.printStackTrace(); Toast.makeText(this, getString(R.string.error_parser), Toast.LENGTH_SHORT).show(); finish(); //finaliza el intent actual para desplegar el anterior } }
From source file:gov.medicaid.screening.dao.impl.OptometryLicenseDAOBean.java
/** * Performs a search for all possible results. * * @param identifier The value to be searched. * @return the search result for licenses * @throws URISyntaxException When an error occurs while building the URL. * @throws ClientProtocolException When client does not support protocol used. * @throws IOException When an error occurs while parsing response. * @throws ParseException When an error occurs while parsing response. * @throws PersistenceException for database related errors * @throws ServiceException for any other problems encountered *//*from w w w . j ava 2 s . c om*/ private SearchResult<License> getAllResults(String identifier) throws URISyntaxException, ClientProtocolException, IOException, ParseException, PersistenceException, ServiceException { DefaultHttpClient client = new DefaultHttpClient(); URIBuilder builder = new URIBuilder(getSearchURL()).setPath("/Default.aspx"); String hostId = builder.build().toString(); builder.setParameter("tabid", "799"); HttpGet httpget = new HttpGet(builder.build()); HttpResponse landing = client.execute(httpget); Document document = Jsoup.parse(EntityUtils.toString(landing.getEntity())); HttpPost httppost = new HttpPost(builder.build()); HttpEntity entity = postForm(hostId, client, httppost, new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier }, { "_ctl0:_ctl1:_ctl0:btnSubmit", "Search" }, { "__EVENTTARGET", "" }, { "__EVENTARGUMENT", "" }, { "__VIEWSTATE", document.select("#Form input[name=__VIEWSTATE]").first().val() } }, true); // licenses list List<License> licenseList = new ArrayList<License>(); while (entity != null) { String result = EntityUtils.toString(entity); document = Jsoup.parse(result); Elements trs = document.select("table.Datagrid tr"); if (trs != null) { for (Element element : trs) { String cssClass = element.attr("class"); if (!"DatagridHeaderStyle".equals(cssClass.trim()) && element.children().size() == 8) { Elements tds = element.children(); licenseList.add(parseLicense(tds)); } } } // done, check if there are additional results entity = null; Elements elements = document.getElementsByTag("a"); for (Element element : elements) { if (element.text().equals("Next >>")) { entity = postForm(hostId, client, httppost, new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier }, { "__EVENTTARGET", "_ctl0:_ctl1:_ctl0:dgrdLicensee:_ctl29:_ctl1" }, { "__EVENTARGUMENT", "" }, { "__VIEWSTATE", document.select("#Form input[name=__VIEWSTATE]").first().val() } }, true); break; } } } SearchResult<License> result = new SearchResult<License>(); result.setItems(licenseList); return result; }
From source file:org.jasig.portlet.proxy.service.proxy.document.URLRewritingFilter.java
protected void updateUrls(final Document document, final IContentResponse proxyResponse, final Map<String, Set<String>> elementSet, final RenderRequest request, final RenderResponse response, boolean action) { // attempt to retrieve the list of rewritten URLs from the session final PortletSession session = request.getPortletSession(); ConcurrentMap<String, String> rewrittenUrls; synchronized (PortletUtils.getSessionMutex(session)) { rewrittenUrls = (ConcurrentMap<String, String>) session.getAttribute(REWRITTEN_URLS_KEY); // if the rewritten URLs list doesn't exist yet, create it if (rewrittenUrls == null) { rewrittenUrls = new ConcurrentHashMap<String, String>(); session.setAttribute(REWRITTEN_URLS_KEY, rewrittenUrls); }/* w w w.j av a 2 s .c om*/ } // get the list of configured whitelist regexes final PortletPreferences preferences = request.getPreferences(); final String[] whitelistRegexes = preferences.getValues("whitelistRegexes", new String[] {}); // If we're proxying a remote website (as opposed to a local file system // resources, we'll need to transform any relative URLs. To do this, // we first compute the base and relative URLs for the page. String baseUrl = null; String relativeUrl = null; try { baseUrl = getBaseServerUrl(proxyResponse.getProxiedLocation()); relativeUrl = getRelativePathUrl(proxyResponse.getProxiedLocation()); LOG.trace("Computed base url {} and relative url {} for proxied url {}", baseUrl, relativeUrl, proxyResponse.getProxiedLocation()); } catch (URISyntaxException e) { LOG.error(e.getMessage(), e); } for (final Map.Entry<String, Set<String>> elementEntry : elementSet.entrySet()) { for (final String attributeName : elementEntry.getValue()) { // get a list of elements for this element type and iterate through // them, updating the relevant URL attribute final Elements elements = document.getElementsByTag(elementEntry.getKey()); for (Element element : elements) { String attributeUrl = element.attr(attributeName); LOG.trace("Considering element {} with URL attribute {} of value {}", element, attributeName, attributeUrl); // don't adjust or filter javascript url targets if (StringUtils.isNotBlank(attributeUrl) && !attributeUrl.startsWith(JAVASCRIPT_PREFIX) && !attributeUrl.startsWith(JAVASCRIPT_PREFIX.toLowerCase())) { // if we're proxying a remote website, adjust any // relative URLs into absolute URLs if (baseUrl != null) { // (1) do not prefix absolute URLs if (attributeUrl.contains("://") || attributeUrl.startsWith("//")) { // do nothing... } // (2) if the URL is relative to the server base, // prepend the base URL else if (attributeUrl.startsWith("/")) { attributeUrl = baseUrl.concat(attributeUrl); } // (3) otherwise use the full relative path else { attributeUrl = relativeUrl.concat(attributeUrl); } } // if this URL matches our whitelist regex, rewrite it // to pass through this portlet for (String regex : whitelistRegexes) { if (StringUtils.isNotBlank(regex)) { final Pattern pattern = Pattern.compile(regex); // TODO share compiled regexes if (pattern.matcher(attributeUrl).find()) { // record that we've rewritten this URL rewrittenUrls.put(attributeUrl, attributeUrl); // TODO: the value in the rewritten URLs map needs to // be a resource URL. we also want to key URLs by a short // string rather than the full URL if (elementEntry.getKey().equals("form")) { // the form action needs to be set to POST to // properly pass through our portlet boolean isPost = "POST".equalsIgnoreCase(element.attr("method")); if (!isPost) { element.attr("method", "POST"); } attributeUrl = createFormUrl(response, isPost, attributeUrl); } else if (action) { attributeUrl = createActionUrl(response, attributeUrl); } else { attributeUrl = createResourceUrl(response, attributeUrl); } } } } } element.attr(attributeName, attributeUrl.replace("&", "&")); } } } }
From source file:crawler.HackerEarthCrawler.java
@Override public void crawl() { int flag = 0; //set of urls which should be crawled TreeSet<String> linksset = new TreeSet<String>(); TreeSet<String> tempset = new TreeSet<String>(); TreeSet<String> tutorialset = new TreeSet<String>(); //final set of problem urls TreeSet<String> problemset = new TreeSet<String>(); //visited for maintaing status of if url is already crawled or not TreeMap<String, Integer> visited = new TreeMap<String, Integer>(); //add base url linksset.add(baseUrl);/*from w w w . j ava 2s.c o m*/ //mark base url as not crawled visited.put(baseUrl, 0); try { while (true) { flag = 0; tempset.clear(); for (String str : linksset) { //check if url is already crawled or not and it has valid domain name if ((visited.get(str) == 0) && (str.startsWith("https://www.hackerearth.com/"))) { System.out.println("crawling " + str); //retriving response of current url as document Document doc = Jsoup.connect(str).timeout(0).userAgent( "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0") .referrer("http://www.google.com").ignoreHttpErrors(true).get(); //retriving all urls from current page Elements links = doc.select("a[href]"); //mark url as crawled visited.put(str, 1); //mark flag as url is crawled flag = 1; //retrive all urls for (Element link : links) { if (link.absUrl("href").endsWith("/tutorial/")) { tutorialset.add(link.absUrl("href")); } //check if url is problem url then add it in problemurlset if (link.absUrl("href").startsWith("https://www.hackerearth.com/") && isProblemUrl(link.absUrl("href"))) { problemset.add(link.absUrl("href")); } //check if url has valid domain and it has problem urls or not if (link.absUrl("href").contains(("https://www.hackerearth.com/")) && isCrawlable(link.absUrl("href"))) { //if link is not visited then mark it as uncrawled if (!visited.containsKey(link.absUrl("href"))) { visited.put(link.absUrl("href"), 0); } //add it in tempsetorary set tempset.add(link.absUrl("href")); //System.out.println("\n base: "+str+" ::: link : " + link.absUrl("href")); } } } } //if nothing is left to crawl break the loop if (flag == 0) { break; } //add all retrieved links to linksset linksset.addAll(tempset); } System.out.println("\n\ntotal problem urls " + problemset.size()); int i = 0; for (String str : problemset) { System.out.println("link " + i + " : " + str); i++; } } catch (IOException ex) { Logger.getLogger(HackerEarthCrawler.class.getName()).log(Level.SEVERE, null, ex); } //scrap and store into database //for every problem url scrap problem page for (String problemUrl : problemset) { System.out.println("problemUrl :" + problemUrl); try { //create problem class to store in database Problem problem = new Problem(); String problemSIOC = "", problemIOC = ""; String problemTitle = "", problemStatement = "", problemInput = "", problemOutput = "", problemConstraints = ""; String sampleInput = "", sampleOutput = ""; String problemExplanation = ""; //set default timelimit to 1 second double problemTimeLimit = 1.0; ArrayList<String> tags = new ArrayList<String>(); //get response for given problem url Response response = Jsoup.connect(problemUrl).execute(); Document doc = response.parse(); //retrieve problem title from page Element elementTitle = doc.getElementsByTag("title").first(); StringTokenizer stTitle = new StringTokenizer(elementTitle.text(), "|"); problemTitle = stTitle.nextToken().trim(); Element content = doc.getElementsByClass("starwars-lab").first(); problemSIOC = content.text(); Elements e = content.children(); //to find problem statement String breakloop[] = { "input", "input:", "input :", "input format:", "input format :", "input format", "Input and output", "constraints :", "constraints:", "constraints", "$$Input :$$" }; flag = 0; for (Element p : e) { String tempStatement = ""; for (Element pp : p.getAllElements()) { for (String strbreak : breakloop) { if (StringUtils.equalsIgnoreCase(pp.ownText(), strbreak)) { //System.out.println("strbreak :"+strbreak); tempStatement = p.text().substring(0, p.text().toLowerCase().indexOf(strbreak.toLowerCase())); // System.out.println("temp "+tempStatement); flag = 1; break; } } } if (flag == 1) { problemStatement += tempStatement; //remove extra space at end if (tempStatement.length() == 0) { problemStatement = problemStatement.substring(0, problemStatement.length() - 1); } break; } problemStatement += p.text() + " "; } System.out.println("problemSIOC :" + problemSIOC); System.out.println("problemStatement :" + problemStatement); if (problemStatement.length() <= problemSIOC.length()) { //remove problem statement from whole text and remove extra spaces at the beginning and the end problemIOC = problemSIOC.substring(problemStatement.length()).trim(); } else { problemIOC = ""; } System.out.println("problemIOC :" + problemIOC); //keywords for identifying input String decideInput[] = { "Input format :", "Input format:", "Input format", "inputformat:", "inputformat :", "inputformat", "input and output", "input :", "input:", "input" }; //keywords for identifying output String decideOutput[] = { "output format :", "output format:", "Output format", "outputformat:", "outputformat :", "outputformat", "output :", "output:", "output" }; //keywords for identifying constraint String decideConstraint[] = { "constraints:", "constraints :", "constraints", "Constraints :", "constraint:", "constraint :", "constraint", "Contraints :" }; int posin = 0, posoutput = 0, poscon = 0, idxin, idxout, idxcon, flaginput = 0, flagoutput = 0, flagcon = 0, inlen = 0, outlen = 0, conlen = 0; //find inputformat position,length of keyword for (idxin = 0; idxin < decideInput.length; idxin++) { if (StringUtils.containsIgnoreCase(problemIOC, decideInput[idxin])) { posin = problemIOC.toLowerCase().indexOf(decideInput[idxin].toLowerCase()); flaginput = 1; inlen = decideInput[idxin].length(); //decide it is keyowrd for actucal input or it is "sample input" if (StringUtils.containsIgnoreCase(problemIOC, "sample input")) { if (posin > problemIOC.toLowerCase().indexOf("sample input")) { flaginput = 0; inlen = 0; } else { break; } } else { break; } } } //find outputformat position,length of keyword for (idxout = 0; idxout < decideOutput.length; idxout++) { if (StringUtils.containsIgnoreCase(problemIOC, decideOutput[idxout])) { posoutput = problemIOC.toLowerCase().indexOf(decideOutput[idxout].toLowerCase()); flagoutput = 1; outlen = decideOutput[idxout].length(); break; } } //find constraint position,length of keyword for (idxcon = 0; idxcon < decideConstraint.length; idxcon++) { if (StringUtils.containsIgnoreCase(problemIOC, decideConstraint[idxcon])) { poscon = problemIOC.toLowerCase().indexOf(decideConstraint[idxcon].toLowerCase()); flagcon = 1; conlen = decideConstraint[idxcon].length(); break; } } System.out.println("input " + flaginput + " " + inlen + " " + posin); System.out.println("output " + flagoutput + " " + outlen + " " + posoutput); System.out.println("constraint " + flagcon + " " + conlen + " " + poscon); //retrieve problem input and output if present in problem page //if input format is present if (flaginput == 1) { //if input keyword is "input and output" and contraint is present in problem page if (idxin == 6 && flagcon == 1) { problemInput = problemIOC.substring(inlen, poscon); } //if input keyword is "input and output" and contraint is not present in problem page else if (idxin == 6 && flagcon == 0) { problemInput = problemIOC.substring(inlen); } //if output format and constraint is present else if (flagoutput == 1 && flagcon == 1) { //if constraint is present before input format if (poscon < posin) { problemInput = problemIOC.substring(posin + inlen, posoutput); problemOutput = problemIOC.substring(posoutput + outlen); } //if constraint is present before sample else if (poscon < posoutput) { problemInput = problemIOC.substring(inlen, poscon); problemOutput = problemIOC.substring(posoutput + outlen); } else { problemInput = problemIOC.substring(inlen, posoutput); problemOutput = problemIOC.substring(posoutput + outlen, poscon); } } //if constraint is not present else if (flagoutput == 1 && flagcon == 0) { problemInput = problemIOC.substring(inlen, posoutput); problemOutput = problemIOC.substring(posoutput + outlen); } else if (flagoutput == 0 && flagcon == 1) { if (poscon < posin) { problemInput = problemIOC.substring(posin + inlen); } else { problemInput = problemIOC.substring(poscon + conlen, posin); } problemOutput = ""; } else { problemInput = problemIOC.substring(inlen); problemOutput = ""; } } //if input format and output format is not present else { problemInput = ""; problemOutput = ""; } //if constraint is present if (flagcon == 1) { //if constraint is present before input format if (poscon < posin) { problemConstraints = problemIOC.substring(0, posin); } //if constraint is present before output format else if (poscon < posoutput) { problemConstraints = problemIOC.substring(poscon + conlen, posoutput); } else { problemConstraints = problemIOC.substring(poscon + conlen); } } System.out.println("problemInput :" + problemInput); System.out.println("problemOutput :" + problemOutput); System.out.println("problemConstraints :" + problemConstraints); //retrieve problem tags from problem page Element elementtag = doc.getElementsByClass("problem-tags").first().child(1); StringTokenizer st = new StringTokenizer(elementtag.text(), ","); while (st.hasMoreTokens()) { tags.add(st.nextToken().trim()); } //retrieve sample input sample output if present Element elementSIO = doc.getElementsByClass("input-output-container").first(); //if sample input output is present if (elementSIO != null) { //find position of sample output int soutpos = elementSIO.text().indexOf("SAMPLE OUTPUT"); sampleInput = elementSIO.text().substring(12, soutpos); sampleOutput = elementSIO.text().substring(soutpos + 13); System.out.println("Sample input :\n" + sampleInput + "\n\n\n"); System.out.println("Sample Output :\n" + sampleOutput); } else { sampleInput = ""; sampleOutput = ""; } //retrieve problem explanation from problem page if present Element elementExplanation = doc.getElementsByClass("standard-margin").first().child(0); if (elementExplanation.text().toLowerCase().contains("explanation")) { problemExplanation = elementExplanation.nextElementSibling().text(); } System.out.println("Explanation :" + problemExplanation); //retrieve timelimit Element elementTL = doc.getElementsByClass("problem-guidelines").first().child(0).child(1); StringTokenizer stTL = new StringTokenizer(elementTL.ownText(), " "); problemTimeLimit = Double.parseDouble(stTL.nextToken()); //System.out.println("problemTimeLimit :"+problemTimeLimit); //set all retrieved information to problem class problem.setProblemUrl(problemUrl); if (problemTitle.length() == 0) { problemTitle = null; } if (problemStatement.length() == 0) { problemStatement = null; } if (problemInput.length() == 0) { problemInput = null; } if (problemOutput.length() == 0) { problemOutput = null; } if (problemExplanation.length() == 0) { problemExplanation = null; } if (problemConstraints.length() == 0) { problemConstraints = null; } problem.setTitle(problemTitle); problem.setProblemUrl(problemUrl); problem.setProblemStatement(problemStatement); problem.setInputFormat(problemInput); problem.setOutputFormat(problemOutput); problem.setTimeLimit(problemTimeLimit); problem.setExplanation(problemExplanation); problem.setConstraints(problemConstraints); //set sample input output to problem class SampleInputOutput sampleInputOutput = new SampleInputOutput(problem, sampleInput, sampleOutput); problem.getSampleInputOutputs().add(sampleInputOutput); //set platform as hackerearth problem.setPlatform(Platform.HackerEarth); for (String strtag : tags) { problem.getTags().add(strtag); } //store in database Session session = null; Transaction transaction = null; try { //start session session = HibernateUtil.getSessionFactory().openSession(); transaction = session.beginTransaction(); //check if problem is already stored in database String hql = "FROM Problem p where p.problemUrl = :problem_url"; Problem oldProblem = (Problem) session.createQuery(hql).setString("problem_url", problemUrl) .uniqueResult(); String task; //if problem is present in database if (oldProblem != null) { //update the old problem task = "updated"; //retrieve id of old problem problem.setId(oldProblem.getId()); session.delete(oldProblem); session.flush(); session.save(problem); } else { task = "saved"; session.save(problem); } transaction.commit(); //log the info to console Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.INFO, "{0} {1}", new Object[] { task, problem.getProblemUrl() }); } catch (HibernateException ee) { if (transaction != null) { transaction.rollback(); } Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.SEVERE, "Cannot Insert/Update problem into databse: " + problemUrl, e); } finally { //close the session if (session != null) { session.close(); } } } catch (Exception ee) { System.out.println(ee.toString()); } } System.out.println("\n\n\n\ntutorial urls\n\n"); try { for (String tutorialurl : tutorialset) { //System.out.println(tutorialurl+"\n\n"); Response tutorialres = Jsoup.connect(tutorialurl).execute(); Document doc = tutorialres.parse(); Tutorial tutorial = new Tutorial(); tutorial.setContent(doc.getElementsByClass("tutorial").first().text()); tutorial.setName(baseUrl); tutorialurl = tutorialurl.substring(0, tutorialurl.length() - 10); StringTokenizer tutorialtok = new StringTokenizer(tutorialurl, "/"); String tempstr = ""; while (tutorialtok.hasMoreTokens()) { tempstr = tutorialtok.nextToken(); } Session session = null; Transaction transaction = null; try { //start session session = HibernateUtil.getSessionFactory().openSession(); transaction = session.beginTransaction(); //check if problem is already stored in database String hql = "FROM Tutorial p where p.name = :name"; Tutorial oldProblem = (Tutorial) session.createQuery(hql).setString("name", tempstr) .uniqueResult(); String task; //if problem is present in database if (oldProblem != null) { //update the old problem task = "updated"; //retrieve id of old problem tutorial.setName(oldProblem.getName()); session.delete(oldProblem); session.flush(); session.save(tutorial); } else { task = "saved"; tutorial.setName(tempstr); session.save(tutorial); } transaction.commit(); //log the info to console Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.INFO, "{0} {1}", new Object[] { task, tutorial.getName() }); } catch (HibernateException ee) { if (transaction != null) { transaction.rollback(); } Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.SEVERE, "Cannot Insert/Update problem into databse: " + tempstr, ee); } finally { //close the session if (session != null) { session.close(); } } } } catch (Exception e) { System.out.println(e.getMessage()); } }
From source file:com.google.android.gms.example.bannerexample.CreateFile.java
public ArrayList<String> parseHTML3(String data, String tag) { String title = null;//from w ww .j a va 2 s .c o m Document doc = Jsoup.parse(data); Elements elements = doc.getElementsByTag("div"); String value = null; for (int i = 0; i < elements.size(); i++) { value = elements.get(i).id(); dataList3.add(value + ".txt"); } return dataList3; }