List of usage examples for org.jsoup.nodes Element html
public String html()
From source file:module.entities.NameFinder.RegexNameFinder.java
/** * @param args the command line arguments *//* w ww .j av a 2 s . co m*/ public static void main(String[] args) throws SQLException, IOException { if (args.length == 1) { Config.configFile = args[0]; } long lStartTime = System.currentTimeMillis(); Timestamp startTime = new Timestamp(lStartTime); System.out.println("Regex Name Finder process started at: " + startTime); DB.initPostgres(); regexerId = DB.LogRegexFinder(lStartTime); initLexicons(); JSONObject obj = new JSONObject(); TreeMap<Integer, String> consultations = DB.getDemocracitConsultationBody(); Document doc; int count = 0; TreeMap<Integer, String> consFoundNames = new TreeMap<>(); TreeMap<Integer, String> consFoundRoles = new TreeMap<>(); for (int consId : consultations.keySet()) { String consBody = consultations.get(consId); String signName = "", roleName = ""; doc = Jsoup.parse(consBody); Elements allPars = new Elements(); Elements paragraphs = doc.select("p"); for (Element par : paragraphs) { if (par.html().contains("<br>")) { String out = "<p>" + par.html().replaceAll("<br>", "</p><p>") + "</p>"; Document internal_doc = Jsoup.parse(out); Elements subparagraphs = internal_doc.select("p"); allPars.addAll(subparagraphs); } else { allPars.add(par); } // System.out.println(formatedText); } String signature = getSignatureFromParagraphs(allPars); // System.out.println(signature); if (signature.contains("#")) { String[] sign_tokens = signature.split("#"); signName = sign_tokens[0]; if (sign_tokens.length > 1) { roleName = sign_tokens[1]; } consFoundNames.put(consId, signName.trim()); consFoundRoles.put(consId, roleName.trim()); count++; } else { System.err.println("--" + consId); } // } DB.insertDemocracitConsultationMinister(consFoundNames, consFoundRoles); TreeMap<Integer, String> consultationsCompletedText = DB.getDemocracitCompletedConsultationBody(); Document doc2; TreeMap<Integer, String> complConsFoundNames = new TreeMap<>(); int count2 = 0; for (int consId : consultationsCompletedText.keySet()) { String consBody = consultationsCompletedText.get(consId); String signName = "", roleName = ""; doc2 = Jsoup.parse(consBody); // if (doc.text().contains("<br>")) { // doc.text().replaceAll("(<[Bb][Rr]>)+", "<p>"); // } Elements allPars = new Elements(); Elements paragraphs = doc2.select("p"); for (Element par : paragraphs) { if (par.html().contains("<br>")) { String out = "<p>" + par.html().replaceAll("<br>", "</p><p>") + "</p>"; Document internal_doc = Jsoup.parse(out); Elements subparagraphs = internal_doc.select("p"); allPars.addAll(subparagraphs); } else { allPars.add(par); } } String signature = getSignatureFromParagraphs(allPars); if (signature.contains("#")) { String[] sign_tokens = signature.split("#"); signName = sign_tokens[0]; if (sign_tokens.length > 1) { roleName = sign_tokens[1]; } consFoundNames.put(consId, signName.trim()); consFoundRoles.put(consId, roleName.trim()); // System.out.println(consId); // System.out.println(signName.trim()); // System.out.println("***************"); count2++; } else { System.err.println("++" + consId); } } DB.insertDemocracitConsultationMinister(complConsFoundNames, consFoundRoles); long lEndTime = System.currentTimeMillis(); System.out.println("Regex Name Finder process finished at: " + startTime); obj.put("message", "Regex Name Finder finished with no errors"); obj.put("details", ""); DB.UpdateLogRegexFinder(lEndTime, regexerId, obj); DB.close(); }
From source file:com.mycollab.core.utils.StringUtils.java
/** * @param value// w ww . j ava2 s .c o m * @return */ public static String formatRichText(String value) { if (isBlank(value)) { return ""; } value = Jsoup.clean(value, relaxed().addTags("img") .addAttributes("img", "align", "alt", "height", "src", "title", "width", "style") .addProtocols("img", "src", "http", "https")); Document doc = Jsoup.parse(value); Element body = doc.body(); replaceHtml(body); String html = body.html(); return html.replace("\n", ""); }
From source file:com.geecko.QuickLyric.lyrics.LyricWiki.java
public static Lyrics fromURL(String url, String artist, String song) { if (url.endsWith("action=edit")) { return new Lyrics(NO_RESULT); }//from w w w .ja v a2 s .co m String text; try { //url = URLDecoder.decode(url, "utf-8"); Document lyricsPage = Jsoup.connect(url).get(); Element lyricbox = lyricsPage.select("div.lyricBox").get(0); lyricbox.after(lyricbox.childNode(0)); String lyricsHtml = lyricbox.html(); text = lyricsHtml.substring(0, lyricsHtml.indexOf("<!--")).replaceAll("<.*?>", "").replaceAll("\n", "<br />"); if (text.contains("&#")) text = Parser.unescapeEntities(text, true); } catch (IndexOutOfBoundsException | IOException e) { e.printStackTrace(); return new Lyrics(ERROR); } if (artist == null) artist = url.substring(24).replace("Gracenote:", "").split(":", 2)[0].replace('_', ' '); if (song == null) song = url.substring(24).replace("Gracenote:", "").split(":", 2)[1].replace('_', ' '); try { artist = URLDecoder.decode(artist, "UTF-8"); song = URLDecoder.decode(song, "UTF-8"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } if (text.contains( "Unfortunately, we are not licensed to display the full lyrics for this song at the moment.") || text.equals("Instrumental <br />")) { Lyrics result = new Lyrics(NEGATIVE_RESULT); result.setArtist(artist); result.setTitle(song); return result; } else if (text.equals("") || text.length() < 3) return new Lyrics(NO_RESULT); else { Lyrics lyrics = new Lyrics(POSITIVE_RESULT); lyrics.setArtist(artist); lyrics.setTitle(song); lyrics.setText(text); lyrics.setSource("LyricsWiki"); lyrics.setURL(url); return lyrics; } }
From source file:io.jari.geenstijl.API.API.java
private static Artikel parseArtikel(Element artikel_el, Context context) throws ParseException { Artikel artikel = new Artikel(); //id//from w ww . j a va2s . c o m artikel.id = Integer.parseInt(artikel_el.attr("id").substring(1)); //summary artikel.summary = artikel_el.select("a.more").first() != null; //titel artikel.titel = artikel_el.select("h1").text(); //plaatje if (PreferenceManager.getDefaultSharedPreferences(context).getBoolean("show_images", true)) { Element plaatje = artikel_el.select("img").first(); if (plaatje != null) { try { String url = plaatje.attr("src"); Log.d(TAG, "Downloading " + url); // artikel.plaatje = Drawable.createFromStream(((java.io.InputStream)new URL(plaatje.attr("src")).getContent()), null); artikel.plaatje = readBytes((InputStream) new URL(plaatje.attr("src")).getContent()); artikel.groot_plaatje = plaatje.hasClass("groot"); if (plaatje.hasAttr("width") && plaatje.hasAttr("height")) if (!plaatje.attr("width").equals("100") || !plaatje.attr("height").equals("100")) artikel.groot_plaatje = true; if (artikel.groot_plaatje) Log.i(TAG, " Done. Big image."); else Log.i(TAG, " Done."); } catch (Exception ex) { Log.w(TAG, "Unable to download image, Falling back... Reason: " + ex.getMessage()); artikel.plaatje = null; } } } //embed if (artikel_el.select("div.embed").first() != null) { //atm alleen support voor iframes Element frame = artikel_el.select("div.embed>iframe").first(); if (frame != null) artikel.embed = frame.attr("src"); } //embed (geenstijl.tv) if (!domain.equals("www.geenstijl.nl")) { //extract url from script Element scriptEl = artikel_el.select("script").first(); if (scriptEl != null) { String script = scriptEl.html(); Pattern pattern = Pattern.compile("'(.*)', fall"); Matcher matcher = pattern.matcher(script); if (matcher.find() && matcher.groupCount() == 1) { artikel.embed = matcher.group(1); } } } //footer shit Element footer = artikel_el.select("footer").first(); SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm", Locale.US); artikel.datum = simpleDateFormat.parse(footer.select("time").first().attr("datetime")); StringTokenizer footer_items = new StringTokenizer(footer.text(), "|"); artikel.auteur = footer_items.nextToken().trim(); artikel.reacties = Integer.parseInt(footer.select("a.comments").text().replace(" reacties", "")); artikel.link = footer.select("a").first().attr("href"); //clean up artikel_el.select("h1").remove(); artikel_el.select(".embed").remove(); artikel_el.select("img").remove(); artikel_el.select("footer").remove(); artikel_el.select("a.more").remove(); artikel_el.select("script").remove(); //inhoud artikel.inhoud = artikel_el.html(); return artikel; }
From source file:dsll.pinterest.crawler.Reduce.java
private static Text getPinContent(String url, DBCollection pinsCollection) throws JSONException { Document html = null;/*from w w w .j a v a 2s. c o m*/ JSONObject pin = new JSONObject(); try { html = Jsoup.connect(url).get(); } catch (Exception e) { return new Text("HTTP connection failed..."); } // Gather major pins data Element doc = html.select("body").first(); // Pin ID String id = (url.split("pin/")[1].split("/")[0]); pin.append("ID", id); // Pin image String imageURL = ""; Element tmp = doc.select("div[class=pinImageSourceWrapper]").first(); try { tmp = tmp.select("div[class=imageContainer]").select("img").first(); imageURL = tmp.attr("src"); } catch (Exception e) { } // try{ // ByteArrayOutputStream pimg=new ByteArrayOutputStream(), cimg = new ByteArrayOutputStream(); // for(int i=0; i<3; i++){ // BufferedImage img=dummyImage; // try{ // img = ImageIO.read(new URL(imageURL)); // // }catch(Exception e){} // ImageIO.write(img, "jpg", cimg); // if(pimg.size()<cimg.size()){ // pimg = cimg; // } // } // // save to hdfs // Configuration conf = new Configuration(); // FileSystem fs = FileSystem.get(conf); // Path outFile = new Path("/home/hadoop/"+id+".png"); // FSDataOutputStream out = fs.create(outFile); // out.write(pimg.toByteArray()); // // }catch(Exception e){ // e.printStackTrace(); // } pin.append("image", imageURL); //Pin name tmp = doc.select("h2[itemprop=name]").first(); String name = ""; if (tmp != null) { name = tmp.text().trim(); } pin.append("name", name); // Pin source Element sourceCont = doc.select("div[class=sourceFlagWrapper]").first(); JSONObject source = new JSONObject(); if (sourceCont != null) { String title = sourceCont.text().trim(); String src = sourceCont.select("a").first().attr("href"); source.append("title", title); source.append("src", src); } pin.append("source", source); //pin credit JSONObject pinCredit = new JSONObject(); Element credit = doc.select("div[class=pinCredits]").first(); String creditName = "", creditTitle = "", creditSource = ""; try { creditName = credit.select("div[class=creditName]").text().trim(); } catch (Exception e) { } try { creditTitle = credit.select("div[class=creditTitle]").text().trim(); } catch (Exception e) { } try { creditSource = credit.select("a").attr("href"); } catch (Exception e) { } pinCredit.append("name", creditName); pinCredit.append("title", creditTitle); pinCredit.append("src", creditSource); pin.append("credit", pinCredit); //comments JSONArray comments = new JSONArray(); Elements commentsConts = doc.select("div[class=commenterNameCommentText]"); for (Element commentCont : commentsConts) { JSONObject comment = new JSONObject(); Element creatorEle = commentCont.select("div[class=commenterWrapper] a").first(); String creatorName = creatorEle.text().trim(); String creatorSrc = creatorEle.attr("href"); String content = "", raw = ""; Element commentContent = commentCont.select(".commentDescriptionContent").first(); try { content = commentContent.text().trim(); raw = commentContent.html(); comment.append("creator", creatorName); comment.append("creator_url", creatorSrc); comment.append("content", content); comment.append("content_raw", raw); comments.put(comment); } catch (Exception e) { } } pin.append("comments", comments); //pin board link and related pins Element bottomDoc = doc.select("div[class=Module CloseupSidebar]").first(); //pin board JSONArray board = new JSONArray(); if (bottomDoc != null) { Element boardEle = bottomDoc.select("div[class=boardHeader]").first(); JSONObject b = new JSONObject(); String boardName = ""; try { boardName = boardEle.select("h3[class=title]").text().trim(); } catch (Exception ee) { } String boardSrc = ""; try { boardSrc = "https://www.pinterest.com" + boardEle.select("a").attr("href").trim(); } catch (Exception ee) { } b.append("name", boardName); b.append("src", boardSrc); board.put(b); } pin.append("board", board); //CAUTION: what if a pin shows up in different boards? //related pins bottomDoc = doc .select("div[class=closeupBottom] div[class=Module CloseupBottom] div[class=relatedPinsWrapper]") .first(); JSONArray relatedPins = new JSONArray(); if (bottomDoc != null) { Elements relatedPinsConts = bottomDoc.select("div[class=pinWrapper]"); for (Element relatedPinsCont : relatedPinsConts) { JSONObject relatedPin = new JSONObject(); try { relatedPin.append("src", "https://www.pinterest.com" + relatedPinsCont.select("div[class=pinHolder] > a").attr("href")); } catch (Exception e) { } relatedPins.put(relatedPin); } } pin.append("related_pins", relatedPins); // Optional: push data to database BasicDBObject dbObject = (BasicDBObject) JSON.parse(pin.toString()); pinsCollection.insert(dbObject); return new Text(pin.toString()); }
From source file:com.ferasinfotech.gwreader.ScreenSlidePageFragment.java
/** * Alternate Factory method for this fragment class. Constructs a new fragment for the given page number, * and HTML story element./* w ww .j av a 2s.co m*/ */ public static ScreenSlidePageFragment create(int pageNumber, int numPages, org.jsoup.nodes.Element story) { int story_id = -1; String name = ""; String summary = ""; String headline = ""; String cover_photo_url = ""; String story_string = ""; long createdAt; ScreenSlidePageFragment fragment = new ScreenSlidePageFragment(); Bundle args = new Bundle(); if (pageNumber == 0) { story_id = 0; name = "Grasswire Help"; headline = "Usage Instructions"; cover_photo_url = "android.resource://com.ferasinfotech.gwreader/" + R.drawable.gw_logo; summary = "Swipe right and left to read each story.\n\n" + "Scroll down to read facts and associated news items (tweets and links) for each story.\n\n" + "Tap on a news items within a story and you'll be able to follow web links, view tweets via the Twitter app, or watch videos.\n\n" + "A long press on a story's cover photo will launch the device browser to view or edit the story on the Grasswire mobile site.\n\n" + "A long press on the image above will launch the Grasswire main page.\n\n" + "App Version: " + BuildConfig.VERSION_NAME + "\n\n"; } else { // doing a story page, Element 'story' is the story data Elements e_list; org.jsoup.nodes.Element tag; story_id = Integer.valueOf(story.attr("data-story-id")); e_list = story.getElementsByClass("feature__tag"); tag = e_list.get(0); name = tag.text() + " (" + pageNumber + "/" + numPages + ")"; e_list = story.getElementsByClass("story__summary"); tag = e_list.get(0); summary = tag.html().replace("<br />", "\r"); e_list = story.getElementsByClass("feature__text"); tag = e_list.get(0); headline = tag.text(); e_list = story.getElementsByClass("feature__image"); tag = e_list.get(0); cover_photo_url = tag.attr("src"); story_string = story.toString(); } args.putInt(ARG_PAGE, pageNumber); args.putInt(ARG_STORY_ID, story_id); args.putString(ARG_TITLE, name); args.putString(ARG_SUMMARY, summary); args.putString(ARG_HEADLINE, headline); args.putString(ARG_COVER_PHOTO, cover_photo_url); args.putString(ARG_STORY_STRING, "<html><head></head><body>" + story_string + "</body></html>"); fragment.setArguments(args); return fragment; }
From source file:com.kantenkugel.discordbot.jdocparser.JDocParser.java
private static Element getSingleElementByQuery(Element root, String query) { Elements elementsByQuery = root.select(query); if (elementsByQuery.size() > 1) { String error = "Found " + elementsByQuery.size() + " elements matching query \"" + query + "\" inside of " + root.tagName() + "-" + root.className(); throw new RuntimeException(error + root.html()); }//from w ww. j av a2 s. c om return elementsByQuery.first(); }
From source file:com.nineash.hutsync.client.NetworkUtilities.java
/** * Perform 2-way sync with the server-side contacts. We send a request that * includes all the locally-dirty contacts so that the server can process * those changes, and we receive (and return) a list of contacts that were * updated on the server-side that need to be updated locally. * * @param account The account being synced * @param authtoken The authtoken stored in the AccountManager for this * account// w ww. java2 s .c o m * @param serverSyncState A token returned from the server on the last sync * @param dirtyContacts A list of the contacts to send to the server * @return A list of contacts that we need to update locally */ public static void syncCalendar(Context context, Account account, String authtoken, long serverSyncState) throws JSONException, ParseException, IOException, AuthenticationException { ArrayList<SerializableCookie> myCookies; CookieStore cookieStore = new BasicCookieStore(); DefaultHttpClient hClient = getHttpClient(context); mContentResolver = context.getContentResolver(); final String[] weeknames = { "rota_this_week", "rota_next_week" }; long calendar_id = getCalendar(account); if (calendar_id == -1) { Log.e("CalendarSyncAdapter", "Unable to create HutSync event calendar"); return; } try { myCookies = (ArrayList<SerializableCookie>) fromString(authtoken); } catch (final IOException e) { Log.e(TAG, "IOException when expanding authtoken", e); return; } catch (final ClassNotFoundException e) { Log.e(TAG, "ClassNotFoundException when expanding authtoken", e); return; } for (SerializableCookie cur_cookie : myCookies) { cookieStore.addCookie(cur_cookie.getCookie()); } hClient.setCookieStore(cookieStore); Log.i(TAG, "Syncing to: " + SYNC_CONTACTS_URI); HttpGet httpget = new HttpGet(SYNC_CONTACTS_URI); final HttpResponse resp = hClient.execute(httpget); final String response = EntityUtils.toString(resp.getEntity()); HashMap<Long, SyncEntry> localEvents = new HashMap<Long, SyncEntry>(); ArrayList<Event> events = new ArrayList<Event>(); Pattern p = Pattern.compile("background-color:(#[[a-f][A-F][0-9]]{6})"); Pattern ps = Pattern .compile(".calendar-key span.(\\S+) \\{ background-color:(#[[a-f][A-F][0-9]]{6}); color:#fff; \\}"); if (resp.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { //check we are still logged in //if (resp.getStatusLine().getStatusCode() == HttpStatus.SC_UNAUTHORIZED) { // Log.e(TAG, "Authentication exception in sending dirty contacts"); // throw new AuthenticationException(); //} //if we are logged in Map<String, String> shift_types = new HashMap<String, String>(); int length = weeknames.length; Document doc = Jsoup.parse(response); String full_name = doc.select("a[href*=" + account.name + "/profile]").first().text(); AccountManager mAccountManager = AccountManager.get(context); Account[] the_accounts = mAccountManager.getAccountsByType(Constants.ACCOUNT_TYPE); boolean multiple_accounts = (the_accounts.length > 1); Elements the_styles = doc.select("style"); for (Element the_style : the_styles) { String st_txt = the_style.html(); Matcher ms = ps.matcher(st_txt); while (ms.find()) { // Find each match in turn; String can't do this. String cname = ms.group(1); // Access a submatch group; String can't do this. String ccol = ms.group(2); String rname = doc.select("span." + cname).first().text(); Log.i(TAG, "LOOK: " + cname + ", " + ccol + ", " + rname); shift_types.put(ccol, rname); } } for (int w = 0; w < weeknames.length; w++) { Elements the_dates = doc.select("div.homepage div.accord-content table[id=" + weeknames[w] + "] tr.heading th:not(.skipStyles)"); //for (Element hidden : the_dates) { //0 is Mon, 6 is Sun Element the_date = the_dates.first(); //figure out the year for the Monday. String str_v = the_date.text(); String[] str_sub = str_v.split(" "); str_sub[1] = str_sub[1].trim(); String[] date_split = str_sub[1].split("/"); Calendar c = Calendar.getInstance(); int this_month = c.get(Calendar.MONTH) + 1; int monday_month = Integer.parseInt(date_split[1]); int this_year = c.get(Calendar.YEAR); int monday_year = this_year; if (this_month > monday_month) { monday_year++; } else if (this_month < monday_month) { monday_year--; } SimpleDateFormat format = new SimpleDateFormat("dd/MM/yyyy"); Date date = new Date(); if (str_v != null && !str_v.isEmpty()) { String this_date = str_sub[1] + "/" + monday_year; //we need to figure out the year - sometimes its next year try { date = format.parse(this_date); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } Log.i(TAG, "Dates: " + this_date + " - " + date); } //} for (int i = 1; i < 8; ++i) { //1 is monday, 7 is sunday Elements hiddens = doc.select("div.homepage div.accord-content table[id=" + weeknames[w] + "] td:eq(" + Integer.toString(i) + "):not(.skipStyles) div.timeElem"); int add_days = i - 1; for (Element hidden : hiddens) { String str = hidden.text(); if (str != null && !str.isEmpty()) { String style = hidden.attr("style"); String bg_col = ""; Matcher m = p.matcher(style); if (m.find()) { bg_col = m.group(1); // Access a submatch group; String can't do this. } Log.i(TAG, "Time: " + str + "(" + bg_col + ")"); String ev_description = ""; //Location too? if (multiple_accounts) ev_description += full_name + "\n\n"; String[] times = str.split(" - "); String[] start_time = times[0].split(":"); String[] end_time = times[1].split(":"); int add_start_hours = Integer.parseInt(start_time[0]); int add_start_minutes = Integer.parseInt(start_time[1]); int add_finish_hours = Integer.parseInt(end_time[0]); int add_finish_minutes = Integer.parseInt(end_time[1]); String ev_shiftType = ""; if (bg_col != null && !bg_col.isEmpty()) { ev_shiftType = (String) shift_types.get(bg_col); } else { ev_shiftType = "Other"; } String ev_title = ev_shiftType + " Shift"; c.setTime(date); c.add(Calendar.DATE, add_days); c.add(Calendar.HOUR_OF_DAY, add_start_hours); c.add(Calendar.MINUTE, add_start_minutes); Date startDate = c.getTime(); long ev_id = startDate.getTime(); c.setTime(date); c.add(Calendar.DATE, add_days); if (add_finish_hours < add_start_hours) { //shift rolls to next day c.add(Calendar.HOUR_OF_DAY, 24); ev_description += "Shift finishes at " + times[1] + " on the next day\n\n"; } else { c.add(Calendar.HOUR_OF_DAY, add_finish_hours); c.add(Calendar.MINUTE, add_finish_minutes); } Date endDate = c.getTime(); Event ev = new Event(ev_id, ev_title, startDate, endDate, ev_description, ev_shiftType); events.add(ev); Log.i(TAG, "Event: " + ev); } } } } //next merge adjacent shifts SimpleDateFormat timeFormat = new SimpleDateFormat("HH:mm"); Event prev_event = null; for (Iterator<Event> it = events.iterator(); it.hasNext();) { Event cur_event = it.next(); if (prev_event != null) { if (prev_event.getEndDate().compareTo(cur_event.getStartDate()) == 0) { prev_event.setDescription(prev_event.getDescription() + "Merged consecutive shifts:\n" + timeFormat.format(prev_event.getStartDate()) + " to " + timeFormat.format(prev_event.getEndDate()) + " (" + prev_event.getShiftType() + ")\n" + timeFormat.format(cur_event.getStartDate()) + " to " + timeFormat.format(cur_event.getEndDate()) + " (" + cur_event.getShiftType() + ")\n\n"); prev_event.setEndDate(cur_event.getEndDate()); //TODO: only merge if other + FOH/BOH, note times in new description it.remove(); } } prev_event = cur_event; } //next, load local events Cursor c1 = mContentResolver.query( Events.CONTENT_URI.buildUpon().appendQueryParameter(Events.ACCOUNT_NAME, account.name) .appendQueryParameter(Events.ACCOUNT_TYPE, account.type).build(), new String[] { Events._ID, Events._SYNC_ID }, Events.CALENDAR_ID + "=?", new String[] { String.valueOf(calendar_id) }, null); while (c1 != null && c1.moveToNext()) { //if(is_full_sync) { // deleteEvent(context, account, c1.getLong(0)); //} else { SyncEntry entry = new SyncEntry(); entry.raw_id = c1.getLong(0); localEvents.put(c1.getLong(1), entry); //} } c1.close(); try { ArrayList<ContentProviderOperation> operationList = new ArrayList<ContentProviderOperation>(); for (Event event : events) { if (localEvents.containsKey(Long.valueOf(event.getId()))) { SyncEntry entry = localEvents.get(Long.valueOf(event.getId())); operationList.add(updateEvent(calendar_id, account, event, entry.raw_id)); } else { operationList.add(updateEvent(calendar_id, account, event, -1)); } if (operationList.size() >= 50) { try { mContentResolver.applyBatch(CalendarContract.AUTHORITY, operationList); } catch (Exception e) { e.printStackTrace(); } operationList.clear(); } } if (operationList.size() > 0) { try { mContentResolver.applyBatch(CalendarContract.AUTHORITY, operationList); } catch (Exception e) { e.printStackTrace(); } } } catch (Exception e1) { // TODO Auto-generated catch block e1.printStackTrace(); return; } } else { Log.e(TAG, "Server error in sending dirty contacts: " + resp.getStatusLine()); throw new IOException(); } }
From source file:io.seldon.importer.articles.FileItemAttributesImporter.java
public static Map<String, String> getAttributes(String url, String existingCategory) { ItemProcessResult itemProcessResult = new ItemProcessResult(); itemProcessResult.client_item_id = url; itemProcessResult.extraction_status = "EXTRACTION_FAILED"; logger.info("Trying to get attributes for " + url); Map<String, String> attributes = null; String title = ""; String category = ""; String subCategory = ""; String img_url = ""; String description = ""; String tags = ""; String leadtext = ""; String link = ""; String publishDate = ""; String domain = ""; try {/*from w w w .j a va 2 s . c o m*/ long now = System.currentTimeMillis(); long timeSinceLastRequest = now - lastUrlFetchTime; if (timeSinceLastRequest < minFetchGapMsecs) { long timeToSleep = minFetchGapMsecs - timeSinceLastRequest; logger.info( "Sleeping " + timeToSleep + "msecs as time since last fetch is " + timeSinceLastRequest); Thread.sleep(timeToSleep); } Document articleDoc = Jsoup.connect(url).userAgent("SeldonBot/1.0").timeout(httpGetTimeout).get(); lastUrlFetchTime = System.currentTimeMillis(); //get IMAGE URL if (StringUtils.isNotBlank(imageCssSelector)) { Element imageElement = articleDoc.select(imageCssSelector).first(); if (imageElement != null) { if (imageElement.attr("content") != null) { img_url = imageElement.attr("content"); } if (StringUtils.isBlank(img_url) && imageElement.attr("src") != null) { img_url = imageElement.attr("src"); } if (StringUtils.isBlank(img_url) && imageElement.attr("href") != null) { img_url = imageElement.attr("href"); } } } if (StringUtils.isBlank(img_url) && StringUtils.isNotBlank(defImageUrl)) { logger.info("Setting image to default: " + defImageUrl); img_url = defImageUrl; } img_url = StringUtils.strip(img_url); //get TITLE if (StringUtils.isNotBlank(titleCssSelector)) { Element titleElement = articleDoc.select(titleCssSelector).first(); if (titleElement != null && titleElement.attr("content") != null) { title = titleElement.attr("content"); } } //get Lead Text if (StringUtils.isNotBlank(leadTextCssSelector)) { Element leadElement = articleDoc.select(leadTextCssSelector).first(); if (leadElement != null && leadElement.attr("content") != null) { leadtext = leadElement.attr("content"); } } //get publish date if (StringUtils.isNotBlank(publishDateCssSelector)) { //2013-01-21T10:40:55Z Element pubElement = articleDoc.select(publishDateCssSelector).first(); if (pubElement != null && pubElement.attr("content") != null) { String pubtext = pubElement.attr("content"); SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH); Date result = null; try { result = df.parse(pubtext); } catch (ParseException e) { logger.info("Failed to parse date withUTC format " + pubtext); } //try a simpler format df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ENGLISH); try { result = df.parse(pubtext); } catch (ParseException e) { logger.info("Failed to parse date " + pubtext); } if (result != null) publishDate = dateFormatter.format(result); else logger.error("Failed to parse date " + pubtext); } } //get Link if (StringUtils.isNotBlank(linkCssSelector)) { Element linkElement = articleDoc.select(linkCssSelector).first(); if (linkElement != null && linkElement.attr("content") != null) { link = linkElement.attr("content"); } } //get CONTENT if (StringUtils.isNotBlank(textCssSelector)) { Element descriptionElement = articleDoc.select(textCssSelector).first(); if (descriptionElement != null) description = Jsoup.parse(descriptionElement.html()).text(); } //get TAGS Set<String> tagSet = AttributesImporterUtils.getTags(articleDoc, tagsCssSelector, title); if (tagSet.size() > 0) tags = CollectionTools.join(tagSet, ","); //get CATEGORY - client specific if (StringUtils.isNotBlank(categoryCssSelector)) { Element categoryElement = articleDoc.select(categoryCssSelector).first(); if (categoryElement != null && categoryElement.attr("content") != null) { category = categoryElement.attr("content"); if (StringUtils.isNotBlank(category)) category = category.toUpperCase(); } } else if (StringUtils.isNotBlank(categoryClassPrefix)) { String className = "io.seldon.importer.articles.category." + categoryClassPrefix + "CategoryExtractor"; Class<?> clazz = Class.forName(className); Constructor<?> ctor = clazz.getConstructor(); CategoryExtractor extractor = (CategoryExtractor) ctor.newInstance(); category = extractor.getCategory(url, articleDoc); } //get Sub CATEGORY - client specific if (StringUtils.isNotBlank(subCategoryCssSelector)) { Element subCategoryElement = articleDoc.select(subCategoryCssSelector).first(); if (subCategoryElement != null && subCategoryElement.attr("content") != null) { subCategory = subCategoryElement.attr("content"); if (StringUtils.isNotBlank(subCategory)) subCategory = category.toUpperCase(); } } else if (StringUtils.isNotBlank(subCategoryClassPrefix)) { String className = "io.seldon.importer.articles.category." + subCategoryClassPrefix + "SubCategoryExtractor"; Class<?> clazz = Class.forName(className); Constructor<?> ctor = clazz.getConstructor(); CategoryExtractor extractor = (CategoryExtractor) ctor.newInstance(); subCategory = extractor.getCategory(url, articleDoc); } // Get domain if (domainIsNeeded) { domain = getDomain(url); } if (StringUtils.isNotBlank(title) && (imageNotNeeded || StringUtils.isNotBlank(img_url)) && (categoryNotNeeded || StringUtils.isNotBlank(category)) && (!domainIsNeeded || StringUtils.isNotBlank(domain))) { attributes = new HashMap<String, String>(); attributes.put(TITLE, title); if (StringUtils.isNotBlank(category)) attributes.put(CATEGORY, category); if (StringUtils.isNotBlank(subCategory)) attributes.put(SUBCATEGORY, subCategory); if (StringUtils.isNotBlank(link)) attributes.put(LINK, link); if (StringUtils.isNotBlank(leadtext)) attributes.put(LEAD_TEXT, leadtext); if (StringUtils.isNotBlank(img_url)) attributes.put(IMG_URL, img_url); if (StringUtils.isNotBlank(tags)) attributes.put(TAGS, tags); attributes.put(CONTENT_TYPE, VERIFIED_CONTENT_TYPE); if (StringUtils.isNotBlank(description)) attributes.put(DESCRIPTION, description); if (StringUtils.isNotBlank(publishDate)) attributes.put(PUBLISH_DATE, publishDate); if (StringUtils.isNotBlank(domain)) attributes.put(DOMAIN, domain); System.out.println("Item: " + url + "; Category: " + category); itemProcessResult.extraction_status = "EXTRACTION_SUCCEEDED"; } else { logger.warn("Failed to get title for article " + url); logger.warn("[title=" + title + ", img_url=" + img_url + ", category=" + category + ", domain=" + domain + "]"); } { // check for failures for the log result if (StringUtils.isBlank(title)) { itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",") + "title"; } if (!imageNotNeeded && StringUtils.isBlank(img_url)) { itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",") + "img_url"; } if (!categoryNotNeeded && StringUtils.isBlank(category)) { itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",") + "category"; } } } catch (Exception e) { logger.warn("Article: " + url + ". Attributes import FAILED", e); itemProcessResult.error = e.toString(); } AttributesImporterUtils.logResult(logger, itemProcessResult); return attributes; }
From source file:com.kantenkugel.discordbot.jdocparser.JDocParser.java
private static Element getSingleElementByClass(Element root, String className) { Elements elementsByClass = root.getElementsByClass(className); if (elementsByClass.size() != 1) { String error = "Found " + elementsByClass.size() + " elements with class " + className + " inside of " + root.tagName() + "-" + root.className(); throw new RuntimeException(error + root.html()); }// w w w. jav a 2 s. com return elementsByClass.first(); }